1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
25#include "llvm/ADT/PostOrderIterator.h"
26#include "llvm/ADT/STLExtras.h"
27#include "llvm/ADT/SetOperations.h"
28#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallPtrSet.h"
30#include "llvm/ADT/TypeSwitch.h"
31#include "llvm/Analysis/IVDescriptors.h"
32#include "llvm/Analysis/InstSimplifyFolder.h"
33#include "llvm/Analysis/LoopInfo.h"
34#include "llvm/Analysis/MemoryLocation.h"
35#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
36#include "llvm/Analysis/ScopedNoAliasAA.h"
37#include "llvm/Analysis/VectorUtils.h"
38#include "llvm/IR/Intrinsics.h"
39#include "llvm/IR/MDBuilder.h"
40#include "llvm/IR/Metadata.h"
41#include "llvm/Support/Casting.h"
42#include "llvm/Support/TypeSize.h"
43#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
44
45using namespace llvm;
46using namespace VPlanPatternMatch;
47using namespace SCEVPatternMatch;
48
49bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
50 VPlan &Plan, const TargetLibraryInfo &TLI) {
51
52 ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
53 Plan.getVectorLoopRegion());
54 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: RPOT)) {
55 // Skip blocks outside region
56 if (!VPBB->getParent())
57 break;
58 VPRecipeBase *Term = VPBB->getTerminator();
59 auto EndIter = Term ? Term->getIterator() : VPBB->end();
60 // Introduce each ingredient into VPlan.
61 for (VPRecipeBase &Ingredient :
62 make_early_inc_range(Range: make_range(x: VPBB->begin(), y: EndIter))) {
63
64 VPValue *VPV = Ingredient.getVPSingleValue();
65 if (!VPV->getUnderlyingValue())
66 continue;
67
68 Instruction *Inst = cast<Instruction>(Val: VPV->getUnderlyingValue());
69
70 VPRecipeBase *NewRecipe = nullptr;
71 if (auto *PhiR = dyn_cast<VPPhi>(Val: &Ingredient)) {
72 auto *Phi = cast<PHINode>(Val: PhiR->getUnderlyingValue());
73 NewRecipe = new VPWidenPHIRecipe(Phi, nullptr, PhiR->getDebugLoc());
74 for (VPValue *Op : PhiR->operands())
75 NewRecipe->addOperand(Operand: Op);
76 } else if (auto *VPI = dyn_cast<VPInstruction>(Val: &Ingredient)) {
77 assert(!isa<PHINode>(Inst) && "phis should be handled above");
78 // Create VPWidenMemoryRecipe for loads and stores.
79 if (LoadInst *Load = dyn_cast<LoadInst>(Val: Inst)) {
80 NewRecipe = new VPWidenLoadRecipe(
81 *Load, Ingredient.getOperand(N: 0), nullptr /*Mask*/,
82 false /*Consecutive*/, false /*Reverse*/, *VPI,
83 Ingredient.getDebugLoc());
84 } else if (StoreInst *Store = dyn_cast<StoreInst>(Val: Inst)) {
85 NewRecipe = new VPWidenStoreRecipe(
86 *Store, Ingredient.getOperand(N: 1), Ingredient.getOperand(N: 0),
87 nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/, *VPI,
88 Ingredient.getDebugLoc());
89 } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: Inst)) {
90 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands(), *VPI,
91 Ingredient.getDebugLoc());
92 } else if (CallInst *CI = dyn_cast<CallInst>(Val: Inst)) {
93 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, TLI: &TLI);
94 if (VectorID == Intrinsic::not_intrinsic)
95 return false;
96 NewRecipe = new VPWidenIntrinsicRecipe(
97 *CI, getVectorIntrinsicIDForCall(CI, TLI: &TLI),
98 drop_end(RangeOrContainer: Ingredient.operands()), CI->getType(), VPIRFlags(*CI),
99 *VPI, CI->getDebugLoc());
100 } else if (auto *CI = dyn_cast<CastInst>(Val: Inst)) {
101 NewRecipe = new VPWidenCastRecipe(
102 CI->getOpcode(), Ingredient.getOperand(N: 0), CI->getType(), CI,
103 VPIRFlags(*CI), VPIRMetadata(*CI));
104 } else {
105 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,
106 *VPI, Ingredient.getDebugLoc());
107 }
108 } else {
109 assert(isa<VPWidenIntOrFpInductionRecipe>(&Ingredient) &&
110 "inductions must be created earlier");
111 continue;
112 }
113
114 NewRecipe->insertBefore(InsertPos: &Ingredient);
115 if (NewRecipe->getNumDefinedValues() == 1)
116 VPV->replaceAllUsesWith(New: NewRecipe->getVPSingleValue());
117 else
118 assert(NewRecipe->getNumDefinedValues() == 0 &&
119 "Only recpies with zero or one defined values expected");
120 Ingredient.eraseFromParent();
121 }
122 }
123 return true;
124}
125
126/// Helper for extra no-alias checks via known-safe recipe and SCEV.
127class SinkStoreInfo {
128 const SmallPtrSetImpl<VPRecipeBase *> &ExcludeRecipes;
129 VPReplicateRecipe &GroupLeader;
130 PredicatedScalarEvolution &PSE;
131 const Loop &L;
132 VPTypeAnalysis &TypeInfo;
133
134 // Return true if \p A and \p B are known to not alias for all VFs in the
135 // plan, checked via the distance between the accesses
136 bool isNoAliasViaDistance(VPReplicateRecipe *A, VPReplicateRecipe *B) const {
137 if (A->getOpcode() != Instruction::Store ||
138 B->getOpcode() != Instruction::Store)
139 return false;
140
141 VPValue *AddrA = A->getOperand(N: 1);
142 const SCEV *SCEVA = vputils::getSCEVExprForVPValue(V: AddrA, PSE, L: &L);
143 VPValue *AddrB = B->getOperand(N: 1);
144 const SCEV *SCEVB = vputils::getSCEVExprForVPValue(V: AddrB, PSE, L: &L);
145 if (isa<SCEVCouldNotCompute>(Val: SCEVA) || isa<SCEVCouldNotCompute>(Val: SCEVB))
146 return false;
147
148 const APInt *Distance;
149 ScalarEvolution &SE = *PSE.getSE();
150 if (!match(S: SE.getMinusSCEV(LHS: SCEVA, RHS: SCEVB), P: m_scev_APInt(C&: Distance)))
151 return false;
152
153 const DataLayout &DL = SE.getDataLayout();
154 Type *TyA = TypeInfo.inferScalarType(V: A->getOperand(N: 0));
155 uint64_t SizeA = DL.getTypeStoreSize(Ty: TyA);
156 Type *TyB = TypeInfo.inferScalarType(V: B->getOperand(N: 0));
157 uint64_t SizeB = DL.getTypeStoreSize(Ty: TyB);
158
159 // Use the maximum store size to ensure no overlap from either direction.
160 // Currently only handles fixed sizes, as it is only used for
161 // replicating VPReplicateRecipes.
162 uint64_t MaxStoreSize = std::max(a: SizeA, b: SizeB);
163
164 auto VFs = B->getParent()->getPlan()->vectorFactors();
165 ElementCount MaxVF = *max_element(Range&: VFs, C: ElementCount::isKnownLT);
166 if (MaxVF.isScalable())
167 return false;
168 return Distance->abs().uge(
169 RHS: MaxVF.multiplyCoefficientBy(RHS: MaxStoreSize).getFixedValue());
170 }
171
172public:
173 SinkStoreInfo(const SmallPtrSetImpl<VPRecipeBase *> &ExcludeRecipes,
174 VPReplicateRecipe &GroupLeader, PredicatedScalarEvolution &PSE,
175 const Loop &L, VPTypeAnalysis &TypeInfo)
176 : ExcludeRecipes(ExcludeRecipes), GroupLeader(GroupLeader), PSE(PSE),
177 L(L), TypeInfo(TypeInfo) {}
178
179 /// Return true if \p R should be skipped during alias checking, either
180 /// because it's in the exclude set or because no-alias can be proven via
181 /// SCEV.
182 bool shouldSkip(VPRecipeBase &R) const {
183 auto *Store = dyn_cast<VPReplicateRecipe>(Val: &R);
184 return ExcludeRecipes.contains(Ptr: &R) ||
185 (Store && isNoAliasViaDistance(A: Store, B: &GroupLeader));
186 }
187};
188
189/// Check if a memory operation doesn't alias with memory operations in blocks
190/// between \p FirstBB and \p LastBB using scoped noalias metadata. If
191/// \p SinkInfo is std::nullopt, only recipes that may write to memory are
192/// checked (for load hoisting). Otherwise recipes that both read and write
193/// memory are checked, and SCEV is used to prove no-alias between the group
194/// leader and other replicate recipes (for store sinking).
195static bool
196canHoistOrSinkWithNoAliasCheck(const MemoryLocation &MemLoc,
197 VPBasicBlock *FirstBB, VPBasicBlock *LastBB,
198 std::optional<SinkStoreInfo> SinkInfo = {}) {
199 bool CheckReads = SinkInfo.has_value();
200 if (!MemLoc.AATags.Scope)
201 return false;
202
203 const AAMDNodes &MemAA = MemLoc.AATags;
204
205 for (VPBlockBase *Block = FirstBB; Block;
206 Block = Block->getSingleSuccessor()) {
207 assert(Block->getNumSuccessors() <= 1 &&
208 "Expected at most one successor in block chain");
209 auto *VPBB = cast<VPBasicBlock>(Val: Block);
210 for (VPRecipeBase &R : *VPBB) {
211 if (SinkInfo && SinkInfo->shouldSkip(R))
212 continue;
213
214 // Skip recipes that don't need checking.
215 if (!R.mayWriteToMemory() && !(CheckReads && R.mayReadFromMemory()))
216 continue;
217
218 auto Loc = vputils::getMemoryLocation(R);
219 if (!Loc)
220 // Conservatively assume aliasing for memory operations without
221 // location.
222 return false;
223
224 // For reads, check if they don't alias in the reverse direction and
225 // skip if so.
226 if (CheckReads && R.mayReadFromMemory() &&
227 !ScopedNoAliasAAResult::mayAliasInScopes(Scopes: Loc->AATags.Scope,
228 NoAlias: MemAA.NoAlias))
229 continue;
230
231 // Check if the memory operations may alias in the forward direction.
232 if (ScopedNoAliasAAResult::mayAliasInScopes(Scopes: MemAA.Scope,
233 NoAlias: Loc->AATags.NoAlias))
234 return false;
235 }
236
237 if (Block == LastBB)
238 break;
239 }
240 return true;
241}
242
243/// Return true if we do not know how to (mechanically) hoist or sink \p R out
244/// of a loop region.
245static bool cannotHoistOrSinkRecipe(const VPRecipeBase &R) {
246 // Assumes don't alias anything or throw; as long as they're guaranteed to
247 // execute, they're safe to hoist.
248 if (match(V: &R, P: m_Intrinsic<Intrinsic::assume>()))
249 return false;
250
251 // TODO: Relax checks in the future, e.g. we could also hoist reads, if their
252 // memory location is not modified in the vector loop.
253 if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi())
254 return true;
255
256 // Allocas cannot be hoisted.
257 auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R);
258 return RepR && RepR->getOpcode() == Instruction::Alloca;
259}
260
261static bool sinkScalarOperands(VPlan &Plan) {
262 auto Iter = vp_depth_first_deep(G: Plan.getEntry());
263 bool ScalarVFOnly = Plan.hasScalarVFOnly();
264 bool Changed = false;
265
266 SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList;
267 auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
268 VPBasicBlock *SinkTo, VPValue *Op) {
269 auto *Candidate =
270 dyn_cast_or_null<VPSingleDefRecipe>(Val: Op->getDefiningRecipe());
271 if (!Candidate)
272 return;
273
274 // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
275 // for now.
276 if (!isa<VPReplicateRecipe, VPScalarIVStepsRecipe>(Val: Candidate))
277 return;
278
279 if (Candidate->getParent() == SinkTo || cannotHoistOrSinkRecipe(R: *Candidate))
280 return;
281
282 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: Candidate))
283 if (!ScalarVFOnly && RepR->isSingleScalar())
284 return;
285
286 WorkList.insert(X: {SinkTo, Candidate});
287 };
288
289 // First, collect the operands of all recipes in replicate blocks as seeds for
290 // sinking.
291 for (VPRegionBlock *VPR : VPBlockUtils::blocksOnly<VPRegionBlock>(Range: Iter)) {
292 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
293 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
294 continue;
295 VPBasicBlock *VPBB = cast<VPBasicBlock>(Val: EntryVPBB->getSuccessors().front());
296 if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
297 continue;
298 for (auto &Recipe : *VPBB)
299 for (VPValue *Op : Recipe.operands())
300 InsertIfValidSinkCandidate(VPBB, Op);
301 }
302
303 // Try to sink each replicate or scalar IV steps recipe in the worklist.
304 for (unsigned I = 0; I != WorkList.size(); ++I) {
305 VPBasicBlock *SinkTo;
306 VPSingleDefRecipe *SinkCandidate;
307 std::tie(args&: SinkTo, args&: SinkCandidate) = WorkList[I];
308
309 // All recipe users of SinkCandidate must be in the same block SinkTo or all
310 // users outside of SinkTo must only use the first lane of SinkCandidate. In
311 // the latter case, we need to duplicate SinkCandidate.
312 auto UsersOutsideSinkTo =
313 make_filter_range(Range: SinkCandidate->users(), Pred: [SinkTo](VPUser *U) {
314 return cast<VPRecipeBase>(Val: U)->getParent() != SinkTo;
315 });
316 if (any_of(Range&: UsersOutsideSinkTo, P: [SinkCandidate](VPUser *U) {
317 return !U->usesFirstLaneOnly(Op: SinkCandidate);
318 }))
319 continue;
320 bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
321
322 if (NeedsDuplicating) {
323 if (ScalarVFOnly)
324 continue;
325 VPSingleDefRecipe *Clone;
326 if (auto *SinkCandidateRepR =
327 dyn_cast<VPReplicateRecipe>(Val: SinkCandidate)) {
328 // TODO: Handle converting to uniform recipes as separate transform,
329 // then cloning should be sufficient here.
330 Instruction *I = SinkCandidate->getUnderlyingInstr();
331 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
332 nullptr /*Mask*/, *SinkCandidateRepR,
333 *SinkCandidateRepR);
334 // TODO: add ".cloned" suffix to name of Clone's VPValue.
335 } else {
336 Clone = SinkCandidate->clone();
337 }
338
339 Clone->insertBefore(InsertPos: SinkCandidate);
340 SinkCandidate->replaceUsesWithIf(New: Clone, ShouldReplace: [SinkTo](VPUser &U, unsigned) {
341 return cast<VPRecipeBase>(Val: &U)->getParent() != SinkTo;
342 });
343 }
344 SinkCandidate->moveBefore(BB&: *SinkTo, I: SinkTo->getFirstNonPhi());
345 for (VPValue *Op : SinkCandidate->operands())
346 InsertIfValidSinkCandidate(SinkTo, Op);
347 Changed = true;
348 }
349 return Changed;
350}
351
352/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
353/// the mask.
354static VPValue *getPredicatedMask(VPRegionBlock *R) {
355 auto *EntryBB = dyn_cast<VPBasicBlock>(Val: R->getEntry());
356 if (!EntryBB || EntryBB->size() != 1 ||
357 !isa<VPBranchOnMaskRecipe>(Val: EntryBB->begin()))
358 return nullptr;
359
360 return cast<VPBranchOnMaskRecipe>(Val: &*EntryBB->begin())->getOperand(N: 0);
361}
362
363/// If \p R is a triangle region, return the 'then' block of the triangle.
364static VPBasicBlock *getPredicatedThenBlock(VPRegionBlock *R) {
365 auto *EntryBB = cast<VPBasicBlock>(Val: R->getEntry());
366 if (EntryBB->getNumSuccessors() != 2)
367 return nullptr;
368
369 auto *Succ0 = dyn_cast<VPBasicBlock>(Val: EntryBB->getSuccessors()[0]);
370 auto *Succ1 = dyn_cast<VPBasicBlock>(Val: EntryBB->getSuccessors()[1]);
371 if (!Succ0 || !Succ1)
372 return nullptr;
373
374 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
375 return nullptr;
376 if (Succ0->getSingleSuccessor() == Succ1)
377 return Succ0;
378 if (Succ1->getSingleSuccessor() == Succ0)
379 return Succ1;
380 return nullptr;
381}
382
383// Merge replicate regions in their successor region, if a replicate region
384// is connected to a successor replicate region with the same predicate by a
385// single, empty VPBasicBlock.
386static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) {
387 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
388
389 // Collect replicate regions followed by an empty block, followed by another
390 // replicate region with matching masks to process front. This is to avoid
391 // iterator invalidation issues while merging regions.
392 SmallVector<VPRegionBlock *, 8> WorkList;
393 for (VPRegionBlock *Region1 : VPBlockUtils::blocksOnly<VPRegionBlock>(
394 Range: vp_depth_first_deep(G: Plan.getEntry()))) {
395 if (!Region1->isReplicator())
396 continue;
397 auto *MiddleBasicBlock =
398 dyn_cast_or_null<VPBasicBlock>(Val: Region1->getSingleSuccessor());
399 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
400 continue;
401
402 auto *Region2 =
403 dyn_cast_or_null<VPRegionBlock>(Val: MiddleBasicBlock->getSingleSuccessor());
404 if (!Region2 || !Region2->isReplicator())
405 continue;
406
407 VPValue *Mask1 = getPredicatedMask(R: Region1);
408 VPValue *Mask2 = getPredicatedMask(R: Region2);
409 if (!Mask1 || Mask1 != Mask2)
410 continue;
411
412 assert(Mask1 && Mask2 && "both region must have conditions");
413 WorkList.push_back(Elt: Region1);
414 }
415
416 // Move recipes from Region1 to its successor region, if both are triangles.
417 for (VPRegionBlock *Region1 : WorkList) {
418 if (TransformedRegions.contains(Ptr: Region1))
419 continue;
420 auto *MiddleBasicBlock = cast<VPBasicBlock>(Val: Region1->getSingleSuccessor());
421 auto *Region2 = cast<VPRegionBlock>(Val: MiddleBasicBlock->getSingleSuccessor());
422
423 VPBasicBlock *Then1 = getPredicatedThenBlock(R: Region1);
424 VPBasicBlock *Then2 = getPredicatedThenBlock(R: Region2);
425 if (!Then1 || !Then2)
426 continue;
427
428 // Note: No fusion-preventing memory dependencies are expected in either
429 // region. Such dependencies should be rejected during earlier dependence
430 // checks, which guarantee accesses can be re-ordered for vectorization.
431 //
432 // Move recipes to the successor region.
433 for (VPRecipeBase &ToMove : make_early_inc_range(Range: reverse(C&: *Then1)))
434 ToMove.moveBefore(BB&: *Then2, I: Then2->getFirstNonPhi());
435
436 auto *Merge1 = cast<VPBasicBlock>(Val: Then1->getSingleSuccessor());
437 auto *Merge2 = cast<VPBasicBlock>(Val: Then2->getSingleSuccessor());
438
439 // Move VPPredInstPHIRecipes from the merge block to the successor region's
440 // merge block. Update all users inside the successor region to use the
441 // original values.
442 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(Range: reverse(C&: *Merge1))) {
443 VPValue *PredInst1 =
444 cast<VPPredInstPHIRecipe>(Val: &Phi1ToMove)->getOperand(N: 0);
445 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
446 Phi1ToMoveV->replaceUsesWithIf(New: PredInst1, ShouldReplace: [Then2](VPUser &U, unsigned) {
447 return cast<VPRecipeBase>(Val: &U)->getParent() == Then2;
448 });
449
450 // Remove phi recipes that are unused after merging the regions.
451 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
452 Phi1ToMove.eraseFromParent();
453 continue;
454 }
455 Phi1ToMove.moveBefore(BB&: *Merge2, I: Merge2->begin());
456 }
457
458 // Remove the dead recipes in Region1's entry block.
459 for (VPRecipeBase &R :
460 make_early_inc_range(Range: reverse(C&: *Region1->getEntryBasicBlock())))
461 R.eraseFromParent();
462
463 // Finally, remove the first region.
464 for (VPBlockBase *Pred : make_early_inc_range(Range&: Region1->getPredecessors())) {
465 VPBlockUtils::disconnectBlocks(From: Pred, To: Region1);
466 VPBlockUtils::connectBlocks(From: Pred, To: MiddleBasicBlock);
467 }
468 VPBlockUtils::disconnectBlocks(From: Region1, To: MiddleBasicBlock);
469 TransformedRegions.insert(Ptr: Region1);
470 }
471
472 return !TransformedRegions.empty();
473}
474
475static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,
476 VPlan &Plan) {
477 Instruction *Instr = PredRecipe->getUnderlyingInstr();
478 // Build the triangular if-then region.
479 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
480 assert(Instr->getParent() && "Predicated instruction not in any basic block");
481 auto *BlockInMask = PredRecipe->getMask();
482 auto *MaskDef = BlockInMask->getDefiningRecipe();
483 auto *BOMRecipe = new VPBranchOnMaskRecipe(
484 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
485 auto *Entry =
486 Plan.createVPBasicBlock(Name: Twine(RegionName) + ".entry", Recipe: BOMRecipe);
487
488 // Replace predicated replicate recipe with a replicate recipe without a
489 // mask but in the replicate region.
490 auto *RecipeWithoutMask = new VPReplicateRecipe(
491 PredRecipe->getUnderlyingInstr(), drop_end(RangeOrContainer: PredRecipe->operands()),
492 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe, *PredRecipe,
493 PredRecipe->getDebugLoc());
494 auto *Pred =
495 Plan.createVPBasicBlock(Name: Twine(RegionName) + ".if", Recipe: RecipeWithoutMask);
496
497 VPPredInstPHIRecipe *PHIRecipe = nullptr;
498 if (PredRecipe->getNumUsers() != 0) {
499 PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
500 RecipeWithoutMask->getDebugLoc());
501 PredRecipe->replaceAllUsesWith(New: PHIRecipe);
502 PHIRecipe->setOperand(I: 0, New: RecipeWithoutMask);
503 }
504 PredRecipe->eraseFromParent();
505 auto *Exiting =
506 Plan.createVPBasicBlock(Name: Twine(RegionName) + ".continue", Recipe: PHIRecipe);
507 VPRegionBlock *Region =
508 Plan.createReplicateRegion(Entry, Exiting, Name: RegionName);
509
510 // Note: first set Entry as region entry and then connect successors starting
511 // from it in order, to propagate the "parent" of each VPBasicBlock.
512 VPBlockUtils::insertTwoBlocksAfter(IfTrue: Pred, IfFalse: Exiting, BlockPtr: Entry);
513 VPBlockUtils::connectBlocks(From: Pred, To: Exiting);
514
515 return Region;
516}
517
518static void addReplicateRegions(VPlan &Plan) {
519 SmallVector<VPReplicateRecipe *> WorkList;
520 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
521 Range: vp_depth_first_deep(G: Plan.getEntry()))) {
522 for (VPRecipeBase &R : *VPBB)
523 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R)) {
524 if (RepR->isPredicated())
525 WorkList.push_back(Elt: RepR);
526 }
527 }
528
529 unsigned BBNum = 0;
530 for (VPReplicateRecipe *RepR : WorkList) {
531 VPBasicBlock *CurrentBlock = RepR->getParent();
532 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(SplitAt: RepR->getIterator());
533
534 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
535 SplitBlock->setName(
536 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
537 // Record predicated instructions for above packing optimizations.
538 VPRegionBlock *Region = createReplicateRegion(PredRecipe: RepR, Plan);
539 Region->setParent(CurrentBlock->getParent());
540 VPBlockUtils::insertOnEdge(From: CurrentBlock, To: SplitBlock, BlockPtr: Region);
541
542 VPRegionBlock *ParentRegion = Region->getParent();
543 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
544 ParentRegion->setExiting(SplitBlock);
545 }
546}
547
548/// Remove redundant VPBasicBlocks by merging them into their predecessor if
549/// the predecessor has a single successor.
550static bool mergeBlocksIntoPredecessors(VPlan &Plan) {
551 SmallVector<VPBasicBlock *> WorkList;
552 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
553 Range: vp_depth_first_deep(G: Plan.getEntry()))) {
554 // Don't fold the blocks in the skeleton of the Plan into their single
555 // predecessors for now.
556 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
557 if (!VPBB->getParent())
558 continue;
559 auto *PredVPBB =
560 dyn_cast_or_null<VPBasicBlock>(Val: VPBB->getSinglePredecessor());
561 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
562 isa<VPIRBasicBlock>(Val: PredVPBB))
563 continue;
564 WorkList.push_back(Elt: VPBB);
565 }
566
567 for (VPBasicBlock *VPBB : WorkList) {
568 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(Val: VPBB->getSinglePredecessor());
569 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB))
570 R.moveBefore(BB&: *PredVPBB, I: PredVPBB->end());
571 VPBlockUtils::disconnectBlocks(From: PredVPBB, To: VPBB);
572 auto *ParentRegion = VPBB->getParent();
573 if (ParentRegion && ParentRegion->getExiting() == VPBB)
574 ParentRegion->setExiting(PredVPBB);
575 for (auto *Succ : to_vector(Range: VPBB->successors())) {
576 VPBlockUtils::disconnectBlocks(From: VPBB, To: Succ);
577 VPBlockUtils::connectBlocks(From: PredVPBB, To: Succ);
578 }
579 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
580 }
581 return !WorkList.empty();
582}
583
584void VPlanTransforms::createAndOptimizeReplicateRegions(VPlan &Plan) {
585 // Convert masked VPReplicateRecipes to if-then region blocks.
586 addReplicateRegions(Plan);
587
588 bool ShouldSimplify = true;
589 while (ShouldSimplify) {
590 ShouldSimplify = sinkScalarOperands(Plan);
591 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
592 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
593 }
594}
595
596/// Remove redundant casts of inductions.
597///
598/// Such redundant casts are casts of induction variables that can be ignored,
599/// because we already proved that the casted phi is equal to the uncasted phi
600/// in the vectorized loop. There is no need to vectorize the cast - the same
601/// value can be used for both the phi and casts in the vector loop.
602static void removeRedundantInductionCasts(VPlan &Plan) {
603 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
604 auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi);
605 if (!IV || IV->getTruncInst())
606 continue;
607
608 // A sequence of IR Casts has potentially been recorded for IV, which
609 // *must be bypassed* when the IV is vectorized, because the vectorized IV
610 // will produce the desired casted value. This sequence forms a def-use
611 // chain and is provided in reverse order, ending with the cast that uses
612 // the IV phi. Search for the recipe of the last cast in the chain and
613 // replace it with the original IV. Note that only the final cast is
614 // expected to have users outside the cast-chain and the dead casts left
615 // over will be cleaned up later.
616 ArrayRef<Instruction *> Casts = IV->getInductionDescriptor().getCastInsts();
617 VPValue *FindMyCast = IV;
618 for (Instruction *IRCast : reverse(C&: Casts)) {
619 VPSingleDefRecipe *FoundUserCast = nullptr;
620 for (auto *U : FindMyCast->users()) {
621 auto *UserCast = dyn_cast<VPSingleDefRecipe>(Val: U);
622 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
623 FoundUserCast = UserCast;
624 break;
625 }
626 }
627 FindMyCast = FoundUserCast;
628 }
629 FindMyCast->replaceAllUsesWith(New: IV);
630 }
631}
632
633/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
634/// recipe, if it exists.
635static void removeRedundantCanonicalIVs(VPlan &Plan) {
636 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
637 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
638 VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
639 for (VPUser *U : CanonicalIV->users()) {
640 WidenNewIV = dyn_cast<VPWidenCanonicalIVRecipe>(Val: U);
641 if (WidenNewIV)
642 break;
643 }
644
645 if (!WidenNewIV)
646 return;
647
648 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
649 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
650 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi);
651
652 if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
653 continue;
654
655 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
656 // everything WidenNewIV's users need. That is, WidenOriginalIV will
657 // generate a vector phi or all users of WidenNewIV demand the first lane
658 // only.
659 if (!vputils::onlyScalarValuesUsed(Def: WidenOriginalIV) ||
660 vputils::onlyFirstLaneUsed(Def: WidenNewIV)) {
661 // We are replacing a wide canonical iv with a suitable wide induction.
662 // This is used to compute header mask, hence all lanes will be used and
663 // we need to drop wrap flags only applying to lanes guranteed to execute
664 // in the original scalar loop.
665 WidenOriginalIV->dropPoisonGeneratingFlags();
666 WidenNewIV->replaceAllUsesWith(New: WidenOriginalIV);
667 WidenNewIV->eraseFromParent();
668 return;
669 }
670 }
671}
672
673/// Returns true if \p R is dead and can be removed.
674static bool isDeadRecipe(VPRecipeBase &R) {
675 // Do remove conditional assume instructions as their conditions may be
676 // flattened.
677 auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R);
678 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
679 match(V: RepR, P: m_Intrinsic<Intrinsic::assume>());
680 if (IsConditionalAssume)
681 return true;
682
683 if (R.mayHaveSideEffects())
684 return false;
685
686 // Recipe is dead if no user keeps the recipe alive.
687 return all_of(Range: R.definedValues(),
688 P: [](VPValue *V) { return V->getNumUsers() == 0; });
689}
690
691void VPlanTransforms::removeDeadRecipes(VPlan &Plan) {
692 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
693 Range: vp_post_order_deep(G: Plan.getEntry()))) {
694 // The recipes in the block are processed in reverse order, to catch chains
695 // of dead recipes.
696 for (VPRecipeBase &R : make_early_inc_range(Range: reverse(C&: *VPBB))) {
697 if (isDeadRecipe(R)) {
698 R.eraseFromParent();
699 continue;
700 }
701
702 // Check if R is a dead VPPhi <-> update cycle and remove it.
703 auto *PhiR = dyn_cast<VPPhi>(Val: &R);
704 if (!PhiR || PhiR->getNumOperands() != 2)
705 continue;
706 VPUser *PhiUser = PhiR->getSingleUser();
707 if (!PhiUser)
708 continue;
709 VPValue *Incoming = PhiR->getOperand(N: 1);
710 if (PhiUser != Incoming->getDefiningRecipe() ||
711 Incoming->getNumUsers() != 1)
712 continue;
713 PhiR->replaceAllUsesWith(New: PhiR->getOperand(N: 0));
714 PhiR->eraseFromParent();
715 Incoming->getDefiningRecipe()->eraseFromParent();
716 }
717 }
718}
719
720static VPScalarIVStepsRecipe *
721createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind,
722 Instruction::BinaryOps InductionOpcode,
723 FPMathOperator *FPBinOp, Instruction *TruncI,
724 VPIRValue *StartV, VPValue *Step, DebugLoc DL,
725 VPBuilder &Builder) {
726 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
727 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
728 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
729 VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
730 Kind, FPBinOp, Start: StartV, Current: CanonicalIV, Step, Name: "offset.idx");
731
732 // Truncate base induction if needed.
733 VPTypeAnalysis TypeInfo(Plan);
734 Type *ResultTy = TypeInfo.inferScalarType(V: BaseIV);
735 if (TruncI) {
736 Type *TruncTy = TruncI->getType();
737 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
738 "Not truncating.");
739 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
740 BaseIV = Builder.createScalarCast(Opcode: Instruction::Trunc, Op: BaseIV, ResultTy: TruncTy, DL);
741 ResultTy = TruncTy;
742 }
743
744 // Truncate step if needed.
745 Type *StepTy = TypeInfo.inferScalarType(V: Step);
746 if (ResultTy != StepTy) {
747 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
748 "Not truncating.");
749 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
750 auto *VecPreheader =
751 cast<VPBasicBlock>(Val: HeaderVPBB->getSingleHierarchicalPredecessor());
752 VPBuilder::InsertPointGuard Guard(Builder);
753 Builder.setInsertPoint(VecPreheader);
754 Step = Builder.createScalarCast(Opcode: Instruction::Trunc, Op: Step, ResultTy, DL);
755 }
756 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, IV: BaseIV, Step,
757 VF: &Plan.getVF(), DL);
758}
759
760static SmallVector<VPUser *> collectUsersRecursively(VPValue *V) {
761 SetVector<VPUser *> Users(llvm::from_range, V->users());
762 for (unsigned I = 0; I != Users.size(); ++I) {
763 VPRecipeBase *Cur = cast<VPRecipeBase>(Val: Users[I]);
764 if (isa<VPHeaderPHIRecipe>(Val: Cur))
765 continue;
766 for (VPValue *V : Cur->definedValues())
767 Users.insert_range(R: V->users());
768 }
769 return Users.takeVector();
770}
771
772/// Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd
773/// (IndStart, ScalarIVSteps (0, Step)). This is used when the recipe only
774/// generates scalar values.
775static VPValue *
776scalarizeVPWidenPointerInduction(VPWidenPointerInductionRecipe *PtrIV,
777 VPlan &Plan, VPBuilder &Builder) {
778 const InductionDescriptor &ID = PtrIV->getInductionDescriptor();
779 VPIRValue *StartV = Plan.getConstantInt(Ty: ID.getStep()->getType(), Val: 0);
780 VPValue *StepV = PtrIV->getOperand(N: 1);
781 VPScalarIVStepsRecipe *Steps = createScalarIVSteps(
782 Plan, Kind: InductionDescriptor::IK_IntInduction, InductionOpcode: Instruction::Add, FPBinOp: nullptr,
783 TruncI: nullptr, StartV, Step: StepV, DL: PtrIV->getDebugLoc(), Builder);
784
785 return Builder.createPtrAdd(Ptr: PtrIV->getStartValue(), Offset: Steps,
786 DL: PtrIV->getDebugLoc(), Name: "next.gep");
787}
788
789/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
790/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
791/// VPWidenPointerInductionRecipe will generate vectors only. If some users
792/// require vectors while other require scalars, the scalar uses need to extract
793/// the scalars from the generated vectors (Note that this is different to how
794/// int/fp inductions are handled). Legalize extract-from-ends using uniform
795/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
796/// the correct end value is available. Also optimize
797/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
798/// providing them scalar steps built on the canonical scalar IV and update the
799/// original IV's users. This is an optional optimization to reduce the needs of
800/// vector extracts.
801static void legalizeAndOptimizeInductions(VPlan &Plan) {
802 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
803 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
804 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
805 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
806 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(Val: &Phi);
807 if (!PhiR)
808 continue;
809
810 // Try to narrow wide and replicating recipes to uniform recipes, based on
811 // VPlan analysis.
812 // TODO: Apply to all recipes in the future, to replace legacy uniformity
813 // analysis.
814 auto Users = collectUsersRecursively(V: PhiR);
815 for (VPUser *U : reverse(C&: Users)) {
816 auto *Def = dyn_cast<VPRecipeWithIRFlags>(Val: U);
817 auto *RepR = dyn_cast<VPReplicateRecipe>(Val: U);
818 // Skip recipes that shouldn't be narrowed.
819 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Val: Def) ||
820 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
821 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
822 continue;
823
824 // Skip recipes that may have other lanes than their first used.
825 if (!vputils::isSingleScalar(VPV: Def) && !vputils::onlyFirstLaneUsed(Def))
826 continue;
827
828 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
829 Def->operands(), /*IsUniform*/ true,
830 /*Mask*/ nullptr, /*Flags*/ *Def);
831 Clone->insertAfter(InsertPos: Def);
832 Def->replaceAllUsesWith(New: Clone);
833 }
834
835 // Replace wide pointer inductions which have only their scalars used by
836 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
837 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(Val: &Phi)) {
838 if (!Plan.hasScalarVFOnly() &&
839 !PtrIV->onlyScalarsGenerated(IsScalable: Plan.hasScalableVF()))
840 continue;
841
842 VPValue *PtrAdd = scalarizeVPWidenPointerInduction(PtrIV, Plan, Builder);
843 PtrIV->replaceAllUsesWith(New: PtrAdd);
844 continue;
845 }
846
847 // Replace widened induction with scalar steps for users that only use
848 // scalars.
849 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi);
850 if (HasOnlyVectorVFs && none_of(Range: WideIV->users(), P: [WideIV](VPUser *U) {
851 return U->usesScalars(Op: WideIV);
852 }))
853 continue;
854
855 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
856 VPScalarIVStepsRecipe *Steps = createScalarIVSteps(
857 Plan, Kind: ID.getKind(), InductionOpcode: ID.getInductionOpcode(),
858 FPBinOp: dyn_cast_or_null<FPMathOperator>(Val: ID.getInductionBinOp()),
859 TruncI: WideIV->getTruncInst(), StartV: WideIV->getStartValue(), Step: WideIV->getStepValue(),
860 DL: WideIV->getDebugLoc(), Builder);
861
862 // Update scalar users of IV to use Step instead.
863 if (!HasOnlyVectorVFs) {
864 assert(!Plan.hasScalableVF() &&
865 "plans containing a scalar VF cannot also include scalable VFs");
866 WideIV->replaceAllUsesWith(New: Steps);
867 } else {
868 bool HasScalableVF = Plan.hasScalableVF();
869 WideIV->replaceUsesWithIf(New: Steps,
870 ShouldReplace: [WideIV, HasScalableVF](VPUser &U, unsigned) {
871 if (HasScalableVF)
872 return U.usesFirstLaneOnly(Op: WideIV);
873 return U.usesScalars(Op: WideIV);
874 });
875 }
876 }
877}
878
879/// Check if \p VPV is an untruncated wide induction, either before or after the
880/// increment. If so return the header IV (before the increment), otherwise
881/// return null.
882static VPWidenInductionRecipe *
883getOptimizableIVOf(VPValue *VPV, PredicatedScalarEvolution &PSE) {
884 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(Val: VPV);
885 if (WideIV) {
886 // VPV itself is a wide induction, separately compute the end value for exit
887 // users if it is not a truncated IV.
888 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: WideIV);
889 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
890 }
891
892 // Check if VPV is an optimizable induction increment.
893 VPRecipeBase *Def = VPV->getDefiningRecipe();
894 if (!Def || Def->getNumOperands() != 2)
895 return nullptr;
896 WideIV = dyn_cast<VPWidenInductionRecipe>(Val: Def->getOperand(N: 0));
897 if (!WideIV)
898 WideIV = dyn_cast<VPWidenInductionRecipe>(Val: Def->getOperand(N: 1));
899 if (!WideIV)
900 return nullptr;
901
902 auto IsWideIVInc = [&]() {
903 auto &ID = WideIV->getInductionDescriptor();
904
905 // Check if VPV increments the induction by the induction step.
906 VPValue *IVStep = WideIV->getStepValue();
907 switch (ID.getInductionOpcode()) {
908 case Instruction::Add:
909 return match(V: VPV, P: m_c_Add(Op0: m_Specific(VPV: WideIV), Op1: m_Specific(VPV: IVStep)));
910 case Instruction::FAdd:
911 return match(V: VPV, P: m_c_FAdd(Op0: m_Specific(VPV: WideIV), Op1: m_Specific(VPV: IVStep)));
912 case Instruction::FSub:
913 return match(V: VPV, P: m_Binary<Instruction::FSub>(Op0: m_Specific(VPV: WideIV),
914 Op1: m_Specific(VPV: IVStep)));
915 case Instruction::Sub: {
916 // IVStep will be the negated step of the subtraction. Check if Step == -1
917 // * IVStep.
918 VPValue *Step;
919 if (!match(V: VPV, P: m_Sub(Op0: m_VPValue(), Op1: m_VPValue(V&: Step))))
920 return false;
921 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(V: IVStep, PSE);
922 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(V: Step, PSE);
923 ScalarEvolution &SE = *PSE.getSE();
924 return !isa<SCEVCouldNotCompute>(Val: IVStepSCEV) &&
925 !isa<SCEVCouldNotCompute>(Val: StepSCEV) &&
926 IVStepSCEV == SE.getNegativeSCEV(V: StepSCEV);
927 }
928 default:
929 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
930 match(V: VPV, P: m_GetElementPtr(Op0: m_Specific(VPV: WideIV),
931 Op1: m_Specific(VPV: WideIV->getStepValue())));
932 }
933 llvm_unreachable("should have been covered by switch above");
934 };
935 return IsWideIVInc() ? WideIV : nullptr;
936}
937
938/// Attempts to optimize the induction variable exit values for users in the
939/// early exit block.
940static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan,
941 VPTypeAnalysis &TypeInfo,
942 VPBlockBase *PredVPBB,
943 VPValue *Op,
944 PredicatedScalarEvolution &PSE) {
945 VPValue *Incoming, *Mask;
946 if (!match(V: Op, P: m_ExtractLane(Op0: m_FirstActiveLane(Op0: m_VPValue(V&: Mask)),
947 Op1: m_VPValue(V&: Incoming))))
948 return nullptr;
949
950 auto *WideIV = getOptimizableIVOf(VPV: Incoming, PSE);
951 if (!WideIV)
952 return nullptr;
953
954 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: WideIV);
955 if (WideIntOrFp && WideIntOrFp->getTruncInst())
956 return nullptr;
957
958 // Calculate the final index.
959 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
960 auto *CanonicalIV = LoopRegion->getCanonicalIV();
961 Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
962 VPBuilder B(cast<VPBasicBlock>(Val: PredVPBB));
963
964 DebugLoc DL = cast<VPInstruction>(Val: Op)->getDebugLoc();
965 VPValue *FirstActiveLane =
966 B.createNaryOp(Opcode: VPInstruction::FirstActiveLane, Operands: Mask, DL);
967 Type *FirstActiveLaneType = TypeInfo.inferScalarType(V: FirstActiveLane);
968 FirstActiveLane = B.createScalarZExtOrTrunc(Op: FirstActiveLane, ResultTy: CanonicalIVType,
969 SrcTy: FirstActiveLaneType, DL);
970 VPValue *EndValue = B.createAdd(LHS: CanonicalIV, RHS: FirstActiveLane, DL);
971
972 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
973 // changed it means the exit is using the incremented value, so we need to
974 // add the step.
975 if (Incoming != WideIV) {
976 VPValue *One = Plan.getConstantInt(Ty: CanonicalIVType, Val: 1);
977 EndValue = B.createAdd(LHS: EndValue, RHS: One, DL);
978 }
979
980 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
981 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
982 VPIRValue *Start = WideIV->getStartValue();
983 VPValue *Step = WideIV->getStepValue();
984 EndValue = B.createDerivedIV(
985 Kind: ID.getKind(), FPBinOp: dyn_cast_or_null<FPMathOperator>(Val: ID.getInductionBinOp()),
986 Start, Current: EndValue, Step);
987 }
988
989 return EndValue;
990}
991
992/// Attempts to optimize the induction variable exit values for users in the
993/// exit block coming from the latch in the original scalar loop.
994static VPValue *optimizeLatchExitInductionUser(
995 VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op,
996 DenseMap<VPValue *, VPValue *> &EndValues, PredicatedScalarEvolution &PSE) {
997 VPValue *Incoming;
998 if (!match(V: Op, P: m_ExtractLastLaneOfLastPart(Op0: m_VPValue(V&: Incoming))))
999 return nullptr;
1000
1001 auto *WideIV = getOptimizableIVOf(VPV: Incoming, PSE);
1002 if (!WideIV)
1003 return nullptr;
1004
1005 VPValue *EndValue = EndValues.lookup(Val: WideIV);
1006 assert(EndValue && "end value must have been pre-computed");
1007
1008 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1009 // changed it means the exit is using the incremented value, so we don't
1010 // need to subtract the step.
1011 if (Incoming != WideIV)
1012 return EndValue;
1013
1014 // Otherwise, subtract the step from the EndValue.
1015 VPBuilder B(cast<VPBasicBlock>(Val: PredVPBB)->getTerminator());
1016 VPValue *Step = WideIV->getStepValue();
1017 Type *ScalarTy = TypeInfo.inferScalarType(V: WideIV);
1018 if (ScalarTy->isIntegerTy())
1019 return B.createSub(LHS: EndValue, RHS: Step, DL: DebugLoc::getUnknown(), Name: "ind.escape");
1020 if (ScalarTy->isPointerTy()) {
1021 Type *StepTy = TypeInfo.inferScalarType(V: Step);
1022 auto *Zero = Plan.getConstantInt(Ty: StepTy, Val: 0);
1023 return B.createPtrAdd(Ptr: EndValue, Offset: B.createSub(LHS: Zero, RHS: Step),
1024 DL: DebugLoc::getUnknown(), Name: "ind.escape");
1025 }
1026 if (ScalarTy->isFloatingPointTy()) {
1027 const auto &ID = WideIV->getInductionDescriptor();
1028 return B.createNaryOp(
1029 Opcode: ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
1030 ? Instruction::FSub
1031 : Instruction::FAdd,
1032 Operands: {EndValue, Step}, Flags: {ID.getInductionBinOp()->getFastMathFlags()});
1033 }
1034 llvm_unreachable("all possible induction types must be handled");
1035 return nullptr;
1036}
1037
1038void VPlanTransforms::optimizeInductionExitUsers(
1039 VPlan &Plan, DenseMap<VPValue *, VPValue *> &EndValues,
1040 PredicatedScalarEvolution &PSE) {
1041 VPBlockBase *MiddleVPBB = Plan.getMiddleBlock();
1042 VPTypeAnalysis TypeInfo(Plan);
1043 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
1044 for (VPRecipeBase &R : ExitVPBB->phis()) {
1045 auto *ExitIRI = cast<VPIRPhi>(Val: &R);
1046
1047 for (auto [Idx, PredVPBB] : enumerate(First&: ExitVPBB->getPredecessors())) {
1048 VPValue *Escape = nullptr;
1049 if (PredVPBB == MiddleVPBB)
1050 Escape = optimizeLatchExitInductionUser(Plan, TypeInfo, PredVPBB,
1051 Op: ExitIRI->getOperand(N: Idx),
1052 EndValues, PSE);
1053 else
1054 Escape = optimizeEarlyExitInductionUser(
1055 Plan, TypeInfo, PredVPBB, Op: ExitIRI->getOperand(N: Idx), PSE);
1056 if (Escape)
1057 ExitIRI->setOperand(I: Idx, New: Escape);
1058 }
1059 }
1060 }
1061}
1062
1063/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
1064/// them with already existing recipes expanding the same SCEV expression.
1065static void removeRedundantExpandSCEVRecipes(VPlan &Plan) {
1066 DenseMap<const SCEV *, VPValue *> SCEV2VPV;
1067
1068 for (VPRecipeBase &R :
1069 make_early_inc_range(Range&: *Plan.getEntry()->getEntryBasicBlock())) {
1070 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(Val: &R);
1071 if (!ExpR)
1072 continue;
1073
1074 const auto &[V, Inserted] = SCEV2VPV.try_emplace(Key: ExpR->getSCEV(), Args&: ExpR);
1075 if (Inserted)
1076 continue;
1077 ExpR->replaceAllUsesWith(New: V->second);
1078 ExpR->eraseFromParent();
1079 }
1080}
1081
1082static void recursivelyDeleteDeadRecipes(VPValue *V) {
1083 SmallVector<VPValue *> WorkList;
1084 SmallPtrSet<VPValue *, 8> Seen;
1085 WorkList.push_back(Elt: V);
1086
1087 while (!WorkList.empty()) {
1088 VPValue *Cur = WorkList.pop_back_val();
1089 if (!Seen.insert(Ptr: Cur).second)
1090 continue;
1091 VPRecipeBase *R = Cur->getDefiningRecipe();
1092 if (!R)
1093 continue;
1094 if (!isDeadRecipe(R&: *R))
1095 continue;
1096 append_range(C&: WorkList, R: R->operands());
1097 R->eraseFromParent();
1098 }
1099}
1100
1101/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1102/// Returns an optional pair, where the first element indicates whether it is
1103/// an intrinsic ID.
1104static std::optional<std::pair<bool, unsigned>>
1105getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R) {
1106 return TypeSwitch<const VPSingleDefRecipe *,
1107 std::optional<std::pair<bool, unsigned>>>(R)
1108 .Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe, VPWidenGEPRecipe,
1109 VPReplicateRecipe>(
1110 caseFn: [](auto *I) { return std::make_pair(false, I->getOpcode()); })
1111 .Case(caseFn: [](const VPWidenIntrinsicRecipe *I) {
1112 return std::make_pair(x: true, y: I->getVectorIntrinsicID());
1113 })
1114 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>(caseFn: [](auto *I) {
1115 // For recipes that do not directly map to LLVM IR instructions,
1116 // assign opcodes after the last VPInstruction opcode (which is also
1117 // after the last IR Instruction opcode), based on the VPRecipeID.
1118 return std::make_pair(false,
1119 VPInstruction::OpsEnd + 1 + I->getVPRecipeID());
1120 })
1121 .Default(defaultFn: [](auto *) { return std::nullopt; });
1122}
1123
1124/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
1125/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
1126/// Operands are foldable live-ins.
1127static VPIRValue *tryToFoldLiveIns(VPSingleDefRecipe &R,
1128 ArrayRef<VPValue *> Operands,
1129 const DataLayout &DL,
1130 VPTypeAnalysis &TypeInfo) {
1131 auto OpcodeOrIID = getOpcodeOrIntrinsicID(R: &R);
1132 if (!OpcodeOrIID)
1133 return nullptr;
1134
1135 SmallVector<Value *, 4> Ops;
1136 for (VPValue *Op : Operands) {
1137 if (!match(V: Op, P: m_LiveIn()))
1138 return nullptr;
1139 Value *V = Op->getUnderlyingValue();
1140 if (!V)
1141 return nullptr;
1142 Ops.push_back(Elt: V);
1143 }
1144
1145 auto FoldToIRValue = [&]() -> Value * {
1146 InstSimplifyFolder Folder(DL);
1147 if (OpcodeOrIID->first) {
1148 if (R.getNumOperands() != 2)
1149 return nullptr;
1150 unsigned ID = OpcodeOrIID->second;
1151 return Folder.FoldBinaryIntrinsic(ID, LHS: Ops[0], RHS: Ops[1],
1152 Ty: TypeInfo.inferScalarType(V: &R));
1153 }
1154 unsigned Opcode = OpcodeOrIID->second;
1155 if (Instruction::isBinaryOp(Opcode))
1156 return Folder.FoldBinOp(Opc: static_cast<Instruction::BinaryOps>(Opcode),
1157 LHS: Ops[0], RHS: Ops[1]);
1158 if (Instruction::isCast(Opcode))
1159 return Folder.FoldCast(Op: static_cast<Instruction::CastOps>(Opcode), V: Ops[0],
1160 DestTy: TypeInfo.inferScalarType(V: R.getVPSingleValue()));
1161 switch (Opcode) {
1162 case VPInstruction::LogicalAnd:
1163 return Folder.FoldSelect(C: Ops[0], True: Ops[1],
1164 False: ConstantInt::getNullValue(Ty: Ops[1]->getType()));
1165 case VPInstruction::Not:
1166 return Folder.FoldBinOp(Opc: Instruction::BinaryOps::Xor, LHS: Ops[0],
1167 RHS: Constant::getAllOnesValue(Ty: Ops[0]->getType()));
1168 case Instruction::Select:
1169 return Folder.FoldSelect(C: Ops[0], True: Ops[1], False: Ops[2]);
1170 case Instruction::ICmp:
1171 case Instruction::FCmp:
1172 return Folder.FoldCmp(P: cast<VPRecipeWithIRFlags>(Val&: R).getPredicate(), LHS: Ops[0],
1173 RHS: Ops[1]);
1174 case Instruction::GetElementPtr: {
1175 auto &RFlags = cast<VPRecipeWithIRFlags>(Val&: R);
1176 auto *GEP = cast<GetElementPtrInst>(Val: RFlags.getUnderlyingInstr());
1177 return Folder.FoldGEP(Ty: GEP->getSourceElementType(), Ptr: Ops[0],
1178 IdxList: drop_begin(RangeOrContainer&: Ops), NW: RFlags.getGEPNoWrapFlags());
1179 }
1180 case VPInstruction::PtrAdd:
1181 case VPInstruction::WidePtrAdd:
1182 return Folder.FoldGEP(Ty: IntegerType::getInt8Ty(C&: TypeInfo.getContext()),
1183 Ptr: Ops[0], IdxList: Ops[1],
1184 NW: cast<VPRecipeWithIRFlags>(Val&: R).getGEPNoWrapFlags());
1185 // An extract of a live-in is an extract of a broadcast, so return the
1186 // broadcasted element.
1187 case Instruction::ExtractElement:
1188 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1189 return Ops[0];
1190 }
1191 return nullptr;
1192 };
1193
1194 if (Value *V = FoldToIRValue())
1195 return R.getParent()->getPlan()->getOrAddLiveIn(V);
1196 return nullptr;
1197}
1198
1199/// Try to simplify VPSingleDefRecipe \p Def.
1200static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) {
1201 VPlan *Plan = Def->getParent()->getPlan();
1202
1203 // Simplification of live-in IR values for SingleDef recipes using
1204 // InstSimplifyFolder.
1205 const DataLayout &DL =
1206 Plan->getScalarHeader()->getIRBasicBlock()->getDataLayout();
1207 if (VPValue *V = tryToFoldLiveIns(R&: *Def, Operands: Def->operands(), DL, TypeInfo))
1208 return Def->replaceAllUsesWith(New: V);
1209
1210 // Fold PredPHI LiveIn -> LiveIn.
1211 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Val: Def)) {
1212 VPValue *Op = PredPHI->getOperand(N: 0);
1213 if (isa<VPIRValue>(Val: Op))
1214 PredPHI->replaceAllUsesWith(New: Op);
1215 }
1216
1217 VPBuilder Builder(Def);
1218 VPValue *A;
1219 if (match(R: Def, P: m_Trunc(Op0: m_ZExtOrSExt(Op0: m_VPValue(V&: A))))) {
1220 Type *TruncTy = TypeInfo.inferScalarType(V: Def);
1221 Type *ATy = TypeInfo.inferScalarType(V: A);
1222 if (TruncTy == ATy) {
1223 Def->replaceAllUsesWith(New: A);
1224 } else {
1225 // Don't replace a scalarizing recipe with a widened cast.
1226 if (isa<VPReplicateRecipe>(Val: Def))
1227 return;
1228 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1229
1230 unsigned ExtOpcode = match(V: Def->getOperand(N: 0), P: m_SExt(Op0: m_VPValue()))
1231 ? Instruction::SExt
1232 : Instruction::ZExt;
1233 auto *Ext = Builder.createWidenCast(Opcode: Instruction::CastOps(ExtOpcode), Op: A,
1234 ResultTy: TruncTy);
1235 if (auto *UnderlyingExt = Def->getOperand(N: 0)->getUnderlyingValue()) {
1236 // UnderlyingExt has distinct return type, used to retain legacy cost.
1237 Ext->setUnderlyingValue(UnderlyingExt);
1238 }
1239 Def->replaceAllUsesWith(New: Ext);
1240 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1241 auto *Trunc = Builder.createWidenCast(Opcode: Instruction::Trunc, Op: A, ResultTy: TruncTy);
1242 Def->replaceAllUsesWith(New: Trunc);
1243 }
1244 }
1245#ifndef NDEBUG
1246 // Verify that the cached type info is for both A and its users is still
1247 // accurate by comparing it to freshly computed types.
1248 VPTypeAnalysis TypeInfo2(*Plan);
1249 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1250 for (VPUser *U : A->users()) {
1251 auto *R = cast<VPRecipeBase>(U);
1252 for (VPValue *VPV : R->definedValues())
1253 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1254 }
1255#endif
1256 }
1257
1258 // Simplify (X && Y) || (X && !Y) -> X.
1259 // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
1260 // && (Y || Z) and (X || !X) into true. This requires queuing newly created
1261 // recipes to be visited during simplification.
1262 VPValue *X, *Y, *Z;
1263 if (match(R: Def,
1264 P: m_c_BinaryOr(Op0: m_LogicalAnd(Op0: m_VPValue(V&: X), Op1: m_VPValue(V&: Y)),
1265 Op1: m_LogicalAnd(Op0: m_Deferred(V: X), Op1: m_Not(Op0: m_Deferred(V: Y)))))) {
1266 Def->replaceAllUsesWith(New: X);
1267 Def->eraseFromParent();
1268 return;
1269 }
1270
1271 // x | 1 -> 1
1272 if (match(R: Def, P: m_c_BinaryOr(Op0: m_VPValue(V&: X), Op1: m_AllOnes())))
1273 return Def->replaceAllUsesWith(New: Def->getOperand(N: Def->getOperand(N: 0) == X));
1274
1275 // x | 0 -> x
1276 if (match(R: Def, P: m_c_BinaryOr(Op0: m_VPValue(V&: X), Op1: m_ZeroInt())))
1277 return Def->replaceAllUsesWith(New: X);
1278
1279 // x | !x -> AllOnes
1280 if (match(R: Def, P: m_c_BinaryOr(Op0: m_VPValue(V&: X), Op1: m_Not(Op0: m_Deferred(V: X))))) {
1281 return Def->replaceAllUsesWith(New: Plan->getOrAddLiveIn(
1282 V: ConstantInt::getAllOnesValue(Ty: TypeInfo.inferScalarType(V: Def))));
1283 }
1284
1285 // x & 0 -> 0
1286 if (match(R: Def, P: m_c_BinaryAnd(Op0: m_VPValue(V&: X), Op1: m_ZeroInt())))
1287 return Def->replaceAllUsesWith(New: Def->getOperand(N: Def->getOperand(N: 0) == X));
1288
1289 // x && false -> false
1290 if (match(R: Def, P: m_LogicalAnd(Op0: m_VPValue(V&: X), Op1: m_False())))
1291 return Def->replaceAllUsesWith(New: Def->getOperand(N: 1));
1292
1293 // (x && y) || (x && z) -> x && (y || z)
1294 if (match(R: Def, P: m_c_BinaryOr(Op0: m_LogicalAnd(Op0: m_VPValue(V&: X), Op1: m_VPValue(V&: Y)),
1295 Op1: m_LogicalAnd(Op0: m_Deferred(V: X), Op1: m_VPValue(V&: Z)))) &&
1296 // Simplify only if one of the operands has one use to avoid creating an
1297 // extra recipe.
1298 (!Def->getOperand(N: 0)->hasMoreThanOneUniqueUser() ||
1299 !Def->getOperand(N: 1)->hasMoreThanOneUniqueUser()))
1300 return Def->replaceAllUsesWith(
1301 New: Builder.createLogicalAnd(LHS: X, RHS: Builder.createOr(LHS: Y, RHS: Z)));
1302
1303 // x && !x -> 0
1304 if (match(R: Def, P: m_LogicalAnd(Op0: m_VPValue(V&: X), Op1: m_Not(Op0: m_Deferred(V: X)))))
1305 return Def->replaceAllUsesWith(New: Plan->getFalse());
1306
1307 if (match(R: Def, P: m_Select(Op0: m_VPValue(), Op1: m_VPValue(V&: X), Op2: m_Deferred(V: X))))
1308 return Def->replaceAllUsesWith(New: X);
1309
1310 // select c, false, true -> not c
1311 VPValue *C;
1312 if (match(R: Def, P: m_Select(Op0: m_VPValue(V&: C), Op1: m_False(), Op2: m_True())))
1313 return Def->replaceAllUsesWith(New: Builder.createNot(Operand: C));
1314
1315 // select !c, x, y -> select c, y, x
1316 if (match(R: Def, P: m_Select(Op0: m_Not(Op0: m_VPValue(V&: C)), Op1: m_VPValue(V&: X), Op2: m_VPValue(V&: Y)))) {
1317 Def->setOperand(I: 0, New: C);
1318 Def->setOperand(I: 1, New: Y);
1319 Def->setOperand(I: 2, New: X);
1320 return;
1321 }
1322
1323 // Reassociate (x && y) && z -> x && (y && z) if x has multiple users. With
1324 // tail folding it is likely that x is a header mask and can be simplified
1325 // further.
1326 if (match(R: Def, P: m_LogicalAnd(Op0: m_LogicalAnd(Op0: m_VPValue(V&: X), Op1: m_VPValue(V&: Y)),
1327 Op1: m_VPValue(V&: Z))) &&
1328 X->hasMoreThanOneUniqueUser())
1329 return Def->replaceAllUsesWith(
1330 New: Builder.createLogicalAnd(LHS: X, RHS: Builder.createLogicalAnd(LHS: Y, RHS: Z)));
1331
1332 if (match(R: Def, P: m_c_Add(Op0: m_VPValue(V&: A), Op1: m_ZeroInt())))
1333 return Def->replaceAllUsesWith(New: A);
1334
1335 if (match(R: Def, P: m_c_Mul(Op0: m_VPValue(V&: A), Op1: m_One())))
1336 return Def->replaceAllUsesWith(New: A);
1337
1338 if (match(R: Def, P: m_c_Mul(Op0: m_VPValue(V&: A), Op1: m_ZeroInt())))
1339 return Def->replaceAllUsesWith(
1340 New: Def->getOperand(N: 0) == A ? Def->getOperand(N: 1) : Def->getOperand(N: 0));
1341
1342 const APInt *APC;
1343 if (match(R: Def, P: m_c_Mul(Op0: m_VPValue(V&: A), Op1: m_APInt(C&: APC))) && APC->isPowerOf2())
1344 return Def->replaceAllUsesWith(New: Builder.createNaryOp(
1345 Opcode: Instruction::Shl,
1346 Operands: {A, Plan->getConstantInt(BitWidth: APC->getBitWidth(), Val: APC->exactLogBase2())},
1347 Flags: *cast<VPRecipeWithIRFlags>(Val: Def), DL: Def->getDebugLoc()));
1348
1349 // Don't convert udiv to lshr inside a replicate region, as VPInstructions are
1350 // not allowed in them.
1351 const VPRegionBlock *ParentRegion = Def->getParent()->getParent();
1352 bool IsInReplicateRegion = ParentRegion && ParentRegion->isReplicator();
1353 if (!IsInReplicateRegion && match(R: Def, P: m_UDiv(Op0: m_VPValue(V&: A), Op1: m_APInt(C&: APC))) &&
1354 APC->isPowerOf2())
1355 return Def->replaceAllUsesWith(New: Builder.createNaryOp(
1356 Opcode: Instruction::LShr,
1357 Operands: {A, Plan->getConstantInt(BitWidth: APC->getBitWidth(), Val: APC->exactLogBase2())},
1358 Flags: *cast<VPRecipeWithIRFlags>(Val: Def), DL: Def->getDebugLoc()));
1359
1360 if (match(R: Def, P: m_Not(Op0: m_VPValue(V&: A)))) {
1361 if (match(V: A, P: m_Not(Op0: m_VPValue(V&: A))))
1362 return Def->replaceAllUsesWith(New: A);
1363
1364 // Try to fold Not into compares by adjusting the predicate in-place.
1365 CmpPredicate Pred;
1366 if (match(V: A, P: m_Cmp(Pred, Op0: m_VPValue(), Op1: m_VPValue()))) {
1367 auto *Cmp = cast<VPRecipeWithIRFlags>(Val: A);
1368 if (all_of(Range: Cmp->users(),
1369 P: match_fn(P: m_CombineOr(
1370 L: m_Not(Op0: m_Specific(VPV: Cmp)),
1371 R: m_Select(Op0: m_Specific(VPV: Cmp), Op1: m_VPValue(), Op2: m_VPValue()))))) {
1372 Cmp->setPredicate(CmpInst::getInversePredicate(pred: Pred));
1373 for (VPUser *U : to_vector(Range: Cmp->users())) {
1374 auto *R = cast<VPSingleDefRecipe>(Val: U);
1375 if (match(R, P: m_Select(Op0: m_Specific(VPV: Cmp), Op1: m_VPValue(V&: X), Op2: m_VPValue(V&: Y)))) {
1376 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1377 R->setOperand(I: 1, New: Y);
1378 R->setOperand(I: 2, New: X);
1379 } else {
1380 // not (cmp pred) -> cmp inv_pred
1381 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1382 R->replaceAllUsesWith(New: Cmp);
1383 }
1384 }
1385 // If Cmp doesn't have a debug location, use the one from the negation,
1386 // to preserve the location.
1387 if (!Cmp->getDebugLoc() && Def->getDebugLoc())
1388 Cmp->setDebugLoc(Def->getDebugLoc());
1389 }
1390 }
1391 }
1392
1393 // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
1394 // any-of (fcmp uno %A, %B), ...
1395 if (match(R: Def, P: m_AnyOf())) {
1396 SmallVector<VPValue *, 4> NewOps;
1397 VPRecipeBase *UnpairedCmp = nullptr;
1398 for (VPValue *Op : Def->operands()) {
1399 VPValue *X;
1400 if (Op->getNumUsers() > 1 ||
1401 !match(V: Op, P: m_SpecificCmp(MatchPred: CmpInst::FCMP_UNO, Op0: m_VPValue(V&: X),
1402 Op1: m_Deferred(V: X)))) {
1403 NewOps.push_back(Elt: Op);
1404 } else if (!UnpairedCmp) {
1405 UnpairedCmp = Op->getDefiningRecipe();
1406 } else {
1407 NewOps.push_back(Elt: Builder.createFCmp(Pred: CmpInst::FCMP_UNO,
1408 A: UnpairedCmp->getOperand(N: 0), B: X));
1409 UnpairedCmp = nullptr;
1410 }
1411 }
1412
1413 if (UnpairedCmp)
1414 NewOps.push_back(Elt: UnpairedCmp->getVPSingleValue());
1415
1416 if (NewOps.size() < Def->getNumOperands()) {
1417 VPValue *NewAnyOf = Builder.createNaryOp(Opcode: VPInstruction::AnyOf, Operands: NewOps);
1418 return Def->replaceAllUsesWith(New: NewAnyOf);
1419 }
1420 }
1421
1422 // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
1423 // This is useful for fmax/fmin without fast-math flags, where we need to
1424 // check if any operand is NaN.
1425 if (match(R: Def, P: m_BinaryOr(Op0: m_SpecificCmp(MatchPred: CmpInst::FCMP_UNO, Op0: m_VPValue(V&: X),
1426 Op1: m_Deferred(V: X)),
1427 Op1: m_SpecificCmp(MatchPred: CmpInst::FCMP_UNO, Op0: m_VPValue(V&: Y),
1428 Op1: m_Deferred(V: Y))))) {
1429 VPValue *NewCmp = Builder.createFCmp(Pred: CmpInst::FCMP_UNO, A: X, B: Y);
1430 return Def->replaceAllUsesWith(New: NewCmp);
1431 }
1432
1433 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1434 if ((match(R: Def, P: m_DerivedIV(Op0: m_ZeroInt(), Op1: m_VPValue(V&: A), Op2: m_One())) ||
1435 match(R: Def, P: m_DerivedIV(Op0: m_ZeroInt(), Op1: m_ZeroInt(), Op2: m_VPValue()))) &&
1436 TypeInfo.inferScalarType(V: Def->getOperand(N: 1)) ==
1437 TypeInfo.inferScalarType(V: Def))
1438 return Def->replaceAllUsesWith(New: Def->getOperand(N: 1));
1439
1440 if (match(R: Def, P: m_VPInstruction<VPInstruction::WideIVStep>(Ops: m_VPValue(V&: X),
1441 Ops: m_One()))) {
1442 Type *WideStepTy = TypeInfo.inferScalarType(V: Def);
1443 if (TypeInfo.inferScalarType(V: X) != WideStepTy)
1444 X = Builder.createWidenCast(Opcode: Instruction::Trunc, Op: X, ResultTy: WideStepTy);
1445 Def->replaceAllUsesWith(New: X);
1446 return;
1447 }
1448
1449 // For i1 vp.merges produced by AnyOf reductions:
1450 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1451 if (match(R: Def, P: m_Intrinsic<Intrinsic::vp_merge>(Op0: m_True(), Op1: m_VPValue(V&: A),
1452 Op2: m_VPValue(V&: X), Op3: m_VPValue())) &&
1453 match(V: A, P: m_c_BinaryOr(Op0: m_Specific(VPV: X), Op1: m_VPValue(V&: Y))) &&
1454 TypeInfo.inferScalarType(V: Def)->isIntegerTy(Bitwidth: 1)) {
1455 Def->setOperand(I: 1, New: Def->getOperand(N: 0));
1456 Def->setOperand(I: 0, New: Y);
1457 return;
1458 }
1459
1460 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Val: Def)) {
1461 if (Phi->getOperand(N: 0) == Phi->getOperand(N: 1))
1462 Phi->replaceAllUsesWith(New: Phi->getOperand(N: 0));
1463 return;
1464 }
1465
1466 // Look through ExtractLastLane.
1467 if (match(R: Def, P: m_ExtractLastLane(Op0: m_VPValue(V&: A)))) {
1468 if (match(V: A, P: m_BuildVector())) {
1469 auto *BuildVector = cast<VPInstruction>(Val: A);
1470 Def->replaceAllUsesWith(
1471 New: BuildVector->getOperand(N: BuildVector->getNumOperands() - 1));
1472 return;
1473 }
1474 if (Plan->hasScalarVFOnly())
1475 return Def->replaceAllUsesWith(New: A);
1476 }
1477
1478 // Look through ExtractPenultimateElement (BuildVector ....).
1479 if (match(R: Def, P: m_ExtractPenultimateElement(Op0: m_BuildVector()))) {
1480 auto *BuildVector = cast<VPInstruction>(Val: Def->getOperand(N: 0));
1481 Def->replaceAllUsesWith(
1482 New: BuildVector->getOperand(N: BuildVector->getNumOperands() - 2));
1483 return;
1484 }
1485
1486 uint64_t Idx;
1487 if (match(R: Def, P: m_ExtractElement(Op0: m_BuildVector(), Op1: m_ConstantInt(C&: Idx)))) {
1488 auto *BuildVector = cast<VPInstruction>(Val: Def->getOperand(N: 0));
1489 Def->replaceAllUsesWith(New: BuildVector->getOperand(N: Idx));
1490 return;
1491 }
1492
1493 if (match(R: Def, P: m_BuildVector()) && all_equal(Range: Def->operands())) {
1494 Def->replaceAllUsesWith(
1495 New: Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: Def->getOperand(N: 0)));
1496 return;
1497 }
1498
1499 // Look through broadcast of single-scalar when used as select conditions; in
1500 // that case the scalar condition can be used directly.
1501 if (match(R: Def,
1502 P: m_Select(Op0: m_Broadcast(Op0: m_VPValue(V&: C)), Op1: m_VPValue(), Op2: m_VPValue()))) {
1503 assert(vputils::isSingleScalar(C) &&
1504 "broadcast operand must be single-scalar");
1505 Def->setOperand(I: 0, New: C);
1506 return;
1507 }
1508
1509 if (auto *Phi = dyn_cast<VPPhi>(Val: Def)) {
1510 if (Phi->getNumOperands() == 1)
1511 Phi->replaceAllUsesWith(New: Phi->getOperand(N: 0));
1512 return;
1513 }
1514
1515 VPIRValue *IRV;
1516 if (Def->getNumOperands() == 1 &&
1517 match(R: Def, P: m_ComputeReductionResult(Op0: m_VPIRValue(V&: IRV))))
1518 return Def->replaceAllUsesWith(New: IRV);
1519
1520 // Some simplifications can only be applied after unrolling. Perform them
1521 // below.
1522 if (!Plan->isUnrolled())
1523 return;
1524
1525 // After unrolling, extract-lane may be used to extract values from multiple
1526 // scalar sources. Only simplify when extracting from a single scalar source.
1527 VPValue *LaneToExtract;
1528 if (match(R: Def, P: m_ExtractLane(Op0: m_VPValue(V&: LaneToExtract), Op1: m_VPValue(V&: A)))) {
1529 // Simplify extract-lane(%lane_num, %scalar_val) -> %scalar_val.
1530 if (vputils::isSingleScalar(VPV: A))
1531 return Def->replaceAllUsesWith(New: A);
1532
1533 // Simplify extract-lane with single source to extract-element.
1534 Def->replaceAllUsesWith(New: Builder.createNaryOp(
1535 Opcode: Instruction::ExtractElement, Operands: {A, LaneToExtract}, DL: Def->getDebugLoc()));
1536 return;
1537 }
1538
1539 // Hoist an invariant increment Y of a phi X, by having X start at Y.
1540 if (match(R: Def, P: m_c_Add(Op0: m_VPValue(V&: X), Op1: m_VPValue(V&: Y))) && isa<VPIRValue>(Val: Y) &&
1541 isa<VPPhi>(Val: X)) {
1542 auto *Phi = cast<VPPhi>(Val: X);
1543 if (Phi->getOperand(N: 1) != Def && match(V: Phi->getOperand(N: 0), P: m_ZeroInt()) &&
1544 Phi->getSingleUser() == Def) {
1545 Phi->setOperand(I: 0, New: Y);
1546 Def->replaceAllUsesWith(New: Phi);
1547 return;
1548 }
1549 }
1550
1551 // Simplify unrolled VectorPointer without offset, or with zero offset, to
1552 // just the pointer operand.
1553 if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(Val: Def))
1554 if (!VPR->getOffset() || match(V: VPR->getOffset(), P: m_ZeroInt()))
1555 return VPR->replaceAllUsesWith(New: VPR->getOperand(N: 0));
1556
1557 // VPScalarIVSteps after unrolling can be replaced by their start value, if
1558 // the start index is zero and only the first lane 0 is demanded.
1559 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Val: Def)) {
1560 if (!Steps->getStartIndex() && vputils::onlyFirstLaneUsed(Def: Steps)) {
1561 Steps->replaceAllUsesWith(New: Steps->getOperand(N: 0));
1562 return;
1563 }
1564 }
1565 // Simplify redundant ReductionStartVector recipes after unrolling.
1566 VPValue *StartV;
1567 if (match(R: Def, P: m_VPInstruction<VPInstruction::ReductionStartVector>(
1568 Ops: m_VPValue(V&: StartV), Ops: m_VPValue(), Ops: m_VPValue()))) {
1569 Def->replaceUsesWithIf(New: StartV, ShouldReplace: [](const VPUser &U, unsigned Idx) {
1570 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &U);
1571 return PhiR && PhiR->isInLoop();
1572 });
1573 return;
1574 }
1575
1576 if (match(R: Def, P: m_ExtractLastLane(Op0: m_Broadcast(Op0: m_VPValue(V&: A))))) {
1577 Def->replaceAllUsesWith(New: A);
1578 return;
1579 }
1580
1581 if (match(R: Def, P: m_ExtractLastLane(Op0: m_VPValue(V&: A))) &&
1582 ((isa<VPInstruction>(Val: A) && vputils::isSingleScalar(VPV: A)) ||
1583 (isa<VPReplicateRecipe>(Val: A) &&
1584 cast<VPReplicateRecipe>(Val: A)->isSingleScalar())) &&
1585 all_of(Range: A->users(),
1586 P: [Def, A](VPUser *U) { return U->usesScalars(Op: A) || Def == U; })) {
1587 return Def->replaceAllUsesWith(New: A);
1588 }
1589
1590 if (Plan->getUF() == 1 && match(R: Def, P: m_ExtractLastPart(Op0: m_VPValue(V&: A))))
1591 return Def->replaceAllUsesWith(New: A);
1592}
1593
1594void VPlanTransforms::simplifyRecipes(VPlan &Plan) {
1595 ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
1596 Plan.getEntry());
1597 VPTypeAnalysis TypeInfo(Plan);
1598 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: RPOT)) {
1599 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB))
1600 if (auto *Def = dyn_cast<VPSingleDefRecipe>(Val: &R))
1601 simplifyRecipe(Def, TypeInfo);
1602 }
1603}
1604
1605static void narrowToSingleScalarRecipes(VPlan &Plan) {
1606 if (Plan.hasScalarVFOnly())
1607 return;
1608
1609 // Try to narrow wide and replicating recipes to single scalar recipes,
1610 // based on VPlan analysis. Only process blocks in the loop region for now,
1611 // without traversing into nested regions, as recipes in replicate regions
1612 // cannot be converted yet.
1613 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1614 Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry()))) {
1615 for (VPRecipeBase &R : make_early_inc_range(Range: reverse(C&: *VPBB))) {
1616 if (!isa<VPWidenRecipe, VPWidenGEPRecipe, VPReplicateRecipe,
1617 VPWidenStoreRecipe>(Val: &R))
1618 continue;
1619 auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R);
1620 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1621 continue;
1622
1623 // Convert an unmasked scatter with an uniform address into
1624 // extract-last-lane + scalar store.
1625 // TODO: Add a profitability check comparing the cost of a scatter vs.
1626 // extract + scalar store.
1627 auto *WidenStoreR = dyn_cast<VPWidenStoreRecipe>(Val: &R);
1628 if (WidenStoreR && vputils::isSingleScalar(VPV: WidenStoreR->getAddr()) &&
1629 !WidenStoreR->isConsecutive()) {
1630 assert(!WidenStoreR->isReverse() &&
1631 "Not consecutive memory recipes shouldn't be reversed");
1632 VPValue *Mask = WidenStoreR->getMask();
1633
1634 // Only convert the scatter to a scalar store if it is unmasked.
1635 // TODO: Support converting scatter masked by the header mask to scalar
1636 // store.
1637 if (Mask)
1638 continue;
1639
1640 auto *Extract = new VPInstruction(VPInstruction::ExtractLastLane,
1641 {WidenStoreR->getOperand(N: 1)});
1642 Extract->insertBefore(InsertPos: WidenStoreR);
1643
1644 // TODO: Sink the scalar store recipe to middle block if possible.
1645 auto *ScalarStore = new VPReplicateRecipe(
1646 &WidenStoreR->getIngredient(), {Extract, WidenStoreR->getAddr()},
1647 true /*IsSingleScalar*/, nullptr /*Mask*/, {},
1648 *WidenStoreR /*Metadata*/);
1649 ScalarStore->insertBefore(InsertPos: WidenStoreR);
1650 WidenStoreR->eraseFromParent();
1651 continue;
1652 }
1653
1654 auto *RepOrWidenR = dyn_cast<VPRecipeWithIRFlags>(Val: &R);
1655 if (RepR && isa<StoreInst>(Val: RepR->getUnderlyingInstr()) &&
1656 vputils::isSingleScalar(VPV: RepR->getOperand(N: 1))) {
1657 auto *Clone = new VPReplicateRecipe(
1658 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1659 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Flags*/,
1660 *RepR /*Metadata*/, RepR->getDebugLoc());
1661 Clone->insertBefore(InsertPos: RepOrWidenR);
1662 VPBuilder Builder(Clone);
1663 VPValue *ExtractOp = Clone->getOperand(N: 0);
1664 if (vputils::isUniformAcrossVFsAndUFs(V: RepR->getOperand(N: 1)))
1665 ExtractOp =
1666 Builder.createNaryOp(Opcode: VPInstruction::ExtractLastPart, Operands: ExtractOp);
1667 ExtractOp =
1668 Builder.createNaryOp(Opcode: VPInstruction::ExtractLastLane, Operands: ExtractOp);
1669 Clone->setOperand(I: 0, New: ExtractOp);
1670 RepR->eraseFromParent();
1671 continue;
1672 }
1673
1674 // Skip recipes that aren't single scalars.
1675 if (!RepOrWidenR || !vputils::isSingleScalar(VPV: RepOrWidenR))
1676 continue;
1677
1678 // Skip recipes for which conversion to single-scalar does introduce
1679 // additional broadcasts. No extra broadcasts are needed, if either only
1680 // the scalars of the recipe are used, or at least one of the operands
1681 // would require a broadcast. In the latter case, the single-scalar may
1682 // need to be broadcasted, but another broadcast is removed.
1683 if (!all_of(Range: RepOrWidenR->users(),
1684 P: [RepOrWidenR](const VPUser *U) {
1685 if (auto *VPI = dyn_cast<VPInstruction>(Val: U)) {
1686 unsigned Opcode = VPI->getOpcode();
1687 if (Opcode == VPInstruction::ExtractLastLane ||
1688 Opcode == VPInstruction::ExtractLastPart ||
1689 Opcode == VPInstruction::ExtractPenultimateElement)
1690 return true;
1691 }
1692
1693 return U->usesScalars(Op: RepOrWidenR);
1694 }) &&
1695 none_of(Range: RepOrWidenR->operands(), P: [RepOrWidenR](VPValue *Op) {
1696 if (Op->getSingleUser() != RepOrWidenR)
1697 return false;
1698 // Non-constant live-ins require broadcasts, while constants do not
1699 // need explicit broadcasts.
1700 auto *IRV = dyn_cast<VPIRValue>(Val: Op);
1701 bool LiveInNeedsBroadcast = IRV && !isa<Constant>(Val: IRV->getValue());
1702 auto *OpR = dyn_cast<VPReplicateRecipe>(Val: Op);
1703 return LiveInNeedsBroadcast || (OpR && OpR->isSingleScalar());
1704 }))
1705 continue;
1706
1707 auto *Clone = new VPReplicateRecipe(
1708 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1709 true /*IsSingleScalar*/, nullptr, *RepOrWidenR);
1710 Clone->insertBefore(InsertPos: RepOrWidenR);
1711 RepOrWidenR->replaceAllUsesWith(New: Clone);
1712 if (isDeadRecipe(R&: *RepOrWidenR))
1713 RepOrWidenR->eraseFromParent();
1714 }
1715 }
1716}
1717
1718/// Try to see if all of \p Blend's masks share a common value logically and'ed
1719/// and remove it from the masks.
1720static void removeCommonBlendMask(VPBlendRecipe *Blend) {
1721 if (Blend->isNormalized())
1722 return;
1723 VPValue *CommonEdgeMask;
1724 if (!match(V: Blend->getMask(Idx: 0),
1725 P: m_LogicalAnd(Op0: m_VPValue(V&: CommonEdgeMask), Op1: m_VPValue())))
1726 return;
1727 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1728 if (!match(V: Blend->getMask(Idx: I),
1729 P: m_LogicalAnd(Op0: m_Specific(VPV: CommonEdgeMask), Op1: m_VPValue())))
1730 return;
1731 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1732 Blend->setMask(Idx: I, V: Blend->getMask(Idx: I)->getDefiningRecipe()->getOperand(N: 1));
1733}
1734
1735/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1736/// to make sure the masks are simplified.
1737static void simplifyBlends(VPlan &Plan) {
1738 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1739 Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry()))) {
1740 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
1741 auto *Blend = dyn_cast<VPBlendRecipe>(Val: &R);
1742 if (!Blend)
1743 continue;
1744
1745 removeCommonBlendMask(Blend);
1746
1747 // Try to remove redundant blend recipes.
1748 SmallPtrSet<VPValue *, 4> UniqueValues;
1749 if (Blend->isNormalized() || !match(V: Blend->getMask(Idx: 0), P: m_False()))
1750 UniqueValues.insert(Ptr: Blend->getIncomingValue(Idx: 0));
1751 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1752 if (!match(V: Blend->getMask(Idx: I), P: m_False()))
1753 UniqueValues.insert(Ptr: Blend->getIncomingValue(Idx: I));
1754
1755 if (UniqueValues.size() == 1) {
1756 Blend->replaceAllUsesWith(New: *UniqueValues.begin());
1757 Blend->eraseFromParent();
1758 continue;
1759 }
1760
1761 if (Blend->isNormalized())
1762 continue;
1763
1764 // Normalize the blend so its first incoming value is used as the initial
1765 // value with the others blended into it.
1766
1767 unsigned StartIndex = 0;
1768 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1769 // If a value's mask is used only by the blend then is can be deadcoded.
1770 // TODO: Find the most expensive mask that can be deadcoded, or a mask
1771 // that's used by multiple blends where it can be removed from them all.
1772 VPValue *Mask = Blend->getMask(Idx: I);
1773 if (Mask->getNumUsers() == 1 && !match(V: Mask, P: m_False())) {
1774 StartIndex = I;
1775 break;
1776 }
1777 }
1778
1779 SmallVector<VPValue *, 4> OperandsWithMask;
1780 OperandsWithMask.push_back(Elt: Blend->getIncomingValue(Idx: StartIndex));
1781
1782 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1783 if (I == StartIndex)
1784 continue;
1785 OperandsWithMask.push_back(Elt: Blend->getIncomingValue(Idx: I));
1786 OperandsWithMask.push_back(Elt: Blend->getMask(Idx: I));
1787 }
1788
1789 auto *NewBlend =
1790 new VPBlendRecipe(cast_or_null<PHINode>(Val: Blend->getUnderlyingValue()),
1791 OperandsWithMask, Blend->getDebugLoc());
1792 NewBlend->insertBefore(InsertPos: &R);
1793
1794 VPValue *DeadMask = Blend->getMask(Idx: StartIndex);
1795 Blend->replaceAllUsesWith(New: NewBlend);
1796 Blend->eraseFromParent();
1797 recursivelyDeleteDeadRecipes(V: DeadMask);
1798
1799 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1800 VPValue *NewMask;
1801 if (NewBlend->getNumOperands() == 3 &&
1802 match(V: NewBlend->getMask(Idx: 1), P: m_Not(Op0: m_VPValue(V&: NewMask)))) {
1803 VPValue *Inc0 = NewBlend->getOperand(N: 0);
1804 VPValue *Inc1 = NewBlend->getOperand(N: 1);
1805 VPValue *OldMask = NewBlend->getOperand(N: 2);
1806 NewBlend->setOperand(I: 0, New: Inc1);
1807 NewBlend->setOperand(I: 1, New: Inc0);
1808 NewBlend->setOperand(I: 2, New: NewMask);
1809 if (OldMask->getNumUsers() == 0)
1810 cast<VPInstruction>(Val: OldMask)->eraseFromParent();
1811 }
1812 }
1813 }
1814}
1815
1816/// Optimize the width of vector induction variables in \p Plan based on a known
1817/// constant Trip Count, \p BestVF and \p BestUF.
1818static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan,
1819 ElementCount BestVF,
1820 unsigned BestUF) {
1821 // Only proceed if we have not completely removed the vector region.
1822 if (!Plan.getVectorLoopRegion())
1823 return false;
1824
1825 const APInt *TC;
1826 if (!BestVF.isFixed() || !match(V: Plan.getTripCount(), P: m_APInt(C&: TC)))
1827 return false;
1828
1829 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
1830 // and UF. Returns at least 8.
1831 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
1832 APInt AlignedTC =
1833 Align * APIntOps::RoundingUDiv(A: TC, B: APInt(TC.getBitWidth(), Align),
1834 RM: APInt::Rounding::UP);
1835 APInt MaxVal = AlignedTC - 1;
1836 return std::max<unsigned>(a: PowerOf2Ceil(A: MaxVal.getActiveBits()), b: 8);
1837 };
1838 unsigned NewBitWidth =
1839 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
1840
1841 LLVMContext &Ctx = Plan.getContext();
1842 auto *NewIVTy = IntegerType::get(C&: Ctx, NumBits: NewBitWidth);
1843
1844 bool MadeChange = false;
1845
1846 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
1847 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
1848 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi);
1849
1850 // Currently only handle canonical IVs as it is trivial to replace the start
1851 // and stop values, and we currently only perform the optimization when the
1852 // IV has a single use.
1853 if (!WideIV || !WideIV->isCanonical() ||
1854 WideIV->hasMoreThanOneUniqueUser() ||
1855 NewIVTy == WideIV->getScalarType())
1856 continue;
1857
1858 // Currently only handle cases where the single user is a header-mask
1859 // comparison with the backedge-taken-count.
1860 VPUser *SingleUser = WideIV->getSingleUser();
1861 if (!SingleUser ||
1862 !match(U: SingleUser, P: m_ICmp(Op0: m_Specific(VPV: WideIV),
1863 Op1: m_Broadcast(Op0: m_Specific(
1864 VPV: Plan.getOrCreateBackedgeTakenCount())))))
1865 continue;
1866
1867 // Update IV operands and comparison bound to use new narrower type.
1868 auto *NewStart = Plan.getConstantInt(Ty: NewIVTy, Val: 0);
1869 WideIV->setStartValue(NewStart);
1870 auto *NewStep = Plan.getConstantInt(Ty: NewIVTy, Val: 1);
1871 WideIV->setStepValue(NewStep);
1872
1873 auto *NewBTC = new VPWidenCastRecipe(
1874 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy,
1875 nullptr, VPIRFlags::getDefaultFlags(Opcode: Instruction::Trunc));
1876 Plan.getVectorPreheader()->appendRecipe(Recipe: NewBTC);
1877 auto *Cmp = cast<VPInstruction>(Val: WideIV->getSingleUser());
1878 Cmp->setOperand(I: 1, New: NewBTC);
1879
1880 MadeChange = true;
1881 }
1882
1883 return MadeChange;
1884}
1885
1886/// Return true if \p Cond is known to be true for given \p BestVF and \p
1887/// BestUF.
1888static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan,
1889 ElementCount BestVF, unsigned BestUF,
1890 PredicatedScalarEvolution &PSE) {
1891 if (match(V: Cond, P: m_BinaryOr(Op0: m_VPValue(), Op1: m_VPValue())))
1892 return any_of(Range: Cond->getDefiningRecipe()->operands(), P: [&Plan, BestVF, BestUF,
1893 &PSE](VPValue *C) {
1894 return isConditionTrueViaVFAndUF(Cond: C, Plan, BestVF, BestUF, PSE);
1895 });
1896
1897 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
1898 if (!match(V: Cond, P: m_SpecificICmp(MatchPred: CmpInst::ICMP_EQ,
1899 Op0: m_Specific(VPV: CanIV->getBackedgeValue()),
1900 Op1: m_Specific(VPV: &Plan.getVectorTripCount()))))
1901 return false;
1902
1903 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
1904 // count is not conveniently available as SCEV so far, so we compare directly
1905 // against the original trip count. This is stricter than necessary, as we
1906 // will only return true if the trip count == vector trip count.
1907 const SCEV *VectorTripCount =
1908 vputils::getSCEVExprForVPValue(V: &Plan.getVectorTripCount(), PSE);
1909 if (isa<SCEVCouldNotCompute>(Val: VectorTripCount))
1910 VectorTripCount = vputils::getSCEVExprForVPValue(V: Plan.getTripCount(), PSE);
1911 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
1912 "Trip count SCEV must be computable");
1913 ScalarEvolution &SE = *PSE.getSE();
1914 ElementCount NumElements = BestVF.multiplyCoefficientBy(RHS: BestUF);
1915 const SCEV *C = SE.getElementCount(Ty: VectorTripCount->getType(), EC: NumElements);
1916 return SE.isKnownPredicate(Pred: CmpInst::ICMP_EQ, LHS: VectorTripCount, RHS: C);
1917}
1918
1919/// Try to replace multiple active lane masks used for control flow with
1920/// a single, wide active lane mask instruction followed by multiple
1921/// extract subvector intrinsics. This applies to the active lane mask
1922/// instructions both in the loop and in the preheader.
1923/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
1924/// new extracts from the first active lane mask, which has it's last
1925/// operand (multiplier) set to UF.
1926static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF,
1927 unsigned UF) {
1928 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
1929 return false;
1930
1931 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1932 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1933 auto *Term = &ExitingVPBB->back();
1934
1935 using namespace llvm::VPlanPatternMatch;
1936 if (!match(V: Term, P: m_BranchOnCond(Op0: m_Not(Op0: m_ActiveLaneMask(
1937 Op0: m_VPValue(), Op1: m_VPValue(), Op2: m_VPValue())))))
1938 return false;
1939
1940 auto *Header = cast<VPBasicBlock>(Val: VectorRegion->getEntry());
1941 LLVMContext &Ctx = Plan.getContext();
1942
1943 auto ExtractFromALM = [&](VPInstruction *ALM,
1944 SmallVectorImpl<VPValue *> &Extracts) {
1945 DebugLoc DL = ALM->getDebugLoc();
1946 for (unsigned Part = 0; Part < UF; ++Part) {
1947 SmallVector<VPValue *> Ops;
1948 Ops.append(IL: {ALM, Plan.getOrAddLiveIn(
1949 V: ConstantInt::get(Ty: IntegerType::getInt64Ty(C&: Ctx),
1950 V: VF.getKnownMinValue() * Part))});
1951 auto *Ext =
1952 new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
1953 IntegerType::getInt1Ty(C&: Ctx), {}, {}, DL);
1954 Extracts[Part] = Ext;
1955 Ext->insertAfter(InsertPos: ALM);
1956 }
1957 };
1958
1959 // Create a list of each active lane mask phi, ordered by unroll part.
1960 SmallVector<VPActiveLaneMaskPHIRecipe *> Phis(UF, nullptr);
1961 for (VPRecipeBase &R : Header->phis()) {
1962 auto *Phi = dyn_cast<VPActiveLaneMaskPHIRecipe>(Val: &R);
1963 if (!Phi)
1964 continue;
1965 VPValue *Index = nullptr;
1966 match(V: Phi->getBackedgeValue(),
1967 P: m_ActiveLaneMask(Op0: m_VPValue(V&: Index), Op1: m_VPValue(), Op2: m_VPValue()));
1968 assert(Index && "Expected index from ActiveLaneMask instruction");
1969
1970 uint64_t Part;
1971 if (match(V: Index,
1972 P: m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>(
1973 Ops: m_VPValue(), Ops: m_ConstantInt(C&: Part))))
1974 Phis[Part] = Phi;
1975 else
1976 // Anything other than a CanonicalIVIncrementForPart is part 0
1977 Phis[0] = Phi;
1978 }
1979
1980 assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) &&
1981 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
1982
1983 auto *EntryALM = cast<VPInstruction>(Val: Phis[0]->getStartValue());
1984 auto *LoopALM = cast<VPInstruction>(Val: Phis[0]->getBackedgeValue());
1985
1986 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
1987 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
1988 "Expected incoming values of Phi to be ActiveLaneMasks");
1989
1990 // When using wide lane masks, the return type of the get.active.lane.mask
1991 // intrinsic is VF x UF (last operand).
1992 VPValue *ALMMultiplier = Plan.getConstantInt(BitWidth: 64, Val: UF);
1993 EntryALM->setOperand(I: 2, New: ALMMultiplier);
1994 LoopALM->setOperand(I: 2, New: ALMMultiplier);
1995
1996 // Create UF x extract vectors and insert into preheader.
1997 SmallVector<VPValue *> EntryExtracts(UF);
1998 ExtractFromALM(EntryALM, EntryExtracts);
1999
2000 // Create UF x extract vectors and insert before the loop compare & branch,
2001 // updating the compare to use the first extract.
2002 SmallVector<VPValue *> LoopExtracts(UF);
2003 ExtractFromALM(LoopALM, LoopExtracts);
2004 VPInstruction *Not = cast<VPInstruction>(Val: Term->getOperand(N: 0));
2005 Not->setOperand(I: 0, New: LoopExtracts[0]);
2006
2007 // Update the incoming values of active lane mask phis.
2008 for (unsigned Part = 0; Part < UF; ++Part) {
2009 Phis[Part]->setStartValue(EntryExtracts[Part]);
2010 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
2011 }
2012
2013 return true;
2014}
2015
2016/// Try to simplify the branch condition of \p Plan. This may restrict the
2017/// resulting plan to \p BestVF and \p BestUF.
2018static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
2019 unsigned BestUF,
2020 PredicatedScalarEvolution &PSE) {
2021 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2022 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2023 auto *Term = &ExitingVPBB->back();
2024 VPValue *Cond;
2025 if (match(V: Term, P: m_BranchOnCount()) ||
2026 match(V: Term, P: m_BranchOnCond(Op0: m_Not(Op0: m_ActiveLaneMask(
2027 Op0: m_VPValue(), Op1: m_VPValue(), Op2: m_VPValue()))))) {
2028 // Try to simplify the branch condition if VectorTC <= VF * UF when the
2029 // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)).
2030 const SCEV *VectorTripCount =
2031 vputils::getSCEVExprForVPValue(V: &Plan.getVectorTripCount(), PSE);
2032 if (isa<SCEVCouldNotCompute>(Val: VectorTripCount))
2033 VectorTripCount =
2034 vputils::getSCEVExprForVPValue(V: Plan.getTripCount(), PSE);
2035 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2036 "Trip count SCEV must be computable");
2037 ScalarEvolution &SE = *PSE.getSE();
2038 ElementCount NumElements = BestVF.multiplyCoefficientBy(RHS: BestUF);
2039 const SCEV *C = SE.getElementCount(Ty: VectorTripCount->getType(), EC: NumElements);
2040 if (!SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: VectorTripCount, RHS: C))
2041 return false;
2042 } else if (match(V: Term, P: m_BranchOnCond(Op0: m_VPValue(V&: Cond))) ||
2043 match(V: Term, P: m_BranchOnTwoConds(Op0: m_VPValue(), Op1: m_VPValue(V&: Cond)))) {
2044 // For BranchOnCond, check if we can prove the condition to be true using VF
2045 // and UF.
2046 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, PSE))
2047 return false;
2048 } else {
2049 return false;
2050 }
2051
2052 // The vector loop region only executes once. If possible, completely remove
2053 // the region, otherwise replace the terminator controlling the latch with
2054 // (BranchOnCond true).
2055 // TODO: VPWidenIntOrFpInductionRecipe is only partially supported; add
2056 // support for other non-canonical widen induction recipes (e.g.,
2057 // VPWidenPointerInductionRecipe).
2058 // TODO: fold branch-on-constant after dissolving region.
2059 auto *Header = cast<VPBasicBlock>(Val: VectorRegion->getEntry());
2060 if (all_of(Range: Header->phis(), P: [](VPRecipeBase &Phi) {
2061 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi))
2062 return R->isCanonical();
2063 return isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe,
2064 VPFirstOrderRecurrencePHIRecipe, VPPhi>(Val: &Phi);
2065 })) {
2066 for (VPRecipeBase &HeaderR : make_early_inc_range(Range: Header->phis())) {
2067 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &HeaderR)) {
2068 VPBuilder Builder(Plan.getVectorPreheader());
2069 VPValue *StepV = Builder.createNaryOp(Opcode: VPInstruction::StepVector, Operands: {},
2070 ResultTy: R->getScalarType());
2071 HeaderR.getVPSingleValue()->replaceAllUsesWith(New: StepV);
2072 HeaderR.eraseFromParent();
2073 continue;
2074 }
2075 auto *Phi = cast<VPPhiAccessors>(Val: &HeaderR);
2076 HeaderR.getVPSingleValue()->replaceAllUsesWith(New: Phi->getIncomingValue(Idx: 0));
2077 HeaderR.eraseFromParent();
2078 }
2079
2080 VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
2081 SmallVector<VPBlockBase *> Exits = to_vector(Range&: VectorRegion->getSuccessors());
2082 VPBlockUtils::disconnectBlocks(From: Preheader, To: VectorRegion);
2083 for (VPBlockBase *Exit : Exits)
2084 VPBlockUtils::disconnectBlocks(From: VectorRegion, To: Exit);
2085
2086 for (VPBlockBase *B : vp_depth_first_shallow(G: VectorRegion->getEntry()))
2087 B->setParent(nullptr);
2088
2089 VPBlockUtils::connectBlocks(From: Preheader, To: Header);
2090
2091 for (VPBlockBase *Exit : Exits)
2092 VPBlockUtils::connectBlocks(From: ExitingVPBB, To: Exit);
2093
2094 // Replace terminating branch-on-two-conds with branch-on-cond to early
2095 // exit.
2096 if (Exits.size() != 1) {
2097 assert(match(Term, m_BranchOnTwoConds()) && Exits.size() == 2 &&
2098 "BranchOnTwoConds needs 2 remaining exits");
2099 VPBuilder(Term).createNaryOp(Opcode: VPInstruction::BranchOnCond,
2100 Operands: Term->getOperand(N: 0));
2101 }
2102 VPlanTransforms::simplifyRecipes(Plan);
2103 } else {
2104 // The vector region contains header phis for which we cannot remove the
2105 // loop region yet.
2106
2107 // For BranchOnTwoConds, set the latch exit condition to true directly.
2108 if (match(V: Term, P: m_BranchOnTwoConds())) {
2109 Term->setOperand(I: 1, New: Plan.getTrue());
2110 return true;
2111 }
2112
2113 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, {Plan.getTrue()},
2114 {}, {}, Term->getDebugLoc());
2115 ExitingVPBB->appendRecipe(Recipe: BOC);
2116 }
2117
2118 Term->eraseFromParent();
2119
2120 return true;
2121}
2122
2123/// From the definition of llvm.experimental.get.vector.length,
2124/// VPInstruction::ExplicitVectorLength(%AVL) = %AVL when %AVL <= VF.
2125static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF,
2126 PredicatedScalarEvolution &PSE) {
2127 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2128 Range: vp_depth_first_deep(G: Plan.getEntry()))) {
2129 for (VPRecipeBase &R : *VPBB) {
2130 VPValue *AVL;
2131 if (!match(V: &R, P: m_EVL(Op0: m_VPValue(V&: AVL))))
2132 continue;
2133
2134 const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(V: AVL, PSE);
2135 if (isa<SCEVCouldNotCompute>(Val: AVLSCEV))
2136 continue;
2137 ScalarEvolution &SE = *PSE.getSE();
2138 const SCEV *VFSCEV = SE.getElementCount(Ty: AVLSCEV->getType(), EC: VF);
2139 if (!SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: AVLSCEV, RHS: VFSCEV))
2140 continue;
2141
2142 VPValue *Trunc = VPBuilder(&R).createScalarZExtOrTrunc(
2143 Op: AVL, ResultTy: Type::getInt32Ty(C&: Plan.getContext()), SrcTy: AVLSCEV->getType(),
2144 DL: R.getDebugLoc());
2145 R.getVPSingleValue()->replaceAllUsesWith(New: Trunc);
2146 return true;
2147 }
2148 }
2149 return false;
2150}
2151
2152void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
2153 unsigned BestUF,
2154 PredicatedScalarEvolution &PSE) {
2155 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
2156 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
2157
2158 bool MadeChange = tryToReplaceALMWithWideALM(Plan, VF: BestVF, UF: BestUF);
2159 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
2160 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
2161 MadeChange |= simplifyKnownEVL(Plan, VF: BestVF, PSE);
2162
2163 if (MadeChange) {
2164 Plan.setVF(BestVF);
2165 assert(Plan.getUF() == BestUF && "BestUF must match the Plan's UF");
2166 }
2167}
2168
2169/// Sink users of \p FOR after the recipe defining the previous value \p
2170/// Previous of the recurrence. \returns true if all users of \p FOR could be
2171/// re-arranged as needed or false if it is not possible.
2172static bool
2173sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR,
2174 VPRecipeBase *Previous,
2175 VPDominatorTree &VPDT) {
2176 // Collect recipes that need sinking.
2177 SmallVector<VPRecipeBase *> WorkList;
2178 SmallPtrSet<VPRecipeBase *, 8> Seen;
2179 Seen.insert(Ptr: Previous);
2180 auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
2181 // The previous value must not depend on the users of the recurrence phi. In
2182 // that case, FOR is not a fixed order recurrence.
2183 if (SinkCandidate == Previous)
2184 return false;
2185
2186 if (isa<VPHeaderPHIRecipe>(Val: SinkCandidate) ||
2187 !Seen.insert(Ptr: SinkCandidate).second ||
2188 VPDT.properlyDominates(A: Previous, B: SinkCandidate))
2189 return true;
2190
2191 if (cannotHoistOrSinkRecipe(R: *SinkCandidate))
2192 return false;
2193
2194 WorkList.push_back(Elt: SinkCandidate);
2195 return true;
2196 };
2197
2198 // Recursively sink users of FOR after Previous.
2199 WorkList.push_back(Elt: FOR);
2200 for (unsigned I = 0; I != WorkList.size(); ++I) {
2201 VPRecipeBase *Current = WorkList[I];
2202 assert(Current->getNumDefinedValues() == 1 &&
2203 "only recipes with a single defined value expected");
2204
2205 for (VPUser *User : Current->getVPSingleValue()->users()) {
2206 if (!TryToPushSinkCandidate(cast<VPRecipeBase>(Val: User)))
2207 return false;
2208 }
2209 }
2210
2211 // Keep recipes to sink ordered by dominance so earlier instructions are
2212 // processed first.
2213 sort(C&: WorkList, Comp: [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2214 return VPDT.properlyDominates(A, B);
2215 });
2216
2217 for (VPRecipeBase *SinkCandidate : WorkList) {
2218 if (SinkCandidate == FOR)
2219 continue;
2220
2221 SinkCandidate->moveAfter(MovePos: Previous);
2222 Previous = SinkCandidate;
2223 }
2224 return true;
2225}
2226
2227/// Try to hoist \p Previous and its operands before all users of \p FOR.
2228static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR,
2229 VPRecipeBase *Previous,
2230 VPDominatorTree &VPDT) {
2231 if (cannotHoistOrSinkRecipe(R: *Previous))
2232 return false;
2233
2234 // Collect recipes that need hoisting.
2235 SmallVector<VPRecipeBase *> HoistCandidates;
2236 SmallPtrSet<VPRecipeBase *, 8> Visited;
2237 VPRecipeBase *HoistPoint = nullptr;
2238 // Find the closest hoist point by looking at all users of FOR and selecting
2239 // the recipe dominating all other users.
2240 for (VPUser *U : FOR->users()) {
2241 auto *R = cast<VPRecipeBase>(Val: U);
2242 if (!HoistPoint || VPDT.properlyDominates(A: R, B: HoistPoint))
2243 HoistPoint = R;
2244 }
2245 assert(all_of(FOR->users(),
2246 [&VPDT, HoistPoint](VPUser *U) {
2247 auto *R = cast<VPRecipeBase>(U);
2248 return HoistPoint == R ||
2249 VPDT.properlyDominates(HoistPoint, R);
2250 }) &&
2251 "HoistPoint must dominate all users of FOR");
2252
2253 auto NeedsHoisting = [HoistPoint, &VPDT,
2254 &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * {
2255 VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
2256 if (!HoistCandidate)
2257 return nullptr;
2258 VPRegionBlock *EnclosingLoopRegion =
2259 HoistCandidate->getParent()->getEnclosingLoopRegion();
2260 assert((!HoistCandidate->getRegion() ||
2261 HoistCandidate->getRegion() == EnclosingLoopRegion) &&
2262 "CFG in VPlan should still be flat, without replicate regions");
2263 // Hoist candidate was already visited, no need to hoist.
2264 if (!Visited.insert(Ptr: HoistCandidate).second)
2265 return nullptr;
2266
2267 // Candidate is outside loop region or a header phi, dominates FOR users w/o
2268 // hoisting.
2269 if (!EnclosingLoopRegion || isa<VPHeaderPHIRecipe>(Val: HoistCandidate))
2270 return nullptr;
2271
2272 // If we reached a recipe that dominates HoistPoint, we don't need to
2273 // hoist the recipe.
2274 if (VPDT.properlyDominates(A: HoistCandidate, B: HoistPoint))
2275 return nullptr;
2276 return HoistCandidate;
2277 };
2278
2279 if (!NeedsHoisting(Previous->getVPSingleValue()))
2280 return true;
2281
2282 // Recursively try to hoist Previous and its operands before all users of FOR.
2283 HoistCandidates.push_back(Elt: Previous);
2284
2285 for (unsigned I = 0; I != HoistCandidates.size(); ++I) {
2286 VPRecipeBase *Current = HoistCandidates[I];
2287 assert(Current->getNumDefinedValues() == 1 &&
2288 "only recipes with a single defined value expected");
2289 if (cannotHoistOrSinkRecipe(R: *Current))
2290 return false;
2291
2292 for (VPValue *Op : Current->operands()) {
2293 // If we reach FOR, it means the original Previous depends on some other
2294 // recurrence that in turn depends on FOR. If that is the case, we would
2295 // also need to hoist recipes involving the other FOR, which may break
2296 // dependencies.
2297 if (Op == FOR)
2298 return false;
2299
2300 if (auto *R = NeedsHoisting(Op)) {
2301 // Bail out if the recipe defines multiple values.
2302 // TODO: Hoisting such recipes requires additional handling.
2303 if (R->getNumDefinedValues() != 1)
2304 return false;
2305 HoistCandidates.push_back(Elt: R);
2306 }
2307 }
2308 }
2309
2310 // Order recipes to hoist by dominance so earlier instructions are processed
2311 // first.
2312 sort(C&: HoistCandidates, Comp: [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2313 return VPDT.properlyDominates(A, B);
2314 });
2315
2316 for (VPRecipeBase *HoistCandidate : HoistCandidates) {
2317 HoistCandidate->moveBefore(BB&: *HoistPoint->getParent(),
2318 I: HoistPoint->getIterator());
2319 }
2320
2321 return true;
2322}
2323
2324bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
2325 VPBuilder &LoopBuilder) {
2326 VPDominatorTree VPDT(Plan);
2327
2328 SmallVector<VPFirstOrderRecurrencePHIRecipe *> RecurrencePhis;
2329 for (VPRecipeBase &R :
2330 Plan.getVectorLoopRegion()->getEntry()->getEntryBasicBlock()->phis())
2331 if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Val: &R))
2332 RecurrencePhis.push_back(Elt: FOR);
2333
2334 for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
2335 SmallPtrSet<VPFirstOrderRecurrencePHIRecipe *, 4> SeenPhis;
2336 VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
2337 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
2338 // to terminate.
2339 while (auto *PrevPhi =
2340 dyn_cast_or_null<VPFirstOrderRecurrencePHIRecipe>(Val: Previous)) {
2341 assert(PrevPhi->getParent() == FOR->getParent());
2342 assert(SeenPhis.insert(PrevPhi).second);
2343 Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
2344 }
2345
2346 if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
2347 !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
2348 return false;
2349
2350 // Introduce a recipe to combine the incoming and previous values of a
2351 // fixed-order recurrence.
2352 VPBasicBlock *InsertBlock = Previous->getParent();
2353 if (isa<VPHeaderPHIRecipe>(Val: Previous))
2354 LoopBuilder.setInsertPoint(TheBB: InsertBlock, IP: InsertBlock->getFirstNonPhi());
2355 else
2356 LoopBuilder.setInsertPoint(TheBB: InsertBlock,
2357 IP: std::next(x: Previous->getIterator()));
2358
2359 auto *RecurSplice =
2360 LoopBuilder.createNaryOp(Opcode: VPInstruction::FirstOrderRecurrenceSplice,
2361 Operands: {FOR, FOR->getBackedgeValue()});
2362
2363 FOR->replaceAllUsesWith(New: RecurSplice);
2364 // Set the first operand of RecurSplice to FOR again, after replacing
2365 // all users.
2366 RecurSplice->setOperand(I: 0, New: FOR);
2367
2368 // Check for users extracting at the penultimate active lane of the FOR.
2369 // If only a single lane is active in the current iteration, we need to
2370 // select the last element from the previous iteration (from the FOR phi
2371 // directly).
2372 for (VPUser *U : RecurSplice->users()) {
2373 if (!match(U, P: m_ExtractLane(Op0: m_LastActiveLane(Op0: m_VPValue()),
2374 Op1: m_Specific(VPV: RecurSplice))))
2375 continue;
2376
2377 VPBuilder B(cast<VPInstruction>(Val: U));
2378 VPValue *LastActiveLane = cast<VPInstruction>(Val: U)->getOperand(N: 0);
2379 Type *I64Ty = Type::getInt64Ty(C&: Plan.getContext());
2380 VPValue *Zero = Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: I64Ty, V: 0));
2381 VPValue *One = Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: I64Ty, V: 1));
2382 VPValue *PenultimateIndex = B.createSub(LHS: LastActiveLane, RHS: One);
2383 VPValue *PenultimateLastIter =
2384 B.createNaryOp(Opcode: VPInstruction::ExtractLane,
2385 Operands: {PenultimateIndex, FOR->getBackedgeValue()});
2386 VPValue *LastPrevIter =
2387 B.createNaryOp(Opcode: VPInstruction::ExtractLastLane, Operands: FOR);
2388
2389 VPValue *Cmp = B.createICmp(Pred: CmpInst::ICMP_EQ, A: LastActiveLane, B: Zero);
2390 VPValue *Sel = B.createSelect(Cond: Cmp, TrueVal: LastPrevIter, FalseVal: PenultimateLastIter);
2391 cast<VPInstruction>(Val: U)->replaceAllUsesWith(New: Sel);
2392 }
2393 }
2394 return true;
2395}
2396
2397void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {
2398 for (VPRecipeBase &R :
2399 Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
2400 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
2401 if (!PhiR)
2402 continue;
2403 RecurKind RK = PhiR->getRecurrenceKind();
2404 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2405 RK != RecurKind::AddChainWithSubs)
2406 continue;
2407
2408 for (VPUser *U : collectUsersRecursively(V: PhiR))
2409 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(Val: U)) {
2410 RecWithFlags->dropPoisonGeneratingFlags();
2411 }
2412 }
2413}
2414
2415namespace {
2416struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2417 static bool isSentinel(const VPSingleDefRecipe *Def) {
2418 return Def == getEmptyKey() || Def == getTombstoneKey();
2419 }
2420
2421 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2422 /// return that source element type.
2423 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2424 // All VPInstructions that lower to GEPs must have the i8 source element
2425 // type (as they are PtrAdds), so we omit it.
2426 return TypeSwitch<const VPSingleDefRecipe *, Type *>(R)
2427 .Case(caseFn: [](const VPReplicateRecipe *I) -> Type * {
2428 if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: I->getUnderlyingValue()))
2429 return GEP->getSourceElementType();
2430 return nullptr;
2431 })
2432 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2433 caseFn: [](auto *I) { return I->getSourceElementType(); })
2434 .Default(defaultFn: [](auto *) { return nullptr; });
2435 }
2436
2437 /// Returns true if recipe \p Def can be safely handed for CSE.
2438 static bool canHandle(const VPSingleDefRecipe *Def) {
2439 // We can extend the list of handled recipes in the future,
2440 // provided we account for the data embedded in them while checking for
2441 // equality or hashing.
2442 auto C = getOpcodeOrIntrinsicID(R: Def);
2443
2444 // The issue with (Insert|Extract)Value is that the index of the
2445 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2446 // VPlan.
2447 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2448 C->second == Instruction::ExtractValue)))
2449 return false;
2450
2451 // During CSE, we can only handle recipes that don't read from memory: if
2452 // they read from memory, there could be an intervening write to memory
2453 // before the next instance is CSE'd, leading to an incorrect result.
2454 return !Def->mayReadFromMemory();
2455 }
2456
2457 /// Hash the underlying data of \p Def.
2458 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2459 const VPlan *Plan = Def->getParent()->getPlan();
2460 VPTypeAnalysis TypeInfo(*Plan);
2461 hash_code Result = hash_combine(
2462 args: Def->getVPRecipeID(), args: getOpcodeOrIntrinsicID(R: Def),
2463 args: getGEPSourceElementType(R: Def), args: TypeInfo.inferScalarType(V: Def),
2464 args: vputils::isSingleScalar(VPV: Def), args: hash_combine_range(R: Def->operands()));
2465 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Val: Def))
2466 if (RFlags->hasPredicate())
2467 return hash_combine(args: Result, args: RFlags->getPredicate());
2468 return Result;
2469 }
2470
2471 /// Check equality of underlying data of \p L and \p R.
2472 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2473 if (isSentinel(Def: L) || isSentinel(Def: R))
2474 return L == R;
2475 if (L->getVPRecipeID() != R->getVPRecipeID() ||
2476 getOpcodeOrIntrinsicID(R: L) != getOpcodeOrIntrinsicID(R) ||
2477 getGEPSourceElementType(R: L) != getGEPSourceElementType(R) ||
2478 vputils::isSingleScalar(VPV: L) != vputils::isSingleScalar(VPV: R) ||
2479 !equal(LRange: L->operands(), RRange: R->operands()))
2480 return false;
2481 assert(getOpcodeOrIntrinsicID(L) && getOpcodeOrIntrinsicID(R) &&
2482 "must have valid opcode info for both recipes");
2483 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(Val: L))
2484 if (LFlags->hasPredicate() &&
2485 LFlags->getPredicate() !=
2486 cast<VPRecipeWithIRFlags>(Val: R)->getPredicate())
2487 return false;
2488 // Recipes in replicate regions implicitly depend on predicate. If either
2489 // recipe is in a replicate region, only consider them equal if both have
2490 // the same parent.
2491 const VPRegionBlock *RegionL = L->getRegion();
2492 const VPRegionBlock *RegionR = R->getRegion();
2493 if (((RegionL && RegionL->isReplicator()) ||
2494 (RegionR && RegionR->isReplicator())) &&
2495 L->getParent() != R->getParent())
2496 return false;
2497 const VPlan *Plan = L->getParent()->getPlan();
2498 VPTypeAnalysis TypeInfo(*Plan);
2499 return TypeInfo.inferScalarType(V: L) == TypeInfo.inferScalarType(V: R);
2500 }
2501};
2502} // end anonymous namespace
2503
2504/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2505/// Plan.
2506void VPlanTransforms::cse(VPlan &Plan) {
2507 VPDominatorTree VPDT(Plan);
2508 DenseMap<VPSingleDefRecipe *, VPSingleDefRecipe *, VPCSEDenseMapInfo> CSEMap;
2509
2510 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2511 Range: vp_depth_first_deep(G: Plan.getEntry()))) {
2512 for (VPRecipeBase &R : *VPBB) {
2513 auto *Def = dyn_cast<VPSingleDefRecipe>(Val: &R);
2514 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2515 continue;
2516 if (VPSingleDefRecipe *V = CSEMap.lookup(Val: Def)) {
2517 // V must dominate Def for a valid replacement.
2518 if (!VPDT.dominates(A: V->getParent(), B: VPBB))
2519 continue;
2520 // Only keep flags present on both V and Def.
2521 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Val: V))
2522 RFlags->intersectFlags(Other: *cast<VPRecipeWithIRFlags>(Val: Def));
2523 Def->replaceAllUsesWith(New: V);
2524 continue;
2525 }
2526 CSEMap[Def] = Def;
2527 }
2528 }
2529}
2530
2531/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2532static void licm(VPlan &Plan) {
2533 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2534
2535 // Hoist any loop invariant recipes from the vector loop region to the
2536 // preheader. Preform a shallow traversal of the vector loop region, to
2537 // exclude recipes in replicate regions. Since the top-level blocks in the
2538 // vector loop region are guaranteed to execute if the vector pre-header is,
2539 // we don't need to check speculation safety.
2540 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2541 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2542 "Expected vector prehader's successor to be the vector loop region");
2543 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2544 Range: vp_depth_first_shallow(G: LoopRegion->getEntry()))) {
2545 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
2546 if (cannotHoistOrSinkRecipe(R))
2547 continue;
2548 if (any_of(Range: R.operands(), P: [](VPValue *Op) {
2549 return !Op->isDefinedOutsideLoopRegions();
2550 }))
2551 continue;
2552 R.moveBefore(BB&: *Preheader, I: Preheader->end());
2553 }
2554 }
2555
2556#ifndef NDEBUG
2557 VPDominatorTree VPDT(Plan);
2558#endif
2559 // Sink recipes with no users inside the vector loop region if all users are
2560 // in the same exit block of the region.
2561 // TODO: Extend to sink recipes from inner loops.
2562 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2563 Range: vp_post_order_shallow(G: LoopRegion->getEntry()))) {
2564 for (VPRecipeBase &R : make_early_inc_range(Range: reverse(C&: *VPBB))) {
2565 if (cannotHoistOrSinkRecipe(R))
2566 continue;
2567
2568 // TODO: Support sinking VPReplicateRecipe after ensuring replicateByVF
2569 // handles sunk recipes correctly.
2570 if (isa<VPReplicateRecipe>(Val: &R))
2571 continue;
2572
2573 // TODO: Use R.definedValues() instead of casting to VPSingleDefRecipe to
2574 // support recipes with multiple defined values (e.g., interleaved loads).
2575 auto *Def = cast<VPSingleDefRecipe>(Val: &R);
2576 // Skip recipes without users as we cannot determine a sink block.
2577 // TODO: Clone sinkable recipes without users to all exit blocks to reduce
2578 // their execution frequency.
2579 if (Def->getNumUsers() == 0)
2580 continue;
2581
2582 VPBasicBlock *SinkBB = nullptr;
2583 // Cannot sink the recipe if any user
2584 // * is defined in any loop region, or
2585 // * is a phi, or
2586 // * multiple users in different blocks.
2587 if (any_of(Range: Def->users(), P: [&SinkBB](VPUser *U) {
2588 auto *UserR = cast<VPRecipeBase>(Val: U);
2589 VPBasicBlock *Parent = UserR->getParent();
2590 // TODO: If the user is a PHI node, we should check the block of
2591 // incoming value. Support PHI node users if needed.
2592 if (UserR->isPhi() || Parent->getEnclosingLoopRegion())
2593 return true;
2594 // TODO: Support sinking when users are in multiple blocks.
2595 if (SinkBB && SinkBB != Parent)
2596 return true;
2597 SinkBB = Parent;
2598 return false;
2599 }))
2600 continue;
2601
2602 // Only sink to dedicated exit blocks of the loop region.
2603 if (SinkBB->getSinglePredecessor() != LoopRegion)
2604 continue;
2605
2606 // TODO: This will need to be a check instead of a assert after
2607 // conditional branches in vectorized loops are supported.
2608 assert(VPDT.properlyDominates(VPBB, SinkBB) &&
2609 "Defining block must dominate sink block");
2610 // TODO: Clone the recipe if users are on multiple exit paths, instead of
2611 // just moving.
2612 Def->moveBefore(BB&: *SinkBB, I: SinkBB->getFirstNonPhi());
2613 }
2614 }
2615}
2616
2617void VPlanTransforms::truncateToMinimalBitwidths(
2618 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2619 if (Plan.hasScalarVFOnly())
2620 return;
2621 // Keep track of created truncates, so they can be re-used. Note that we
2622 // cannot use RAUW after creating a new truncate, as this would could make
2623 // other uses have different types for their operands, making them invalidly
2624 // typed.
2625 DenseMap<VPValue *, VPWidenCastRecipe *> ProcessedTruncs;
2626 VPTypeAnalysis TypeInfo(Plan);
2627 VPBasicBlock *PH = Plan.getVectorPreheader();
2628 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2629 Range: vp_depth_first_deep(G: Plan.getVectorLoopRegion()))) {
2630 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
2631 if (!isa<VPWidenRecipe, VPWidenCastRecipe, VPReplicateRecipe,
2632 VPWidenLoadRecipe, VPWidenIntrinsicRecipe>(Val: &R))
2633 continue;
2634
2635 VPValue *ResultVPV = R.getVPSingleValue();
2636 auto *UI = cast_or_null<Instruction>(Val: ResultVPV->getUnderlyingValue());
2637 unsigned NewResSizeInBits = MinBWs.lookup(Key: UI);
2638 if (!NewResSizeInBits)
2639 continue;
2640
2641 // If the value wasn't vectorized, we must maintain the original scalar
2642 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2643 // skip casts which do not need to be handled explicitly here, as
2644 // redundant casts will be removed during recipe simplification.
2645 if (isa<VPReplicateRecipe, VPWidenCastRecipe>(Val: &R))
2646 continue;
2647
2648 Type *OldResTy = TypeInfo.inferScalarType(V: ResultVPV);
2649 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2650 assert(OldResTy->isIntegerTy() && "only integer types supported");
2651 (void)OldResSizeInBits;
2652
2653 auto *NewResTy = IntegerType::get(C&: Plan.getContext(), NumBits: NewResSizeInBits);
2654
2655 // Any wrapping introduced by shrinking this operation shouldn't be
2656 // considered undefined behavior. So, we can't unconditionally copy
2657 // arithmetic wrapping flags to VPW.
2658 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(Val: &R))
2659 VPW->dropPoisonGeneratingFlags();
2660
2661 if (OldResSizeInBits != NewResSizeInBits &&
2662 !match(V: &R, P: m_ICmp(Op0: m_VPValue(), Op1: m_VPValue()))) {
2663 // Extend result to original width.
2664 auto *Ext = new VPWidenCastRecipe(
2665 Instruction::ZExt, ResultVPV, OldResTy, nullptr,
2666 VPIRFlags::getDefaultFlags(Opcode: Instruction::ZExt));
2667 Ext->insertAfter(InsertPos: &R);
2668 ResultVPV->replaceAllUsesWith(New: Ext);
2669 Ext->setOperand(I: 0, New: ResultVPV);
2670 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
2671 } else {
2672 assert(match(&R, m_ICmp(m_VPValue(), m_VPValue())) &&
2673 "Only ICmps should not need extending the result.");
2674 }
2675
2676 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2677 if (isa<VPWidenLoadRecipe, VPWidenIntrinsicRecipe>(Val: &R))
2678 continue;
2679
2680 // Shrink operands by introducing truncates as needed.
2681 unsigned StartIdx =
2682 match(V: &R, P: m_Select(Op0: m_VPValue(), Op1: m_VPValue(), Op2: m_VPValue())) ? 1 : 0;
2683 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
2684 auto *Op = R.getOperand(N: Idx);
2685 unsigned OpSizeInBits =
2686 TypeInfo.inferScalarType(V: Op)->getScalarSizeInBits();
2687 if (OpSizeInBits == NewResSizeInBits)
2688 continue;
2689 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2690 auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Key: Op);
2691 if (!IterIsEmpty) {
2692 R.setOperand(I: Idx, New: ProcessedIter->second);
2693 continue;
2694 }
2695
2696 VPBuilder Builder;
2697 if (isa<VPIRValue>(Val: Op))
2698 Builder.setInsertPoint(PH);
2699 else
2700 Builder.setInsertPoint(&R);
2701 VPWidenCastRecipe *NewOp =
2702 Builder.createWidenCast(Opcode: Instruction::Trunc, Op, ResultTy: NewResTy);
2703 ProcessedIter->second = NewOp;
2704 R.setOperand(I: Idx, New: NewOp);
2705 }
2706
2707 }
2708 }
2709}
2710
2711void VPlanTransforms::removeBranchOnConst(VPlan &Plan) {
2712 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2713 Range: vp_depth_first_shallow(G: Plan.getEntry()))) {
2714 VPValue *Cond;
2715 // Skip blocks that are not terminated by BranchOnCond.
2716 if (VPBB->empty() || !match(V: &VPBB->back(), P: m_BranchOnCond(Op0: m_VPValue(V&: Cond))))
2717 continue;
2718
2719 assert(VPBB->getNumSuccessors() == 2 &&
2720 "Two successors expected for BranchOnCond");
2721 unsigned RemovedIdx;
2722 if (match(V: Cond, P: m_True()))
2723 RemovedIdx = 1;
2724 else if (match(V: Cond, P: m_False()))
2725 RemovedIdx = 0;
2726 else
2727 continue;
2728
2729 VPBasicBlock *RemovedSucc =
2730 cast<VPBasicBlock>(Val: VPBB->getSuccessors()[RemovedIdx]);
2731 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2732 "There must be a single edge between VPBB and its successor");
2733 // Values coming from VPBB into phi recipes of RemoveSucc are removed from
2734 // these recipes.
2735 for (VPRecipeBase &R : RemovedSucc->phis())
2736 cast<VPPhiAccessors>(Val: &R)->removeIncomingValueFor(IncomingBlock: VPBB);
2737
2738 // Disconnect blocks and remove the terminator. RemovedSucc will be deleted
2739 // automatically on VPlan destruction if it becomes unreachable.
2740 VPBlockUtils::disconnectBlocks(From: VPBB, To: RemovedSucc);
2741 VPBB->back().eraseFromParent();
2742 }
2743}
2744
2745void VPlanTransforms::optimize(VPlan &Plan) {
2746 RUN_VPLAN_PASS(removeRedundantCanonicalIVs, Plan);
2747 RUN_VPLAN_PASS(removeRedundantInductionCasts, Plan);
2748
2749 RUN_VPLAN_PASS(simplifyRecipes, Plan);
2750 RUN_VPLAN_PASS(removeDeadRecipes, Plan);
2751 RUN_VPLAN_PASS(simplifyBlends, Plan);
2752 RUN_VPLAN_PASS(legalizeAndOptimizeInductions, Plan);
2753 RUN_VPLAN_PASS(narrowToSingleScalarRecipes, Plan);
2754 RUN_VPLAN_PASS(removeRedundantExpandSCEVRecipes, Plan);
2755 RUN_VPLAN_PASS(simplifyRecipes, Plan);
2756 RUN_VPLAN_PASS(removeBranchOnConst, Plan);
2757 RUN_VPLAN_PASS(removeDeadRecipes, Plan);
2758
2759 RUN_VPLAN_PASS(createAndOptimizeReplicateRegions, Plan);
2760 RUN_VPLAN_PASS(hoistInvariantLoads, Plan);
2761 RUN_VPLAN_PASS(mergeBlocksIntoPredecessors, Plan);
2762 RUN_VPLAN_PASS(licm, Plan);
2763}
2764
2765// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2766// the loop terminator with a branch-on-cond recipe with the negated
2767// active-lane-mask as operand. Note that this turns the loop into an
2768// uncountable one. Only the existing terminator is replaced, all other existing
2769// recipes/users remain unchanged, except for poison-generating flags being
2770// dropped from the canonical IV increment. Return the created
2771// VPActiveLaneMaskPHIRecipe.
2772//
2773// The function uses the following definitions:
2774//
2775// %TripCount = DataWithControlFlowWithoutRuntimeCheck ?
2776// calculate-trip-count-minus-VF (original TC) : original TC
2777// %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ?
2778// CanonicalIVPhi : CanonicalIVIncrement
2779// %StartV is the canonical induction start value.
2780//
2781// The function adds the following recipes:
2782//
2783// vector.ph:
2784// %TripCount = calculate-trip-count-minus-VF (original TC)
2785// [if DataWithControlFlowWithoutRuntimeCheck]
2786// %EntryInc = canonical-iv-increment-for-part %StartV
2787// %EntryALM = active-lane-mask %EntryInc, %TripCount
2788//
2789// vector.body:
2790// ...
2791// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2792// ...
2793// %InLoopInc = canonical-iv-increment-for-part %IncrementValue
2794// %ALM = active-lane-mask %InLoopInc, TripCount
2795// %Negated = Not %ALM
2796// branch-on-cond %Negated
2797//
2798static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
2799 VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) {
2800 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2801 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2802 auto *CanonicalIVPHI = TopRegion->getCanonicalIV();
2803 VPValue *StartV = CanonicalIVPHI->getStartValue();
2804
2805 auto *CanonicalIVIncrement =
2806 cast<VPInstruction>(Val: CanonicalIVPHI->getBackedgeValue());
2807 // TODO: Check if dropping the flags is needed if
2808 // !DataAndControlFlowWithoutRuntimeCheck.
2809 CanonicalIVIncrement->dropPoisonGeneratingFlags();
2810 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2811 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2812 // we have to take unrolling into account. Each part needs to start at
2813 // Part * VF
2814 auto *VecPreheader = Plan.getVectorPreheader();
2815 VPBuilder Builder(VecPreheader);
2816
2817 // Create the ActiveLaneMask instruction using the correct start values.
2818 VPValue *TC = Plan.getTripCount();
2819
2820 VPValue *TripCount, *IncrementValue;
2821 if (!DataAndControlFlowWithoutRuntimeCheck) {
2822 // When the loop is guarded by a runtime overflow check for the loop
2823 // induction variable increment by VF, we can increment the value before
2824 // the get.active.lane mask and use the unmodified tripcount.
2825 IncrementValue = CanonicalIVIncrement;
2826 TripCount = TC;
2827 } else {
2828 // When avoiding a runtime check, the active.lane.mask inside the loop
2829 // uses a modified trip count and the induction variable increment is
2830 // done after the active.lane.mask intrinsic is called.
2831 IncrementValue = CanonicalIVPHI;
2832 TripCount = Builder.createNaryOp(Opcode: VPInstruction::CalculateTripCountMinusVF,
2833 Operands: {TC}, DL);
2834 }
2835 auto *EntryIncrement = Builder.createOverflowingOp(
2836 Opcode: VPInstruction::CanonicalIVIncrementForPart, Operands: {StartV}, WrapFlags: {false, false}, DL,
2837 Name: "index.part.next");
2838
2839 // Create the active lane mask instruction in the VPlan preheader.
2840 VPValue *ALMMultiplier =
2841 Plan.getConstantInt(Ty: TopRegion->getCanonicalIVType(), Val: 1);
2842 auto *EntryALM = Builder.createNaryOp(Opcode: VPInstruction::ActiveLaneMask,
2843 Operands: {EntryIncrement, TC, ALMMultiplier}, DL,
2844 Name: "active.lane.mask.entry");
2845
2846 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2847 // preheader ActiveLaneMask instruction.
2848 auto *LaneMaskPhi =
2849 new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc::getUnknown());
2850 LaneMaskPhi->insertAfter(InsertPos: CanonicalIVPHI);
2851
2852 // Create the active lane mask for the next iteration of the loop before the
2853 // original terminator.
2854 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2855 Builder.setInsertPoint(OriginalTerminator);
2856 auto *InLoopIncrement =
2857 Builder.createOverflowingOp(Opcode: VPInstruction::CanonicalIVIncrementForPart,
2858 Operands: {IncrementValue}, WrapFlags: {false, false}, DL);
2859 auto *ALM = Builder.createNaryOp(Opcode: VPInstruction::ActiveLaneMask,
2860 Operands: {InLoopIncrement, TripCount, ALMMultiplier},
2861 DL, Name: "active.lane.mask.next");
2862 LaneMaskPhi->addOperand(Operand: ALM);
2863
2864 // Replace the original terminator with BranchOnCond. We have to invert the
2865 // mask here because a true condition means jumping to the exit block.
2866 auto *NotMask = Builder.createNot(Operand: ALM, DL);
2867 Builder.createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: {NotMask}, DL);
2868 OriginalTerminator->eraseFromParent();
2869 return LaneMaskPhi;
2870}
2871
2872/// Collect the header mask with the pattern:
2873/// (ICMP_ULE, WideCanonicalIV, backedge-taken-count)
2874/// TODO: Introduce explicit recipe for header-mask instead of searching
2875/// for the header-mask pattern manually.
2876static VPSingleDefRecipe *findHeaderMask(VPlan &Plan) {
2877 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2878 SmallVector<VPValue *> WideCanonicalIVs;
2879 auto *FoundWidenCanonicalIVUser = find_if(
2880 Range: LoopRegion->getCanonicalIV()->users(), P: IsaPred<VPWidenCanonicalIVRecipe>);
2881 assert(count_if(LoopRegion->getCanonicalIV()->users(),
2882 IsaPred<VPWidenCanonicalIVRecipe>) <= 1 &&
2883 "Must have at most one VPWideCanonicalIVRecipe");
2884 if (FoundWidenCanonicalIVUser !=
2885 LoopRegion->getCanonicalIV()->users().end()) {
2886 auto *WideCanonicalIV =
2887 cast<VPWidenCanonicalIVRecipe>(Val: *FoundWidenCanonicalIVUser);
2888 WideCanonicalIVs.push_back(Elt: WideCanonicalIV);
2889 }
2890
2891 // Also include VPWidenIntOrFpInductionRecipes that represent a widened
2892 // version of the canonical induction.
2893 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
2894 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2895 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi);
2896 if (WidenOriginalIV && WidenOriginalIV->isCanonical())
2897 WideCanonicalIVs.push_back(Elt: WidenOriginalIV);
2898 }
2899
2900 // Walk users of wide canonical IVs and find the single compare of the form
2901 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count).
2902 VPSingleDefRecipe *HeaderMask = nullptr;
2903 for (auto *Wide : WideCanonicalIVs) {
2904 for (VPUser *U : Wide->users()) {
2905 auto *VPI = dyn_cast<VPInstruction>(Val: U);
2906 if (!VPI || !vputils::isHeaderMask(V: VPI, Plan))
2907 continue;
2908
2909 assert(VPI->getOperand(0) == Wide &&
2910 "WidenCanonicalIV must be the first operand of the compare");
2911 assert(!HeaderMask && "Multiple header masks found?");
2912 HeaderMask = VPI;
2913 }
2914 }
2915 return HeaderMask;
2916}
2917
2918void VPlanTransforms::addActiveLaneMask(
2919 VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
2920 bool DataAndControlFlowWithoutRuntimeCheck) {
2921 assert((!DataAndControlFlowWithoutRuntimeCheck ||
2922 UseActiveLaneMaskForControlFlow) &&
2923 "DataAndControlFlowWithoutRuntimeCheck implies "
2924 "UseActiveLaneMaskForControlFlow");
2925
2926 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2927 auto *FoundWidenCanonicalIVUser = find_if(
2928 Range: LoopRegion->getCanonicalIV()->users(), P: IsaPred<VPWidenCanonicalIVRecipe>);
2929 assert(FoundWidenCanonicalIVUser &&
2930 "Must have widened canonical IV when tail folding!");
2931 VPSingleDefRecipe *HeaderMask = findHeaderMask(Plan);
2932 auto *WideCanonicalIV =
2933 cast<VPWidenCanonicalIVRecipe>(Val: *FoundWidenCanonicalIVUser);
2934 VPSingleDefRecipe *LaneMask;
2935 if (UseActiveLaneMaskForControlFlow) {
2936 LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(
2937 Plan, DataAndControlFlowWithoutRuntimeCheck);
2938 } else {
2939 VPBuilder B = VPBuilder::getToInsertAfter(R: WideCanonicalIV);
2940 VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
2941 V: ConstantInt::get(Ty: LoopRegion->getCanonicalIVType(), V: 1));
2942 LaneMask =
2943 B.createNaryOp(Opcode: VPInstruction::ActiveLaneMask,
2944 Operands: {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
2945 DL: nullptr, Name: "active.lane.mask");
2946 }
2947
2948 // Walk users of WideCanonicalIV and replace the header mask of the form
2949 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
2950 // removing the old one to ensure there is always only a single header mask.
2951 HeaderMask->replaceAllUsesWith(New: LaneMask);
2952 HeaderMask->eraseFromParent();
2953}
2954
2955template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
2956 Op0_t In;
2957 Op1_t &Out;
2958
2959 RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
2960
2961 template <typename OpTy> bool match(OpTy *V) const {
2962 if (m_Specific(In).match(V)) {
2963 Out = nullptr;
2964 return true;
2965 }
2966 return m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V);
2967 }
2968};
2969
2970/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
2971/// Returns the remaining part \p Out if so, or nullptr otherwise.
2972template <typename Op0_t, typename Op1_t>
2973static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
2974 Op1_t &Out) {
2975 return RemoveMask_match<Op0_t, Op1_t>(In, Out);
2976}
2977
2978/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
2979/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
2980/// recipe could be created.
2981/// \p HeaderMask Header Mask.
2982/// \p CurRecipe Recipe to be transform.
2983/// \p TypeInfo VPlan-based type analysis.
2984/// \p EVL The explicit vector length parameter of vector-predication
2985/// intrinsics.
2986static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
2987 VPRecipeBase &CurRecipe,
2988 VPTypeAnalysis &TypeInfo, VPValue &EVL) {
2989 VPlan *Plan = CurRecipe.getParent()->getPlan();
2990 DebugLoc DL = CurRecipe.getDebugLoc();
2991 VPValue *Addr, *Mask, *EndPtr;
2992
2993 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
2994 auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
2995 auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(Val: EndPtr)->clone();
2996 EVLEndPtr->insertBefore(InsertPos: &CurRecipe);
2997 EVLEndPtr->setOperand(I: 1, New: &EVL);
2998 return EVLEndPtr;
2999 };
3000
3001 if (match(V: &CurRecipe,
3002 P: m_MaskedLoad(Addr: m_VPValue(V&: Addr), Mask: m_RemoveMask(In: HeaderMask, Out&: Mask))) &&
3003 !cast<VPWidenLoadRecipe>(Val&: CurRecipe).isReverse())
3004 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(Val&: CurRecipe), Addr,
3005 EVL, Mask);
3006
3007 VPValue *ReversedVal;
3008 if (match(V: &CurRecipe, P: m_Reverse(Op0: m_VPValue(V&: ReversedVal))) &&
3009 match(V: ReversedVal,
3010 P: m_MaskedLoad(Addr: m_VPValue(V&: EndPtr), Mask: m_RemoveMask(In: HeaderMask, Out&: Mask))) &&
3011 match(V: EndPtr, P: m_VecEndPtr(Op0: m_VPValue(V&: Addr), Op1: m_Specific(VPV: &Plan->getVF()))) &&
3012 cast<VPWidenLoadRecipe>(Val: ReversedVal)->isReverse()) {
3013 auto *LoadR = new VPWidenLoadEVLRecipe(
3014 *cast<VPWidenLoadRecipe>(Val: ReversedVal), AdjustEndPtr(EndPtr), EVL, Mask);
3015 LoadR->insertBefore(InsertPos: &CurRecipe);
3016 return new VPWidenIntrinsicRecipe(
3017 Intrinsic::experimental_vp_reverse, {LoadR, Plan->getTrue(), &EVL},
3018 TypeInfo.inferScalarType(V: LoadR), {}, {}, DL);
3019 }
3020
3021 VPValue *StoredVal;
3022 if (match(V: &CurRecipe, P: m_MaskedStore(Addr: m_VPValue(V&: Addr), Val: m_VPValue(V&: StoredVal),
3023 Mask: m_RemoveMask(In: HeaderMask, Out&: Mask))) &&
3024 !cast<VPWidenStoreRecipe>(Val&: CurRecipe).isReverse())
3025 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(Val&: CurRecipe), Addr,
3026 StoredVal, EVL, Mask);
3027
3028 if (match(V: &CurRecipe,
3029 P: m_MaskedStore(Addr: m_VPValue(V&: EndPtr), Val: m_Reverse(Op0: m_VPValue(V&: ReversedVal)),
3030 Mask: m_RemoveMask(In: HeaderMask, Out&: Mask))) &&
3031 match(V: EndPtr, P: m_VecEndPtr(Op0: m_VPValue(V&: Addr), Op1: m_Specific(VPV: &Plan->getVF()))) &&
3032 cast<VPWidenStoreRecipe>(Val&: CurRecipe).isReverse()) {
3033 auto *NewReverse = new VPWidenIntrinsicRecipe(
3034 Intrinsic::experimental_vp_reverse,
3035 {ReversedVal, Plan->getTrue(), &EVL},
3036 TypeInfo.inferScalarType(V: ReversedVal), {}, {}, DL);
3037 NewReverse->insertBefore(InsertPos: &CurRecipe);
3038 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(Val&: CurRecipe),
3039 AdjustEndPtr(EndPtr), NewReverse, EVL,
3040 Mask);
3041 }
3042
3043 if (auto *Rdx = dyn_cast<VPReductionRecipe>(Val: &CurRecipe))
3044 if (Rdx->isConditional() &&
3045 match(V: Rdx->getCondOp(), P: m_RemoveMask(In: HeaderMask, Out&: Mask)))
3046 return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
3047
3048 if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(Val: &CurRecipe))
3049 if (Interleave->getMask() &&
3050 match(V: Interleave->getMask(), P: m_RemoveMask(In: HeaderMask, Out&: Mask)))
3051 return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
3052
3053 VPValue *LHS, *RHS;
3054 if (match(V: &CurRecipe,
3055 P: m_Select(Op0: m_Specific(VPV: HeaderMask), Op1: m_VPValue(V&: LHS), Op2: m_VPValue(V&: RHS))))
3056 return new VPWidenIntrinsicRecipe(
3057 Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL},
3058 TypeInfo.inferScalarType(V: LHS), {}, {}, DL);
3059
3060 if (match(V: &CurRecipe, P: m_Select(Op0: m_RemoveMask(In: HeaderMask, Out&: Mask), Op1: m_VPValue(V&: LHS),
3061 Op2: m_VPValue(V&: RHS))))
3062 return new VPWidenIntrinsicRecipe(
3063 Intrinsic::vp_merge, {Mask, LHS, RHS, &EVL},
3064 TypeInfo.inferScalarType(V: LHS), {}, {}, DL);
3065
3066 if (match(V: &CurRecipe, P: m_LastActiveLane(Op0: m_Specific(VPV: HeaderMask)))) {
3067 Type *Ty = TypeInfo.inferScalarType(V: CurRecipe.getVPSingleValue());
3068 VPValue *ZExt =
3069 VPBuilder(&CurRecipe).createScalarCast(Opcode: Instruction::ZExt, Op: &EVL, ResultTy: Ty, DL);
3070 return new VPInstruction(
3071 Instruction::Sub, {ZExt, Plan->getConstantInt(Ty, Val: 1)},
3072 VPIRFlags::getDefaultFlags(Opcode: Instruction::Sub), {}, DL);
3073 }
3074
3075 return nullptr;
3076}
3077
3078/// Optimize away any EVL-based header masks to VP intrinsic based recipes.
3079/// The transforms here need to preserve the original semantics.
3080void VPlanTransforms::optimizeEVLMasks(VPlan &Plan) {
3081 // Find the EVL-based header mask if it exists: icmp ult step-vector, EVL
3082 VPValue *HeaderMask = nullptr, *EVL = nullptr;
3083 for (VPRecipeBase &R : *Plan.getVectorLoopRegion()->getEntryBasicBlock()) {
3084 if (match(V: &R, P: m_SpecificICmp(MatchPred: CmpInst::ICMP_ULT, Op0: m_StepVector(),
3085 Op1: m_VPValue(V&: EVL))) &&
3086 match(V: EVL, P: m_EVL(Op0: m_VPValue()))) {
3087 HeaderMask = R.getVPSingleValue();
3088 break;
3089 }
3090 }
3091 if (!HeaderMask)
3092 return;
3093
3094 VPTypeAnalysis TypeInfo(Plan);
3095 SmallVector<VPRecipeBase *> OldRecipes;
3096 for (VPUser *U : collectUsersRecursively(V: HeaderMask)) {
3097 VPRecipeBase *R = cast<VPRecipeBase>(Val: U);
3098 if (auto *NewR = optimizeMaskToEVL(HeaderMask, CurRecipe&: *R, TypeInfo, EVL&: *EVL)) {
3099 NewR->insertBefore(InsertPos: R);
3100 for (auto [Old, New] :
3101 zip_equal(t: R->definedValues(), u: NewR->definedValues()))
3102 Old->replaceAllUsesWith(New);
3103 OldRecipes.push_back(Elt: R);
3104 }
3105 }
3106 // Erase old recipes at the end so we don't invalidate TypeInfo.
3107 for (VPRecipeBase *R : reverse(C&: OldRecipes)) {
3108 SmallVector<VPValue *> PossiblyDead(R->operands());
3109 R->eraseFromParent();
3110 for (VPValue *Op : PossiblyDead)
3111 recursivelyDeleteDeadRecipes(V: Op);
3112 }
3113}
3114
3115/// After replacing the canonical IV with a EVL-based IV, fixup recipes that use
3116/// VF to use the EVL instead to avoid incorrect updates on the penultimate
3117/// iteration.
3118static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL) {
3119 VPTypeAnalysis TypeInfo(Plan);
3120 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3121 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3122
3123 assert(all_of(Plan.getVF().users(),
3124 IsaPred<VPVectorEndPointerRecipe, VPScalarIVStepsRecipe,
3125 VPWidenIntOrFpInductionRecipe>) &&
3126 "User of VF that we can't transform to EVL.");
3127 Plan.getVF().replaceUsesWithIf(New: &EVL, ShouldReplace: [](VPUser &U, unsigned Idx) {
3128 return isa<VPWidenIntOrFpInductionRecipe, VPScalarIVStepsRecipe>(Val: U);
3129 });
3130
3131 assert(all_of(Plan.getVFxUF().users(),
3132 [&LoopRegion, &Plan](VPUser *U) {
3133 return match(U,
3134 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
3135 m_Specific(&Plan.getVFxUF()))) ||
3136 isa<VPWidenPointerInductionRecipe>(U);
3137 }) &&
3138 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
3139 "increment of the canonical induction.");
3140 Plan.getVFxUF().replaceUsesWithIf(New: &EVL, ShouldReplace: [](VPUser &U, unsigned Idx) {
3141 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
3142 // canonical induction must not be updated.
3143 return isa<VPWidenPointerInductionRecipe>(Val: U);
3144 });
3145
3146 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
3147 // contained.
3148 bool ContainsFORs =
3149 any_of(Range: Header->phis(), P: IsaPred<VPFirstOrderRecurrencePHIRecipe>);
3150 if (ContainsFORs) {
3151 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
3152 VPValue *MaxEVL = &Plan.getVF();
3153 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
3154 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
3155 MaxEVL = Builder.createScalarZExtOrTrunc(
3156 Op: MaxEVL, ResultTy: Type::getInt32Ty(C&: Plan.getContext()),
3157 SrcTy: TypeInfo.inferScalarType(V: MaxEVL), DL: DebugLoc::getUnknown());
3158
3159 Builder.setInsertPoint(TheBB: Header, IP: Header->getFirstNonPhi());
3160 VPValue *PrevEVL = Builder.createScalarPhi(
3161 IncomingValues: {MaxEVL, &EVL}, DL: DebugLoc::getUnknown(), Name: "prev.evl");
3162
3163 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
3164 Range: vp_depth_first_deep(G: Plan.getVectorLoopRegion()->getEntry()))) {
3165 for (VPRecipeBase &R : *VPBB) {
3166 VPValue *V1, *V2;
3167 if (!match(V: &R,
3168 P: m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>(
3169 Ops: m_VPValue(V&: V1), Ops: m_VPValue(V&: V2))))
3170 continue;
3171 VPValue *Imm = Plan.getOrAddLiveIn(
3172 V: ConstantInt::getSigned(Ty: Type::getInt32Ty(C&: Plan.getContext()), V: -1));
3173 VPWidenIntrinsicRecipe *VPSplice = new VPWidenIntrinsicRecipe(
3174 Intrinsic::experimental_vp_splice,
3175 {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
3176 TypeInfo.inferScalarType(V: R.getVPSingleValue()), {}, {},
3177 R.getDebugLoc());
3178 VPSplice->insertBefore(InsertPos: &R);
3179 R.getVPSingleValue()->replaceAllUsesWith(New: VPSplice);
3180 }
3181 }
3182 }
3183
3184 VPValue *HeaderMask = findHeaderMask(Plan);
3185 if (!HeaderMask)
3186 return;
3187
3188 // Replace header masks with a mask equivalent to predicating by EVL:
3189 //
3190 // icmp ule widen-canonical-iv backedge-taken-count
3191 // ->
3192 // icmp ult step-vector, EVL
3193 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
3194 VPBuilder Builder(EVLR->getParent(), std::next(x: EVLR->getIterator()));
3195 Type *EVLType = TypeInfo.inferScalarType(V: &EVL);
3196 VPValue *EVLMask = Builder.createICmp(
3197 Pred: CmpInst::ICMP_ULT,
3198 A: Builder.createNaryOp(Opcode: VPInstruction::StepVector, Operands: {}, ResultTy: EVLType), B: &EVL);
3199 HeaderMask->replaceAllUsesWith(New: EVLMask);
3200}
3201
3202/// Converts a tail folded vector loop region to step by
3203/// VPInstruction::ExplicitVectorLength elements instead of VF elements each
3204/// iteration.
3205///
3206/// - Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
3207/// replaces all uses except the canonical IV increment of
3208/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe.
3209/// VPCanonicalIVPHIRecipe is used only for loop iterations counting after
3210/// this transformation.
3211///
3212/// - The header mask is replaced with a header mask based on the EVL.
3213///
3214/// - Plans with FORs have a new phi added to keep track of the EVL of the
3215/// previous iteration, and VPFirstOrderRecurrencePHIRecipes are replaced with
3216/// @llvm.vp.splice.
3217///
3218/// The function uses the following definitions:
3219/// %StartV is the canonical induction start value.
3220///
3221/// The function adds the following recipes:
3222///
3223/// vector.ph:
3224/// ...
3225///
3226/// vector.body:
3227/// ...
3228/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
3229/// [ %NextEVLIV, %vector.body ]
3230/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3231/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
3232/// ...
3233/// %OpEVL = cast i32 %VPEVL to IVSize
3234/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
3235/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3236/// ...
3237///
3238/// If MaxSafeElements is provided, the function adds the following recipes:
3239/// vector.ph:
3240/// ...
3241///
3242/// vector.body:
3243/// ...
3244/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
3245/// [ %NextEVLIV, %vector.body ]
3246/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3247/// %cmp = cmp ult %AVL, MaxSafeElements
3248/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
3249/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
3250/// ...
3251/// %OpEVL = cast i32 %VPEVL to IVSize
3252/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
3253/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3254/// ...
3255///
3256void VPlanTransforms::addExplicitVectorLength(
3257 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
3258 if (Plan.hasScalarVFOnly())
3259 return;
3260 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3261 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3262
3263 auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
3264 auto *CanIVTy = LoopRegion->getCanonicalIVType();
3265 VPValue *StartV = CanonicalIVPHI->getStartValue();
3266
3267 // Create the ExplicitVectorLengthPhi recipe in the main loop.
3268 auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc::getUnknown());
3269 EVLPhi->insertAfter(InsertPos: CanonicalIVPHI);
3270 VPBuilder Builder(Header, Header->getFirstNonPhi());
3271 // Create the AVL (application vector length), starting from TC -> 0 in steps
3272 // of EVL.
3273 VPPhi *AVLPhi = Builder.createScalarPhi(
3274 IncomingValues: {Plan.getTripCount()}, DL: DebugLoc::getCompilerGenerated(), Name: "avl");
3275 VPValue *AVL = AVLPhi;
3276
3277 if (MaxSafeElements) {
3278 // Support for MaxSafeDist for correct loop emission.
3279 VPValue *AVLSafe = Plan.getConstantInt(Ty: CanIVTy, Val: *MaxSafeElements);
3280 VPValue *Cmp = Builder.createICmp(Pred: ICmpInst::ICMP_ULT, A: AVL, B: AVLSafe);
3281 AVL = Builder.createSelect(Cond: Cmp, TrueVal: AVL, FalseVal: AVLSafe, DL: DebugLoc::getUnknown(),
3282 Name: "safe_avl");
3283 }
3284 auto *VPEVL = Builder.createNaryOp(Opcode: VPInstruction::ExplicitVectorLength, Operands: AVL,
3285 DL: DebugLoc::getUnknown(), Name: "evl");
3286
3287 auto *CanonicalIVIncrement =
3288 cast<VPInstruction>(Val: CanonicalIVPHI->getBackedgeValue());
3289 Builder.setInsertPoint(CanonicalIVIncrement);
3290 VPValue *OpVPEVL = VPEVL;
3291
3292 auto *I32Ty = Type::getInt32Ty(C&: Plan.getContext());
3293 OpVPEVL = Builder.createScalarZExtOrTrunc(
3294 Op: OpVPEVL, ResultTy: CanIVTy, SrcTy: I32Ty, DL: CanonicalIVIncrement->getDebugLoc());
3295
3296 auto *NextEVLIV = Builder.createAdd(
3297 LHS: OpVPEVL, RHS: EVLPhi, DL: CanonicalIVIncrement->getDebugLoc(), Name: "index.evl.next",
3298 WrapFlags: {CanonicalIVIncrement->hasNoUnsignedWrap(),
3299 CanonicalIVIncrement->hasNoSignedWrap()});
3300 EVLPhi->addOperand(Operand: NextEVLIV);
3301
3302 VPValue *NextAVL =
3303 Builder.createSub(LHS: AVLPhi, RHS: OpVPEVL, DL: DebugLoc::getCompilerGenerated(),
3304 Name: "avl.next", WrapFlags: {/*NUW=*/true, /*NSW=*/false});
3305 AVLPhi->addOperand(Operand: NextAVL);
3306
3307 fixupVFUsersForEVL(Plan, EVL&: *VPEVL);
3308 removeDeadRecipes(Plan);
3309
3310 // Replace all uses of VPCanonicalIVPHIRecipe by
3311 // VPEVLBasedIVPHIRecipe except for the canonical IV increment.
3312 CanonicalIVPHI->replaceAllUsesWith(New: EVLPhi);
3313 CanonicalIVIncrement->setOperand(I: 0, New: CanonicalIVPHI);
3314 // TODO: support unroll factor > 1.
3315 Plan.setUF(1);
3316}
3317
3318void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) {
3319 // Find EVL loop entries by locating VPEVLBasedIVPHIRecipe.
3320 // There should be only one EVL PHI in the entire plan.
3321 VPEVLBasedIVPHIRecipe *EVLPhi = nullptr;
3322
3323 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
3324 Range: vp_depth_first_shallow(G: Plan.getEntry())))
3325 for (VPRecipeBase &R : VPBB->phis())
3326 if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(Val: &R)) {
3327 assert(!EVLPhi && "Found multiple EVL PHIs. Only one expected");
3328 EVLPhi = PhiR;
3329 }
3330
3331 // Early return if no EVL PHI is found.
3332 if (!EVLPhi)
3333 return;
3334
3335 VPBasicBlock *HeaderVPBB = EVLPhi->getParent();
3336 VPValue *EVLIncrement = EVLPhi->getBackedgeValue();
3337
3338 // Convert EVLPhi to concrete recipe.
3339 auto *ScalarR =
3340 VPBuilder(EVLPhi).createScalarPhi(IncomingValues: {EVLPhi->getStartValue(), EVLIncrement},
3341 DL: EVLPhi->getDebugLoc(), Name: "evl.based.iv");
3342 EVLPhi->replaceAllUsesWith(New: ScalarR);
3343 EVLPhi->eraseFromParent();
3344
3345 // Replace CanonicalIVInc with EVL-PHI increment.
3346 auto *CanonicalIV = cast<VPPhi>(Val: &*HeaderVPBB->begin());
3347 VPValue *Backedge = CanonicalIV->getIncomingValue(Idx: 1);
3348 assert(match(Backedge, m_c_Add(m_Specific(CanonicalIV),
3349 m_Specific(&Plan.getVFxUF()))) &&
3350 "Unexpected canonical iv");
3351 Backedge->replaceAllUsesWith(New: EVLIncrement);
3352
3353 // Remove unused phi and increment.
3354 VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
3355 CanonicalIVIncrement->eraseFromParent();
3356 CanonicalIV->eraseFromParent();
3357}
3358
3359void VPlanTransforms::convertEVLExitCond(VPlan &Plan) {
3360 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3361 // The canonical IV may not exist at this stage.
3362 if (!LoopRegion ||
3363 !isa<VPCanonicalIVPHIRecipe>(Val: LoopRegion->getEntryBasicBlock()->front()))
3364 return;
3365 VPCanonicalIVPHIRecipe *CanIV = LoopRegion->getCanonicalIV();
3366 if (std::next(x: CanIV->getIterator()) == CanIV->getParent()->end())
3367 return;
3368 // The EVL IV is always immediately after the canonical IV.
3369 auto *EVLPhi =
3370 dyn_cast_or_null<VPEVLBasedIVPHIRecipe>(Val: std::next(x: CanIV->getIterator()));
3371 if (!EVLPhi)
3372 return;
3373
3374 // Bail if not an EVL tail folded loop.
3375 VPValue *AVL;
3376 if (!match(V: EVLPhi->getBackedgeValue(),
3377 P: m_c_Add(Op0: m_ZExtOrSelf(Op0: m_EVL(Op0: m_VPValue(V&: AVL))), Op1: m_Specific(VPV: EVLPhi))))
3378 return;
3379
3380 // The AVL may be capped to a safe distance.
3381 VPValue *SafeAVL;
3382 if (match(V: AVL, P: m_Select(Op0: m_VPValue(), Op1: m_VPValue(V&: SafeAVL), Op2: m_VPValue())))
3383 AVL = SafeAVL;
3384
3385 VPValue *AVLNext;
3386 [[maybe_unused]] bool FoundAVLNext =
3387 match(V: AVL, P: m_VPInstruction<Instruction::PHI>(
3388 Ops: m_Specific(VPV: Plan.getTripCount()), Ops: m_VPValue(V&: AVLNext)));
3389 assert(FoundAVLNext && "Didn't find AVL backedge?");
3390
3391 VPBasicBlock *Latch = LoopRegion->getExitingBasicBlock();
3392 auto *LatchBr = cast<VPInstruction>(Val: Latch->getTerminator());
3393 if (match(V: LatchBr, P: m_BranchOnCond(Op0: m_True())))
3394 return;
3395
3396 assert(
3397 match(LatchBr,
3398 m_BranchOnCond(m_SpecificCmp(
3399 CmpInst::ICMP_EQ, m_Specific(CanIV->getIncomingValue(1)),
3400 m_Specific(&Plan.getVectorTripCount())))) &&
3401 "Expected BranchOnCond with ICmp comparing CanIV increment with vector "
3402 "trip count");
3403
3404 Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(V: AVLNext);
3405 VPBuilder Builder(LatchBr);
3406 LatchBr->setOperand(I: 0, New: Builder.createICmp(Pred: CmpInst::ICMP_EQ, A: AVLNext,
3407 B: Plan.getConstantInt(Ty: AVLTy, Val: 0)));
3408}
3409
3410void VPlanTransforms::replaceSymbolicStrides(
3411 VPlan &Plan, PredicatedScalarEvolution &PSE,
3412 const DenseMap<Value *, const SCEV *> &StridesMap) {
3413 // Replace VPValues for known constant strides guaranteed by predicate scalar
3414 // evolution.
3415 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
3416 auto *R = cast<VPRecipeBase>(Val: &U);
3417 return R->getRegion() ||
3418 R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
3419 };
3420 ValueToSCEVMapTy RewriteMap;
3421 for (const SCEV *Stride : StridesMap.values()) {
3422 using namespace SCEVPatternMatch;
3423 auto *StrideV = cast<SCEVUnknown>(Val: Stride)->getValue();
3424 const APInt *StrideConst;
3425 if (!match(S: PSE.getSCEV(V: StrideV), P: m_scev_APInt(C&: StrideConst)))
3426 // Only handle constant strides for now.
3427 continue;
3428
3429 auto *CI = Plan.getConstantInt(Val: *StrideConst);
3430 if (VPValue *StrideVPV = Plan.getLiveIn(V: StrideV))
3431 StrideVPV->replaceUsesWithIf(New: CI, ShouldReplace: CanUseVersionedStride);
3432
3433 // The versioned value may not be used in the loop directly but through a
3434 // sext/zext. Add new live-ins in those cases.
3435 for (Value *U : StrideV->users()) {
3436 if (!isa<SExtInst, ZExtInst>(Val: U))
3437 continue;
3438 VPValue *StrideVPV = Plan.getLiveIn(V: U);
3439 if (!StrideVPV)
3440 continue;
3441 unsigned BW = U->getType()->getScalarSizeInBits();
3442 APInt C =
3443 isa<SExtInst>(Val: U) ? StrideConst->sext(width: BW) : StrideConst->zext(width: BW);
3444 VPValue *CI = Plan.getConstantInt(Val: C);
3445 StrideVPV->replaceUsesWithIf(New: CI, ShouldReplace: CanUseVersionedStride);
3446 }
3447 RewriteMap[StrideV] = PSE.getSCEV(V: StrideV);
3448 }
3449
3450 for (VPRecipeBase &R : *Plan.getEntry()) {
3451 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(Val: &R);
3452 if (!ExpSCEV)
3453 continue;
3454 const SCEV *ScevExpr = ExpSCEV->getSCEV();
3455 auto *NewSCEV =
3456 SCEVParameterRewriter::rewrite(Scev: ScevExpr, SE&: *PSE.getSE(), Map&: RewriteMap);
3457 if (NewSCEV != ScevExpr) {
3458 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, Expr: NewSCEV);
3459 ExpSCEV->replaceAllUsesWith(New: NewExp);
3460 if (Plan.getTripCount() == ExpSCEV)
3461 Plan.resetTripCount(NewTripCount: NewExp);
3462 }
3463 }
3464}
3465
3466void VPlanTransforms::dropPoisonGeneratingRecipes(
3467 VPlan &Plan,
3468 const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
3469 // Collect recipes in the backward slice of `Root` that may generate a poison
3470 // value that is used after vectorization.
3471 SmallPtrSet<VPRecipeBase *, 16> Visited;
3472 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
3473 SmallVector<VPRecipeBase *, 16> Worklist;
3474 Worklist.push_back(Elt: Root);
3475
3476 // Traverse the backward slice of Root through its use-def chain.
3477 while (!Worklist.empty()) {
3478 VPRecipeBase *CurRec = Worklist.pop_back_val();
3479
3480 if (!Visited.insert(Ptr: CurRec).second)
3481 continue;
3482
3483 // Prune search if we find another recipe generating a widen memory
3484 // instruction. Widen memory instructions involved in address computation
3485 // will lead to gather/scatter instructions, which don't need to be
3486 // handled.
3487 if (isa<VPWidenMemoryRecipe, VPInterleaveRecipe, VPScalarIVStepsRecipe,
3488 VPHeaderPHIRecipe>(Val: CurRec))
3489 continue;
3490
3491 // This recipe contributes to the address computation of a widen
3492 // load/store. If the underlying instruction has poison-generating flags,
3493 // drop them directly.
3494 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(Val: CurRec)) {
3495 VPValue *A, *B;
3496 // Dropping disjoint from an OR may yield incorrect results, as some
3497 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3498 // for dependence analysis). Instead, replace it with an equivalent Add.
3499 // This is possible as all users of the disjoint OR only access lanes
3500 // where the operands are disjoint or poison otherwise.
3501 if (match(V: RecWithFlags, P: m_BinaryOr(Op0: m_VPValue(V&: A), Op1: m_VPValue(V&: B))) &&
3502 RecWithFlags->isDisjoint()) {
3503 VPBuilder Builder(RecWithFlags);
3504 VPInstruction *New =
3505 Builder.createAdd(LHS: A, RHS: B, DL: RecWithFlags->getDebugLoc());
3506 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3507 RecWithFlags->replaceAllUsesWith(New);
3508 RecWithFlags->eraseFromParent();
3509 CurRec = New;
3510 } else
3511 RecWithFlags->dropPoisonGeneratingFlags();
3512 } else {
3513 Instruction *Instr = dyn_cast_or_null<Instruction>(
3514 Val: CurRec->getVPSingleValue()->getUnderlyingValue());
3515 (void)Instr;
3516 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3517 "found instruction with poison generating flags not covered by "
3518 "VPRecipeWithIRFlags");
3519 }
3520
3521 // Add new definitions to the worklist.
3522 for (VPValue *Operand : CurRec->operands())
3523 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3524 Worklist.push_back(Elt: OpDef);
3525 }
3526 });
3527
3528 // Traverse all the recipes in the VPlan and collect the poison-generating
3529 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3530 // VPInterleaveRecipe.
3531 auto Iter = vp_depth_first_deep(G: Plan.getEntry());
3532 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Iter)) {
3533 for (VPRecipeBase &Recipe : *VPBB) {
3534 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(Val: &Recipe)) {
3535 Instruction &UnderlyingInstr = WidenRec->getIngredient();
3536 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3537 if (AddrDef && WidenRec->isConsecutive() &&
3538 BlockNeedsPredication(UnderlyingInstr.getParent()))
3539 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3540 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(Val: &Recipe)) {
3541 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3542 if (AddrDef) {
3543 // Check if any member of the interleave group needs predication.
3544 const InterleaveGroup<Instruction> *InterGroup =
3545 InterleaveRec->getInterleaveGroup();
3546 bool NeedPredication = false;
3547 for (int I = 0, NumMembers = InterGroup->getNumMembers();
3548 I < NumMembers; ++I) {
3549 Instruction *Member = InterGroup->getMember(Index: I);
3550 if (Member)
3551 NeedPredication |= BlockNeedsPredication(Member->getParent());
3552 }
3553
3554 if (NeedPredication)
3555 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3556 }
3557 }
3558 }
3559 }
3560}
3561
3562void VPlanTransforms::createInterleaveGroups(
3563 VPlan &Plan,
3564 const SmallPtrSetImpl<const InterleaveGroup<Instruction> *>
3565 &InterleaveGroups,
3566 VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed) {
3567 if (InterleaveGroups.empty())
3568 return;
3569
3570 // Interleave memory: for each Interleave Group we marked earlier as relevant
3571 // for this VPlan, replace the Recipes widening its memory instructions with a
3572 // single VPInterleaveRecipe at its insertion point.
3573 VPDominatorTree VPDT(Plan);
3574 for (const auto *IG : InterleaveGroups) {
3575 auto *Start =
3576 cast<VPWidenMemoryRecipe>(Val: RecipeBuilder.getRecipe(I: IG->getMember(Index: 0)));
3577 VPIRMetadata InterleaveMD(*Start);
3578 SmallVector<VPValue *, 4> StoredValues;
3579 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Val: Start))
3580 StoredValues.push_back(Elt: StoreR->getStoredValue());
3581 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3582 Instruction *MemberI = IG->getMember(Index: I);
3583 if (!MemberI)
3584 continue;
3585 VPWidenMemoryRecipe *MemoryR =
3586 cast<VPWidenMemoryRecipe>(Val: RecipeBuilder.getRecipe(I: MemberI));
3587 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Val: MemoryR))
3588 StoredValues.push_back(Elt: StoreR->getStoredValue());
3589 InterleaveMD.intersect(MD: *MemoryR);
3590 }
3591
3592 bool NeedsMaskForGaps =
3593 (IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed) ||
3594 (!StoredValues.empty() && !IG->isFull());
3595
3596 Instruction *IRInsertPos = IG->getInsertPos();
3597 auto *InsertPos =
3598 cast<VPWidenMemoryRecipe>(Val: RecipeBuilder.getRecipe(I: IRInsertPos));
3599
3600 GEPNoWrapFlags NW = GEPNoWrapFlags::none();
3601 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3602 Val: getLoadStorePointerOperand(V: IRInsertPos)->stripPointerCasts()))
3603 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3604
3605 // Get or create the start address for the interleave group.
3606 VPValue *Addr = Start->getAddr();
3607 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3608 if (AddrDef && !VPDT.properlyDominates(A: AddrDef, B: InsertPos)) {
3609 // We cannot re-use the address of member zero because it does not
3610 // dominate the insert position. Instead, use the address of the insert
3611 // position and create a PtrAdd adjusting it to the address of member
3612 // zero.
3613 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3614 // InsertPos or sink loads above zero members to join it.
3615 assert(IG->getIndex(IRInsertPos) != 0 &&
3616 "index of insert position shouldn't be zero");
3617 auto &DL = IRInsertPos->getDataLayout();
3618 APInt Offset(32,
3619 DL.getTypeAllocSize(Ty: getLoadStoreType(I: IRInsertPos)) *
3620 IG->getIndex(Instr: IRInsertPos),
3621 /*IsSigned=*/true);
3622 VPValue *OffsetVPV = Plan.getConstantInt(Val: -Offset);
3623 VPBuilder B(InsertPos);
3624 Addr = B.createNoWrapPtrAdd(Ptr: InsertPos->getAddr(), Offset: OffsetVPV, GEPFlags: NW);
3625 }
3626 // If the group is reverse, adjust the index to refer to the last vector
3627 // lane instead of the first. We adjust the index from the first vector
3628 // lane, rather than directly getting the pointer for lane VF - 1, because
3629 // the pointer operand of the interleaved access is supposed to be uniform.
3630 if (IG->isReverse()) {
3631 auto *ReversePtr = new VPVectorEndPointerRecipe(
3632 Addr, &Plan.getVF(), getLoadStoreType(I: IRInsertPos),
3633 -(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc());
3634 ReversePtr->insertBefore(InsertPos);
3635 Addr = ReversePtr;
3636 }
3637 auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
3638 InsertPos->getMask(), NeedsMaskForGaps,
3639 InterleaveMD, InsertPos->getDebugLoc());
3640 VPIG->insertBefore(InsertPos);
3641
3642 unsigned J = 0;
3643 for (unsigned i = 0; i < IG->getFactor(); ++i)
3644 if (Instruction *Member = IG->getMember(Index: i)) {
3645 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(I: Member);
3646 if (!Member->getType()->isVoidTy()) {
3647 VPValue *OriginalV = MemberR->getVPSingleValue();
3648 OriginalV->replaceAllUsesWith(New: VPIG->getVPValue(I: J));
3649 J++;
3650 }
3651 MemberR->eraseFromParent();
3652 }
3653 }
3654}
3655
3656/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3657/// value, phi and backedge value. In the following example:
3658///
3659/// vector.ph:
3660/// Successor(s): vector loop
3661///
3662/// <x1> vector loop: {
3663/// vector.body:
3664/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3665/// ...
3666/// EMIT branch-on-count ...
3667/// No successors
3668/// }
3669///
3670/// WIDEN-INDUCTION will get expanded to:
3671///
3672/// vector.ph:
3673/// ...
3674/// vp<%induction.start> = ...
3675/// vp<%induction.increment> = ...
3676///
3677/// Successor(s): vector loop
3678///
3679/// <x1> vector loop: {
3680/// vector.body:
3681/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3682/// ...
3683/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3684/// EMIT branch-on-count ...
3685/// No successors
3686/// }
3687static void
3688expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR,
3689 VPTypeAnalysis &TypeInfo) {
3690 VPlan *Plan = WidenIVR->getParent()->getPlan();
3691 VPValue *Start = WidenIVR->getStartValue();
3692 VPValue *Step = WidenIVR->getStepValue();
3693 VPValue *VF = WidenIVR->getVFValue();
3694 DebugLoc DL = WidenIVR->getDebugLoc();
3695
3696 // The value from the original loop to which we are mapping the new induction
3697 // variable.
3698 Type *Ty = TypeInfo.inferScalarType(V: WidenIVR);
3699
3700 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3701 Instruction::BinaryOps AddOp;
3702 Instruction::BinaryOps MulOp;
3703 VPIRFlags Flags = *WidenIVR;
3704 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3705 AddOp = Instruction::Add;
3706 MulOp = Instruction::Mul;
3707 } else {
3708 AddOp = ID.getInductionOpcode();
3709 MulOp = Instruction::FMul;
3710 }
3711
3712 // If the phi is truncated, truncate the start and step values.
3713 VPBuilder Builder(Plan->getVectorPreheader());
3714 Type *StepTy = TypeInfo.inferScalarType(V: Step);
3715 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3716 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3717 Step = Builder.createScalarCast(Opcode: Instruction::Trunc, Op: Step, ResultTy: Ty, DL);
3718 Start = Builder.createScalarCast(Opcode: Instruction::Trunc, Op: Start, ResultTy: Ty, DL);
3719 // Truncation doesn't preserve WrapFlags.
3720 Flags.dropPoisonGeneratingFlags();
3721 StepTy = Ty;
3722 }
3723
3724 // Construct the initial value of the vector IV in the vector loop preheader.
3725 Type *IVIntTy =
3726 IntegerType::get(C&: Plan->getContext(), NumBits: StepTy->getScalarSizeInBits());
3727 VPValue *Init = Builder.createNaryOp(Opcode: VPInstruction::StepVector, Operands: {}, ResultTy: IVIntTy);
3728 if (StepTy->isFloatingPointTy())
3729 Init = Builder.createWidenCast(Opcode: Instruction::UIToFP, Op: Init, ResultTy: StepTy);
3730
3731 VPValue *SplatStart = Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: Start);
3732 VPValue *SplatStep = Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: Step);
3733
3734 Init = Builder.createNaryOp(Opcode: MulOp, Operands: {Init, SplatStep}, Flags);
3735 Init = Builder.createNaryOp(Opcode: AddOp, Operands: {SplatStart, Init}, Flags,
3736 DL: DebugLoc::getUnknown(), Name: "induction");
3737
3738 // Create the widened phi of the vector IV.
3739 auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), Init,
3740 WidenIVR->getDebugLoc(), "vec.ind");
3741 WidePHI->insertBefore(InsertPos: WidenIVR);
3742
3743 // Create the backedge value for the vector IV.
3744 VPValue *Inc;
3745 VPValue *Prev;
3746 // If unrolled, use the increment and prev value from the operands.
3747 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3748 Inc = SplatVF;
3749 Prev = WidenIVR->getLastUnrolledPartOperand();
3750 } else {
3751 if (VPRecipeBase *R = VF->getDefiningRecipe())
3752 Builder.setInsertPoint(TheBB: R->getParent(), IP: std::next(x: R->getIterator()));
3753 // Multiply the vectorization factor by the step using integer or
3754 // floating-point arithmetic as appropriate.
3755 if (StepTy->isFloatingPointTy())
3756 VF = Builder.createScalarCast(Opcode: Instruction::CastOps::UIToFP, Op: VF, ResultTy: StepTy,
3757 DL);
3758 else
3759 VF = Builder.createScalarZExtOrTrunc(Op: VF, ResultTy: StepTy,
3760 SrcTy: TypeInfo.inferScalarType(V: VF), DL);
3761
3762 Inc = Builder.createNaryOp(Opcode: MulOp, Operands: {Step, VF}, Flags);
3763 Inc = Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: Inc);
3764 Prev = WidePHI;
3765 }
3766
3767 VPBasicBlock *ExitingBB = Plan->getVectorLoopRegion()->getExitingBasicBlock();
3768 Builder.setInsertPoint(TheBB: ExitingBB, IP: ExitingBB->getTerminator()->getIterator());
3769 auto *Next = Builder.createNaryOp(Opcode: AddOp, Operands: {Prev, Inc}, Flags,
3770 DL: WidenIVR->getDebugLoc(), Name: "vec.ind.next");
3771
3772 WidePHI->addOperand(Operand: Next);
3773
3774 WidenIVR->replaceAllUsesWith(New: WidePHI);
3775}
3776
3777/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3778/// initial value, phi and backedge value. In the following example:
3779///
3780/// <x1> vector loop: {
3781/// vector.body:
3782/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3783/// ...
3784/// EMIT branch-on-count ...
3785/// }
3786///
3787/// WIDEN-POINTER-INDUCTION will get expanded to:
3788///
3789/// <x1> vector loop: {
3790/// vector.body:
3791/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3792/// EMIT %mul = mul %stepvector, %step
3793/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3794/// ...
3795/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3796/// EMIT branch-on-count ...
3797/// }
3798static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R,
3799 VPTypeAnalysis &TypeInfo) {
3800 VPlan *Plan = R->getParent()->getPlan();
3801 VPValue *Start = R->getStartValue();
3802 VPValue *Step = R->getStepValue();
3803 VPValue *VF = R->getVFValue();
3804
3805 assert(R->getInductionDescriptor().getKind() ==
3806 InductionDescriptor::IK_PtrInduction &&
3807 "Not a pointer induction according to InductionDescriptor!");
3808 assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");
3809 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3810 "Recipe should have been replaced");
3811
3812 VPBuilder Builder(R);
3813 DebugLoc DL = R->getDebugLoc();
3814
3815 // Build a scalar pointer phi.
3816 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(IncomingValues: Start, DL, Name: "pointer.phi");
3817
3818 // Create actual address geps that use the pointer phi as base and a
3819 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3820 Builder.setInsertPoint(TheBB: R->getParent(), IP: R->getParent()->getFirstNonPhi());
3821 Type *StepTy = TypeInfo.inferScalarType(V: Step);
3822 VPValue *Offset = Builder.createNaryOp(Opcode: VPInstruction::StepVector, Operands: {}, ResultTy: StepTy);
3823 Offset = Builder.createOverflowingOp(Opcode: Instruction::Mul, Operands: {Offset, Step});
3824 VPValue *PtrAdd =
3825 Builder.createWidePtrAdd(Ptr: ScalarPtrPhi, Offset, DL, Name: "vector.gep");
3826 R->replaceAllUsesWith(New: PtrAdd);
3827
3828 // Create the backedge value for the scalar pointer phi.
3829 VPBasicBlock *ExitingBB = Plan->getVectorLoopRegion()->getExitingBasicBlock();
3830 Builder.setInsertPoint(TheBB: ExitingBB, IP: ExitingBB->getTerminator()->getIterator());
3831 VF = Builder.createScalarZExtOrTrunc(Op: VF, ResultTy: StepTy, SrcTy: TypeInfo.inferScalarType(V: VF),
3832 DL);
3833 VPValue *Inc = Builder.createOverflowingOp(Opcode: Instruction::Mul, Operands: {Step, VF});
3834
3835 VPValue *InductionGEP =
3836 Builder.createPtrAdd(Ptr: ScalarPtrPhi, Offset: Inc, DL, Name: "ptr.ind");
3837 ScalarPtrPhi->addOperand(Operand: InductionGEP);
3838}
3839
3840void VPlanTransforms::dissolveLoopRegions(VPlan &Plan) {
3841 // Replace loop regions with explicity CFG.
3842 SmallVector<VPRegionBlock *> LoopRegions;
3843 for (VPRegionBlock *R : VPBlockUtils::blocksOnly<VPRegionBlock>(
3844 Range: vp_depth_first_deep(G: Plan.getEntry()))) {
3845 if (!R->isReplicator())
3846 LoopRegions.push_back(Elt: R);
3847 }
3848 for (VPRegionBlock *R : LoopRegions)
3849 R->dissolveToCFGLoop();
3850}
3851
3852void VPlanTransforms::expandBranchOnTwoConds(VPlan &Plan) {
3853 SmallVector<VPInstruction *> WorkList;
3854 // The transform runs after dissolving loop regions, so all VPBasicBlocks
3855 // terminated with BranchOnTwoConds are reached via a shallow traversal.
3856 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
3857 Range: vp_depth_first_shallow(G: Plan.getEntry()))) {
3858 if (!VPBB->empty() && match(V: &VPBB->back(), P: m_BranchOnTwoConds()))
3859 WorkList.push_back(Elt: cast<VPInstruction>(Val: &VPBB->back()));
3860 }
3861
3862 // Expand BranchOnTwoConds instructions into explicit CFG with two new
3863 // single-condition branches:
3864 // 1. A branch that replaces BranchOnTwoConds, jumps to the first successor if
3865 // the first condition is true, and otherwise jumps to a new interim block.
3866 // 2. A branch that ends the interim block, jumps to the second successor if
3867 // the second condition is true, and otherwise jumps to the third
3868 // successor.
3869 for (VPInstruction *Br : WorkList) {
3870 assert(Br->getNumOperands() == 2 &&
3871 "BranchOnTwoConds must have exactly 2 conditions");
3872 DebugLoc DL = Br->getDebugLoc();
3873 VPBasicBlock *BrOnTwoCondsBB = Br->getParent();
3874 const auto Successors = to_vector(Range&: BrOnTwoCondsBB->getSuccessors());
3875 assert(Successors.size() == 3 &&
3876 "BranchOnTwoConds must have exactly 3 successors");
3877
3878 for (VPBlockBase *Succ : Successors)
3879 VPBlockUtils::disconnectBlocks(From: BrOnTwoCondsBB, To: Succ);
3880
3881 VPValue *Cond0 = Br->getOperand(N: 0);
3882 VPValue *Cond1 = Br->getOperand(N: 1);
3883 VPBlockBase *Succ0 = Successors[0];
3884 VPBlockBase *Succ1 = Successors[1];
3885 VPBlockBase *Succ2 = Successors[2];
3886 assert(!Succ0->getParent() && !Succ1->getParent() && !Succ2->getParent() &&
3887 !BrOnTwoCondsBB->getParent() && "regions must already be dissolved");
3888
3889 VPBasicBlock *InterimBB =
3890 Plan.createVPBasicBlock(Name: BrOnTwoCondsBB->getName() + ".interim");
3891
3892 VPBuilder(BrOnTwoCondsBB)
3893 .createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: {Cond0}, DL);
3894 VPBlockUtils::connectBlocks(From: BrOnTwoCondsBB, To: Succ0);
3895 VPBlockUtils::connectBlocks(From: BrOnTwoCondsBB, To: InterimBB);
3896
3897 VPBuilder(InterimBB).createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: {Cond1}, DL);
3898 VPBlockUtils::connectBlocks(From: InterimBB, To: Succ1);
3899 VPBlockUtils::connectBlocks(From: InterimBB, To: Succ2);
3900 Br->eraseFromParent();
3901 }
3902}
3903
3904void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) {
3905 VPTypeAnalysis TypeInfo(Plan);
3906 SmallVector<VPRecipeBase *> ToRemove;
3907 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
3908 Range: vp_depth_first_deep(G: Plan.getEntry()))) {
3909 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
3910 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &R)) {
3911 expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
3912 ToRemove.push_back(Elt: WidenIVR);
3913 continue;
3914 }
3915
3916 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(Val: &R)) {
3917 // If the recipe only generates scalars, scalarize it instead of
3918 // expanding it.
3919 if (WidenIVR->onlyScalarsGenerated(IsScalable: Plan.hasScalableVF())) {
3920 VPBuilder Builder(WidenIVR);
3921 VPValue *PtrAdd =
3922 scalarizeVPWidenPointerInduction(PtrIV: WidenIVR, Plan, Builder);
3923 WidenIVR->replaceAllUsesWith(New: PtrAdd);
3924 ToRemove.push_back(Elt: WidenIVR);
3925 continue;
3926 }
3927 expandVPWidenPointerInduction(R: WidenIVR, TypeInfo);
3928 ToRemove.push_back(Elt: WidenIVR);
3929 continue;
3930 }
3931
3932 // Expand VPBlendRecipe into VPInstruction::Select.
3933 VPBuilder Builder(&R);
3934 if (auto *Blend = dyn_cast<VPBlendRecipe>(Val: &R)) {
3935 VPValue *Select = Blend->getIncomingValue(Idx: 0);
3936 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
3937 Select = Builder.createSelect(Cond: Blend->getMask(Idx: I),
3938 TrueVal: Blend->getIncomingValue(Idx: I), FalseVal: Select,
3939 DL: R.getDebugLoc(), Name: "predphi");
3940 Blend->replaceAllUsesWith(New: Select);
3941 ToRemove.push_back(Elt: Blend);
3942 }
3943
3944 if (auto *Expr = dyn_cast<VPExpressionRecipe>(Val: &R)) {
3945 Expr->decompose();
3946 ToRemove.push_back(Elt: Expr);
3947 }
3948
3949 // Expand LastActiveLane into Not + FirstActiveLane + Sub.
3950 auto *LastActiveL = dyn_cast<VPInstruction>(Val: &R);
3951 if (LastActiveL &&
3952 LastActiveL->getOpcode() == VPInstruction::LastActiveLane) {
3953 // Create Not(Mask) for all operands.
3954 SmallVector<VPValue *, 2> NotMasks;
3955 for (VPValue *Op : LastActiveL->operands()) {
3956 VPValue *NotMask = Builder.createNot(Operand: Op, DL: LastActiveL->getDebugLoc());
3957 NotMasks.push_back(Elt: NotMask);
3958 }
3959
3960 // Create FirstActiveLane on the inverted masks.
3961 VPValue *FirstInactiveLane = Builder.createNaryOp(
3962 Opcode: VPInstruction::FirstActiveLane, Operands: NotMasks,
3963 DL: LastActiveL->getDebugLoc(), Name: "first.inactive.lane");
3964
3965 // Subtract 1 to get the last active lane.
3966 VPValue *One = Plan.getOrAddLiveIn(
3967 V: ConstantInt::get(Ty: Type::getInt64Ty(C&: Plan.getContext()), V: 1));
3968 VPValue *LastLane =
3969 Builder.createSub(LHS: FirstInactiveLane, RHS: One,
3970 DL: LastActiveL->getDebugLoc(), Name: "last.active.lane");
3971
3972 LastActiveL->replaceAllUsesWith(New: LastLane);
3973 ToRemove.push_back(Elt: LastActiveL);
3974 continue;
3975 }
3976
3977 // Lower BranchOnCount to ICmp + BranchOnCond.
3978 VPValue *IV, *TC;
3979 if (match(V: &R, P: m_BranchOnCount(Op0: m_VPValue(V&: IV), Op1: m_VPValue(V&: TC)))) {
3980 auto *BranchOnCountInst = cast<VPInstruction>(Val: &R);
3981 DebugLoc DL = BranchOnCountInst->getDebugLoc();
3982 VPValue *Cond = Builder.createICmp(Pred: CmpInst::ICMP_EQ, A: IV, B: TC, DL);
3983 Builder.createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: Cond, DL);
3984 ToRemove.push_back(Elt: BranchOnCountInst);
3985 continue;
3986 }
3987
3988 VPValue *VectorStep;
3989 VPValue *ScalarStep;
3990 if (!match(V: &R, P: m_VPInstruction<VPInstruction::WideIVStep>(
3991 Ops: m_VPValue(V&: VectorStep), Ops: m_VPValue(V&: ScalarStep))))
3992 continue;
3993
3994 // Expand WideIVStep.
3995 auto *VPI = cast<VPInstruction>(Val: &R);
3996 Type *IVTy = TypeInfo.inferScalarType(V: VPI);
3997 if (TypeInfo.inferScalarType(V: VectorStep) != IVTy) {
3998 Instruction::CastOps CastOp = IVTy->isFloatingPointTy()
3999 ? Instruction::UIToFP
4000 : Instruction::Trunc;
4001 VectorStep = Builder.createWidenCast(Opcode: CastOp, Op: VectorStep, ResultTy: IVTy);
4002 }
4003
4004 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
4005 if (TypeInfo.inferScalarType(V: ScalarStep) != IVTy) {
4006 ScalarStep =
4007 Builder.createWidenCast(Opcode: Instruction::Trunc, Op: ScalarStep, ResultTy: IVTy);
4008 }
4009
4010 VPIRFlags Flags;
4011 unsigned MulOpc;
4012 if (IVTy->isFloatingPointTy()) {
4013 MulOpc = Instruction::FMul;
4014 Flags = VPI->getFastMathFlags();
4015 } else {
4016 MulOpc = Instruction::Mul;
4017 Flags = VPIRFlags::getDefaultFlags(Opcode: MulOpc);
4018 }
4019
4020 VPInstruction *Mul = Builder.createNaryOp(
4021 Opcode: MulOpc, Operands: {VectorStep, ScalarStep}, Flags, DL: R.getDebugLoc());
4022 VectorStep = Mul;
4023 VPI->replaceAllUsesWith(New: VectorStep);
4024 ToRemove.push_back(Elt: VPI);
4025 }
4026 }
4027
4028 for (VPRecipeBase *R : ToRemove)
4029 R->eraseFromParent();
4030}
4031
4032void VPlanTransforms::handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB,
4033 VPBasicBlock *EarlyExitVPBB,
4034 VPlan &Plan,
4035 VPBasicBlock *HeaderVPBB,
4036 VPBasicBlock *LatchVPBB) {
4037 auto *MiddleVPBB = cast<VPBasicBlock>(Val: LatchVPBB->getSuccessors()[0]);
4038 if (!EarlyExitVPBB->getSinglePredecessor() &&
4039 EarlyExitVPBB->getPredecessors()[1] == MiddleVPBB) {
4040 assert(EarlyExitVPBB->getNumPredecessors() == 2 &&
4041 EarlyExitVPBB->getPredecessors()[0] == EarlyExitingVPBB &&
4042 "unsupported early exit VPBB");
4043 // Early exit operand should always be last phi operand. If EarlyExitVPBB
4044 // has two predecessors and EarlyExitingVPBB is the first, swap the operands
4045 // of the phis.
4046 for (VPRecipeBase &R : EarlyExitVPBB->phis())
4047 cast<VPIRPhi>(Val: &R)->swapOperands();
4048 }
4049
4050 VPBuilder Builder(LatchVPBB->getTerminator());
4051 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
4052 assert(match(EarlyExitingVPBB->getTerminator(), m_BranchOnCond()) &&
4053 "Terminator must be be BranchOnCond");
4054 VPValue *CondOfEarlyExitingVPBB =
4055 EarlyExitingVPBB->getTerminator()->getOperand(N: 0);
4056 auto *CondToEarlyExit = TrueSucc == EarlyExitVPBB
4057 ? CondOfEarlyExitingVPBB
4058 : Builder.createNot(Operand: CondOfEarlyExitingVPBB);
4059
4060 // Create a BranchOnTwoConds in the latch that branches to:
4061 // [0] vector.early.exit, [1] middle block, [2] header (continue looping).
4062 VPValue *IsEarlyExitTaken =
4063 Builder.createNaryOp(Opcode: VPInstruction::AnyOf, Operands: {CondToEarlyExit});
4064 VPBasicBlock *VectorEarlyExitVPBB =
4065 Plan.createVPBasicBlock(Name: "vector.early.exit");
4066 VectorEarlyExitVPBB->setParent(EarlyExitVPBB->getParent());
4067
4068 VPBlockUtils::connectBlocks(From: VectorEarlyExitVPBB, To: EarlyExitVPBB);
4069
4070 // Update the exit phis in the early exit block.
4071 VPBuilder MiddleBuilder(MiddleVPBB);
4072 VPBuilder EarlyExitB(VectorEarlyExitVPBB);
4073 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
4074 auto *ExitIRI = cast<VPIRPhi>(Val: &R);
4075 // Early exit operand should always be last, i.e., 0 if EarlyExitVPBB has
4076 // a single predecessor and 1 if it has two.
4077 unsigned EarlyExitIdx = ExitIRI->getNumOperands() - 1;
4078 if (ExitIRI->getNumOperands() != 1) {
4079 // The first of two operands corresponds to the latch exit, via MiddleVPBB
4080 // predecessor. Extract its final lane.
4081 ExitIRI->extractLastLaneOfLastPartOfFirstOperand(Builder&: MiddleBuilder);
4082 }
4083
4084 VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(N: EarlyExitIdx);
4085 if (!isa<VPIRValue>(Val: IncomingFromEarlyExit)) {
4086 // Update the incoming value from the early exit.
4087 VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
4088 Opcode: VPInstruction::FirstActiveLane, Operands: {CondToEarlyExit},
4089 DL: DebugLoc::getUnknown(), Name: "first.active.lane");
4090 IncomingFromEarlyExit = EarlyExitB.createNaryOp(
4091 Opcode: VPInstruction::ExtractLane, Operands: {FirstActiveLane, IncomingFromEarlyExit},
4092 DL: DebugLoc::getUnknown(), Name: "early.exit.value");
4093 ExitIRI->setOperand(I: EarlyExitIdx, New: IncomingFromEarlyExit);
4094 }
4095 }
4096
4097 // Replace the conditional branch controlling the latch exit from the vector
4098 // loop with a multi-conditional branch exiting to vector early exit if the
4099 // early exit has been taken, exiting to middle block if the original
4100 // condition of the vector latch is true, otherwise continuing back to header.
4101 auto *LatchExitingBranch = cast<VPInstruction>(Val: LatchVPBB->getTerminator());
4102 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
4103 "Unexpected terminator");
4104 auto *IsLatchExitTaken =
4105 Builder.createICmp(Pred: CmpInst::ICMP_EQ, A: LatchExitingBranch->getOperand(N: 0),
4106 B: LatchExitingBranch->getOperand(N: 1));
4107
4108 DebugLoc LatchDL = LatchExitingBranch->getDebugLoc();
4109 LatchExitingBranch->eraseFromParent();
4110
4111 Builder.setInsertPoint(LatchVPBB);
4112 Builder.createNaryOp(Opcode: VPInstruction::BranchOnTwoConds,
4113 Operands: {IsEarlyExitTaken, IsLatchExitTaken}, DL: LatchDL);
4114 LatchVPBB->clearSuccessors();
4115 LatchVPBB->setSuccessors({VectorEarlyExitVPBB, MiddleVPBB, HeaderVPBB});
4116 VectorEarlyExitVPBB->setPredecessors({LatchVPBB});
4117}
4118
4119/// This function tries convert extended in-loop reductions to
4120/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
4121/// valid. The created recipe must be decomposed to its constituent
4122/// recipes before execution.
4123static VPExpressionRecipe *
4124tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx,
4125 VFRange &Range) {
4126 Type *RedTy = Ctx.Types.inferScalarType(V: Red);
4127 VPValue *VecOp = Red->getVecOp();
4128
4129 // Clamp the range if using extended-reduction is profitable.
4130 auto IsExtendedRedValidAndClampRange =
4131 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
4132 return LoopVectorizationPlanner::getDecisionAndClampRange(
4133 Predicate: [&](ElementCount VF) {
4134 auto *SrcVecTy = cast<VectorType>(Val: toVectorTy(Scalar: SrcTy, EC: VF));
4135 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4136
4137 InstructionCost ExtRedCost = InstructionCost::getInvalid();
4138 InstructionCost ExtCost =
4139 cast<VPWidenCastRecipe>(Val: VecOp)->computeCost(VF, Ctx);
4140 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4141
4142 if (Red->isPartialReduction()) {
4143 TargetTransformInfo::PartialReductionExtendKind ExtKind =
4144 TargetTransformInfo::getPartialReductionExtendKind(CastOpc: ExtOpc);
4145 // FIXME: Move partial reduction creation, costing and clamping
4146 // here from LoopVectorize.cpp.
4147 ExtRedCost = Ctx.TTI.getPartialReductionCost(
4148 Opcode, InputTypeA: SrcTy, InputTypeB: nullptr, AccumType: RedTy, VF, OpAExtend: ExtKind,
4149 OpBExtend: llvm::TargetTransformInfo::PR_None, BinOp: std::nullopt, CostKind: Ctx.CostKind,
4150 FMF: RedTy->isFloatingPointTy()
4151 ? std::optional{Red->getFastMathFlags()}
4152 : std::nullopt);
4153 } else if (!RedTy->isFloatingPointTy()) {
4154 // TTI::getExtendedReductionCost only supports integer types.
4155 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
4156 Opcode, IsUnsigned: ExtOpc == Instruction::CastOps::ZExt, ResTy: RedTy, Ty: SrcVecTy,
4157 FMF: Red->getFastMathFlags(), CostKind);
4158 }
4159 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
4160 },
4161 Range);
4162 };
4163
4164 VPValue *A;
4165 // Match reduce(ext)).
4166 if (isa<VPWidenCastRecipe>(Val: VecOp) &&
4167 (match(V: VecOp, P: m_ZExtOrSExt(Op0: m_VPValue(V&: A))) ||
4168 match(V: VecOp, P: m_FPExt(Op0: m_VPValue(V&: A)))) &&
4169 IsExtendedRedValidAndClampRange(
4170 RecurrenceDescriptor::getOpcode(Kind: Red->getRecurrenceKind()),
4171 cast<VPWidenCastRecipe>(Val: VecOp)->getOpcode(),
4172 Ctx.Types.inferScalarType(V: A)))
4173 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(Val: VecOp), Red);
4174
4175 return nullptr;
4176}
4177
4178/// This function tries convert extended in-loop reductions to
4179/// VPExpressionRecipe and clamp the \p Range if it is beneficial
4180/// and valid. The created VPExpressionRecipe must be decomposed to its
4181/// constituent recipes before execution. Patterns of the
4182/// VPExpressionRecipe:
4183/// reduce.add(mul(...)),
4184/// reduce.add(mul(ext(A), ext(B))),
4185/// reduce.add(ext(mul(ext(A), ext(B)))).
4186/// reduce.fadd(fmul(ext(A), ext(B)))
4187static VPExpressionRecipe *
4188tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
4189 VPCostContext &Ctx, VFRange &Range) {
4190 unsigned Opcode = RecurrenceDescriptor::getOpcode(Kind: Red->getRecurrenceKind());
4191 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
4192 Opcode != Instruction::FAdd)
4193 return nullptr;
4194
4195 Type *RedTy = Ctx.Types.inferScalarType(V: Red);
4196
4197 // Clamp the range if using multiply-accumulate-reduction is profitable.
4198 auto IsMulAccValidAndClampRange =
4199 [&](VPWidenRecipe *Mul, VPWidenCastRecipe *Ext0, VPWidenCastRecipe *Ext1,
4200 VPWidenCastRecipe *OuterExt) -> bool {
4201 return LoopVectorizationPlanner::getDecisionAndClampRange(
4202 Predicate: [&](ElementCount VF) {
4203 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4204 Type *SrcTy =
4205 Ext0 ? Ctx.Types.inferScalarType(V: Ext0->getOperand(N: 0)) : RedTy;
4206 InstructionCost MulAccCost;
4207
4208 if (Red->isPartialReduction()) {
4209 Type *SrcTy2 =
4210 Ext1 ? Ctx.Types.inferScalarType(V: Ext1->getOperand(N: 0)) : nullptr;
4211 // FIXME: Move partial reduction creation, costing and clamping
4212 // here from LoopVectorize.cpp.
4213 MulAccCost = Ctx.TTI.getPartialReductionCost(
4214 Opcode, InputTypeA: SrcTy, InputTypeB: SrcTy2, AccumType: RedTy, VF,
4215 OpAExtend: Ext0 ? TargetTransformInfo::getPartialReductionExtendKind(
4216 CastOpc: Ext0->getOpcode())
4217 : TargetTransformInfo::PR_None,
4218 OpBExtend: Ext1 ? TargetTransformInfo::getPartialReductionExtendKind(
4219 CastOpc: Ext1->getOpcode())
4220 : TargetTransformInfo::PR_None,
4221 BinOp: Mul->getOpcode(), CostKind,
4222 FMF: RedTy->isFloatingPointTy()
4223 ? std::optional{Red->getFastMathFlags()}
4224 : std::nullopt);
4225 } else {
4226 // Only partial reductions support mixed or floating-point extends
4227 // at the moment.
4228 if (Ext0 && Ext1 &&
4229 (Ext0->getOpcode() != Ext1->getOpcode() ||
4230 Ext0->getOpcode() == Instruction::CastOps::FPExt))
4231 return false;
4232
4233 bool IsZExt =
4234 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
4235 auto *SrcVecTy = cast<VectorType>(Val: toVectorTy(Scalar: SrcTy, EC: VF));
4236 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsUnsigned: IsZExt, RedOpcode: Opcode, ResTy: RedTy,
4237 Ty: SrcVecTy, CostKind);
4238 }
4239
4240 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
4241 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4242 InstructionCost ExtCost = 0;
4243 if (Ext0)
4244 ExtCost += Ext0->computeCost(VF, Ctx);
4245 if (Ext1)
4246 ExtCost += Ext1->computeCost(VF, Ctx);
4247 if (OuterExt)
4248 ExtCost += OuterExt->computeCost(VF, Ctx);
4249
4250 return MulAccCost.isValid() &&
4251 MulAccCost < ExtCost + MulCost + RedCost;
4252 },
4253 Range);
4254 };
4255
4256 VPValue *VecOp = Red->getVecOp();
4257 VPRecipeBase *Sub = nullptr;
4258 VPValue *A, *B;
4259 VPValue *Tmp = nullptr;
4260
4261 // Try to match reduce.fadd(fmul(fpext(...), fpext(...))).
4262 if (match(V: VecOp, P: m_FMul(Op0: m_FPExt(Op0: m_VPValue()), Op1: m_FPExt(Op0: m_VPValue())))) {
4263 assert(Opcode == Instruction::FAdd &&
4264 "MulAccumulateReduction from an FMul must accumulate into an FAdd "
4265 "instruction");
4266 auto *FMul = dyn_cast<VPWidenRecipe>(Val: VecOp);
4267 if (!FMul)
4268 return nullptr;
4269
4270 auto *RecipeA = dyn_cast<VPWidenCastRecipe>(Val: FMul->getOperand(N: 0));
4271 auto *RecipeB = dyn_cast<VPWidenCastRecipe>(Val: FMul->getOperand(N: 1));
4272
4273 if (RecipeA && RecipeB &&
4274 IsMulAccValidAndClampRange(FMul, RecipeA, RecipeB, nullptr)) {
4275 return new VPExpressionRecipe(RecipeA, RecipeB, FMul, Red);
4276 }
4277 }
4278 if (RedTy->isFloatingPointTy())
4279 return nullptr;
4280
4281 // Sub reductions could have a sub between the add reduction and vec op.
4282 if (match(V: VecOp, P: m_Sub(Op0: m_ZeroInt(), Op1: m_VPValue(V&: Tmp)))) {
4283 Sub = VecOp->getDefiningRecipe();
4284 VecOp = Tmp;
4285 }
4286
4287 // If ValB is a constant and can be safely extended, truncate it to the same
4288 // type as ExtA's operand, then extend it to the same type as ExtA. This
4289 // creates two uniform extends that can more easily be matched by the rest of
4290 // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
4291 // replaced with the new extend of the constant.
4292 auto ExtendAndReplaceConstantOp = [&Ctx](VPWidenCastRecipe *ExtA,
4293 VPWidenCastRecipe *&ExtB,
4294 VPValue *&ValB, VPWidenRecipe *Mul) {
4295 if (!ExtA || ExtB || !isa<VPIRValue>(Val: ValB))
4296 return;
4297 Type *NarrowTy = Ctx.Types.inferScalarType(V: ExtA->getOperand(N: 0));
4298 Instruction::CastOps ExtOpc = ExtA->getOpcode();
4299 const APInt *Const;
4300 if (!match(V: ValB, P: m_APInt(C&: Const)) ||
4301 !llvm::canConstantBeExtended(
4302 C: Const, NarrowType: NarrowTy, ExtKind: TTI::getPartialReductionExtendKind(CastOpc: ExtOpc)))
4303 return;
4304 // The truncate ensures that the type of each extended operand is the
4305 // same, and it's been proven that the constant can be extended from
4306 // NarrowTy safely. Necessary since ExtA's extended operand would be
4307 // e.g. an i8, while the const will likely be an i32. This will be
4308 // elided by later optimisations.
4309 VPBuilder Builder(Mul);
4310 auto *Trunc =
4311 Builder.createWidenCast(Opcode: Instruction::CastOps::Trunc, Op: ValB, ResultTy: NarrowTy);
4312 Type *WideTy = Ctx.Types.inferScalarType(V: ExtA);
4313 ValB = ExtB = Builder.createWidenCast(Opcode: ExtOpc, Op: Trunc, ResultTy: WideTy);
4314 Mul->setOperand(I: 1, New: ExtB);
4315 };
4316
4317 // Try to match reduce.add(mul(...)).
4318 if (match(V: VecOp, P: m_Mul(Op0: m_VPValue(V&: A), Op1: m_VPValue(V&: B)))) {
4319 auto *RecipeA = dyn_cast_if_present<VPWidenCastRecipe>(Val: A);
4320 auto *RecipeB = dyn_cast_if_present<VPWidenCastRecipe>(Val: B);
4321 auto *Mul = cast<VPWidenRecipe>(Val: VecOp);
4322
4323 // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
4324 ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
4325
4326 // Match reduce.add/sub(mul(ext, ext)).
4327 if (RecipeA && RecipeB && match(V: RecipeA, P: m_ZExtOrSExt(Op0: m_VPValue())) &&
4328 match(V: RecipeB, P: m_ZExtOrSExt(Op0: m_VPValue())) &&
4329 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
4330 if (Sub)
4331 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
4332 cast<VPWidenRecipe>(Val: Sub), Red);
4333 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
4334 }
4335 // TODO: Add an expression type for this variant with a negated mul
4336 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
4337 return new VPExpressionRecipe(Mul, Red);
4338 }
4339 // TODO: Add an expression type for negated versions of other expression
4340 // variants.
4341 if (Sub)
4342 return nullptr;
4343
4344 // Match reduce.add(ext(mul(A, B))).
4345 if (match(V: VecOp, P: m_ZExtOrSExt(Op0: m_Mul(Op0: m_VPValue(V&: A), Op1: m_VPValue(V&: B))))) {
4346 auto *Ext = cast<VPWidenCastRecipe>(Val: VecOp);
4347 auto *Mul = cast<VPWidenRecipe>(Val: Ext->getOperand(N: 0));
4348 auto *Ext0 = dyn_cast_if_present<VPWidenCastRecipe>(Val: A);
4349 auto *Ext1 = dyn_cast_if_present<VPWidenCastRecipe>(Val: B);
4350
4351 // reduce.add(ext(mul(ext, const)))
4352 // -> reduce.add(ext(mul(ext, ext(const))))
4353 ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
4354
4355 // reduce.add(ext(mul(ext(A), ext(B))))
4356 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
4357 // The inner extends must either have the same opcode as the outer extend or
4358 // be the same, in which case the multiply can never result in a negative
4359 // value and the outer extend can be folded away by doing wider
4360 // extends for the operands of the mul.
4361 if (Ext0 && Ext1 &&
4362 (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
4363 Ext0->getOpcode() == Ext1->getOpcode() &&
4364 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
4365 auto *NewExt0 = new VPWidenCastRecipe(
4366 Ext0->getOpcode(), Ext0->getOperand(N: 0), Ext->getResultType(), nullptr,
4367 *Ext0, *Ext0, Ext0->getDebugLoc());
4368 NewExt0->insertBefore(InsertPos: Ext0);
4369
4370 VPWidenCastRecipe *NewExt1 = NewExt0;
4371 if (Ext0 != Ext1) {
4372 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(N: 0),
4373 Ext->getResultType(), nullptr, *Ext1,
4374 *Ext1, Ext1->getDebugLoc());
4375 NewExt1->insertBefore(InsertPos: Ext1);
4376 }
4377 Mul->setOperand(I: 0, New: NewExt0);
4378 Mul->setOperand(I: 1, New: NewExt1);
4379 Red->setOperand(I: 1, New: Mul);
4380 return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red);
4381 }
4382 }
4383 return nullptr;
4384}
4385
4386/// This function tries to create abstract recipes from the reduction recipe for
4387/// following optimizations and cost estimation.
4388static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red,
4389 VPCostContext &Ctx,
4390 VFRange &Range) {
4391 VPExpressionRecipe *AbstractR = nullptr;
4392 auto IP = std::next(x: Red->getIterator());
4393 auto *VPBB = Red->getParent();
4394 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
4395 AbstractR = MulAcc;
4396 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
4397 AbstractR = ExtRed;
4398 // Cannot create abstract inloop reduction recipes.
4399 if (!AbstractR)
4400 return;
4401
4402 AbstractR->insertBefore(BB&: *VPBB, IP);
4403 Red->replaceAllUsesWith(New: AbstractR);
4404}
4405
4406void VPlanTransforms::convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx,
4407 VFRange &Range) {
4408 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4409 Range: vp_depth_first_deep(G: Plan.getVectorLoopRegion()))) {
4410 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
4411 if (auto *Red = dyn_cast<VPReductionRecipe>(Val: &R))
4412 tryToCreateAbstractReductionRecipe(Red, Ctx, Range);
4413 }
4414 }
4415}
4416
4417void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
4418 if (Plan.hasScalarVFOnly())
4419 return;
4420
4421#ifndef NDEBUG
4422 VPDominatorTree VPDT(Plan);
4423#endif
4424
4425 SmallVector<VPValue *> VPValues;
4426 if (Plan.getOrCreateBackedgeTakenCount()->getNumUsers() > 0)
4427 VPValues.push_back(Elt: Plan.getOrCreateBackedgeTakenCount());
4428 append_range(C&: VPValues, R: Plan.getLiveIns());
4429 for (VPRecipeBase &R : *Plan.getEntry())
4430 append_range(C&: VPValues, R: R.definedValues());
4431
4432 auto *VectorPreheader = Plan.getVectorPreheader();
4433 for (VPValue *VPV : VPValues) {
4434 if (vputils::onlyScalarValuesUsed(Def: VPV) ||
4435 (isa<VPIRValue>(Val: VPV) && isa<Constant>(Val: VPV->getLiveInIRValue())))
4436 continue;
4437
4438 // Add explicit broadcast at the insert point that dominates all users.
4439 VPBasicBlock *HoistBlock = VectorPreheader;
4440 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
4441 for (VPUser *User : VPV->users()) {
4442 if (User->usesScalars(Op: VPV))
4443 continue;
4444 if (cast<VPRecipeBase>(Val: User)->getParent() == VectorPreheader)
4445 HoistPoint = HoistBlock->begin();
4446 else
4447 assert(VPDT.dominates(VectorPreheader,
4448 cast<VPRecipeBase>(User)->getParent()) &&
4449 "All users must be in the vector preheader or dominated by it");
4450 }
4451
4452 VPBuilder Builder(cast<VPBasicBlock>(Val: HoistBlock), HoistPoint);
4453 auto *Broadcast = Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: {VPV});
4454 VPV->replaceUsesWithIf(New: Broadcast,
4455 ShouldReplace: [VPV, Broadcast](VPUser &U, unsigned Idx) {
4456 return Broadcast != &U && !U.usesScalars(Op: VPV);
4457 });
4458 }
4459}
4460
4461void VPlanTransforms::hoistInvariantLoads(VPlan &Plan) {
4462 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4463
4464 // Collect candidate loads with invariant addresses and noalias scopes
4465 // metadata and memory-writing recipes with noalias metadata.
4466 SmallVector<std::pair<VPRecipeBase *, MemoryLocation>> CandidateLoads;
4467 SmallVector<MemoryLocation> Stores;
4468 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4469 Range: vp_depth_first_shallow(G: LoopRegion->getEntry()))) {
4470 for (VPRecipeBase &R : *VPBB) {
4471 // Only handle single-scalar replicated loads with invariant addresses.
4472 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R)) {
4473 if (RepR->isPredicated() || !RepR->isSingleScalar() ||
4474 RepR->getOpcode() != Instruction::Load)
4475 continue;
4476
4477 VPValue *Addr = RepR->getOperand(N: 0);
4478 if (Addr->isDefinedOutsideLoopRegions()) {
4479 MemoryLocation Loc = *vputils::getMemoryLocation(R: *RepR);
4480 if (!Loc.AATags.Scope)
4481 continue;
4482 CandidateLoads.push_back(Elt: {RepR, Loc});
4483 }
4484 }
4485 if (R.mayWriteToMemory()) {
4486 auto Loc = vputils::getMemoryLocation(R);
4487 if (!Loc || !Loc->AATags.Scope || !Loc->AATags.NoAlias)
4488 return;
4489 Stores.push_back(Elt: *Loc);
4490 }
4491 }
4492 }
4493
4494 VPBasicBlock *Preheader = Plan.getVectorPreheader();
4495 for (auto &[LoadRecipe, LoadLoc] : CandidateLoads) {
4496 // Hoist the load to the preheader if it doesn't alias with any stores
4497 // according to the noalias metadata. Other loads should have been hoisted
4498 // by other passes
4499 const AAMDNodes &LoadAA = LoadLoc.AATags;
4500 if (all_of(Range&: Stores, P: [&](const MemoryLocation &StoreLoc) {
4501 return !ScopedNoAliasAAResult::mayAliasInScopes(
4502 Scopes: LoadAA.Scope, NoAlias: StoreLoc.AATags.NoAlias);
4503 })) {
4504 LoadRecipe->moveBefore(BB&: *Preheader, I: Preheader->getFirstNonPhi());
4505 }
4506 }
4507}
4508
4509// Collect common metadata from a group of replicate recipes by intersecting
4510// metadata from all recipes in the group.
4511static VPIRMetadata getCommonMetadata(ArrayRef<VPReplicateRecipe *> Recipes) {
4512 VPIRMetadata CommonMetadata = *Recipes.front();
4513 for (VPReplicateRecipe *Recipe : drop_begin(RangeOrContainer&: Recipes))
4514 CommonMetadata.intersect(MD: *Recipe);
4515 return CommonMetadata;
4516}
4517
4518template <unsigned Opcode>
4519static SmallVector<SmallVector<VPReplicateRecipe *, 4>>
4520collectComplementaryPredicatedMemOps(VPlan &Plan,
4521 PredicatedScalarEvolution &PSE,
4522 const Loop *L) {
4523 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
4524 "Only Load and Store opcodes supported");
4525 constexpr bool IsLoad = (Opcode == Instruction::Load);
4526 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4527 VPTypeAnalysis TypeInfo(Plan);
4528
4529 // Group predicated operations by their address SCEV.
4530 DenseMap<const SCEV *, SmallVector<VPReplicateRecipe *>> RecipesByAddress;
4531 for (VPBlockBase *Block : vp_depth_first_shallow(G: LoopRegion->getEntry())) {
4532 auto *VPBB = cast<VPBasicBlock>(Val: Block);
4533 for (VPRecipeBase &R : *VPBB) {
4534 auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R);
4535 if (!RepR || RepR->getOpcode() != Opcode || !RepR->isPredicated())
4536 continue;
4537
4538 // For loads, operand 0 is address; for stores, operand 1 is address.
4539 VPValue *Addr = RepR->getOperand(N: IsLoad ? 0 : 1);
4540 const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(V: Addr, PSE, L);
4541 if (!isa<SCEVCouldNotCompute>(Val: AddrSCEV))
4542 RecipesByAddress[AddrSCEV].push_back(Elt: RepR);
4543 }
4544 }
4545
4546 // For each address, collect operations with the same or complementary masks.
4547 SmallVector<SmallVector<VPReplicateRecipe *, 4>> AllGroups;
4548 auto GetLoadStoreValueType = [&](VPReplicateRecipe *Recipe) {
4549 return TypeInfo.inferScalarType(V: IsLoad ? Recipe : Recipe->getOperand(N: 0));
4550 };
4551 for (auto &[Addr, Recipes] : RecipesByAddress) {
4552 if (Recipes.size() < 2)
4553 continue;
4554
4555 // Collect groups with the same or complementary masks.
4556 for (VPReplicateRecipe *&RecipeI : Recipes) {
4557 if (!RecipeI)
4558 continue;
4559
4560 VPValue *MaskI = RecipeI->getMask();
4561 Type *TypeI = GetLoadStoreValueType(RecipeI);
4562 SmallVector<VPReplicateRecipe *, 4> Group;
4563 Group.push_back(Elt: RecipeI);
4564 RecipeI = nullptr;
4565
4566 // Find all operations with the same or complementary masks.
4567 bool HasComplementaryMask = false;
4568 for (VPReplicateRecipe *&RecipeJ : Recipes) {
4569 if (!RecipeJ)
4570 continue;
4571
4572 VPValue *MaskJ = RecipeJ->getMask();
4573 Type *TypeJ = GetLoadStoreValueType(RecipeJ);
4574 if (TypeI == TypeJ) {
4575 // Check if any operation in the group has a complementary mask with
4576 // another, that is M1 == NOT(M2) or M2 == NOT(M1).
4577 HasComplementaryMask |= match(V: MaskI, P: m_Not(Op0: m_Specific(VPV: MaskJ))) ||
4578 match(V: MaskJ, P: m_Not(Op0: m_Specific(VPV: MaskI)));
4579 Group.push_back(Elt: RecipeJ);
4580 RecipeJ = nullptr;
4581 }
4582 }
4583
4584 if (HasComplementaryMask) {
4585 assert(Group.size() >= 2 && "must have at least 2 entries");
4586 AllGroups.push_back(Elt: std::move(Group));
4587 }
4588 }
4589 }
4590
4591 return AllGroups;
4592}
4593
4594// Find the recipe with minimum alignment in the group.
4595template <typename InstType>
4596static VPReplicateRecipe *
4597findRecipeWithMinAlign(ArrayRef<VPReplicateRecipe *> Group) {
4598 return *min_element(Group, [](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4599 return cast<InstType>(A->getUnderlyingInstr())->getAlign() <
4600 cast<InstType>(B->getUnderlyingInstr())->getAlign();
4601 });
4602}
4603
4604void VPlanTransforms::hoistPredicatedLoads(VPlan &Plan,
4605 PredicatedScalarEvolution &PSE,
4606 const Loop *L) {
4607 auto Groups =
4608 collectComplementaryPredicatedMemOps<Instruction::Load>(Plan, PSE, L);
4609 if (Groups.empty())
4610 return;
4611
4612 VPDominatorTree VPDT(Plan);
4613
4614 // Process each group of loads.
4615 for (auto &Group : Groups) {
4616 // Sort loads by dominance order, with earliest (most dominating) first.
4617 sort(C&: Group, Comp: [&VPDT](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4618 return VPDT.properlyDominates(A, B);
4619 });
4620
4621 // Try to use the earliest (most dominating) load to replace all others.
4622 VPReplicateRecipe *EarliestLoad = Group[0];
4623 VPBasicBlock *FirstBB = EarliestLoad->getParent();
4624 VPBasicBlock *LastBB = Group.back()->getParent();
4625
4626 // Check that the load doesn't alias with stores between first and last.
4627 auto LoadLoc = vputils::getMemoryLocation(R: *EarliestLoad);
4628 if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(MemLoc: *LoadLoc, FirstBB, LastBB))
4629 continue;
4630
4631 // Collect common metadata from all loads in the group.
4632 VPIRMetadata CommonMetadata = getCommonMetadata(Recipes: Group);
4633
4634 // Find the load with minimum alignment to use.
4635 auto *LoadWithMinAlign = findRecipeWithMinAlign<LoadInst>(Group);
4636
4637 // Create an unpredicated version of the earliest load with common
4638 // metadata.
4639 auto *UnpredicatedLoad = new VPReplicateRecipe(
4640 LoadWithMinAlign->getUnderlyingInstr(), {EarliestLoad->getOperand(N: 0)},
4641 /*IsSingleScalar=*/false, /*Mask=*/nullptr, *EarliestLoad,
4642 CommonMetadata);
4643
4644 UnpredicatedLoad->insertBefore(InsertPos: EarliestLoad);
4645
4646 // Replace all loads in the group with the unpredicated load.
4647 for (VPReplicateRecipe *Load : Group) {
4648 Load->replaceAllUsesWith(New: UnpredicatedLoad);
4649 Load->eraseFromParent();
4650 }
4651 }
4652}
4653
4654static bool
4655canSinkStoreWithNoAliasCheck(ArrayRef<VPReplicateRecipe *> StoresToSink,
4656 PredicatedScalarEvolution &PSE, const Loop &L,
4657 VPTypeAnalysis &TypeInfo) {
4658 auto StoreLoc = vputils::getMemoryLocation(R: *StoresToSink.front());
4659 if (!StoreLoc || !StoreLoc->AATags.Scope)
4660 return false;
4661
4662 // When sinking a group of stores, all members of the group alias each other.
4663 // Skip them during the alias checks.
4664 SmallPtrSet<VPRecipeBase *, 4> StoresToSinkSet(StoresToSink.begin(),
4665 StoresToSink.end());
4666
4667 VPBasicBlock *FirstBB = StoresToSink.front()->getParent();
4668 VPBasicBlock *LastBB = StoresToSink.back()->getParent();
4669 SinkStoreInfo SinkInfo(StoresToSinkSet, *StoresToSink[0], PSE, L, TypeInfo);
4670 return canHoistOrSinkWithNoAliasCheck(MemLoc: *StoreLoc, FirstBB, LastBB, SinkInfo);
4671}
4672
4673void VPlanTransforms::sinkPredicatedStores(VPlan &Plan,
4674 PredicatedScalarEvolution &PSE,
4675 const Loop *L) {
4676 auto Groups =
4677 collectComplementaryPredicatedMemOps<Instruction::Store>(Plan, PSE, L);
4678 if (Groups.empty())
4679 return;
4680
4681 VPDominatorTree VPDT(Plan);
4682 VPTypeAnalysis TypeInfo(Plan);
4683
4684 for (auto &Group : Groups) {
4685 sort(C&: Group, Comp: [&VPDT](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4686 return VPDT.properlyDominates(A, B);
4687 });
4688
4689 if (!canSinkStoreWithNoAliasCheck(StoresToSink: Group, PSE, L: *L, TypeInfo))
4690 continue;
4691
4692 // Use the last (most dominated) store's location for the unconditional
4693 // store.
4694 VPReplicateRecipe *LastStore = Group.back();
4695 VPBasicBlock *InsertBB = LastStore->getParent();
4696
4697 // Collect common alias metadata from all stores in the group.
4698 VPIRMetadata CommonMetadata = getCommonMetadata(Recipes: Group);
4699
4700 // Build select chain for stored values.
4701 VPValue *SelectedValue = Group[0]->getOperand(N: 0);
4702 VPBuilder Builder(InsertBB, LastStore->getIterator());
4703
4704 for (unsigned I = 1; I < Group.size(); ++I) {
4705 VPValue *Mask = Group[I]->getMask();
4706 VPValue *Value = Group[I]->getOperand(N: 0);
4707 SelectedValue = Builder.createSelect(Cond: Mask, TrueVal: Value, FalseVal: SelectedValue,
4708 DL: Group[I]->getDebugLoc());
4709 }
4710
4711 // Find the store with minimum alignment to use.
4712 auto *StoreWithMinAlign = findRecipeWithMinAlign<StoreInst>(Group);
4713
4714 // Create unconditional store with selected value and common metadata.
4715 auto *UnpredicatedStore =
4716 new VPReplicateRecipe(StoreWithMinAlign->getUnderlyingInstr(),
4717 {SelectedValue, LastStore->getOperand(N: 1)},
4718 /*IsSingleScalar=*/false,
4719 /*Mask=*/nullptr, *LastStore, CommonMetadata);
4720 UnpredicatedStore->insertBefore(BB&: *InsertBB, IP: LastStore->getIterator());
4721
4722 // Remove all predicated stores from the group.
4723 for (VPReplicateRecipe *Store : Group)
4724 Store->eraseFromParent();
4725 }
4726}
4727
4728void VPlanTransforms::materializeConstantVectorTripCount(
4729 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
4730 PredicatedScalarEvolution &PSE) {
4731 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
4732 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
4733
4734 VPValue *TC = Plan.getTripCount();
4735 // Skip cases for which the trip count may be non-trivial to materialize.
4736 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
4737 // tail is required.
4738 if (!Plan.hasScalarTail() ||
4739 Plan.getMiddleBlock()->getSingleSuccessor() ==
4740 Plan.getScalarPreheader() ||
4741 !isa<VPIRValue>(Val: TC))
4742 return;
4743
4744 // Materialize vector trip counts for constants early if it can simply
4745 // be computed as (Original TC / VF * UF) * VF * UF.
4746 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
4747 // tail-folded loops.
4748 ScalarEvolution &SE = *PSE.getSE();
4749 auto *TCScev = SE.getSCEV(V: TC->getLiveInIRValue());
4750 if (!isa<SCEVConstant>(Val: TCScev))
4751 return;
4752 const SCEV *VFxUF = SE.getElementCount(Ty: TCScev->getType(), EC: BestVF * BestUF);
4753 auto VecTCScev = SE.getMulExpr(LHS: SE.getUDivExpr(LHS: TCScev, RHS: VFxUF), RHS: VFxUF);
4754 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(Val: VecTCScev))
4755 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
4756}
4757
4758void VPlanTransforms::materializeBackedgeTakenCount(VPlan &Plan,
4759 VPBasicBlock *VectorPH) {
4760 VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
4761 if (BTC->getNumUsers() == 0)
4762 return;
4763
4764 VPBuilder Builder(VectorPH, VectorPH->begin());
4765 auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(V: Plan.getTripCount());
4766 auto *TCMO =
4767 Builder.createSub(LHS: Plan.getTripCount(), RHS: Plan.getConstantInt(Ty: TCTy, Val: 1),
4768 DL: DebugLoc::getCompilerGenerated(), Name: "trip.count.minus.1");
4769 BTC->replaceAllUsesWith(New: TCMO);
4770}
4771
4772void VPlanTransforms::materializePacksAndUnpacks(VPlan &Plan) {
4773 if (Plan.hasScalarVFOnly())
4774 return;
4775
4776 VPTypeAnalysis TypeInfo(Plan);
4777 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4778 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
4779 Range: vp_depth_first_shallow(G: Plan.getEntry()));
4780 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
4781 Range: vp_depth_first_shallow(G: LoopRegion->getEntry()));
4782 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes and
4783 // VPInstructions, excluding ones in replicate regions. Those are not
4784 // materialized explicitly yet. Those vector users are still handled in
4785 // VPReplicateRegion::execute(), via shouldPack().
4786 // TODO: materialize build vectors for replicating recipes in replicating
4787 // regions.
4788 for (VPBasicBlock *VPBB :
4789 concat<VPBasicBlock *>(Ranges&: VPBBsOutsideLoopRegion, Ranges&: VPBBsInsideLoopRegion)) {
4790 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
4791 if (!isa<VPReplicateRecipe, VPInstruction>(Val: &R))
4792 continue;
4793 auto *DefR = cast<VPRecipeWithIRFlags>(Val: &R);
4794 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
4795 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(Val: U)->getRegion();
4796 return !U->usesScalars(Op: DefR) || ParentRegion != LoopRegion;
4797 };
4798 if ((isa<VPReplicateRecipe>(Val: DefR) &&
4799 cast<VPReplicateRecipe>(Val: DefR)->isSingleScalar()) ||
4800 (isa<VPInstruction>(Val: DefR) &&
4801 (vputils::onlyFirstLaneUsed(Def: DefR) ||
4802 !cast<VPInstruction>(Val: DefR)->doesGeneratePerAllLanes())) ||
4803 none_of(Range: DefR->users(), P: UsesVectorOrInsideReplicateRegion))
4804 continue;
4805
4806 Type *ScalarTy = TypeInfo.inferScalarType(V: DefR);
4807 unsigned Opcode = ScalarTy->isStructTy()
4808 ? VPInstruction::BuildStructVector
4809 : VPInstruction::BuildVector;
4810 auto *BuildVector = new VPInstruction(Opcode, {DefR});
4811 BuildVector->insertAfter(InsertPos: DefR);
4812
4813 DefR->replaceUsesWithIf(
4814 New: BuildVector, ShouldReplace: [BuildVector, &UsesVectorOrInsideReplicateRegion](
4815 VPUser &U, unsigned) {
4816 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
4817 });
4818 }
4819 }
4820
4821 // Create explicit VPInstructions to convert vectors to scalars. The current
4822 // implementation is conservative - it may miss some cases that may or may not
4823 // be vector values. TODO: introduce Unpacks speculatively - remove them later
4824 // if they are known to operate on scalar values.
4825 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
4826 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
4827 if (isa<VPReplicateRecipe, VPInstruction, VPScalarIVStepsRecipe,
4828 VPDerivedIVRecipe, VPCanonicalIVPHIRecipe>(Val: &R))
4829 continue;
4830 for (VPValue *Def : R.definedValues()) {
4831 // Skip recipes that are single-scalar or only have their first lane
4832 // used.
4833 // TODO: The Defs skipped here may or may not be vector values.
4834 // Introduce Unpacks, and remove them later, if they are guaranteed to
4835 // produce scalar values.
4836 if (vputils::isSingleScalar(VPV: Def) || vputils::onlyFirstLaneUsed(Def))
4837 continue;
4838
4839 // At the moment, we create unpacks only for scalar users outside
4840 // replicate regions. Recipes inside replicate regions still extract the
4841 // required lanes implicitly.
4842 // TODO: Remove once replicate regions are unrolled completely.
4843 auto IsCandidateUnpackUser = [Def](VPUser *U) {
4844 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(Val: U)->getRegion();
4845 return U->usesScalars(Op: Def) &&
4846 (!ParentRegion || !ParentRegion->isReplicator());
4847 };
4848 if (none_of(Range: Def->users(), P: IsCandidateUnpackUser))
4849 continue;
4850
4851 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
4852 if (R.isPhi())
4853 Unpack->insertBefore(BB&: *VPBB, IP: VPBB->getFirstNonPhi());
4854 else
4855 Unpack->insertAfter(InsertPos: &R);
4856 Def->replaceUsesWithIf(New: Unpack,
4857 ShouldReplace: [&IsCandidateUnpackUser](VPUser &U, unsigned) {
4858 return IsCandidateUnpackUser(&U);
4859 });
4860 }
4861 }
4862 }
4863}
4864
4865void VPlanTransforms::materializeVectorTripCount(VPlan &Plan,
4866 VPBasicBlock *VectorPHVPBB,
4867 bool TailByMasking,
4868 bool RequiresScalarEpilogue) {
4869 VPSymbolicValue &VectorTC = Plan.getVectorTripCount();
4870 // There's nothing to do if there are no users of the vector trip count or its
4871 // IR value has already been set.
4872 if (VectorTC.getNumUsers() == 0 || VectorTC.getUnderlyingValue())
4873 return;
4874
4875 VPValue *TC = Plan.getTripCount();
4876 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(V: TC);
4877 VPBuilder Builder(VectorPHVPBB, VectorPHVPBB->begin());
4878 VPValue *Step = &Plan.getVFxUF();
4879
4880 // If the tail is to be folded by masking, round the number of iterations N
4881 // up to a multiple of Step instead of rounding down. This is done by first
4882 // adding Step-1 and then rounding down. Note that it's ok if this addition
4883 // overflows: the vector induction variable will eventually wrap to zero given
4884 // that it starts at zero and its Step is a power of two; the loop will then
4885 // exit, with the last early-exit vector comparison also producing all-true.
4886 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
4887 // is accounted for in emitIterationCountCheck that adds an overflow check.
4888 if (TailByMasking) {
4889 TC = Builder.createAdd(
4890 LHS: TC, RHS: Builder.createSub(LHS: Step, RHS: Plan.getConstantInt(Ty: TCTy, Val: 1)),
4891 DL: DebugLoc::getCompilerGenerated(), Name: "n.rnd.up");
4892 }
4893
4894 // Now we need to generate the expression for the part of the loop that the
4895 // vectorized body will execute. This is equal to N - (N % Step) if scalar
4896 // iterations are not required for correctness, or N - Step, otherwise. Step
4897 // is equal to the vectorization factor (number of SIMD elements) times the
4898 // unroll factor (number of SIMD instructions).
4899 VPValue *R =
4900 Builder.createNaryOp(Opcode: Instruction::URem, Operands: {TC, Step},
4901 DL: DebugLoc::getCompilerGenerated(), Name: "n.mod.vf");
4902
4903 // There are cases where we *must* run at least one iteration in the remainder
4904 // loop. See the cost model for when this can happen. If the step evenly
4905 // divides the trip count, we set the remainder to be equal to the step. If
4906 // the step does not evenly divide the trip count, no adjustment is necessary
4907 // since there will already be scalar iterations. Note that the minimum
4908 // iterations check ensures that N >= Step.
4909 if (RequiresScalarEpilogue) {
4910 assert(!TailByMasking &&
4911 "requiring scalar epilogue is not supported with fail folding");
4912 VPValue *IsZero =
4913 Builder.createICmp(Pred: CmpInst::ICMP_EQ, A: R, B: Plan.getConstantInt(Ty: TCTy, Val: 0));
4914 R = Builder.createSelect(Cond: IsZero, TrueVal: Step, FalseVal: R);
4915 }
4916
4917 VPValue *Res =
4918 Builder.createSub(LHS: TC, RHS: R, DL: DebugLoc::getCompilerGenerated(), Name: "n.vec");
4919 VectorTC.replaceAllUsesWith(New: Res);
4920}
4921
4922void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH,
4923 ElementCount VFEC) {
4924 VPBuilder Builder(VectorPH, VectorPH->begin());
4925 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(V: Plan.getTripCount());
4926 VPValue &VF = Plan.getVF();
4927 VPValue &VFxUF = Plan.getVFxUF();
4928 // Note that after the transform, Plan.getVF and Plan.getVFxUF should not be
4929 // used.
4930 // TODO: Assert that they aren't used.
4931
4932 // If there are no users of the runtime VF, compute VFxUF by constant folding
4933 // the multiplication of VF and UF.
4934 if (VF.getNumUsers() == 0) {
4935 VPValue *RuntimeVFxUF =
4936 Builder.createElementCount(Ty: TCTy, EC: VFEC * Plan.getUF());
4937 VFxUF.replaceAllUsesWith(New: RuntimeVFxUF);
4938 return;
4939 }
4940
4941 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
4942 // vscale) * UF.
4943 VPValue *RuntimeVF = Builder.createElementCount(Ty: TCTy, EC: VFEC);
4944 if (!vputils::onlyScalarValuesUsed(Def: &VF)) {
4945 VPValue *BC = Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: RuntimeVF);
4946 VF.replaceUsesWithIf(
4947 New: BC, ShouldReplace: [&VF](VPUser &U, unsigned) { return !U.usesScalars(Op: &VF); });
4948 }
4949 VF.replaceAllUsesWith(New: RuntimeVF);
4950
4951 VPValue *UF = Plan.getConstantInt(Ty: TCTy, Val: Plan.getUF());
4952 VPValue *MulByUF = Builder.createOverflowingOp(
4953 Opcode: Instruction::Mul, Operands: {RuntimeVF, UF}, WrapFlags: {true, false});
4954 VFxUF.replaceAllUsesWith(New: MulByUF);
4955}
4956
4957DenseMap<const SCEV *, Value *>
4958VPlanTransforms::expandSCEVs(VPlan &Plan, ScalarEvolution &SE) {
4959 SCEVExpander Expander(SE, "induction", /*PreserveLCSSA=*/false);
4960
4961 auto *Entry = cast<VPIRBasicBlock>(Val: Plan.getEntry());
4962 BasicBlock *EntryBB = Entry->getIRBasicBlock();
4963 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
4964 for (VPRecipeBase &R : make_early_inc_range(Range&: *Entry)) {
4965 if (isa<VPIRInstruction, VPIRPhi>(Val: &R))
4966 continue;
4967 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(Val: &R);
4968 if (!ExpSCEV)
4969 break;
4970 const SCEV *Expr = ExpSCEV->getSCEV();
4971 Value *Res =
4972 Expander.expandCodeFor(SH: Expr, Ty: Expr->getType(), I: EntryBB->getTerminator());
4973 ExpandedSCEVs[ExpSCEV->getSCEV()] = Res;
4974 VPValue *Exp = Plan.getOrAddLiveIn(V: Res);
4975 ExpSCEV->replaceAllUsesWith(New: Exp);
4976 if (Plan.getTripCount() == ExpSCEV)
4977 Plan.resetTripCount(NewTripCount: Exp);
4978 ExpSCEV->eraseFromParent();
4979 }
4980 assert(none_of(*Entry, IsaPred<VPExpandSCEVRecipe>) &&
4981 "VPExpandSCEVRecipes must be at the beginning of the entry block, "
4982 "before any VPIRInstructions");
4983 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
4984 // to the VPIRBasicBlock.
4985 auto EI = Entry->begin();
4986 for (Instruction &I : drop_end(RangeOrContainer&: *EntryBB)) {
4987 if (EI != Entry->end() && isa<VPIRInstruction>(Val: *EI) &&
4988 &cast<VPIRInstruction>(Val: &*EI)->getInstruction() == &I) {
4989 EI++;
4990 continue;
4991 }
4992 VPIRInstruction::create(I)->insertBefore(BB&: *Entry, IP: EI);
4993 }
4994
4995 return ExpandedSCEVs;
4996}
4997
4998/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
4999/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
5000/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
5001/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
5002/// an index-independent load if it feeds all wide ops at all indices (\p OpV
5003/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
5004/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
5005/// is defined at \p Idx of a load interleave group.
5006static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
5007 VPValue *OpV, unsigned Idx) {
5008 VPValue *Member0Op = WideMember0->getOperand(N: OpIdx);
5009 VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
5010 if (!Member0OpR)
5011 return Member0Op == OpV;
5012 if (auto *W = dyn_cast<VPWidenLoadRecipe>(Val: Member0OpR))
5013 return !W->getMask() && Member0Op == OpV;
5014 if (auto *IR = dyn_cast<VPInterleaveRecipe>(Val: Member0OpR))
5015 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(I: Idx) == OpV;
5016 return false;
5017}
5018
5019/// Returns true if \p IR is a full interleave group with factor and number of
5020/// members both equal to \p VF. The interleave group must also access the full
5021/// vector width \p VectorRegWidth.
5022static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR,
5023 ElementCount VF,
5024 VPTypeAnalysis &TypeInfo,
5025 TypeSize VectorRegWidth) {
5026 if (!InterleaveR || InterleaveR->getMask())
5027 return false;
5028
5029 Type *GroupElementTy = nullptr;
5030 if (InterleaveR->getStoredValues().empty()) {
5031 GroupElementTy = TypeInfo.inferScalarType(V: InterleaveR->getVPValue(I: 0));
5032 if (!all_of(Range: InterleaveR->definedValues(),
5033 P: [&TypeInfo, GroupElementTy](VPValue *Op) {
5034 return TypeInfo.inferScalarType(V: Op) == GroupElementTy;
5035 }))
5036 return false;
5037 } else {
5038 GroupElementTy =
5039 TypeInfo.inferScalarType(V: InterleaveR->getStoredValues()[0]);
5040 if (!all_of(Range: InterleaveR->getStoredValues(),
5041 P: [&TypeInfo, GroupElementTy](VPValue *Op) {
5042 return TypeInfo.inferScalarType(V: Op) == GroupElementTy;
5043 }))
5044 return false;
5045 }
5046
5047 unsigned VFMin = VF.getKnownMinValue();
5048 TypeSize GroupSize = TypeSize::get(
5049 Quantity: GroupElementTy->getScalarSizeInBits() * VFMin, Scalable: VF.isScalable());
5050 const auto *IG = InterleaveR->getInterleaveGroup();
5051 return IG->getFactor() == VFMin && IG->getNumMembers() == VFMin &&
5052 GroupSize == VectorRegWidth;
5053}
5054
5055/// Returns true if \p VPValue is a narrow VPValue.
5056static bool isAlreadyNarrow(VPValue *VPV) {
5057 if (isa<VPIRValue>(Val: VPV))
5058 return true;
5059 auto *RepR = dyn_cast<VPReplicateRecipe>(Val: VPV);
5060 return RepR && RepR->isSingleScalar();
5061}
5062
5063// Convert a wide recipe defining a VPValue \p V feeding an interleave group to
5064// a narrow variant.
5065static VPValue *
5066narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl<VPValue *> &NarrowedOps) {
5067 auto *R = V->getDefiningRecipe();
5068 if (!R || NarrowedOps.contains(Ptr: V))
5069 return V;
5070
5071 if (isAlreadyNarrow(VPV: V))
5072 return V;
5073
5074 if (auto *WideMember0 = dyn_cast<VPWidenRecipe>(Val: R)) {
5075 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
5076 WideMember0->setOperand(
5077 I: Idx,
5078 New: narrowInterleaveGroupOp(V: WideMember0->getOperand(N: Idx), NarrowedOps));
5079 return V;
5080 }
5081
5082 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(Val: R)) {
5083 // Narrow interleave group to wide load, as transformed VPlan will only
5084 // process one original iteration.
5085 auto *LI = cast<LoadInst>(Val: LoadGroup->getInterleaveGroup()->getInsertPos());
5086 auto *L = new VPWidenLoadRecipe(
5087 *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
5088 /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
5089 L->insertBefore(InsertPos: LoadGroup);
5090 NarrowedOps.insert(Ptr: L);
5091 return L;
5092 }
5093
5094 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: R)) {
5095 assert(RepR->isSingleScalar() &&
5096 isa<LoadInst>(RepR->getUnderlyingInstr()) &&
5097 "must be a single scalar load");
5098 NarrowedOps.insert(Ptr: RepR);
5099 return RepR;
5100 }
5101
5102 auto *WideLoad = cast<VPWidenLoadRecipe>(Val: R);
5103 VPValue *PtrOp = WideLoad->getAddr();
5104 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(Val: PtrOp))
5105 PtrOp = VecPtr->getOperand(N: 0);
5106 // Narrow wide load to uniform scalar load, as transformed VPlan will only
5107 // process one original iteration.
5108 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
5109 /*IsUniform*/ true,
5110 /*Mask*/ nullptr, {}, *WideLoad);
5111 N->insertBefore(InsertPos: WideLoad);
5112 NarrowedOps.insert(Ptr: N);
5113 return N;
5114}
5115
5116void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
5117 TypeSize VectorRegWidth) {
5118 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
5119 if (!VectorLoop || VectorLoop->getEntry()->getNumSuccessors() != 0)
5120 return;
5121
5122 VPTypeAnalysis TypeInfo(Plan);
5123
5124 SmallVector<VPInterleaveRecipe *> StoreGroups;
5125 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
5126 if (isa<VPCanonicalIVPHIRecipe>(Val: &R))
5127 continue;
5128
5129 if (isa<VPDerivedIVRecipe, VPScalarIVStepsRecipe>(Val: &R) &&
5130 vputils::onlyFirstLaneUsed(Def: cast<VPSingleDefRecipe>(Val: &R)))
5131 continue;
5132
5133 // Bail out on recipes not supported at the moment:
5134 // * phi recipes other than the canonical induction
5135 // * recipes writing to memory except interleave groups
5136 // Only support plans with a canonical induction phi.
5137 if (R.isPhi())
5138 return;
5139
5140 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(Val: &R);
5141 if (R.mayWriteToMemory() && !InterleaveR)
5142 return;
5143
5144 // Do not narrow interleave groups if there are VectorPointer recipes and
5145 // the plan was unrolled. The recipe implicitly uses VF from
5146 // VPTransformState.
5147 // TODO: Remove restriction once the VF for the VectorPointer offset is
5148 // modeled explicitly as operand.
5149 if (isa<VPVectorPointerRecipe>(Val: &R) && Plan.getUF() > 1)
5150 return;
5151
5152 // All other ops are allowed, but we reject uses that cannot be converted
5153 // when checking all allowed consumers (store interleave groups) below.
5154 if (!InterleaveR)
5155 continue;
5156
5157 // Bail out on non-consecutive interleave groups.
5158 if (!isConsecutiveInterleaveGroup(InterleaveR, VF, TypeInfo,
5159 VectorRegWidth))
5160 return;
5161
5162 // Skip read interleave groups.
5163 if (InterleaveR->getStoredValues().empty())
5164 continue;
5165
5166 // Narrow interleave groups, if all operands are already matching narrow
5167 // ops.
5168 auto *Member0 = InterleaveR->getStoredValues()[0];
5169 if (isAlreadyNarrow(VPV: Member0) &&
5170 all_of(Range: InterleaveR->getStoredValues(), P: equal_to(Arg&: Member0))) {
5171 StoreGroups.push_back(Elt: InterleaveR);
5172 continue;
5173 }
5174
5175 // For now, we only support full interleave groups storing load interleave
5176 // groups.
5177 if (all_of(Range: enumerate(First: InterleaveR->getStoredValues()), P: [](auto Op) {
5178 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
5179 if (!DefR)
5180 return false;
5181 auto *IR = dyn_cast<VPInterleaveRecipe>(Val: DefR);
5182 return IR && IR->getInterleaveGroup()->isFull() &&
5183 IR->getVPValue(Op.index()) == Op.value();
5184 })) {
5185 StoreGroups.push_back(Elt: InterleaveR);
5186 continue;
5187 }
5188
5189 // Check if all values feeding InterleaveR are matching wide recipes, which
5190 // operands that can be narrowed.
5191 auto *WideMember0 =
5192 dyn_cast_or_null<VPWidenRecipe>(Val: InterleaveR->getStoredValues()[0]);
5193 if (!WideMember0)
5194 return;
5195 for (const auto &[I, V] : enumerate(First: InterleaveR->getStoredValues())) {
5196 auto *R = dyn_cast_or_null<VPWidenRecipe>(Val: V);
5197 if (!R || R->getOpcode() != WideMember0->getOpcode() ||
5198 R->getNumOperands() > 2)
5199 return;
5200 if (any_of(Range: enumerate(First: R->operands()),
5201 P: [WideMember0, Idx = I](const auto &P) {
5202 const auto &[OpIdx, OpV] = P;
5203 return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx);
5204 }))
5205 return;
5206 }
5207 StoreGroups.push_back(Elt: InterleaveR);
5208 }
5209
5210 if (StoreGroups.empty())
5211 return;
5212
5213 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
5214 SmallPtrSet<VPValue *, 4> NarrowedOps;
5215 // Narrow operation tree rooted at store groups.
5216 for (auto *StoreGroup : StoreGroups) {
5217 VPValue *Res =
5218 narrowInterleaveGroupOp(V: StoreGroup->getStoredValues()[0], NarrowedOps);
5219 auto *SI =
5220 cast<StoreInst>(Val: StoreGroup->getInterleaveGroup()->getInsertPos());
5221 auto *S = new VPWidenStoreRecipe(
5222 *SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
5223 /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
5224 S->insertBefore(InsertPos: StoreGroup);
5225 StoreGroup->eraseFromParent();
5226 }
5227
5228 // Adjust induction to reflect that the transformed plan only processes one
5229 // original iteration.
5230 auto *CanIV = VectorLoop->getCanonicalIV();
5231 auto *Inc = cast<VPInstruction>(Val: CanIV->getBackedgeValue());
5232 VPBuilder PHBuilder(Plan.getVectorPreheader());
5233
5234 VPValue *UF = Plan.getOrAddLiveIn(
5235 V: ConstantInt::get(Ty: VectorLoop->getCanonicalIVType(), V: 1 * Plan.getUF()));
5236 if (VF.isScalable()) {
5237 VPValue *VScale = PHBuilder.createElementCount(
5238 Ty: VectorLoop->getCanonicalIVType(), EC: ElementCount::getScalable(MinVal: 1));
5239 VPValue *VScaleUF = PHBuilder.createOverflowingOp(
5240 Opcode: Instruction::Mul, Operands: {VScale, UF}, WrapFlags: {true, false});
5241 Inc->setOperand(I: 1, New: VScaleUF);
5242 Plan.getVF().replaceAllUsesWith(New: VScale);
5243 } else {
5244 Inc->setOperand(I: 1, New: UF);
5245 Plan.getVF().replaceAllUsesWith(
5246 New: Plan.getConstantInt(Ty: CanIV->getScalarType(), Val: 1));
5247 }
5248 removeDeadRecipes(Plan);
5249}
5250
5251/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
5252/// BranchOnCond recipe.
5253void VPlanTransforms::addBranchWeightToMiddleTerminator(
5254 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
5255 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5256 auto *MiddleTerm =
5257 dyn_cast_or_null<VPInstruction>(Val: MiddleVPBB->getTerminator());
5258 // Only add branch metadata if there is a (conditional) terminator.
5259 if (!MiddleTerm)
5260 return;
5261
5262 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
5263 "must have a BranchOnCond");
5264 // Assume that `TripCount % VectorStep ` is equally distributed.
5265 unsigned VectorStep = Plan.getUF() * VF.getKnownMinValue();
5266 if (VF.isScalable() && VScaleForTuning.has_value())
5267 VectorStep *= *VScaleForTuning;
5268 assert(VectorStep > 0 && "trip count should not be zero");
5269 MDBuilder MDB(Plan.getContext());
5270 MDNode *BranchWeights =
5271 MDB.createBranchWeights(Weights: {1, VectorStep - 1}, /*IsExpected=*/false);
5272 MiddleTerm->setMetadata(Kind: LLVMContext::MD_prof, Node: BranchWeights);
5273}
5274
5275/// Compute and return the end value for \p WideIV, unless it is truncated. If
5276/// the induction recipe is not canonical, creates a VPDerivedIVRecipe to
5277/// compute the end value of the induction.
5278static VPValue *tryToComputeEndValueForInduction(VPWidenInductionRecipe *WideIV,
5279 VPBuilder &VectorPHBuilder,
5280 VPTypeAnalysis &TypeInfo,
5281 VPValue *VectorTC) {
5282 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: WideIV);
5283 // Truncated wide inductions resume from the last lane of their vector value
5284 // in the last vector iteration which is handled elsewhere.
5285 if (WideIntOrFp && WideIntOrFp->getTruncInst())
5286 return nullptr;
5287
5288 VPIRValue *Start = WideIV->getStartValue();
5289 VPValue *Step = WideIV->getStepValue();
5290 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
5291 VPValue *EndValue = VectorTC;
5292 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
5293 EndValue = VectorPHBuilder.createDerivedIV(
5294 Kind: ID.getKind(), FPBinOp: dyn_cast_or_null<FPMathOperator>(Val: ID.getInductionBinOp()),
5295 Start, Current: VectorTC, Step);
5296 }
5297
5298 // EndValue is derived from the vector trip count (which has the same type as
5299 // the widest induction) and thus may be wider than the induction here.
5300 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(V: WideIV);
5301 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(V: EndValue)) {
5302 EndValue = VectorPHBuilder.createScalarCast(Opcode: Instruction::Trunc, Op: EndValue,
5303 ResultTy: ScalarTypeOfWideIV,
5304 DL: WideIV->getDebugLoc());
5305 }
5306
5307 return EndValue;
5308}
5309
5310void VPlanTransforms::updateScalarResumePhis(
5311 VPlan &Plan, DenseMap<VPValue *, VPValue *> &IVEndValues) {
5312 VPTypeAnalysis TypeInfo(Plan);
5313 auto *ScalarPH = Plan.getScalarPreheader();
5314 auto *MiddleVPBB = cast<VPBasicBlock>(Val: ScalarPH->getPredecessors()[0]);
5315 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5316 VPBuilder VectorPHBuilder(
5317 cast<VPBasicBlock>(Val: VectorRegion->getSinglePredecessor()));
5318 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5319 for (VPRecipeBase &PhiR : Plan.getScalarPreheader()->phis()) {
5320 auto *ResumePhiR = cast<VPPhi>(Val: &PhiR);
5321
5322 // TODO: Extract final value from induction recipe initially, optimize to
5323 // pre-computed end value together in optimizeInductionExitUsers.
5324 auto *VectorPhiR = cast<VPHeaderPHIRecipe>(Val: ResumePhiR->getOperand(N: 0));
5325 if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(Val: VectorPhiR)) {
5326 if (VPValue *EndValue = tryToComputeEndValueForInduction(
5327 WideIV: WideIVR, VectorPHBuilder, TypeInfo, VectorTC: &Plan.getVectorTripCount())) {
5328 IVEndValues[WideIVR] = EndValue;
5329 ResumePhiR->setOperand(I: 0, New: EndValue);
5330 ResumePhiR->setName("bc.resume.val");
5331 continue;
5332 }
5333 // TODO: Also handle truncated inductions here. Computing end-values
5334 // separately should be done as VPlan-to-VPlan optimization, after
5335 // legalizing all resume values to use the last lane from the loop.
5336 assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
5337 "should only skip truncated wide inductions");
5338 continue;
5339 }
5340
5341 // The backedge value provides the value to resume coming out of a loop,
5342 // which for FORs is a vector whose last element needs to be extracted. The
5343 // start value provides the value if the loop is bypassed.
5344 bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(Val: VectorPhiR);
5345 auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
5346 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5347 "Cannot handle loops with uncountable early exits");
5348 if (IsFOR) {
5349 auto *ExtractPart = MiddleBuilder.createNaryOp(
5350 Opcode: VPInstruction::ExtractLastPart, Operands: ResumeFromVectorLoop);
5351 ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
5352 Opcode: VPInstruction::ExtractLastLane, Operands: ExtractPart, DL: DebugLoc::getUnknown(),
5353 Name: "vector.recur.extract");
5354 }
5355 ResumePhiR->setName(IsFOR ? "scalar.recur.init" : "bc.merge.rdx");
5356 ResumePhiR->setOperand(I: 0, New: ResumeFromVectorLoop);
5357 }
5358}
5359
5360void VPlanTransforms::addExitUsersForFirstOrderRecurrences(VPlan &Plan,
5361 VFRange &Range) {
5362 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5363 auto *ScalarPHVPBB = Plan.getScalarPreheader();
5364 auto *MiddleVPBB = Plan.getMiddleBlock();
5365 VPBuilder ScalarPHBuilder(ScalarPHVPBB);
5366 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5367
5368 auto IsScalableOne = [](ElementCount VF) -> bool {
5369 return VF == ElementCount::getScalable(MinVal: 1);
5370 };
5371
5372 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
5373 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Val: &HeaderPhi);
5374 if (!FOR)
5375 continue;
5376
5377 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5378 "Cannot handle loops with uncountable early exits");
5379
5380 // This is the second phase of vectorizing first-order recurrences, creating
5381 // extract for users outside the loop. An overview of the transformation is
5382 // described below. Suppose we have the following loop with some use after
5383 // the loop of the last a[i-1],
5384 //
5385 // for (int i = 0; i < n; ++i) {
5386 // t = a[i - 1];
5387 // b[i] = a[i] - t;
5388 // }
5389 // use t;
5390 //
5391 // There is a first-order recurrence on "a". For this loop, the shorthand
5392 // scalar IR looks like:
5393 //
5394 // scalar.ph:
5395 // s.init = a[-1]
5396 // br scalar.body
5397 //
5398 // scalar.body:
5399 // i = phi [0, scalar.ph], [i+1, scalar.body]
5400 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
5401 // s2 = a[i]
5402 // b[i] = s2 - s1
5403 // br cond, scalar.body, exit.block
5404 //
5405 // exit.block:
5406 // use = lcssa.phi [s1, scalar.body]
5407 //
5408 // In this example, s1 is a recurrence because it's value depends on the
5409 // previous iteration. In the first phase of vectorization, we created a
5410 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
5411 // for users in the scalar preheader and exit block.
5412 //
5413 // vector.ph:
5414 // v_init = vector(..., ..., ..., a[-1])
5415 // br vector.body
5416 //
5417 // vector.body
5418 // i = phi [0, vector.ph], [i+4, vector.body]
5419 // v1 = phi [v_init, vector.ph], [v2, vector.body]
5420 // v2 = a[i, i+1, i+2, i+3]
5421 // b[i] = v2 - v1
5422 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
5423 // b[i, i+1, i+2, i+3] = v2 - v1
5424 // br cond, vector.body, middle.block
5425 //
5426 // middle.block:
5427 // vector.recur.extract.for.phi = v2(2)
5428 // vector.recur.extract = v2(3)
5429 // br cond, scalar.ph, exit.block
5430 //
5431 // scalar.ph:
5432 // scalar.recur.init = phi [vector.recur.extract, middle.block],
5433 // [s.init, otherwise]
5434 // br scalar.body
5435 //
5436 // scalar.body:
5437 // i = phi [0, scalar.ph], [i+1, scalar.body]
5438 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
5439 // s2 = a[i]
5440 // b[i] = s2 - s1
5441 // br cond, scalar.body, exit.block
5442 //
5443 // exit.block:
5444 // lo = lcssa.phi [s1, scalar.body],
5445 // [vector.recur.extract.for.phi, middle.block]
5446 //
5447 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
5448 // Extract the penultimate value of the recurrence and use it as operand for
5449 // the VPIRInstruction modeling the phi.
5450 for (VPRecipeBase &R : make_early_inc_range(
5451 Range: make_range(x: MiddleVPBB->getFirstNonPhi(), y: MiddleVPBB->end()))) {
5452 if (!match(V: &R, P: m_ExtractLastLaneOfLastPart(Op0: m_Specific(VPV: FOR))))
5453 continue;
5454
5455 // For VF vscale x 1, if vscale = 1, we are unable to extract the
5456 // penultimate value of the recurrence. Instead we rely on the existing
5457 // extract of the last element from the result of
5458 // VPInstruction::FirstOrderRecurrenceSplice.
5459 // TODO: Consider vscale_range info and UF.
5460 if (LoopVectorizationPlanner::getDecisionAndClampRange(Predicate: IsScalableOne,
5461 Range))
5462 return;
5463 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
5464 Opcode: VPInstruction::ExtractPenultimateElement, Operands: FOR->getBackedgeValue(), DL: {},
5465 Name: "vector.recur.extract.for.phi");
5466 cast<VPInstruction>(Val: &R)->replaceAllUsesWith(New: PenultimateElement);
5467 }
5468 }
5469}
5470
5471namespace {
5472
5473/// A chain of recipes that form a partial reduction. Matches either
5474/// reduction_bin_op (extend (A), accumulator), or
5475/// reduction_bin_op (bin_op (extend (A), (extend (B))), accumulator).
5476struct VPPartialReductionChain {
5477 /// The top-level binary operation that forms the reduction to a scalar
5478 /// after the loop body.
5479 VPWidenRecipe *ReductionBinOp;
5480 /// The extension of each of the inner binary operation's operands.
5481 VPWidenCastRecipe *ExtendA;
5482 VPWidenCastRecipe *ExtendB;
5483 /// The user of the extends that is then reduced.
5484 VPWidenRecipe *BinOp;
5485 unsigned ScaleFactor;
5486};
5487
5488// Helper to transform a partial reduction chain into a partial reduction
5489// recipe. Returns true if transformation succeeded. Checks profitability and
5490// clamps VF range.
5491static bool transformToPartialReduction(const VPPartialReductionChain &Chain,
5492 VFRange &Range, VPCostContext &CostCtx,
5493 VPlan &Plan) {
5494 VPWidenRecipe *WidenRecipe = Chain.ReductionBinOp;
5495 unsigned ScaleFactor = Chain.ScaleFactor;
5496 assert(WidenRecipe->getNumOperands() == 2 && "Expected binary operation");
5497
5498 VPValue *BinOp = WidenRecipe->getOperand(N: 0);
5499 VPValue *Accumulator = WidenRecipe->getOperand(N: 1);
5500
5501 // Swap if needed to ensure Accumulator is the PHI or partial reduction.
5502 if (isa_and_present<VPReductionPHIRecipe, VPReductionRecipe>(Val: BinOp))
5503 std::swap(a&: BinOp, b&: Accumulator);
5504
5505 // For chained reductions, only transform if accumulator is already a PHI or
5506 // partial reduction. Otherwise, it needs to be transformed first.
5507 auto *AccumRecipe = Accumulator->getDefiningRecipe();
5508 if (!isa_and_present<VPReductionPHIRecipe, VPReductionRecipe>(Val: AccumRecipe))
5509 return false;
5510
5511 // Check if the partial reduction is profitable for the VF range.
5512 Type *PhiType = CostCtx.Types.inferScalarType(V: Accumulator);
5513
5514 // Derive extend info from the stored extends.
5515 auto GetExtInfo = [&CostCtx](VPWidenCastRecipe *Ext)
5516 -> std::pair<Type *, TargetTransformInfo::PartialReductionExtendKind> {
5517 if (!Ext)
5518 return {nullptr, TargetTransformInfo::PR_None};
5519 Type *ExtOpType = CostCtx.Types.inferScalarType(V: Ext->getOperand(N: 0));
5520 auto ExtKind = TargetTransformInfo::getPartialReductionExtendKind(
5521 CastOpc: static_cast<Instruction::CastOps>(Ext->getOpcode()));
5522 return {ExtOpType, ExtKind};
5523 };
5524 auto ExtInfoA = GetExtInfo(Chain.ExtendA);
5525 auto ExtInfoB = GetExtInfo(Chain.ExtendB);
5526 Type *ExtOpTypeA = ExtInfoA.first;
5527 Type *ExtOpTypeB = ExtInfoB.first;
5528 auto ExtKindA = ExtInfoA.second;
5529 auto ExtKindB = ExtInfoB.second;
5530 // If ExtendB is nullptr but there's a separate BinOp, the second operand
5531 // was a constant that can use the same extend kind as the first.
5532 if (!Chain.ExtendB && Chain.BinOp && Chain.BinOp != Chain.ReductionBinOp) {
5533 // Validate that the constant can be extended to the narrow type.
5534 const APInt *Const = nullptr;
5535 for (VPValue *Op : Chain.BinOp->operands()) {
5536 if (match(V: Op, P: m_APInt(C&: Const)))
5537 break;
5538 }
5539 if (!Const || !canConstantBeExtended(C: Const, NarrowType: ExtOpTypeA, ExtKind: ExtKindA))
5540 return false;
5541 ExtOpTypeB = ExtOpTypeA;
5542 ExtKindB = ExtKindA;
5543 }
5544
5545 // BinOpc is only set when there's a separate binary op (not when BinOp is
5546 // the reduction itself).
5547 std::optional<unsigned> BinOpc =
5548 (Chain.BinOp && Chain.BinOp != Chain.ReductionBinOp)
5549 ? std::make_optional(t: Chain.BinOp->getOpcode())
5550 : std::nullopt;
5551
5552 if (!LoopVectorizationPlanner::getDecisionAndClampRange(
5553 Predicate: [&](ElementCount VF) {
5554 return CostCtx.TTI
5555 .getPartialReductionCost(
5556 Opcode: WidenRecipe->getOpcode(), InputTypeA: ExtOpTypeA, InputTypeB: ExtOpTypeB, AccumType: PhiType,
5557 VF, OpAExtend: ExtKindA, OpBExtend: ExtKindB, BinOp: BinOpc, CostKind: CostCtx.CostKind,
5558 FMF: PhiType->isFloatingPointTy()
5559 ? std::optional{WidenRecipe->getFastMathFlags()}
5560 : std::nullopt)
5561 .isValid();
5562 },
5563 Range))
5564 return false;
5565
5566 VPValue *Cond = nullptr;
5567 VPValue *ExitValue = nullptr;
5568 if (auto *RdxPhi = dyn_cast<VPReductionPHIRecipe>(Val: AccumRecipe)) {
5569 assert(RdxPhi->getVFScaleFactor() == 1 && "scale factor must not be set");
5570 RdxPhi->setVFScaleFactor(ScaleFactor);
5571
5572 // Update ReductionStartVector instruction scale factor.
5573 VPValue *StartValue = RdxPhi->getOperand(N: 0);
5574 auto *StartInst = cast<VPInstruction>(Val: StartValue);
5575 assert(StartInst->getOpcode() == VPInstruction::ReductionStartVector);
5576 auto *NewScaleFactor = Plan.getConstantInt(BitWidth: 32, Val: ScaleFactor);
5577 StartInst->setOperand(I: 2, New: NewScaleFactor);
5578
5579 // Find the ComputeReductionResult that uses the WidenRecipe (the exit
5580 // value). Look through selects for predicated reductions.
5581 if (auto *RdxResult = vputils::findComputeReductionResult(PhiR: RdxPhi)) {
5582 ExitValue = RdxResult->getOperand(N: 0);
5583 match(V: ExitValue, P: m_Select(Op0: m_VPValue(V&: Cond), Op1: m_VPValue(), Op2: m_VPValue()));
5584 }
5585 }
5586
5587 // Handle SUB by negating the operand and using ADD for the partial reduction.
5588 if (WidenRecipe->getOpcode() == Instruction::Sub) {
5589 VPBuilder Builder(WidenRecipe);
5590 Type *ElemTy = CostCtx.Types.inferScalarType(V: BinOp);
5591 auto *Zero = Plan.getConstantInt(Ty: ElemTy, Val: 0);
5592 VPIRFlags Flags = WidenRecipe->getUnderlyingInstr()
5593 ? VPIRFlags(*WidenRecipe->getUnderlyingInstr())
5594 : VPIRFlags();
5595 auto *NegRecipe = new VPWidenRecipe(Instruction::Sub, {Zero, BinOp}, Flags,
5596 VPIRMetadata(), DebugLoc::getUnknown());
5597 Builder.insert(R: NegRecipe);
5598 BinOp = NegRecipe;
5599 }
5600
5601 RecurKind RdxKind =
5602 PhiType->isFloatingPointTy() ? RecurKind::FAdd : RecurKind::Add;
5603 auto *PartialRed = new VPReductionRecipe(
5604 RdxKind,
5605 RdxKind == RecurKind::FAdd ? WidenRecipe->getFastMathFlags()
5606 : FastMathFlags(),
5607 WidenRecipe->getUnderlyingInstr(), Accumulator, BinOp, Cond,
5608 RdxUnordered{/*VFScaleFactor=*/ScaleFactor});
5609 PartialRed->insertBefore(InsertPos: WidenRecipe);
5610
5611 if (Cond)
5612 ExitValue->replaceAllUsesWith(New: PartialRed);
5613 WidenRecipe->replaceAllUsesWith(New: PartialRed);
5614 return true;
5615}
5616
5617/// Examines reduction operations to see if the target can use a cheaper
5618/// operation with a wider per-iteration input VF and narrower PHI VF.
5619/// Recursively calls itself to identify chained scaled reductions.
5620/// Returns true if this invocation added an entry to Chains, otherwise false.
5621static bool
5622getScaledReductions(VPSingleDefRecipe *RedPhiR, VPValue *PrevValue,
5623 SmallVectorImpl<VPPartialReductionChain> &Chains,
5624 VPTypeAnalysis &TypeInfo) {
5625 auto *UpdateR = dyn_cast<VPWidenRecipe>(Val: PrevValue);
5626 if (!UpdateR || !Instruction::isBinaryOp(Opcode: UpdateR->getOpcode()))
5627 return false;
5628
5629 VPValue *Op = UpdateR->getOperand(N: 0);
5630 VPValue *PhiOp = UpdateR->getOperand(N: 1);
5631 if (Op == RedPhiR)
5632 std::swap(a&: Op, b&: PhiOp);
5633
5634 // If Op is an extend, then it's still a valid partial reduction if the
5635 // extended mul fulfills the other requirements.
5636 // For example, reduce.add(ext(mul(ext(A), ext(B)))) is still a valid partial
5637 // reduction since the inner extends will be widened. We already have oneUse
5638 // checks on the inner extends so widening them is safe.
5639 std::optional<TTI::PartialReductionExtendKind> OuterExtKind;
5640 if (match(V: Op, P: m_ZExtOrSExt(Op0: m_Mul(Op0: m_VPValue(), Op1: m_VPValue()))) ||
5641 match(V: Op, P: m_FPExt(Op0: m_FMul(Op0: m_VPValue(), Op1: m_VPValue())))) {
5642 auto *CastRecipe = dyn_cast<VPWidenCastRecipe>(Val: Op);
5643 if (!CastRecipe)
5644 return false;
5645 auto CastOp = static_cast<Instruction::CastOps>(CastRecipe->getOpcode());
5646 OuterExtKind = TTI::getPartialReductionExtendKind(CastOpc: CastOp);
5647 Op = CastRecipe->getOperand(N: 0);
5648 }
5649
5650 // Try and get a scaled reduction from the first non-phi operand.
5651 // If one is found, we use the discovered reduction instruction in
5652 // place of the accumulator for costing.
5653 if (getScaledReductions(RedPhiR, PrevValue: Op, Chains, TypeInfo)) {
5654 RedPhiR = Chains.rbegin()->ReductionBinOp;
5655 Op = UpdateR->getOperand(N: 0);
5656 PhiOp = UpdateR->getOperand(N: 1);
5657 if (Op == RedPhiR)
5658 std::swap(a&: Op, b&: PhiOp);
5659 }
5660 if (RedPhiR != PhiOp)
5661 return false;
5662
5663 // If the update is a binary op, check both of its operands to see if
5664 // they are extends. Otherwise, see if the update comes directly from an
5665 // extend.
5666 VPWidenCastRecipe *CastRecipes[2] = {nullptr};
5667
5668 // Match extends and populate CastRecipes. Returns false if matching fails.
5669 auto MatchExtends = [OuterExtKind,
5670 &CastRecipes](ArrayRef<VPValue *> Operands) {
5671 assert(Operands.size() <= 2 && "expected at most 2 operands");
5672
5673 for (const auto &[I, OpVal] : enumerate(First&: Operands)) {
5674 // Allow constant as second operand - validation happens in transform.
5675 const APInt *Unused;
5676 if (I > 0 && CastRecipes[0] && match(V: OpVal, P: m_APInt(C&: Unused)))
5677 continue;
5678
5679 VPValue *ExtInput;
5680 if (!match(V: OpVal, P: m_ZExtOrSExt(Op0: m_VPValue(V&: ExtInput))) &&
5681 !match(V: OpVal, P: m_FPExt(Op0: m_VPValue(V&: ExtInput))))
5682 return false;
5683
5684 CastRecipes[I] = dyn_cast<VPWidenCastRecipe>(Val: OpVal);
5685 if (!CastRecipes[I])
5686 return false;
5687
5688 // The outer extend kind must match the inner extends for folding.
5689 if (OuterExtKind) {
5690 auto CastOp =
5691 static_cast<Instruction::CastOps>(CastRecipes[I]->getOpcode());
5692 if (*OuterExtKind != TTI::getPartialReductionExtendKind(CastOpc: CastOp))
5693 return false;
5694 }
5695 }
5696 return CastRecipes[0] != nullptr;
5697 };
5698
5699 // If Op is a binary operator, check both of its operands to see if they are
5700 // extends. Otherwise, see if the update comes directly from an extend.
5701 auto *BinOp = dyn_cast<VPWidenRecipe>(Val: Op);
5702 if (BinOp && Instruction::isBinaryOp(Opcode: BinOp->getOpcode())) {
5703 if (!BinOp->hasOneUse())
5704 return false;
5705
5706 // Handle neg(binop(ext, ext)) pattern.
5707 VPValue *OtherOp = nullptr;
5708 if (match(V: BinOp, P: m_Sub(Op0: m_ZeroInt(), Op1: m_VPValue(V&: OtherOp))))
5709 BinOp = dyn_cast<VPWidenRecipe>(Val: OtherOp);
5710
5711 if (!BinOp || !Instruction::isBinaryOp(Opcode: BinOp->getOpcode()) ||
5712 !MatchExtends(BinOp->operands()))
5713 return false;
5714 } else if (match(V: UpdateR, P: m_Add(Op0: m_VPValue(), Op1: m_VPValue())) ||
5715 match(V: UpdateR, P: m_FAdd(Op0: m_VPValue(), Op1: m_VPValue()))) {
5716 // We already know the operands for Update are Op and PhiOp.
5717 if (!MatchExtends({Op}))
5718 return false;
5719 BinOp = UpdateR;
5720 } else {
5721 return false;
5722 }
5723
5724 Type *PhiType = TypeInfo.inferScalarType(V: RedPhiR);
5725 TypeSize PHISize = PhiType->getPrimitiveSizeInBits();
5726 Type *ExtOpType = TypeInfo.inferScalarType(V: CastRecipes[0]->getOperand(N: 0));
5727 TypeSize ASize = ExtOpType->getPrimitiveSizeInBits();
5728 if (!PHISize.hasKnownScalarFactor(RHS: ASize))
5729 return false;
5730
5731 Chains.push_back(
5732 Elt: {.ReductionBinOp: UpdateR, .ExtendA: CastRecipes[0], .ExtendB: CastRecipes[1], .BinOp: BinOp,
5733 .ScaleFactor: static_cast<unsigned>(PHISize.getKnownScalarFactor(RHS: ASize))});
5734 return true;
5735}
5736} // namespace
5737
5738void VPlanTransforms::createPartialReductions(VPlan &Plan,
5739 VPCostContext &CostCtx,
5740 VFRange &Range) {
5741 // Find all possible partial reductions, grouping chains by their PHI. This
5742 // grouping allows invalidating the whole chain, if any link is not a valid
5743 // partial reduction.
5744 MapVector<VPReductionPHIRecipe *, SmallVector<VPPartialReductionChain>>
5745 ChainsByPhi;
5746 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
5747 for (VPRecipeBase &R : HeaderVPBB->phis()) {
5748 auto *RedPhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
5749 if (!RedPhiR)
5750 continue;
5751
5752 // Get the backedge value from the reduction PHI and find the
5753 // ComputeReductionResult that uses it (directly or through a select for
5754 // predicated reductions).
5755 if (auto *RdxResult = vputils::findComputeReductionResult(PhiR: RedPhiR)) {
5756 VPValue *ExitValue = RdxResult->getOperand(N: 0);
5757 match(V: ExitValue,
5758 P: m_Select(Op0: m_VPValue(), Op1: m_VPValue(V&: ExitValue), Op2: m_VPValue()));
5759 getScaledReductions(RedPhiR, PrevValue: ExitValue, Chains&: ChainsByPhi[RedPhiR],
5760 TypeInfo&: CostCtx.Types);
5761 }
5762 }
5763
5764 if (ChainsByPhi.empty())
5765 return;
5766
5767 // Build set of partial reduction operations for extend user validation and
5768 // a map of reduction bin ops to their scale factors for scale validation.
5769 SmallPtrSet<VPRecipeBase *, 4> PartialReductionOps;
5770 DenseMap<VPSingleDefRecipe *, unsigned> ScaledReductionMap;
5771 for (const auto &[_, Chains] : ChainsByPhi)
5772 for (const VPPartialReductionChain &Chain : Chains) {
5773 PartialReductionOps.insert(Ptr: Chain.BinOp);
5774 ScaledReductionMap[Chain.ReductionBinOp] = Chain.ScaleFactor;
5775 }
5776
5777 // A partial reduction is invalid if any of its extends are used by
5778 // something that isn't another partial reduction. This is because the
5779 // extends are intended to be lowered along with the reduction itself.
5780 auto ExtendUsersValid = [&](VPWidenCastRecipe *Ext) {
5781 return !Ext || all_of(Range: Ext->users(), P: [&](VPUser *U) {
5782 return PartialReductionOps.contains(Ptr: cast<VPRecipeBase>(Val: U));
5783 });
5784 };
5785
5786 // Validate chains: check that extends are only used by partial reductions,
5787 // and that reduction bin ops are only used by other partial reductions with
5788 // matching scale factors, are outside the loop region or the select
5789 // introduced by tail-folding. Otherwise we would create users of scaled
5790 // reductions where the types of the other operands don't match.
5791 for (auto &[RedPhiR, Chains] : ChainsByPhi) {
5792 for (const VPPartialReductionChain &Chain : Chains) {
5793 if (!ExtendUsersValid(Chain.ExtendA) ||
5794 !ExtendUsersValid(Chain.ExtendB)) {
5795 Chains.clear();
5796 break;
5797 }
5798 auto UseIsValid = [&, RedPhiR = RedPhiR](VPUser *U) {
5799 if (auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: U))
5800 return PhiR == RedPhiR;
5801 auto *R = cast<VPSingleDefRecipe>(Val: U);
5802 return Chain.ScaleFactor == ScaledReductionMap.lookup_or(Val: R, Default: 0) ||
5803 match(R, P: m_ComputeReductionResult(
5804 Op0: m_Specific(VPV: Chain.ReductionBinOp))) ||
5805 match(R, P: m_Select(Op0: m_VPValue(), Op1: m_Specific(VPV: Chain.ReductionBinOp),
5806 Op2: m_Specific(VPV: RedPhiR)));
5807 };
5808 if (!all_of(Range: Chain.ReductionBinOp->users(), P: UseIsValid)) {
5809 Chains.clear();
5810 break;
5811 }
5812
5813 // Check if the compute-reduction-result is used by a sunk store.
5814 // TODO: Also form partial reductions in those cases.
5815 if (auto *RdxResult = vputils::findComputeReductionResult(PhiR: RedPhiR)) {
5816 if (any_of(Range: RdxResult->users(), P: [](VPUser *U) {
5817 auto *RepR = dyn_cast<VPReplicateRecipe>(Val: U);
5818 return RepR && isa<StoreInst>(Val: RepR->getUnderlyingInstr());
5819 })) {
5820 Chains.clear();
5821 break;
5822 }
5823 }
5824 }
5825 }
5826
5827 for (const auto &[_, Chains] : ChainsByPhi)
5828 for (const VPPartialReductionChain &Chain : Chains)
5829 transformToPartialReduction(Chain, Range, CostCtx, Plan);
5830}
5831