1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
25#include "llvm/ADT/PostOrderIterator.h"
26#include "llvm/ADT/STLExtras.h"
27#include "llvm/ADT/SetOperations.h"
28#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallPtrSet.h"
30#include "llvm/ADT/TypeSwitch.h"
31#include "llvm/Analysis/IVDescriptors.h"
32#include "llvm/Analysis/InstSimplifyFolder.h"
33#include "llvm/Analysis/LoopInfo.h"
34#include "llvm/Analysis/MemoryLocation.h"
35#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
36#include "llvm/Analysis/ScopedNoAliasAA.h"
37#include "llvm/Analysis/VectorUtils.h"
38#include "llvm/IR/Intrinsics.h"
39#include "llvm/IR/MDBuilder.h"
40#include "llvm/IR/Metadata.h"
41#include "llvm/Support/Casting.h"
42#include "llvm/Support/TypeSize.h"
43#include "llvm/Transforms/Utils/LoopUtils.h"
44#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
45
46using namespace llvm;
47using namespace VPlanPatternMatch;
48using namespace SCEVPatternMatch;
49
50bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
51 VPlan &Plan, const TargetLibraryInfo &TLI) {
52
53 ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
54 Plan.getVectorLoopRegion());
55 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: RPOT)) {
56 // Skip blocks outside region
57 if (!VPBB->getParent())
58 break;
59 VPRecipeBase *Term = VPBB->getTerminator();
60 auto EndIter = Term ? Term->getIterator() : VPBB->end();
61 // Introduce each ingredient into VPlan.
62 for (VPRecipeBase &Ingredient :
63 make_early_inc_range(Range: make_range(x: VPBB->begin(), y: EndIter))) {
64
65 VPValue *VPV = Ingredient.getVPSingleValue();
66 if (!VPV->getUnderlyingValue())
67 continue;
68
69 Instruction *Inst = cast<Instruction>(Val: VPV->getUnderlyingValue());
70
71 VPRecipeBase *NewRecipe = nullptr;
72 if (auto *PhiR = dyn_cast<VPPhi>(Val: &Ingredient)) {
73 auto *Phi = cast<PHINode>(Val: PhiR->getUnderlyingValue());
74 NewRecipe = new VPWidenPHIRecipe(Phi, nullptr, PhiR->getDebugLoc());
75 for (VPValue *Op : PhiR->operands())
76 NewRecipe->addOperand(Operand: Op);
77 } else if (auto *VPI = dyn_cast<VPInstruction>(Val: &Ingredient)) {
78 assert(!isa<PHINode>(Inst) && "phis should be handled above");
79 // Create VPWidenMemoryRecipe for loads and stores.
80 if (LoadInst *Load = dyn_cast<LoadInst>(Val: Inst)) {
81 NewRecipe = new VPWidenLoadRecipe(
82 *Load, Ingredient.getOperand(N: 0), nullptr /*Mask*/,
83 false /*Consecutive*/, false /*Reverse*/, *VPI,
84 Ingredient.getDebugLoc());
85 } else if (StoreInst *Store = dyn_cast<StoreInst>(Val: Inst)) {
86 NewRecipe = new VPWidenStoreRecipe(
87 *Store, Ingredient.getOperand(N: 1), Ingredient.getOperand(N: 0),
88 nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/, *VPI,
89 Ingredient.getDebugLoc());
90 } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: Inst)) {
91 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands(), *VPI,
92 Ingredient.getDebugLoc());
93 } else if (CallInst *CI = dyn_cast<CallInst>(Val: Inst)) {
94 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, TLI: &TLI);
95 if (VectorID == Intrinsic::not_intrinsic)
96 return false;
97 NewRecipe = new VPWidenIntrinsicRecipe(
98 *CI, getVectorIntrinsicIDForCall(CI, TLI: &TLI),
99 drop_end(RangeOrContainer: Ingredient.operands()), CI->getType(), VPIRFlags(*CI),
100 *VPI, CI->getDebugLoc());
101 } else if (auto *CI = dyn_cast<CastInst>(Val: Inst)) {
102 NewRecipe = new VPWidenCastRecipe(
103 CI->getOpcode(), Ingredient.getOperand(N: 0), CI->getType(), CI,
104 VPIRFlags(*CI), VPIRMetadata(*CI));
105 } else {
106 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,
107 *VPI, Ingredient.getDebugLoc());
108 }
109 } else {
110 assert(isa<VPWidenIntOrFpInductionRecipe>(&Ingredient) &&
111 "inductions must be created earlier");
112 continue;
113 }
114
115 NewRecipe->insertBefore(InsertPos: &Ingredient);
116 if (NewRecipe->getNumDefinedValues() == 1)
117 VPV->replaceAllUsesWith(New: NewRecipe->getVPSingleValue());
118 else
119 assert(NewRecipe->getNumDefinedValues() == 0 &&
120 "Only recpies with zero or one defined values expected");
121 Ingredient.eraseFromParent();
122 }
123 }
124 return true;
125}
126
127/// Helper for extra no-alias checks via known-safe recipe and SCEV.
128class SinkStoreInfo {
129 const SmallPtrSetImpl<VPRecipeBase *> &ExcludeRecipes;
130 VPReplicateRecipe &GroupLeader;
131 PredicatedScalarEvolution &PSE;
132 const Loop &L;
133 VPTypeAnalysis &TypeInfo;
134
135 // Return true if \p A and \p B are known to not alias for all VFs in the
136 // plan, checked via the distance between the accesses
137 bool isNoAliasViaDistance(VPReplicateRecipe *A, VPReplicateRecipe *B) const {
138 if (A->getOpcode() != Instruction::Store ||
139 B->getOpcode() != Instruction::Store)
140 return false;
141
142 VPValue *AddrA = A->getOperand(N: 1);
143 const SCEV *SCEVA = vputils::getSCEVExprForVPValue(V: AddrA, PSE, L: &L);
144 VPValue *AddrB = B->getOperand(N: 1);
145 const SCEV *SCEVB = vputils::getSCEVExprForVPValue(V: AddrB, PSE, L: &L);
146 if (isa<SCEVCouldNotCompute>(Val: SCEVA) || isa<SCEVCouldNotCompute>(Val: SCEVB))
147 return false;
148
149 const APInt *Distance;
150 ScalarEvolution &SE = *PSE.getSE();
151 if (!match(S: SE.getMinusSCEV(LHS: SCEVA, RHS: SCEVB), P: m_scev_APInt(C&: Distance)))
152 return false;
153
154 const DataLayout &DL = SE.getDataLayout();
155 Type *TyA = TypeInfo.inferScalarType(V: A->getOperand(N: 0));
156 uint64_t SizeA = DL.getTypeStoreSize(Ty: TyA);
157 Type *TyB = TypeInfo.inferScalarType(V: B->getOperand(N: 0));
158 uint64_t SizeB = DL.getTypeStoreSize(Ty: TyB);
159
160 // Use the maximum store size to ensure no overlap from either direction.
161 // Currently only handles fixed sizes, as it is only used for
162 // replicating VPReplicateRecipes.
163 uint64_t MaxStoreSize = std::max(a: SizeA, b: SizeB);
164
165 auto VFs = B->getParent()->getPlan()->vectorFactors();
166 ElementCount MaxVF = *max_element(Range&: VFs, C: ElementCount::isKnownLT);
167 if (MaxVF.isScalable())
168 return false;
169 return Distance->abs().uge(
170 RHS: MaxVF.multiplyCoefficientBy(RHS: MaxStoreSize).getFixedValue());
171 }
172
173public:
174 SinkStoreInfo(const SmallPtrSetImpl<VPRecipeBase *> &ExcludeRecipes,
175 VPReplicateRecipe &GroupLeader, PredicatedScalarEvolution &PSE,
176 const Loop &L, VPTypeAnalysis &TypeInfo)
177 : ExcludeRecipes(ExcludeRecipes), GroupLeader(GroupLeader), PSE(PSE),
178 L(L), TypeInfo(TypeInfo) {}
179
180 /// Return true if \p R should be skipped during alias checking, either
181 /// because it's in the exclude set or because no-alias can be proven via
182 /// SCEV.
183 bool shouldSkip(VPRecipeBase &R) const {
184 auto *Store = dyn_cast<VPReplicateRecipe>(Val: &R);
185 return ExcludeRecipes.contains(Ptr: &R) ||
186 (Store && isNoAliasViaDistance(A: Store, B: &GroupLeader));
187 }
188};
189
190/// Check if a memory operation doesn't alias with memory operations in blocks
191/// between \p FirstBB and \p LastBB using scoped noalias metadata. If
192/// \p SinkInfo is std::nullopt, only recipes that may write to memory are
193/// checked (for load hoisting). Otherwise recipes that both read and write
194/// memory are checked, and SCEV is used to prove no-alias between the group
195/// leader and other replicate recipes (for store sinking).
196static bool
197canHoistOrSinkWithNoAliasCheck(const MemoryLocation &MemLoc,
198 VPBasicBlock *FirstBB, VPBasicBlock *LastBB,
199 std::optional<SinkStoreInfo> SinkInfo = {}) {
200 bool CheckReads = SinkInfo.has_value();
201 if (!MemLoc.AATags.Scope)
202 return false;
203
204 for (VPBlockBase *Block = FirstBB; Block;
205 Block = Block->getSingleSuccessor()) {
206 assert(Block->getNumSuccessors() <= 1 &&
207 "Expected at most one successor in block chain");
208 auto *VPBB = cast<VPBasicBlock>(Val: Block);
209 for (VPRecipeBase &R : *VPBB) {
210 if (SinkInfo && SinkInfo->shouldSkip(R))
211 continue;
212
213 // Skip recipes that don't need checking.
214 if (!R.mayWriteToMemory() && !(CheckReads && R.mayReadFromMemory()))
215 continue;
216
217 auto Loc = vputils::getMemoryLocation(R);
218 if (!Loc)
219 // Conservatively assume aliasing for memory operations without
220 // location.
221 return false;
222
223 if (ScopedNoAliasAAResult::alias(LocA: *Loc, LocB: MemLoc) != AliasResult::NoAlias)
224 return false;
225 }
226
227 if (Block == LastBB)
228 break;
229 }
230 return true;
231}
232
233/// Collect either replicated Loads or Stores grouped by their address SCEV.
234template <unsigned Opcode>
235static SmallVector<SmallVector<VPReplicateRecipe *, 4>>
236collectGroupedReplicateMemOps(
237 VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L,
238 function_ref<bool(VPReplicateRecipe *)> FilterFn) {
239 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
240 "Only Load and Store opcodes supported");
241 constexpr bool IsLoad = (Opcode == Instruction::Load);
242 SmallDenseMap<const SCEV *, SmallVector<VPReplicateRecipe *, 4>>
243 RecipesByAddress;
244 for (VPBlockBase *Block :
245 vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry())) {
246 auto *VPBB = cast<VPBasicBlock>(Val: Block);
247 for (VPRecipeBase &R : *VPBB) {
248 auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R);
249 if (!RepR || RepR->getOpcode() != Opcode || !FilterFn(RepR))
250 continue;
251
252 // For loads, operand 0 is address; for stores, operand 1 is address.
253 VPValue *Addr = RepR->getOperand(N: IsLoad ? 0 : 1);
254 const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(V: Addr, PSE, L);
255 if (!isa<SCEVCouldNotCompute>(Val: AddrSCEV))
256 RecipesByAddress[AddrSCEV].push_back(Elt: RepR);
257 }
258 }
259 auto Groups = to_vector(Range: RecipesByAddress.values());
260 VPDominatorTree VPDT(Plan);
261 for (auto &Group : Groups) {
262 // Sort mem ops by dominance order, with earliest (most dominating) first.
263 stable_sort(Group, [&VPDT](VPReplicateRecipe *A, VPReplicateRecipe *B) {
264 return VPDT.properlyDominates(A, B);
265 });
266 }
267 return Groups;
268}
269
270/// Return true if we do not know how to (mechanically) hoist or sink \p R out
271/// of a loop region.
272static bool cannotHoistOrSinkRecipe(const VPRecipeBase &R) {
273 // Assumes don't alias anything or throw; as long as they're guaranteed to
274 // execute, they're safe to hoist.
275 if (match(V: &R, P: m_Intrinsic<Intrinsic::assume>()))
276 return false;
277
278 // TODO: Relax checks in the future, e.g. we could also hoist reads, if their
279 // memory location is not modified in the vector loop.
280 if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi())
281 return true;
282
283 // Allocas cannot be hoisted.
284 auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R);
285 return RepR && RepR->getOpcode() == Instruction::Alloca;
286}
287
288static bool sinkScalarOperands(VPlan &Plan) {
289 auto Iter = vp_depth_first_deep(G: Plan.getEntry());
290 bool ScalarVFOnly = Plan.hasScalarVFOnly();
291 bool Changed = false;
292
293 SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList;
294 auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
295 VPBasicBlock *SinkTo, VPValue *Op) {
296 auto *Candidate =
297 dyn_cast_or_null<VPSingleDefRecipe>(Val: Op->getDefiningRecipe());
298 if (!Candidate)
299 return;
300
301 // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
302 // for now.
303 if (!isa<VPReplicateRecipe, VPScalarIVStepsRecipe>(Val: Candidate))
304 return;
305
306 if (Candidate->getParent() == SinkTo || cannotHoistOrSinkRecipe(R: *Candidate))
307 return;
308
309 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: Candidate))
310 if (!ScalarVFOnly && RepR->isSingleScalar())
311 return;
312
313 WorkList.insert(X: {SinkTo, Candidate});
314 };
315
316 // First, collect the operands of all recipes in replicate blocks as seeds for
317 // sinking.
318 for (VPRegionBlock *VPR : VPBlockUtils::blocksOnly<VPRegionBlock>(Range: Iter)) {
319 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
320 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
321 continue;
322 VPBasicBlock *VPBB = cast<VPBasicBlock>(Val: EntryVPBB->getSuccessors().front());
323 if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
324 continue;
325 for (auto &Recipe : *VPBB)
326 for (VPValue *Op : Recipe.operands())
327 InsertIfValidSinkCandidate(VPBB, Op);
328 }
329
330 // Try to sink each replicate or scalar IV steps recipe in the worklist.
331 for (unsigned I = 0; I != WorkList.size(); ++I) {
332 VPBasicBlock *SinkTo;
333 VPSingleDefRecipe *SinkCandidate;
334 std::tie(args&: SinkTo, args&: SinkCandidate) = WorkList[I];
335
336 // All recipe users of SinkCandidate must be in the same block SinkTo or all
337 // users outside of SinkTo must only use the first lane of SinkCandidate. In
338 // the latter case, we need to duplicate SinkCandidate.
339 auto UsersOutsideSinkTo =
340 make_filter_range(Range: SinkCandidate->users(), Pred: [SinkTo](VPUser *U) {
341 return cast<VPRecipeBase>(Val: U)->getParent() != SinkTo;
342 });
343 if (any_of(Range&: UsersOutsideSinkTo, P: [SinkCandidate](VPUser *U) {
344 return !U->usesFirstLaneOnly(Op: SinkCandidate);
345 }))
346 continue;
347 bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
348
349 if (NeedsDuplicating) {
350 if (ScalarVFOnly)
351 continue;
352 VPSingleDefRecipe *Clone;
353 if (auto *SinkCandidateRepR =
354 dyn_cast<VPReplicateRecipe>(Val: SinkCandidate)) {
355 // TODO: Handle converting to uniform recipes as separate transform,
356 // then cloning should be sufficient here.
357 Instruction *I = SinkCandidate->getUnderlyingInstr();
358 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
359 nullptr /*Mask*/, *SinkCandidateRepR,
360 *SinkCandidateRepR);
361 // TODO: add ".cloned" suffix to name of Clone's VPValue.
362 } else {
363 Clone = SinkCandidate->clone();
364 }
365
366 Clone->insertBefore(InsertPos: SinkCandidate);
367 SinkCandidate->replaceUsesWithIf(New: Clone, ShouldReplace: [SinkTo](VPUser &U, unsigned) {
368 return cast<VPRecipeBase>(Val: &U)->getParent() != SinkTo;
369 });
370 }
371 SinkCandidate->moveBefore(BB&: *SinkTo, I: SinkTo->getFirstNonPhi());
372 for (VPValue *Op : SinkCandidate->operands())
373 InsertIfValidSinkCandidate(SinkTo, Op);
374 Changed = true;
375 }
376 return Changed;
377}
378
379/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
380/// the mask.
381static VPValue *getPredicatedMask(VPRegionBlock *R) {
382 auto *EntryBB = dyn_cast<VPBasicBlock>(Val: R->getEntry());
383 if (!EntryBB || EntryBB->size() != 1 ||
384 !isa<VPBranchOnMaskRecipe>(Val: EntryBB->begin()))
385 return nullptr;
386
387 return cast<VPBranchOnMaskRecipe>(Val: &*EntryBB->begin())->getOperand(N: 0);
388}
389
390/// If \p R is a triangle region, return the 'then' block of the triangle.
391static VPBasicBlock *getPredicatedThenBlock(VPRegionBlock *R) {
392 auto *EntryBB = cast<VPBasicBlock>(Val: R->getEntry());
393 if (EntryBB->getNumSuccessors() != 2)
394 return nullptr;
395
396 auto *Succ0 = dyn_cast<VPBasicBlock>(Val: EntryBB->getSuccessors()[0]);
397 auto *Succ1 = dyn_cast<VPBasicBlock>(Val: EntryBB->getSuccessors()[1]);
398 if (!Succ0 || !Succ1)
399 return nullptr;
400
401 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
402 return nullptr;
403 if (Succ0->getSingleSuccessor() == Succ1)
404 return Succ0;
405 if (Succ1->getSingleSuccessor() == Succ0)
406 return Succ1;
407 return nullptr;
408}
409
410// Merge replicate regions in their successor region, if a replicate region
411// is connected to a successor replicate region with the same predicate by a
412// single, empty VPBasicBlock.
413static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) {
414 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
415
416 // Collect replicate regions followed by an empty block, followed by another
417 // replicate region with matching masks to process front. This is to avoid
418 // iterator invalidation issues while merging regions.
419 SmallVector<VPRegionBlock *, 8> WorkList;
420 for (VPRegionBlock *Region1 : VPBlockUtils::blocksOnly<VPRegionBlock>(
421 Range: vp_depth_first_deep(G: Plan.getEntry()))) {
422 if (!Region1->isReplicator())
423 continue;
424 auto *MiddleBasicBlock =
425 dyn_cast_or_null<VPBasicBlock>(Val: Region1->getSingleSuccessor());
426 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
427 continue;
428
429 auto *Region2 =
430 dyn_cast_or_null<VPRegionBlock>(Val: MiddleBasicBlock->getSingleSuccessor());
431 if (!Region2 || !Region2->isReplicator())
432 continue;
433
434 VPValue *Mask1 = getPredicatedMask(R: Region1);
435 VPValue *Mask2 = getPredicatedMask(R: Region2);
436 if (!Mask1 || Mask1 != Mask2)
437 continue;
438
439 assert(Mask1 && Mask2 && "both region must have conditions");
440 WorkList.push_back(Elt: Region1);
441 }
442
443 // Move recipes from Region1 to its successor region, if both are triangles.
444 for (VPRegionBlock *Region1 : WorkList) {
445 if (TransformedRegions.contains(Ptr: Region1))
446 continue;
447 auto *MiddleBasicBlock = cast<VPBasicBlock>(Val: Region1->getSingleSuccessor());
448 auto *Region2 = cast<VPRegionBlock>(Val: MiddleBasicBlock->getSingleSuccessor());
449
450 VPBasicBlock *Then1 = getPredicatedThenBlock(R: Region1);
451 VPBasicBlock *Then2 = getPredicatedThenBlock(R: Region2);
452 if (!Then1 || !Then2)
453 continue;
454
455 // Note: No fusion-preventing memory dependencies are expected in either
456 // region. Such dependencies should be rejected during earlier dependence
457 // checks, which guarantee accesses can be re-ordered for vectorization.
458 //
459 // Move recipes to the successor region.
460 for (VPRecipeBase &ToMove : make_early_inc_range(Range: reverse(C&: *Then1)))
461 ToMove.moveBefore(BB&: *Then2, I: Then2->getFirstNonPhi());
462
463 auto *Merge1 = cast<VPBasicBlock>(Val: Then1->getSingleSuccessor());
464 auto *Merge2 = cast<VPBasicBlock>(Val: Then2->getSingleSuccessor());
465
466 // Move VPPredInstPHIRecipes from the merge block to the successor region's
467 // merge block. Update all users inside the successor region to use the
468 // original values.
469 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(Range: reverse(C&: *Merge1))) {
470 VPValue *PredInst1 =
471 cast<VPPredInstPHIRecipe>(Val: &Phi1ToMove)->getOperand(N: 0);
472 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
473 Phi1ToMoveV->replaceUsesWithIf(New: PredInst1, ShouldReplace: [Then2](VPUser &U, unsigned) {
474 return cast<VPRecipeBase>(Val: &U)->getParent() == Then2;
475 });
476
477 // Remove phi recipes that are unused after merging the regions.
478 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
479 Phi1ToMove.eraseFromParent();
480 continue;
481 }
482 Phi1ToMove.moveBefore(BB&: *Merge2, I: Merge2->begin());
483 }
484
485 // Remove the dead recipes in Region1's entry block.
486 for (VPRecipeBase &R :
487 make_early_inc_range(Range: reverse(C&: *Region1->getEntryBasicBlock())))
488 R.eraseFromParent();
489
490 // Finally, remove the first region.
491 for (VPBlockBase *Pred : make_early_inc_range(Range&: Region1->getPredecessors())) {
492 VPBlockUtils::disconnectBlocks(From: Pred, To: Region1);
493 VPBlockUtils::connectBlocks(From: Pred, To: MiddleBasicBlock);
494 }
495 VPBlockUtils::disconnectBlocks(From: Region1, To: MiddleBasicBlock);
496 TransformedRegions.insert(Ptr: Region1);
497 }
498
499 return !TransformedRegions.empty();
500}
501
502static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,
503 VPlan &Plan) {
504 Instruction *Instr = PredRecipe->getUnderlyingInstr();
505 // Build the triangular if-then region.
506 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
507 assert(Instr->getParent() && "Predicated instruction not in any basic block");
508 auto *BlockInMask = PredRecipe->getMask();
509 auto *MaskDef = BlockInMask->getDefiningRecipe();
510 auto *BOMRecipe = new VPBranchOnMaskRecipe(
511 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
512 auto *Entry =
513 Plan.createVPBasicBlock(Name: Twine(RegionName) + ".entry", Recipe: BOMRecipe);
514
515 // Replace predicated replicate recipe with a replicate recipe without a
516 // mask but in the replicate region.
517 auto *RecipeWithoutMask = new VPReplicateRecipe(
518 PredRecipe->getUnderlyingInstr(), drop_end(RangeOrContainer: PredRecipe->operands()),
519 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe, *PredRecipe,
520 PredRecipe->getDebugLoc());
521 auto *Pred =
522 Plan.createVPBasicBlock(Name: Twine(RegionName) + ".if", Recipe: RecipeWithoutMask);
523
524 VPPredInstPHIRecipe *PHIRecipe = nullptr;
525 if (PredRecipe->getNumUsers() != 0) {
526 PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
527 RecipeWithoutMask->getDebugLoc());
528 PredRecipe->replaceAllUsesWith(New: PHIRecipe);
529 PHIRecipe->setOperand(I: 0, New: RecipeWithoutMask);
530 }
531 PredRecipe->eraseFromParent();
532 auto *Exiting =
533 Plan.createVPBasicBlock(Name: Twine(RegionName) + ".continue", Recipe: PHIRecipe);
534 VPRegionBlock *Region =
535 Plan.createReplicateRegion(Entry, Exiting, Name: RegionName);
536
537 // Note: first set Entry as region entry and then connect successors starting
538 // from it in order, to propagate the "parent" of each VPBasicBlock.
539 VPBlockUtils::insertTwoBlocksAfter(IfTrue: Pred, IfFalse: Exiting, BlockPtr: Entry);
540 VPBlockUtils::connectBlocks(From: Pred, To: Exiting);
541
542 return Region;
543}
544
545static void addReplicateRegions(VPlan &Plan) {
546 SmallVector<VPReplicateRecipe *> WorkList;
547 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
548 Range: vp_depth_first_deep(G: Plan.getEntry()))) {
549 for (VPRecipeBase &R : *VPBB)
550 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R)) {
551 if (RepR->isPredicated())
552 WorkList.push_back(Elt: RepR);
553 }
554 }
555
556 unsigned BBNum = 0;
557 for (VPReplicateRecipe *RepR : WorkList) {
558 VPBasicBlock *CurrentBlock = RepR->getParent();
559 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(SplitAt: RepR->getIterator());
560
561 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
562 SplitBlock->setName(
563 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
564 // Record predicated instructions for above packing optimizations.
565 VPRegionBlock *Region = createReplicateRegion(PredRecipe: RepR, Plan);
566 Region->setParent(CurrentBlock->getParent());
567 VPBlockUtils::insertOnEdge(From: CurrentBlock, To: SplitBlock, BlockPtr: Region);
568
569 VPRegionBlock *ParentRegion = Region->getParent();
570 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
571 ParentRegion->setExiting(SplitBlock);
572 }
573}
574
575/// Remove redundant VPBasicBlocks by merging them into their predecessor if
576/// the predecessor has a single successor.
577static bool mergeBlocksIntoPredecessors(VPlan &Plan) {
578 SmallVector<VPBasicBlock *> WorkList;
579 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
580 Range: vp_depth_first_deep(G: Plan.getEntry()))) {
581 // Don't fold the blocks in the skeleton of the Plan into their single
582 // predecessors for now.
583 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
584 if (!VPBB->getParent())
585 continue;
586 auto *PredVPBB =
587 dyn_cast_or_null<VPBasicBlock>(Val: VPBB->getSinglePredecessor());
588 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
589 isa<VPIRBasicBlock>(Val: PredVPBB))
590 continue;
591 WorkList.push_back(Elt: VPBB);
592 }
593
594 for (VPBasicBlock *VPBB : WorkList) {
595 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(Val: VPBB->getSinglePredecessor());
596 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB))
597 R.moveBefore(BB&: *PredVPBB, I: PredVPBB->end());
598 VPBlockUtils::disconnectBlocks(From: PredVPBB, To: VPBB);
599 auto *ParentRegion = VPBB->getParent();
600 if (ParentRegion && ParentRegion->getExiting() == VPBB)
601 ParentRegion->setExiting(PredVPBB);
602 for (auto *Succ : to_vector(Range: VPBB->successors())) {
603 VPBlockUtils::disconnectBlocks(From: VPBB, To: Succ);
604 VPBlockUtils::connectBlocks(From: PredVPBB, To: Succ);
605 }
606 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
607 }
608 return !WorkList.empty();
609}
610
611void VPlanTransforms::createAndOptimizeReplicateRegions(VPlan &Plan) {
612 // Convert masked VPReplicateRecipes to if-then region blocks.
613 addReplicateRegions(Plan);
614
615 bool ShouldSimplify = true;
616 while (ShouldSimplify) {
617 ShouldSimplify = sinkScalarOperands(Plan);
618 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
619 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
620 }
621}
622
623/// Remove redundant casts of inductions.
624///
625/// Such redundant casts are casts of induction variables that can be ignored,
626/// because we already proved that the casted phi is equal to the uncasted phi
627/// in the vectorized loop. There is no need to vectorize the cast - the same
628/// value can be used for both the phi and casts in the vector loop.
629static void removeRedundantInductionCasts(VPlan &Plan) {
630 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
631 auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi);
632 if (!IV || IV->getTruncInst())
633 continue;
634
635 // A sequence of IR Casts has potentially been recorded for IV, which
636 // *must be bypassed* when the IV is vectorized, because the vectorized IV
637 // will produce the desired casted value. This sequence forms a def-use
638 // chain and is provided in reverse order, ending with the cast that uses
639 // the IV phi. Search for the recipe of the last cast in the chain and
640 // replace it with the original IV. Note that only the final cast is
641 // expected to have users outside the cast-chain and the dead casts left
642 // over will be cleaned up later.
643 ArrayRef<Instruction *> Casts = IV->getInductionDescriptor().getCastInsts();
644 VPValue *FindMyCast = IV;
645 for (Instruction *IRCast : reverse(C&: Casts)) {
646 VPSingleDefRecipe *FoundUserCast = nullptr;
647 for (auto *U : FindMyCast->users()) {
648 auto *UserCast = dyn_cast<VPSingleDefRecipe>(Val: U);
649 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
650 FoundUserCast = UserCast;
651 break;
652 }
653 }
654 FindMyCast = FoundUserCast;
655 }
656 FindMyCast->replaceAllUsesWith(New: IV);
657 }
658}
659
660/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
661/// recipe, if it exists.
662static void removeRedundantCanonicalIVs(VPlan &Plan) {
663 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
664 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
665 VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
666 for (VPUser *U : CanonicalIV->users()) {
667 WidenNewIV = dyn_cast<VPWidenCanonicalIVRecipe>(Val: U);
668 if (WidenNewIV)
669 break;
670 }
671
672 if (!WidenNewIV)
673 return;
674
675 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
676 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
677 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi);
678
679 if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
680 continue;
681
682 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
683 // everything WidenNewIV's users need. That is, WidenOriginalIV will
684 // generate a vector phi or all users of WidenNewIV demand the first lane
685 // only.
686 if (Plan.hasScalarVFOnly() ||
687 !vputils::onlyScalarValuesUsed(Def: WidenOriginalIV) ||
688 vputils::onlyFirstLaneUsed(Def: WidenNewIV)) {
689 // We are replacing a wide canonical iv with a suitable wide induction.
690 // This is used to compute header mask, hence all lanes will be used and
691 // we need to drop wrap flags only applying to lanes guranteed to execute
692 // in the original scalar loop.
693 WidenOriginalIV->dropPoisonGeneratingFlags();
694 WidenNewIV->replaceAllUsesWith(New: WidenOriginalIV);
695 WidenNewIV->eraseFromParent();
696 return;
697 }
698 }
699}
700
701/// Returns true if \p R is dead and can be removed.
702static bool isDeadRecipe(VPRecipeBase &R) {
703 // Do remove conditional assume instructions as their conditions may be
704 // flattened.
705 auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R);
706 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
707 match(V: RepR, P: m_Intrinsic<Intrinsic::assume>());
708 if (IsConditionalAssume)
709 return true;
710
711 if (R.mayHaveSideEffects())
712 return false;
713
714 // Recipe is dead if no user keeps the recipe alive.
715 return all_of(Range: R.definedValues(),
716 P: [](VPValue *V) { return V->getNumUsers() == 0; });
717}
718
719void VPlanTransforms::removeDeadRecipes(VPlan &Plan) {
720 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
721 Range: vp_post_order_deep(G: Plan.getEntry()))) {
722 // The recipes in the block are processed in reverse order, to catch chains
723 // of dead recipes.
724 for (VPRecipeBase &R : make_early_inc_range(Range: reverse(C&: *VPBB))) {
725 if (isDeadRecipe(R)) {
726 R.eraseFromParent();
727 continue;
728 }
729
730 // Check if R is a dead VPPhi <-> update cycle and remove it.
731 auto *PhiR = dyn_cast<VPPhi>(Val: &R);
732 if (!PhiR || PhiR->getNumOperands() != 2)
733 continue;
734 VPUser *PhiUser = PhiR->getSingleUser();
735 if (!PhiUser)
736 continue;
737 VPValue *Incoming = PhiR->getOperand(N: 1);
738 if (PhiUser != Incoming->getDefiningRecipe() ||
739 Incoming->getNumUsers() != 1)
740 continue;
741 PhiR->replaceAllUsesWith(New: PhiR->getOperand(N: 0));
742 PhiR->eraseFromParent();
743 Incoming->getDefiningRecipe()->eraseFromParent();
744 }
745 }
746}
747
748static VPScalarIVStepsRecipe *
749createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind,
750 Instruction::BinaryOps InductionOpcode,
751 FPMathOperator *FPBinOp, Instruction *TruncI,
752 VPIRValue *StartV, VPValue *Step, DebugLoc DL,
753 VPBuilder &Builder) {
754 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
755 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
756 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
757 VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
758 Kind, FPBinOp, Start: StartV, Current: CanonicalIV, Step, Name: "offset.idx");
759
760 // Truncate base induction if needed.
761 VPTypeAnalysis TypeInfo(Plan);
762 Type *ResultTy = TypeInfo.inferScalarType(V: BaseIV);
763 if (TruncI) {
764 Type *TruncTy = TruncI->getType();
765 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
766 "Not truncating.");
767 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
768 BaseIV = Builder.createScalarCast(Opcode: Instruction::Trunc, Op: BaseIV, ResultTy: TruncTy, DL);
769 ResultTy = TruncTy;
770 }
771
772 // Truncate step if needed.
773 Type *StepTy = TypeInfo.inferScalarType(V: Step);
774 if (ResultTy != StepTy) {
775 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
776 "Not truncating.");
777 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
778 auto *VecPreheader =
779 cast<VPBasicBlock>(Val: HeaderVPBB->getSingleHierarchicalPredecessor());
780 VPBuilder::InsertPointGuard Guard(Builder);
781 Builder.setInsertPoint(VecPreheader);
782 Step = Builder.createScalarCast(Opcode: Instruction::Trunc, Op: Step, ResultTy, DL);
783 }
784 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, IV: BaseIV, Step,
785 VF: &Plan.getVF(), DL);
786}
787
788static SmallVector<VPUser *> collectUsersRecursively(VPValue *V) {
789 SetVector<VPUser *> Users(llvm::from_range, V->users());
790 for (unsigned I = 0; I != Users.size(); ++I) {
791 VPRecipeBase *Cur = cast<VPRecipeBase>(Val: Users[I]);
792 if (isa<VPHeaderPHIRecipe>(Val: Cur))
793 continue;
794 for (VPValue *V : Cur->definedValues())
795 Users.insert_range(R: V->users());
796 }
797 return Users.takeVector();
798}
799
800/// Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd
801/// (IndStart, ScalarIVSteps (0, Step)). This is used when the recipe only
802/// generates scalar values.
803static VPValue *
804scalarizeVPWidenPointerInduction(VPWidenPointerInductionRecipe *PtrIV,
805 VPlan &Plan, VPBuilder &Builder) {
806 const InductionDescriptor &ID = PtrIV->getInductionDescriptor();
807 VPIRValue *StartV = Plan.getZero(Ty: ID.getStep()->getType());
808 VPValue *StepV = PtrIV->getOperand(N: 1);
809 VPScalarIVStepsRecipe *Steps = createScalarIVSteps(
810 Plan, Kind: InductionDescriptor::IK_IntInduction, InductionOpcode: Instruction::Add, FPBinOp: nullptr,
811 TruncI: nullptr, StartV, Step: StepV, DL: PtrIV->getDebugLoc(), Builder);
812
813 return Builder.createPtrAdd(Ptr: PtrIV->getStartValue(), Offset: Steps,
814 DL: PtrIV->getDebugLoc(), Name: "next.gep");
815}
816
817/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
818/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
819/// VPWidenPointerInductionRecipe will generate vectors only. If some users
820/// require vectors while other require scalars, the scalar uses need to extract
821/// the scalars from the generated vectors (Note that this is different to how
822/// int/fp inductions are handled). Legalize extract-from-ends using uniform
823/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
824/// the correct end value is available. Also optimize
825/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
826/// providing them scalar steps built on the canonical scalar IV and update the
827/// original IV's users. This is an optional optimization to reduce the needs of
828/// vector extracts.
829static void legalizeAndOptimizeInductions(VPlan &Plan) {
830 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
831 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
832 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
833 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
834 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(Val: &Phi);
835 if (!PhiR)
836 continue;
837
838 // Try to narrow wide and replicating recipes to uniform recipes, based on
839 // VPlan analysis.
840 // TODO: Apply to all recipes in the future, to replace legacy uniformity
841 // analysis.
842 auto Users = collectUsersRecursively(V: PhiR);
843 for (VPUser *U : reverse(C&: Users)) {
844 auto *Def = dyn_cast<VPRecipeWithIRFlags>(Val: U);
845 auto *RepR = dyn_cast<VPReplicateRecipe>(Val: U);
846 // Skip recipes that shouldn't be narrowed.
847 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Val: Def) ||
848 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
849 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
850 continue;
851
852 // Skip recipes that may have other lanes than their first used.
853 if (!vputils::isSingleScalar(VPV: Def) && !vputils::onlyFirstLaneUsed(Def))
854 continue;
855
856 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
857 Def->operands(), /*IsUniform*/ true,
858 /*Mask*/ nullptr, /*Flags*/ *Def);
859 Clone->insertAfter(InsertPos: Def);
860 Def->replaceAllUsesWith(New: Clone);
861 }
862
863 // Replace wide pointer inductions which have only their scalars used by
864 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
865 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(Val: &Phi)) {
866 if (!Plan.hasScalarVFOnly() &&
867 !PtrIV->onlyScalarsGenerated(IsScalable: Plan.hasScalableVF()))
868 continue;
869
870 VPValue *PtrAdd = scalarizeVPWidenPointerInduction(PtrIV, Plan, Builder);
871 PtrIV->replaceAllUsesWith(New: PtrAdd);
872 continue;
873 }
874
875 // Replace widened induction with scalar steps for users that only use
876 // scalars.
877 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi);
878 if (HasOnlyVectorVFs && none_of(Range: WideIV->users(), P: [WideIV](VPUser *U) {
879 return U->usesScalars(Op: WideIV);
880 }))
881 continue;
882
883 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
884 VPScalarIVStepsRecipe *Steps = createScalarIVSteps(
885 Plan, Kind: ID.getKind(), InductionOpcode: ID.getInductionOpcode(),
886 FPBinOp: dyn_cast_or_null<FPMathOperator>(Val: ID.getInductionBinOp()),
887 TruncI: WideIV->getTruncInst(), StartV: WideIV->getStartValue(), Step: WideIV->getStepValue(),
888 DL: WideIV->getDebugLoc(), Builder);
889
890 // Update scalar users of IV to use Step instead.
891 if (!HasOnlyVectorVFs) {
892 assert(!Plan.hasScalableVF() &&
893 "plans containing a scalar VF cannot also include scalable VFs");
894 WideIV->replaceAllUsesWith(New: Steps);
895 } else {
896 bool HasScalableVF = Plan.hasScalableVF();
897 WideIV->replaceUsesWithIf(New: Steps,
898 ShouldReplace: [WideIV, HasScalableVF](VPUser &U, unsigned) {
899 if (HasScalableVF)
900 return U.usesFirstLaneOnly(Op: WideIV);
901 return U.usesScalars(Op: WideIV);
902 });
903 }
904 }
905}
906
907/// Check if \p VPV is an untruncated wide induction, either before or after the
908/// increment. If so return the header IV (before the increment), otherwise
909/// return null.
910static VPWidenInductionRecipe *
911getOptimizableIVOf(VPValue *VPV, PredicatedScalarEvolution &PSE) {
912 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(Val: VPV);
913 if (WideIV) {
914 // VPV itself is a wide induction, separately compute the end value for exit
915 // users if it is not a truncated IV.
916 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: WideIV);
917 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
918 }
919
920 // Check if VPV is an optimizable induction increment.
921 VPRecipeBase *Def = VPV->getDefiningRecipe();
922 if (!Def || Def->getNumOperands() != 2)
923 return nullptr;
924 WideIV = dyn_cast<VPWidenInductionRecipe>(Val: Def->getOperand(N: 0));
925 if (!WideIV)
926 WideIV = dyn_cast<VPWidenInductionRecipe>(Val: Def->getOperand(N: 1));
927 if (!WideIV)
928 return nullptr;
929
930 auto IsWideIVInc = [&]() {
931 auto &ID = WideIV->getInductionDescriptor();
932
933 // Check if VPV increments the induction by the induction step.
934 VPValue *IVStep = WideIV->getStepValue();
935 switch (ID.getInductionOpcode()) {
936 case Instruction::Add:
937 return match(V: VPV, P: m_c_Add(Op0: m_Specific(VPV: WideIV), Op1: m_Specific(VPV: IVStep)));
938 case Instruction::FAdd:
939 return match(V: VPV, P: m_c_FAdd(Op0: m_Specific(VPV: WideIV), Op1: m_Specific(VPV: IVStep)));
940 case Instruction::FSub:
941 return match(V: VPV, P: m_Binary<Instruction::FSub>(Op0: m_Specific(VPV: WideIV),
942 Op1: m_Specific(VPV: IVStep)));
943 case Instruction::Sub: {
944 // IVStep will be the negated step of the subtraction. Check if Step == -1
945 // * IVStep.
946 VPValue *Step;
947 if (!match(V: VPV, P: m_Sub(Op0: m_VPValue(), Op1: m_VPValue(V&: Step))))
948 return false;
949 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(V: IVStep, PSE);
950 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(V: Step, PSE);
951 ScalarEvolution &SE = *PSE.getSE();
952 return !isa<SCEVCouldNotCompute>(Val: IVStepSCEV) &&
953 !isa<SCEVCouldNotCompute>(Val: StepSCEV) &&
954 IVStepSCEV == SE.getNegativeSCEV(V: StepSCEV);
955 }
956 default:
957 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
958 match(V: VPV, P: m_GetElementPtr(Op0: m_Specific(VPV: WideIV),
959 Op1: m_Specific(VPV: WideIV->getStepValue())));
960 }
961 llvm_unreachable("should have been covered by switch above");
962 };
963 return IsWideIVInc() ? WideIV : nullptr;
964}
965
966/// Attempts to optimize the induction variable exit values for users in the
967/// early exit block.
968static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan,
969 VPTypeAnalysis &TypeInfo,
970 VPBlockBase *PredVPBB,
971 VPValue *Op,
972 PredicatedScalarEvolution &PSE) {
973 VPValue *Incoming, *Mask;
974 if (!match(V: Op, P: m_ExtractLane(Op0: m_FirstActiveLane(Op0: m_VPValue(V&: Mask)),
975 Op1: m_VPValue(V&: Incoming))))
976 return nullptr;
977
978 auto *WideIV = getOptimizableIVOf(VPV: Incoming, PSE);
979 if (!WideIV)
980 return nullptr;
981
982 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: WideIV);
983 if (WideIntOrFp && WideIntOrFp->getTruncInst())
984 return nullptr;
985
986 // Calculate the final index.
987 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
988 auto *CanonicalIV = LoopRegion->getCanonicalIV();
989 Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
990 VPBuilder B(cast<VPBasicBlock>(Val: PredVPBB));
991
992 DebugLoc DL = cast<VPInstruction>(Val: Op)->getDebugLoc();
993 VPValue *FirstActiveLane =
994 B.createNaryOp(Opcode: VPInstruction::FirstActiveLane, Operands: Mask, DL);
995 Type *FirstActiveLaneType = TypeInfo.inferScalarType(V: FirstActiveLane);
996 FirstActiveLane = B.createScalarZExtOrTrunc(Op: FirstActiveLane, ResultTy: CanonicalIVType,
997 SrcTy: FirstActiveLaneType, DL);
998 VPValue *EndValue = B.createAdd(LHS: CanonicalIV, RHS: FirstActiveLane, DL);
999
1000 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1001 // changed it means the exit is using the incremented value, so we need to
1002 // add the step.
1003 if (Incoming != WideIV) {
1004 VPValue *One = Plan.getConstantInt(Ty: CanonicalIVType, Val: 1);
1005 EndValue = B.createAdd(LHS: EndValue, RHS: One, DL);
1006 }
1007
1008 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
1009 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
1010 VPIRValue *Start = WideIV->getStartValue();
1011 VPValue *Step = WideIV->getStepValue();
1012 EndValue = B.createDerivedIV(
1013 Kind: ID.getKind(), FPBinOp: dyn_cast_or_null<FPMathOperator>(Val: ID.getInductionBinOp()),
1014 Start, Current: EndValue, Step);
1015 }
1016
1017 return EndValue;
1018}
1019
1020/// Compute the end value for \p WideIV, unless it is truncated. Creates a
1021/// VPDerivedIVRecipe for non-canonical inductions.
1022static VPValue *tryToComputeEndValueForInduction(VPWidenInductionRecipe *WideIV,
1023 VPBuilder &VectorPHBuilder,
1024 VPTypeAnalysis &TypeInfo,
1025 VPValue *VectorTC) {
1026 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: WideIV);
1027 // Truncated wide inductions resume from the last lane of their vector value
1028 // in the last vector iteration which is handled elsewhere.
1029 if (WideIntOrFp && WideIntOrFp->getTruncInst())
1030 return nullptr;
1031
1032 VPIRValue *Start = WideIV->getStartValue();
1033 VPValue *Step = WideIV->getStepValue();
1034 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
1035 VPValue *EndValue = VectorTC;
1036 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
1037 EndValue = VectorPHBuilder.createDerivedIV(
1038 Kind: ID.getKind(), FPBinOp: dyn_cast_or_null<FPMathOperator>(Val: ID.getInductionBinOp()),
1039 Start, Current: VectorTC, Step);
1040 }
1041
1042 // EndValue is derived from the vector trip count (which has the same type as
1043 // the widest induction) and thus may be wider than the induction here.
1044 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(V: WideIV);
1045 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(V: EndValue)) {
1046 EndValue = VectorPHBuilder.createScalarCast(Opcode: Instruction::Trunc, Op: EndValue,
1047 ResultTy: ScalarTypeOfWideIV,
1048 DL: WideIV->getDebugLoc());
1049 }
1050
1051 return EndValue;
1052}
1053
1054/// Attempts to optimize the induction variable exit values for users in the
1055/// exit block coming from the latch in the original scalar loop.
1056static VPValue *optimizeLatchExitInductionUser(
1057 VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op,
1058 DenseMap<VPValue *, VPValue *> &EndValues, PredicatedScalarEvolution &PSE) {
1059 VPValue *Incoming;
1060 VPWidenInductionRecipe *WideIV = nullptr;
1061 if (match(V: Op, P: m_ExtractLastLaneOfLastPart(Op0: m_VPValue(V&: Incoming))))
1062 WideIV = getOptimizableIVOf(VPV: Incoming, PSE);
1063
1064 if (!WideIV)
1065 return nullptr;
1066
1067 VPValue *EndValue = EndValues.lookup(Val: WideIV);
1068 assert(EndValue && "Must have computed the end value up front");
1069
1070 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1071 // changed it means the exit is using the incremented value, so we don't
1072 // need to subtract the step.
1073 if (Incoming != WideIV)
1074 return EndValue;
1075
1076 // Otherwise, subtract the step from the EndValue.
1077 VPBuilder B(cast<VPBasicBlock>(Val: PredVPBB)->getTerminator());
1078 VPValue *Step = WideIV->getStepValue();
1079 Type *ScalarTy = TypeInfo.inferScalarType(V: WideIV);
1080 if (ScalarTy->isIntegerTy())
1081 return B.createSub(LHS: EndValue, RHS: Step, DL: DebugLoc::getUnknown(), Name: "ind.escape");
1082 if (ScalarTy->isPointerTy()) {
1083 Type *StepTy = TypeInfo.inferScalarType(V: Step);
1084 auto *Zero = Plan.getZero(Ty: StepTy);
1085 return B.createPtrAdd(Ptr: EndValue, Offset: B.createSub(LHS: Zero, RHS: Step),
1086 DL: DebugLoc::getUnknown(), Name: "ind.escape");
1087 }
1088 if (ScalarTy->isFloatingPointTy()) {
1089 const auto &ID = WideIV->getInductionDescriptor();
1090 return B.createNaryOp(
1091 Opcode: ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
1092 ? Instruction::FSub
1093 : Instruction::FAdd,
1094 Operands: {EndValue, Step}, Flags: {ID.getInductionBinOp()->getFastMathFlags()});
1095 }
1096 llvm_unreachable("all possible induction types must be handled");
1097 return nullptr;
1098}
1099
1100void VPlanTransforms::optimizeInductionLiveOutUsers(
1101 VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail) {
1102 // Compute end values for all inductions.
1103 VPTypeAnalysis TypeInfo(Plan);
1104 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1105 auto *VectorPH = cast<VPBasicBlock>(Val: VectorRegion->getSinglePredecessor());
1106 VPBuilder VectorPHBuilder(VectorPH, VectorPH->begin());
1107 DenseMap<VPValue *, VPValue *> EndValues;
1108 VPValue *ResumeTC =
1109 FoldTail ? Plan.getTripCount() : &Plan.getVectorTripCount();
1110 for (auto &Phi : VectorRegion->getEntryBasicBlock()->phis()) {
1111 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(Val: &Phi);
1112 if (!WideIV)
1113 continue;
1114 if (VPValue *EndValue = tryToComputeEndValueForInduction(
1115 WideIV, VectorPHBuilder, TypeInfo, VectorTC: ResumeTC))
1116 EndValues[WideIV] = EndValue;
1117 }
1118
1119 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
1120 for (VPRecipeBase &R : make_early_inc_range(Range&: *MiddleVPBB)) {
1121 VPValue *Op;
1122 if (!match(V: &R, P: m_ExitingIVValue(Op0: m_VPValue(V&: Op))))
1123 continue;
1124 auto *WideIV = cast<VPWidenInductionRecipe>(Val: Op);
1125 if (VPValue *EndValue = EndValues.lookup(Val: WideIV)) {
1126 R.getVPSingleValue()->replaceAllUsesWith(New: EndValue);
1127 R.eraseFromParent();
1128 }
1129 }
1130
1131 // Then, optimize exit block users.
1132 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
1133 for (VPRecipeBase &R : ExitVPBB->phis()) {
1134 auto *ExitIRI = cast<VPIRPhi>(Val: &R);
1135
1136 for (auto [Idx, PredVPBB] : enumerate(First&: ExitVPBB->getPredecessors())) {
1137 VPValue *Escape = nullptr;
1138 if (PredVPBB == MiddleVPBB)
1139 Escape = optimizeLatchExitInductionUser(Plan, TypeInfo, PredVPBB,
1140 Op: ExitIRI->getOperand(N: Idx),
1141 EndValues, PSE);
1142 else
1143 Escape = optimizeEarlyExitInductionUser(
1144 Plan, TypeInfo, PredVPBB, Op: ExitIRI->getOperand(N: Idx), PSE);
1145 if (Escape)
1146 ExitIRI->setOperand(I: Idx, New: Escape);
1147 }
1148 }
1149 }
1150}
1151
1152/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
1153/// them with already existing recipes expanding the same SCEV expression.
1154static void removeRedundantExpandSCEVRecipes(VPlan &Plan) {
1155 DenseMap<const SCEV *, VPValue *> SCEV2VPV;
1156
1157 for (VPRecipeBase &R :
1158 make_early_inc_range(Range&: *Plan.getEntry()->getEntryBasicBlock())) {
1159 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(Val: &R);
1160 if (!ExpR)
1161 continue;
1162
1163 const auto &[V, Inserted] = SCEV2VPV.try_emplace(Key: ExpR->getSCEV(), Args&: ExpR);
1164 if (Inserted)
1165 continue;
1166 ExpR->replaceAllUsesWith(New: V->second);
1167 ExpR->eraseFromParent();
1168 }
1169}
1170
1171static void recursivelyDeleteDeadRecipes(VPValue *V) {
1172 SmallVector<VPValue *> WorkList;
1173 SmallPtrSet<VPValue *, 8> Seen;
1174 WorkList.push_back(Elt: V);
1175
1176 while (!WorkList.empty()) {
1177 VPValue *Cur = WorkList.pop_back_val();
1178 if (!Seen.insert(Ptr: Cur).second)
1179 continue;
1180 VPRecipeBase *R = Cur->getDefiningRecipe();
1181 if (!R)
1182 continue;
1183 if (!isDeadRecipe(R&: *R))
1184 continue;
1185 append_range(C&: WorkList, R: R->operands());
1186 R->eraseFromParent();
1187 }
1188}
1189
1190/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1191/// Returns an optional pair, where the first element indicates whether it is
1192/// an intrinsic ID.
1193static std::optional<std::pair<bool, unsigned>>
1194getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R) {
1195 return TypeSwitch<const VPSingleDefRecipe *,
1196 std::optional<std::pair<bool, unsigned>>>(R)
1197 .Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe, VPWidenGEPRecipe,
1198 VPReplicateRecipe>(
1199 caseFn: [](auto *I) { return std::make_pair(false, I->getOpcode()); })
1200 .Case(caseFn: [](const VPWidenIntrinsicRecipe *I) {
1201 return std::make_pair(x: true, y: I->getVectorIntrinsicID());
1202 })
1203 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>(caseFn: [](auto *I) {
1204 // For recipes that do not directly map to LLVM IR instructions,
1205 // assign opcodes after the last VPInstruction opcode (which is also
1206 // after the last IR Instruction opcode), based on the VPRecipeID.
1207 return std::make_pair(false,
1208 VPInstruction::OpsEnd + 1 + I->getVPRecipeID());
1209 })
1210 .Default(defaultFn: [](auto *) { return std::nullopt; });
1211}
1212
1213/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
1214/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
1215/// Operands are foldable live-ins.
1216static VPIRValue *tryToFoldLiveIns(VPSingleDefRecipe &R,
1217 ArrayRef<VPValue *> Operands,
1218 const DataLayout &DL,
1219 VPTypeAnalysis &TypeInfo) {
1220 auto OpcodeOrIID = getOpcodeOrIntrinsicID(R: &R);
1221 if (!OpcodeOrIID)
1222 return nullptr;
1223
1224 SmallVector<Value *, 4> Ops;
1225 for (VPValue *Op : Operands) {
1226 if (!match(V: Op, P: m_LiveIn()))
1227 return nullptr;
1228 Value *V = Op->getUnderlyingValue();
1229 if (!V)
1230 return nullptr;
1231 Ops.push_back(Elt: V);
1232 }
1233
1234 auto FoldToIRValue = [&]() -> Value * {
1235 InstSimplifyFolder Folder(DL);
1236 if (OpcodeOrIID->first) {
1237 if (R.getNumOperands() != 2)
1238 return nullptr;
1239 unsigned ID = OpcodeOrIID->second;
1240 return Folder.FoldBinaryIntrinsic(ID, LHS: Ops[0], RHS: Ops[1],
1241 Ty: TypeInfo.inferScalarType(V: &R));
1242 }
1243 unsigned Opcode = OpcodeOrIID->second;
1244 if (Instruction::isBinaryOp(Opcode))
1245 return Folder.FoldBinOp(Opc: static_cast<Instruction::BinaryOps>(Opcode),
1246 LHS: Ops[0], RHS: Ops[1]);
1247 if (Instruction::isCast(Opcode))
1248 return Folder.FoldCast(Op: static_cast<Instruction::CastOps>(Opcode), V: Ops[0],
1249 DestTy: TypeInfo.inferScalarType(V: R.getVPSingleValue()));
1250 switch (Opcode) {
1251 case VPInstruction::LogicalAnd:
1252 return Folder.FoldSelect(C: Ops[0], True: Ops[1],
1253 False: ConstantInt::getNullValue(Ty: Ops[1]->getType()));
1254 case VPInstruction::Not:
1255 return Folder.FoldBinOp(Opc: Instruction::BinaryOps::Xor, LHS: Ops[0],
1256 RHS: Constant::getAllOnesValue(Ty: Ops[0]->getType()));
1257 case Instruction::Select:
1258 return Folder.FoldSelect(C: Ops[0], True: Ops[1], False: Ops[2]);
1259 case Instruction::ICmp:
1260 case Instruction::FCmp:
1261 return Folder.FoldCmp(P: cast<VPRecipeWithIRFlags>(Val&: R).getPredicate(), LHS: Ops[0],
1262 RHS: Ops[1]);
1263 case Instruction::GetElementPtr: {
1264 auto &RFlags = cast<VPRecipeWithIRFlags>(Val&: R);
1265 auto *GEP = cast<GetElementPtrInst>(Val: RFlags.getUnderlyingInstr());
1266 return Folder.FoldGEP(Ty: GEP->getSourceElementType(), Ptr: Ops[0],
1267 IdxList: drop_begin(RangeOrContainer&: Ops), NW: RFlags.getGEPNoWrapFlags());
1268 }
1269 case VPInstruction::PtrAdd:
1270 case VPInstruction::WidePtrAdd:
1271 return Folder.FoldGEP(Ty: IntegerType::getInt8Ty(C&: TypeInfo.getContext()),
1272 Ptr: Ops[0], IdxList: Ops[1],
1273 NW: cast<VPRecipeWithIRFlags>(Val&: R).getGEPNoWrapFlags());
1274 // An extract of a live-in is an extract of a broadcast, so return the
1275 // broadcasted element.
1276 case Instruction::ExtractElement:
1277 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1278 return Ops[0];
1279 }
1280 return nullptr;
1281 };
1282
1283 if (Value *V = FoldToIRValue())
1284 return R.getParent()->getPlan()->getOrAddLiveIn(V);
1285 return nullptr;
1286}
1287
1288/// Try to simplify VPSingleDefRecipe \p Def.
1289static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) {
1290 VPlan *Plan = Def->getParent()->getPlan();
1291
1292 // Simplification of live-in IR values for SingleDef recipes using
1293 // InstSimplifyFolder.
1294 const DataLayout &DL = Plan->getDataLayout();
1295 if (VPValue *V = tryToFoldLiveIns(R&: *Def, Operands: Def->operands(), DL, TypeInfo))
1296 return Def->replaceAllUsesWith(New: V);
1297
1298 // Fold PredPHI LiveIn -> LiveIn.
1299 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Val: Def)) {
1300 VPValue *Op = PredPHI->getOperand(N: 0);
1301 if (isa<VPIRValue>(Val: Op))
1302 PredPHI->replaceAllUsesWith(New: Op);
1303 }
1304
1305 VPBuilder Builder(Def);
1306
1307 // Avoid replacing VPInstructions with underlying values with new
1308 // VPInstructions, as we would fail to create widen/replicate recpes from the
1309 // new VPInstructions without an underlying value, and miss out on some
1310 // transformations that only apply to widened/replicated recipes later, by
1311 // doing so.
1312 // TODO: We should also not replace non-VPInstructions like VPWidenRecipe with
1313 // VPInstructions without underlying values, as those will get skipped during
1314 // cost computation.
1315 bool CanCreateNewRecipe =
1316 !isa<VPInstruction>(Val: Def) || !Def->getUnderlyingValue();
1317
1318 VPValue *A;
1319 if (match(R: Def, P: m_Trunc(Op0: m_ZExtOrSExt(Op0: m_VPValue(V&: A))))) {
1320 Type *TruncTy = TypeInfo.inferScalarType(V: Def);
1321 Type *ATy = TypeInfo.inferScalarType(V: A);
1322 if (TruncTy == ATy) {
1323 Def->replaceAllUsesWith(New: A);
1324 } else {
1325 // Don't replace a non-widened cast recipe with a widened cast.
1326 if (!isa<VPWidenCastRecipe>(Val: Def))
1327 return;
1328 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1329
1330 unsigned ExtOpcode = match(V: Def->getOperand(N: 0), P: m_SExt(Op0: m_VPValue()))
1331 ? Instruction::SExt
1332 : Instruction::ZExt;
1333 auto *Ext = Builder.createWidenCast(Opcode: Instruction::CastOps(ExtOpcode), Op: A,
1334 ResultTy: TruncTy);
1335 if (auto *UnderlyingExt = Def->getOperand(N: 0)->getUnderlyingValue()) {
1336 // UnderlyingExt has distinct return type, used to retain legacy cost.
1337 Ext->setUnderlyingValue(UnderlyingExt);
1338 }
1339 Def->replaceAllUsesWith(New: Ext);
1340 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1341 auto *Trunc = Builder.createWidenCast(Opcode: Instruction::Trunc, Op: A, ResultTy: TruncTy);
1342 Def->replaceAllUsesWith(New: Trunc);
1343 }
1344 }
1345#ifndef NDEBUG
1346 // Verify that the cached type info is for both A and its users is still
1347 // accurate by comparing it to freshly computed types.
1348 VPTypeAnalysis TypeInfo2(*Plan);
1349 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1350 for (VPUser *U : A->users()) {
1351 auto *R = cast<VPRecipeBase>(U);
1352 for (VPValue *VPV : R->definedValues())
1353 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1354 }
1355#endif
1356 }
1357
1358 // Simplify (X && Y) | (X && !Y) -> X.
1359 // TODO: Split up into simpler, modular combines: (X && Y) | (X && Z) into X
1360 // && (Y | Z) and (X | !X) into true. This requires queuing newly created
1361 // recipes to be visited during simplification.
1362 VPValue *X, *Y, *Z;
1363 if (match(R: Def,
1364 P: m_c_BinaryOr(Op0: m_LogicalAnd(Op0: m_VPValue(V&: X), Op1: m_VPValue(V&: Y)),
1365 Op1: m_LogicalAnd(Op0: m_Deferred(V: X), Op1: m_Not(Op0: m_Deferred(V: Y)))))) {
1366 Def->replaceAllUsesWith(New: X);
1367 Def->eraseFromParent();
1368 return;
1369 }
1370
1371 // x | AllOnes -> AllOnes
1372 if (match(R: Def, P: m_c_BinaryOr(Op0: m_VPValue(V&: X), Op1: m_AllOnes())))
1373 return Def->replaceAllUsesWith(
1374 New: Plan->getAllOnesValue(Ty: TypeInfo.inferScalarType(V: Def)));
1375
1376 // x | 0 -> x
1377 if (match(R: Def, P: m_c_BinaryOr(Op0: m_VPValue(V&: X), Op1: m_ZeroInt())))
1378 return Def->replaceAllUsesWith(New: X);
1379
1380 // x | !x -> AllOnes
1381 if (match(R: Def, P: m_c_BinaryOr(Op0: m_VPValue(V&: X), Op1: m_Not(Op0: m_Deferred(V: X)))))
1382 return Def->replaceAllUsesWith(
1383 New: Plan->getAllOnesValue(Ty: TypeInfo.inferScalarType(V: Def)));
1384
1385 // x & 0 -> 0
1386 if (match(R: Def, P: m_c_BinaryAnd(Op0: m_VPValue(V&: X), Op1: m_ZeroInt())))
1387 return Def->replaceAllUsesWith(
1388 New: Plan->getZero(Ty: TypeInfo.inferScalarType(V: Def)));
1389
1390 // x & AllOnes -> x
1391 if (match(R: Def, P: m_c_BinaryAnd(Op0: m_VPValue(V&: X), Op1: m_AllOnes())))
1392 return Def->replaceAllUsesWith(New: X);
1393
1394 // x && false -> false
1395 if (match(R: Def, P: m_c_LogicalAnd(Op0: m_VPValue(V&: X), Op1: m_False())))
1396 return Def->replaceAllUsesWith(New: Plan->getFalse());
1397
1398 // x && true -> x
1399 if (match(R: Def, P: m_c_LogicalAnd(Op0: m_VPValue(V&: X), Op1: m_True())))
1400 return Def->replaceAllUsesWith(New: X);
1401
1402 // (x && y) | (x && z) -> x && (y | z)
1403 if (CanCreateNewRecipe &&
1404 match(R: Def, P: m_c_BinaryOr(Op0: m_LogicalAnd(Op0: m_VPValue(V&: X), Op1: m_VPValue(V&: Y)),
1405 Op1: m_LogicalAnd(Op0: m_Deferred(V: X), Op1: m_VPValue(V&: Z)))) &&
1406 // Simplify only if one of the operands has one use to avoid creating an
1407 // extra recipe.
1408 (!Def->getOperand(N: 0)->hasMoreThanOneUniqueUser() ||
1409 !Def->getOperand(N: 1)->hasMoreThanOneUniqueUser()))
1410 return Def->replaceAllUsesWith(
1411 New: Builder.createLogicalAnd(LHS: X, RHS: Builder.createOr(LHS: Y, RHS: Z)));
1412
1413 // x && (x && y) -> x && y
1414 if (match(R: Def, P: m_LogicalAnd(Op0: m_VPValue(V&: X),
1415 Op1: m_LogicalAnd(Op0: m_Deferred(V: X), Op1: m_VPValue()))))
1416 return Def->replaceAllUsesWith(New: Def->getOperand(N: 1));
1417
1418 // x && (y && x) -> x && y
1419 if (match(R: Def, P: m_LogicalAnd(Op0: m_VPValue(V&: X),
1420 Op1: m_LogicalAnd(Op0: m_VPValue(V&: Y), Op1: m_Deferred(V: X)))))
1421 return Def->replaceAllUsesWith(New: Builder.createLogicalAnd(LHS: X, RHS: Y));
1422
1423 // x && !x -> 0
1424 if (match(R: Def, P: m_LogicalAnd(Op0: m_VPValue(V&: X), Op1: m_Not(Op0: m_Deferred(V: X)))))
1425 return Def->replaceAllUsesWith(New: Plan->getFalse());
1426
1427 if (match(R: Def, P: m_Select(Op0: m_VPValue(), Op1: m_VPValue(V&: X), Op2: m_Deferred(V: X))))
1428 return Def->replaceAllUsesWith(New: X);
1429
1430 // select c, false, true -> not c
1431 VPValue *C;
1432 if (CanCreateNewRecipe &&
1433 match(R: Def, P: m_Select(Op0: m_VPValue(V&: C), Op1: m_False(), Op2: m_True())))
1434 return Def->replaceAllUsesWith(New: Builder.createNot(Operand: C));
1435
1436 // select !c, x, y -> select c, y, x
1437 if (match(R: Def, P: m_Select(Op0: m_Not(Op0: m_VPValue(V&: C)), Op1: m_VPValue(V&: X), Op2: m_VPValue(V&: Y)))) {
1438 Def->setOperand(I: 0, New: C);
1439 Def->setOperand(I: 1, New: Y);
1440 Def->setOperand(I: 2, New: X);
1441 return;
1442 }
1443
1444 if (match(R: Def, P: m_c_Add(Op0: m_VPValue(V&: A), Op1: m_ZeroInt())))
1445 return Def->replaceAllUsesWith(New: A);
1446
1447 if (match(R: Def, P: m_c_Mul(Op0: m_VPValue(V&: A), Op1: m_One())))
1448 return Def->replaceAllUsesWith(New: A);
1449
1450 if (match(R: Def, P: m_c_Mul(Op0: m_VPValue(V&: A), Op1: m_ZeroInt())))
1451 return Def->replaceAllUsesWith(
1452 New: Plan->getZero(Ty: TypeInfo.inferScalarType(V: Def)));
1453
1454 const APInt *APC;
1455 if (CanCreateNewRecipe && match(R: Def, P: m_c_Mul(Op0: m_VPValue(V&: A), Op1: m_APInt(C&: APC))) &&
1456 APC->isPowerOf2())
1457 return Def->replaceAllUsesWith(New: Builder.createNaryOp(
1458 Opcode: Instruction::Shl,
1459 Operands: {A, Plan->getConstantInt(BitWidth: APC->getBitWidth(), Val: APC->exactLogBase2())},
1460 Flags: *cast<VPRecipeWithIRFlags>(Val: Def), DL: Def->getDebugLoc()));
1461
1462 // Don't convert udiv to lshr inside a replicate region, as VPInstructions are
1463 // not allowed in them.
1464 const VPRegionBlock *ParentRegion = Def->getParent()->getParent();
1465 bool IsInReplicateRegion = ParentRegion && ParentRegion->isReplicator();
1466 if (CanCreateNewRecipe && !IsInReplicateRegion &&
1467 match(R: Def, P: m_UDiv(Op0: m_VPValue(V&: A), Op1: m_APInt(C&: APC))) && APC->isPowerOf2())
1468 return Def->replaceAllUsesWith(New: Builder.createNaryOp(
1469 Opcode: Instruction::LShr,
1470 Operands: {A, Plan->getConstantInt(BitWidth: APC->getBitWidth(), Val: APC->exactLogBase2())},
1471 Flags: *cast<VPRecipeWithIRFlags>(Val: Def), DL: Def->getDebugLoc()));
1472
1473 if (match(R: Def, P: m_Not(Op0: m_VPValue(V&: A)))) {
1474 if (match(V: A, P: m_Not(Op0: m_VPValue(V&: A))))
1475 return Def->replaceAllUsesWith(New: A);
1476
1477 // Try to fold Not into compares by adjusting the predicate in-place.
1478 CmpPredicate Pred;
1479 if (match(V: A, P: m_Cmp(Pred, Op0: m_VPValue(), Op1: m_VPValue()))) {
1480 auto *Cmp = cast<VPRecipeWithIRFlags>(Val: A);
1481 if (all_of(Range: Cmp->users(),
1482 P: match_fn(P: m_CombineOr(
1483 L: m_Not(Op0: m_Specific(VPV: Cmp)),
1484 R: m_Select(Op0: m_Specific(VPV: Cmp), Op1: m_VPValue(), Op2: m_VPValue()))))) {
1485 Cmp->setPredicate(CmpInst::getInversePredicate(pred: Pred));
1486 for (VPUser *U : to_vector(Range: Cmp->users())) {
1487 auto *R = cast<VPSingleDefRecipe>(Val: U);
1488 if (match(R, P: m_Select(Op0: m_Specific(VPV: Cmp), Op1: m_VPValue(V&: X), Op2: m_VPValue(V&: Y)))) {
1489 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1490 R->setOperand(I: 1, New: Y);
1491 R->setOperand(I: 2, New: X);
1492 } else {
1493 // not (cmp pred) -> cmp inv_pred
1494 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1495 R->replaceAllUsesWith(New: Cmp);
1496 }
1497 }
1498 // If Cmp doesn't have a debug location, use the one from the negation,
1499 // to preserve the location.
1500 if (!Cmp->getDebugLoc() && Def->getDebugLoc())
1501 Cmp->setDebugLoc(Def->getDebugLoc());
1502 }
1503 }
1504 }
1505
1506 // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
1507 // any-of (fcmp uno %A, %B), ...
1508 if (match(R: Def, P: m_AnyOf())) {
1509 SmallVector<VPValue *, 4> NewOps;
1510 VPRecipeBase *UnpairedCmp = nullptr;
1511 for (VPValue *Op : Def->operands()) {
1512 VPValue *X;
1513 if (Op->getNumUsers() > 1 ||
1514 !match(V: Op, P: m_SpecificCmp(MatchPred: CmpInst::FCMP_UNO, Op0: m_VPValue(V&: X),
1515 Op1: m_Deferred(V: X)))) {
1516 NewOps.push_back(Elt: Op);
1517 } else if (!UnpairedCmp) {
1518 UnpairedCmp = Op->getDefiningRecipe();
1519 } else {
1520 NewOps.push_back(Elt: Builder.createFCmp(Pred: CmpInst::FCMP_UNO,
1521 A: UnpairedCmp->getOperand(N: 0), B: X));
1522 UnpairedCmp = nullptr;
1523 }
1524 }
1525
1526 if (UnpairedCmp)
1527 NewOps.push_back(Elt: UnpairedCmp->getVPSingleValue());
1528
1529 if (NewOps.size() < Def->getNumOperands()) {
1530 VPValue *NewAnyOf = Builder.createNaryOp(Opcode: VPInstruction::AnyOf, Operands: NewOps);
1531 return Def->replaceAllUsesWith(New: NewAnyOf);
1532 }
1533 }
1534
1535 // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
1536 // This is useful for fmax/fmin without fast-math flags, where we need to
1537 // check if any operand is NaN.
1538 if (CanCreateNewRecipe &&
1539 match(R: Def, P: m_BinaryOr(Op0: m_SpecificCmp(MatchPred: CmpInst::FCMP_UNO, Op0: m_VPValue(V&: X),
1540 Op1: m_Deferred(V: X)),
1541 Op1: m_SpecificCmp(MatchPred: CmpInst::FCMP_UNO, Op0: m_VPValue(V&: Y),
1542 Op1: m_Deferred(V: Y))))) {
1543 VPValue *NewCmp = Builder.createFCmp(Pred: CmpInst::FCMP_UNO, A: X, B: Y);
1544 return Def->replaceAllUsesWith(New: NewCmp);
1545 }
1546
1547 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1548 if ((match(R: Def, P: m_DerivedIV(Op0: m_ZeroInt(), Op1: m_VPValue(V&: A), Op2: m_One())) ||
1549 match(R: Def, P: m_DerivedIV(Op0: m_ZeroInt(), Op1: m_ZeroInt(), Op2: m_VPValue()))) &&
1550 TypeInfo.inferScalarType(V: Def->getOperand(N: 1)) ==
1551 TypeInfo.inferScalarType(V: Def))
1552 return Def->replaceAllUsesWith(New: Def->getOperand(N: 1));
1553
1554 if (match(R: Def, P: m_VPInstruction<VPInstruction::WideIVStep>(Ops: m_VPValue(V&: X),
1555 Ops: m_One()))) {
1556 Type *WideStepTy = TypeInfo.inferScalarType(V: Def);
1557 if (TypeInfo.inferScalarType(V: X) != WideStepTy)
1558 X = Builder.createWidenCast(Opcode: Instruction::Trunc, Op: X, ResultTy: WideStepTy);
1559 Def->replaceAllUsesWith(New: X);
1560 return;
1561 }
1562
1563 // For i1 vp.merges produced by AnyOf reductions:
1564 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1565 if (match(R: Def, P: m_Intrinsic<Intrinsic::vp_merge>(Op0: m_True(), Op1: m_VPValue(V&: A),
1566 Op2: m_VPValue(V&: X), Op3: m_VPValue())) &&
1567 match(V: A, P: m_c_BinaryOr(Op0: m_Specific(VPV: X), Op1: m_VPValue(V&: Y))) &&
1568 TypeInfo.inferScalarType(V: Def)->isIntegerTy(Bitwidth: 1)) {
1569 Def->setOperand(I: 1, New: Def->getOperand(N: 0));
1570 Def->setOperand(I: 0, New: Y);
1571 return;
1572 }
1573
1574 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Val: Def)) {
1575 if (Phi->getOperand(N: 0) == Phi->getOperand(N: 1))
1576 Phi->replaceAllUsesWith(New: Phi->getOperand(N: 0));
1577 return;
1578 }
1579
1580 // Simplify MaskedCond with no block mask to its single operand.
1581 if (match(R: Def, P: m_VPInstruction<VPInstruction::MaskedCond>()) &&
1582 !cast<VPInstruction>(Val: Def)->isMasked())
1583 return Def->replaceAllUsesWith(New: Def->getOperand(N: 0));
1584
1585 // Look through ExtractLastLane.
1586 if (match(R: Def, P: m_ExtractLastLane(Op0: m_VPValue(V&: A)))) {
1587 if (match(V: A, P: m_BuildVector())) {
1588 auto *BuildVector = cast<VPInstruction>(Val: A);
1589 Def->replaceAllUsesWith(
1590 New: BuildVector->getOperand(N: BuildVector->getNumOperands() - 1));
1591 return;
1592 }
1593 if (Plan->hasScalarVFOnly())
1594 return Def->replaceAllUsesWith(New: A);
1595 }
1596
1597 // Look through ExtractPenultimateElement (BuildVector ....).
1598 if (match(R: Def, P: m_ExtractPenultimateElement(Op0: m_BuildVector()))) {
1599 auto *BuildVector = cast<VPInstruction>(Val: Def->getOperand(N: 0));
1600 Def->replaceAllUsesWith(
1601 New: BuildVector->getOperand(N: BuildVector->getNumOperands() - 2));
1602 return;
1603 }
1604
1605 uint64_t Idx;
1606 if (match(R: Def, P: m_ExtractElement(Op0: m_BuildVector(), Op1: m_ConstantInt(C&: Idx)))) {
1607 auto *BuildVector = cast<VPInstruction>(Val: Def->getOperand(N: 0));
1608 Def->replaceAllUsesWith(New: BuildVector->getOperand(N: Idx));
1609 return;
1610 }
1611
1612 if (match(R: Def, P: m_BuildVector()) && all_equal(Range: Def->operands())) {
1613 Def->replaceAllUsesWith(
1614 New: Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: Def->getOperand(N: 0)));
1615 return;
1616 }
1617
1618 // Look through broadcast of single-scalar when used as select conditions; in
1619 // that case the scalar condition can be used directly.
1620 if (match(R: Def,
1621 P: m_Select(Op0: m_Broadcast(Op0: m_VPValue(V&: C)), Op1: m_VPValue(), Op2: m_VPValue()))) {
1622 assert(vputils::isSingleScalar(C) &&
1623 "broadcast operand must be single-scalar");
1624 Def->setOperand(I: 0, New: C);
1625 return;
1626 }
1627
1628 if (isa<VPPhi, VPWidenPHIRecipe>(Val: Def)) {
1629 if (Def->getNumOperands() == 1)
1630 Def->replaceAllUsesWith(New: Def->getOperand(N: 0));
1631 return;
1632 }
1633
1634 VPIRValue *IRV;
1635 if (Def->getNumOperands() == 1 &&
1636 match(R: Def, P: m_ComputeReductionResult(Op0: m_VPIRValue(V&: IRV))))
1637 return Def->replaceAllUsesWith(New: IRV);
1638
1639 // Some simplifications can only be applied after unrolling. Perform them
1640 // below.
1641 if (!Plan->isUnrolled())
1642 return;
1643
1644 // After unrolling, extract-lane may be used to extract values from multiple
1645 // scalar sources. Only simplify when extracting from a single scalar source.
1646 VPValue *LaneToExtract;
1647 if (match(R: Def, P: m_ExtractLane(Op0: m_VPValue(V&: LaneToExtract), Op1: m_VPValue(V&: A)))) {
1648 // Simplify extract-lane(%lane_num, %scalar_val) -> %scalar_val.
1649 if (vputils::isSingleScalar(VPV: A))
1650 return Def->replaceAllUsesWith(New: A);
1651
1652 // Simplify extract-lane with single source to extract-element.
1653 Def->replaceAllUsesWith(New: Builder.createNaryOp(
1654 Opcode: Instruction::ExtractElement, Operands: {A, LaneToExtract}, DL: Def->getDebugLoc()));
1655 return;
1656 }
1657
1658 // Hoist an invariant increment Y of a phi X, by having X start at Y.
1659 if (match(R: Def, P: m_c_Add(Op0: m_VPValue(V&: X), Op1: m_VPValue(V&: Y))) && isa<VPIRValue>(Val: Y) &&
1660 isa<VPPhi>(Val: X)) {
1661 auto *Phi = cast<VPPhi>(Val: X);
1662 if (Phi->getOperand(N: 1) != Def && match(V: Phi->getOperand(N: 0), P: m_ZeroInt()) &&
1663 Phi->getSingleUser() == Def) {
1664 Phi->setOperand(I: 0, New: Y);
1665 Def->replaceAllUsesWith(New: Phi);
1666 return;
1667 }
1668 }
1669
1670 // Simplify unrolled VectorPointer without offset, or with zero offset, to
1671 // just the pointer operand.
1672 if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(Val: Def))
1673 if (!VPR->getOffset() || match(V: VPR->getOffset(), P: m_ZeroInt()))
1674 return VPR->replaceAllUsesWith(New: VPR->getOperand(N: 0));
1675
1676 // VPScalarIVSteps after unrolling can be replaced by their start value, if
1677 // the start index is zero and only the first lane 0 is demanded.
1678 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Val: Def)) {
1679 if (!Steps->getStartIndex() && vputils::onlyFirstLaneUsed(Def: Steps)) {
1680 Steps->replaceAllUsesWith(New: Steps->getOperand(N: 0));
1681 return;
1682 }
1683 }
1684 // Simplify redundant ReductionStartVector recipes after unrolling.
1685 VPValue *StartV;
1686 if (match(R: Def, P: m_VPInstruction<VPInstruction::ReductionStartVector>(
1687 Ops: m_VPValue(V&: StartV), Ops: m_VPValue(), Ops: m_VPValue()))) {
1688 Def->replaceUsesWithIf(New: StartV, ShouldReplace: [](const VPUser &U, unsigned Idx) {
1689 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &U);
1690 return PhiR && PhiR->isInLoop();
1691 });
1692 return;
1693 }
1694
1695 if (match(R: Def, P: m_ExtractLastLane(Op0: m_Broadcast(Op0: m_VPValue(V&: A))))) {
1696 Def->replaceAllUsesWith(New: A);
1697 return;
1698 }
1699
1700 if (match(R: Def, P: m_ExtractLastLane(Op0: m_VPValue(V&: A))) &&
1701 ((isa<VPInstruction>(Val: A) && vputils::isSingleScalar(VPV: A)) ||
1702 (isa<VPReplicateRecipe>(Val: A) &&
1703 cast<VPReplicateRecipe>(Val: A)->isSingleScalar())) &&
1704 all_of(Range: A->users(),
1705 P: [Def, A](VPUser *U) { return U->usesScalars(Op: A) || Def == U; })) {
1706 return Def->replaceAllUsesWith(New: A);
1707 }
1708
1709 if (Plan->getConcreteUF() == 1 && match(R: Def, P: m_ExtractLastPart(Op0: m_VPValue(V&: A))))
1710 return Def->replaceAllUsesWith(New: A);
1711}
1712
1713void VPlanTransforms::simplifyRecipes(VPlan &Plan) {
1714 ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
1715 Plan.getEntry());
1716 VPTypeAnalysis TypeInfo(Plan);
1717 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: RPOT)) {
1718 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB))
1719 if (auto *Def = dyn_cast<VPSingleDefRecipe>(Val: &R))
1720 simplifyRecipe(Def, TypeInfo);
1721 }
1722}
1723
1724/// Reassociate (headermask && x) && y -> headermask && (x && y) to allow the
1725/// header mask to be simplified further when tail folding, e.g. in
1726/// optimizeEVLMasks.
1727static void reassociateHeaderMask(VPlan &Plan) {
1728 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
1729 if (!HeaderMask)
1730 return;
1731
1732 SmallVector<VPUser *> Worklist;
1733 for (VPUser *U : HeaderMask->users())
1734 if (match(U, P: m_LogicalAnd(Op0: m_Specific(VPV: HeaderMask), Op1: m_VPValue())))
1735 append_range(C&: Worklist, R: cast<VPSingleDefRecipe>(Val: U)->users());
1736
1737 while (!Worklist.empty()) {
1738 auto *R = dyn_cast<VPSingleDefRecipe>(Val: Worklist.pop_back_val());
1739 VPValue *X, *Y;
1740 if (!R || !match(R, P: m_LogicalAnd(
1741 Op0: m_LogicalAnd(Op0: m_Specific(VPV: HeaderMask), Op1: m_VPValue(V&: X)),
1742 Op1: m_VPValue(V&: Y))))
1743 continue;
1744 append_range(C&: Worklist, R: R->users());
1745 VPBuilder Builder(R);
1746 R->replaceAllUsesWith(
1747 New: Builder.createLogicalAnd(LHS: HeaderMask, RHS: Builder.createLogicalAnd(LHS: X, RHS: Y)));
1748 }
1749}
1750
1751static void narrowToSingleScalarRecipes(VPlan &Plan) {
1752 if (Plan.hasScalarVFOnly())
1753 return;
1754
1755 // Try to narrow wide and replicating recipes to single scalar recipes,
1756 // based on VPlan analysis. Only process blocks in the loop region for now,
1757 // without traversing into nested regions, as recipes in replicate regions
1758 // cannot be converted yet.
1759 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1760 Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry()))) {
1761 for (VPRecipeBase &R : make_early_inc_range(Range: reverse(C&: *VPBB))) {
1762 if (!isa<VPWidenRecipe, VPWidenGEPRecipe, VPReplicateRecipe,
1763 VPWidenStoreRecipe>(Val: &R))
1764 continue;
1765 auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R);
1766 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1767 continue;
1768
1769 // Convert an unmasked scatter with an uniform address into
1770 // extract-last-lane + scalar store.
1771 // TODO: Add a profitability check comparing the cost of a scatter vs.
1772 // extract + scalar store.
1773 auto *WidenStoreR = dyn_cast<VPWidenStoreRecipe>(Val: &R);
1774 if (WidenStoreR && vputils::isSingleScalar(VPV: WidenStoreR->getAddr()) &&
1775 !WidenStoreR->isConsecutive()) {
1776 assert(!WidenStoreR->isReverse() &&
1777 "Not consecutive memory recipes shouldn't be reversed");
1778 VPValue *Mask = WidenStoreR->getMask();
1779
1780 // Only convert the scatter to a scalar store if it is unmasked.
1781 // TODO: Support converting scatter masked by the header mask to scalar
1782 // store.
1783 if (Mask)
1784 continue;
1785
1786 auto *Extract = new VPInstruction(VPInstruction::ExtractLastLane,
1787 {WidenStoreR->getOperand(N: 1)});
1788 Extract->insertBefore(InsertPos: WidenStoreR);
1789
1790 // TODO: Sink the scalar store recipe to middle block if possible.
1791 auto *ScalarStore = new VPReplicateRecipe(
1792 &WidenStoreR->getIngredient(), {Extract, WidenStoreR->getAddr()},
1793 true /*IsSingleScalar*/, nullptr /*Mask*/, {},
1794 *WidenStoreR /*Metadata*/);
1795 ScalarStore->insertBefore(InsertPos: WidenStoreR);
1796 WidenStoreR->eraseFromParent();
1797 continue;
1798 }
1799
1800 auto *RepOrWidenR = dyn_cast<VPRecipeWithIRFlags>(Val: &R);
1801 if (RepR && isa<StoreInst>(Val: RepR->getUnderlyingInstr()) &&
1802 vputils::isSingleScalar(VPV: RepR->getOperand(N: 1))) {
1803 auto *Clone = new VPReplicateRecipe(
1804 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1805 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Flags*/,
1806 *RepR /*Metadata*/, RepR->getDebugLoc());
1807 Clone->insertBefore(InsertPos: RepOrWidenR);
1808 VPBuilder Builder(Clone);
1809 VPValue *ExtractOp = Clone->getOperand(N: 0);
1810 if (vputils::isUniformAcrossVFsAndUFs(V: RepR->getOperand(N: 1)))
1811 ExtractOp =
1812 Builder.createNaryOp(Opcode: VPInstruction::ExtractLastPart, Operands: ExtractOp);
1813 ExtractOp =
1814 Builder.createNaryOp(Opcode: VPInstruction::ExtractLastLane, Operands: ExtractOp);
1815 Clone->setOperand(I: 0, New: ExtractOp);
1816 RepR->eraseFromParent();
1817 continue;
1818 }
1819
1820 // Skip recipes that aren't single scalars.
1821 if (!RepOrWidenR || !vputils::isSingleScalar(VPV: RepOrWidenR))
1822 continue;
1823
1824 // Predicate to check if a user of Op introduces extra broadcasts.
1825 auto IntroducesBCastOf = [](const VPValue *Op) {
1826 return [Op](const VPUser *U) {
1827 if (auto *VPI = dyn_cast<VPInstruction>(Val: U)) {
1828 if (is_contained(Set: {VPInstruction::ExtractLastLane,
1829 VPInstruction::ExtractLastPart,
1830 VPInstruction::ExtractPenultimateElement},
1831 Element: VPI->getOpcode()))
1832 return false;
1833 }
1834 return !U->usesScalars(Op);
1835 };
1836 };
1837
1838 if (any_of(Range: RepOrWidenR->users(), P: IntroducesBCastOf(RepOrWidenR)) &&
1839 none_of(Range: RepOrWidenR->operands(), P: [&](VPValue *Op) {
1840 if (any_of(
1841 Range: make_filter_range(Range: Op->users(), Pred: not_equal_to(Arg&: RepOrWidenR)),
1842 P: IntroducesBCastOf(Op)))
1843 return false;
1844 // Non-constant live-ins require broadcasts, while constants do not
1845 // need explicit broadcasts.
1846 auto *IRV = dyn_cast<VPIRValue>(Val: Op);
1847 bool LiveInNeedsBroadcast = IRV && !isa<Constant>(Val: IRV->getValue());
1848 auto *OpR = dyn_cast<VPReplicateRecipe>(Val: Op);
1849 return LiveInNeedsBroadcast || (OpR && OpR->isSingleScalar());
1850 }))
1851 continue;
1852
1853 auto *Clone = new VPReplicateRecipe(
1854 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1855 true /*IsSingleScalar*/, nullptr, *RepOrWidenR);
1856 Clone->insertBefore(InsertPos: RepOrWidenR);
1857 RepOrWidenR->replaceAllUsesWith(New: Clone);
1858 if (isDeadRecipe(R&: *RepOrWidenR))
1859 RepOrWidenR->eraseFromParent();
1860 }
1861 }
1862}
1863
1864/// Try to see if all of \p Blend's masks share a common value logically and'ed
1865/// and remove it from the masks.
1866static void removeCommonBlendMask(VPBlendRecipe *Blend) {
1867 if (Blend->isNormalized())
1868 return;
1869 VPValue *CommonEdgeMask;
1870 if (!match(V: Blend->getMask(Idx: 0),
1871 P: m_LogicalAnd(Op0: m_VPValue(V&: CommonEdgeMask), Op1: m_VPValue())))
1872 return;
1873 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1874 if (!match(V: Blend->getMask(Idx: I),
1875 P: m_LogicalAnd(Op0: m_Specific(VPV: CommonEdgeMask), Op1: m_VPValue())))
1876 return;
1877 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1878 Blend->setMask(Idx: I, V: Blend->getMask(Idx: I)->getDefiningRecipe()->getOperand(N: 1));
1879}
1880
1881/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1882/// to make sure the masks are simplified.
1883static void simplifyBlends(VPlan &Plan) {
1884 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1885 Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry()))) {
1886 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
1887 auto *Blend = dyn_cast<VPBlendRecipe>(Val: &R);
1888 if (!Blend)
1889 continue;
1890
1891 removeCommonBlendMask(Blend);
1892
1893 // Try to remove redundant blend recipes.
1894 SmallPtrSet<VPValue *, 4> UniqueValues;
1895 if (Blend->isNormalized() || !match(V: Blend->getMask(Idx: 0), P: m_False()))
1896 UniqueValues.insert(Ptr: Blend->getIncomingValue(Idx: 0));
1897 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1898 if (!match(V: Blend->getMask(Idx: I), P: m_False()))
1899 UniqueValues.insert(Ptr: Blend->getIncomingValue(Idx: I));
1900
1901 if (UniqueValues.size() == 1) {
1902 Blend->replaceAllUsesWith(New: *UniqueValues.begin());
1903 Blend->eraseFromParent();
1904 continue;
1905 }
1906
1907 if (Blend->isNormalized())
1908 continue;
1909
1910 // Normalize the blend so its first incoming value is used as the initial
1911 // value with the others blended into it.
1912
1913 unsigned StartIndex = 0;
1914 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1915 // If a value's mask is used only by the blend then is can be deadcoded.
1916 // TODO: Find the most expensive mask that can be deadcoded, or a mask
1917 // that's used by multiple blends where it can be removed from them all.
1918 VPValue *Mask = Blend->getMask(Idx: I);
1919 if (Mask->getNumUsers() == 1 && !match(V: Mask, P: m_False())) {
1920 StartIndex = I;
1921 break;
1922 }
1923 }
1924
1925 SmallVector<VPValue *, 4> OperandsWithMask;
1926 OperandsWithMask.push_back(Elt: Blend->getIncomingValue(Idx: StartIndex));
1927
1928 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1929 if (I == StartIndex)
1930 continue;
1931 OperandsWithMask.push_back(Elt: Blend->getIncomingValue(Idx: I));
1932 OperandsWithMask.push_back(Elt: Blend->getMask(Idx: I));
1933 }
1934
1935 auto *NewBlend =
1936 new VPBlendRecipe(cast_or_null<PHINode>(Val: Blend->getUnderlyingValue()),
1937 OperandsWithMask, *Blend, Blend->getDebugLoc());
1938 NewBlend->insertBefore(InsertPos: &R);
1939
1940 VPValue *DeadMask = Blend->getMask(Idx: StartIndex);
1941 Blend->replaceAllUsesWith(New: NewBlend);
1942 Blend->eraseFromParent();
1943 recursivelyDeleteDeadRecipes(V: DeadMask);
1944
1945 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1946 VPValue *NewMask;
1947 if (NewBlend->getNumOperands() == 3 &&
1948 match(V: NewBlend->getMask(Idx: 1), P: m_Not(Op0: m_VPValue(V&: NewMask)))) {
1949 VPValue *Inc0 = NewBlend->getOperand(N: 0);
1950 VPValue *Inc1 = NewBlend->getOperand(N: 1);
1951 VPValue *OldMask = NewBlend->getOperand(N: 2);
1952 NewBlend->setOperand(I: 0, New: Inc1);
1953 NewBlend->setOperand(I: 1, New: Inc0);
1954 NewBlend->setOperand(I: 2, New: NewMask);
1955 if (OldMask->getNumUsers() == 0)
1956 cast<VPInstruction>(Val: OldMask)->eraseFromParent();
1957 }
1958 }
1959 }
1960}
1961
1962/// Optimize the width of vector induction variables in \p Plan based on a known
1963/// constant Trip Count, \p BestVF and \p BestUF.
1964static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan,
1965 ElementCount BestVF,
1966 unsigned BestUF) {
1967 // Only proceed if we have not completely removed the vector region.
1968 if (!Plan.getVectorLoopRegion())
1969 return false;
1970
1971 const APInt *TC;
1972 if (!BestVF.isFixed() || !match(V: Plan.getTripCount(), P: m_APInt(C&: TC)))
1973 return false;
1974
1975 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
1976 // and UF. Returns at least 8.
1977 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
1978 APInt AlignedTC =
1979 Align * APIntOps::RoundingUDiv(A: TC, B: APInt(TC.getBitWidth(), Align),
1980 RM: APInt::Rounding::UP);
1981 APInt MaxVal = AlignedTC - 1;
1982 return std::max<unsigned>(a: PowerOf2Ceil(A: MaxVal.getActiveBits()), b: 8);
1983 };
1984 unsigned NewBitWidth =
1985 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
1986
1987 LLVMContext &Ctx = Plan.getContext();
1988 auto *NewIVTy = IntegerType::get(C&: Ctx, NumBits: NewBitWidth);
1989
1990 bool MadeChange = false;
1991
1992 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
1993 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
1994 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi);
1995
1996 // Currently only handle canonical IVs as it is trivial to replace the start
1997 // and stop values, and we currently only perform the optimization when the
1998 // IV has a single use.
1999 if (!WideIV || !WideIV->isCanonical() ||
2000 WideIV->hasMoreThanOneUniqueUser() ||
2001 NewIVTy == WideIV->getScalarType())
2002 continue;
2003
2004 // Currently only handle cases where the single user is a header-mask
2005 // comparison with the backedge-taken-count.
2006 VPUser *SingleUser = WideIV->getSingleUser();
2007 if (!SingleUser ||
2008 !match(U: SingleUser, P: m_ICmp(Op0: m_Specific(VPV: WideIV),
2009 Op1: m_Broadcast(Op0: m_Specific(
2010 VPV: Plan.getOrCreateBackedgeTakenCount())))))
2011 continue;
2012
2013 // Update IV operands and comparison bound to use new narrower type.
2014 auto *NewStart = Plan.getZero(Ty: NewIVTy);
2015 WideIV->setStartValue(NewStart);
2016 auto *NewStep = Plan.getConstantInt(Ty: NewIVTy, Val: 1);
2017 WideIV->setStepValue(NewStep);
2018
2019 auto *NewBTC = new VPWidenCastRecipe(
2020 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy,
2021 nullptr, VPIRFlags::getDefaultFlags(Opcode: Instruction::Trunc));
2022 Plan.getVectorPreheader()->appendRecipe(Recipe: NewBTC);
2023 auto *Cmp = cast<VPInstruction>(Val: WideIV->getSingleUser());
2024 Cmp->setOperand(I: 1, New: NewBTC);
2025
2026 MadeChange = true;
2027 }
2028
2029 return MadeChange;
2030}
2031
2032/// Return true if \p Cond is known to be true for given \p BestVF and \p
2033/// BestUF.
2034static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan,
2035 ElementCount BestVF, unsigned BestUF,
2036 PredicatedScalarEvolution &PSE) {
2037 if (match(V: Cond, P: m_BinaryOr(Op0: m_VPValue(), Op1: m_VPValue())))
2038 return any_of(Range: Cond->getDefiningRecipe()->operands(), P: [&Plan, BestVF, BestUF,
2039 &PSE](VPValue *C) {
2040 return isConditionTrueViaVFAndUF(Cond: C, Plan, BestVF, BestUF, PSE);
2041 });
2042
2043 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
2044 if (!match(V: Cond, P: m_SpecificICmp(MatchPred: CmpInst::ICMP_EQ,
2045 Op0: m_Specific(VPV: CanIV->getBackedgeValue()),
2046 Op1: m_Specific(VPV: &Plan.getVectorTripCount()))))
2047 return false;
2048
2049 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
2050 // count is not conveniently available as SCEV so far, so we compare directly
2051 // against the original trip count. This is stricter than necessary, as we
2052 // will only return true if the trip count == vector trip count.
2053 const SCEV *VectorTripCount =
2054 vputils::getSCEVExprForVPValue(V: &Plan.getVectorTripCount(), PSE);
2055 if (isa<SCEVCouldNotCompute>(Val: VectorTripCount))
2056 VectorTripCount = vputils::getSCEVExprForVPValue(V: Plan.getTripCount(), PSE);
2057 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2058 "Trip count SCEV must be computable");
2059 ScalarEvolution &SE = *PSE.getSE();
2060 ElementCount NumElements = BestVF.multiplyCoefficientBy(RHS: BestUF);
2061 const SCEV *C = SE.getElementCount(Ty: VectorTripCount->getType(), EC: NumElements);
2062 return SE.isKnownPredicate(Pred: CmpInst::ICMP_EQ, LHS: VectorTripCount, RHS: C);
2063}
2064
2065/// Try to replace multiple active lane masks used for control flow with
2066/// a single, wide active lane mask instruction followed by multiple
2067/// extract subvector intrinsics. This applies to the active lane mask
2068/// instructions both in the loop and in the preheader.
2069/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
2070/// new extracts from the first active lane mask, which has it's last
2071/// operand (multiplier) set to UF.
2072static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF,
2073 unsigned UF) {
2074 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
2075 return false;
2076
2077 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2078 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2079 auto *Term = &ExitingVPBB->back();
2080
2081 using namespace llvm::VPlanPatternMatch;
2082 if (!match(V: Term, P: m_BranchOnCond(Op0: m_Not(Op0: m_ActiveLaneMask(
2083 Op0: m_VPValue(), Op1: m_VPValue(), Op2: m_VPValue())))))
2084 return false;
2085
2086 auto *Header = cast<VPBasicBlock>(Val: VectorRegion->getEntry());
2087 LLVMContext &Ctx = Plan.getContext();
2088
2089 auto ExtractFromALM = [&](VPInstruction *ALM,
2090 SmallVectorImpl<VPValue *> &Extracts) {
2091 DebugLoc DL = ALM->getDebugLoc();
2092 for (unsigned Part = 0; Part < UF; ++Part) {
2093 SmallVector<VPValue *> Ops;
2094 Ops.append(IL: {ALM, Plan.getConstantInt(BitWidth: 64, Val: VF.getKnownMinValue() * Part)});
2095 auto *Ext =
2096 new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
2097 IntegerType::getInt1Ty(C&: Ctx), {}, {}, DL);
2098 Extracts[Part] = Ext;
2099 Ext->insertAfter(InsertPos: ALM);
2100 }
2101 };
2102
2103 // Create a list of each active lane mask phi, ordered by unroll part.
2104 SmallVector<VPActiveLaneMaskPHIRecipe *> Phis(UF, nullptr);
2105 for (VPRecipeBase &R : Header->phis()) {
2106 auto *Phi = dyn_cast<VPActiveLaneMaskPHIRecipe>(Val: &R);
2107 if (!Phi)
2108 continue;
2109 VPValue *Index = nullptr;
2110 match(V: Phi->getBackedgeValue(),
2111 P: m_ActiveLaneMask(Op0: m_VPValue(V&: Index), Op1: m_VPValue(), Op2: m_VPValue()));
2112 assert(Index && "Expected index from ActiveLaneMask instruction");
2113
2114 uint64_t Part;
2115 if (match(V: Index,
2116 P: m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>(
2117 Ops: m_VPValue(), Ops: m_Mul(Op0: m_VPValue(), Op1: m_ConstantInt(C&: Part)))))
2118 Phis[Part] = Phi;
2119 else {
2120 // Anything other than a CanonicalIVIncrementForPart is part 0
2121 assert(!match(
2122 Index,
2123 m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>()));
2124 Phis[0] = Phi;
2125 }
2126 }
2127
2128 assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) &&
2129 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
2130
2131 auto *EntryALM = cast<VPInstruction>(Val: Phis[0]->getStartValue());
2132 auto *LoopALM = cast<VPInstruction>(Val: Phis[0]->getBackedgeValue());
2133
2134 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
2135 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
2136 "Expected incoming values of Phi to be ActiveLaneMasks");
2137
2138 // When using wide lane masks, the return type of the get.active.lane.mask
2139 // intrinsic is VF x UF (last operand).
2140 VPValue *ALMMultiplier = Plan.getConstantInt(BitWidth: 64, Val: UF);
2141 EntryALM->setOperand(I: 2, New: ALMMultiplier);
2142 LoopALM->setOperand(I: 2, New: ALMMultiplier);
2143
2144 // Create UF x extract vectors and insert into preheader.
2145 SmallVector<VPValue *> EntryExtracts(UF);
2146 ExtractFromALM(EntryALM, EntryExtracts);
2147
2148 // Create UF x extract vectors and insert before the loop compare & branch,
2149 // updating the compare to use the first extract.
2150 SmallVector<VPValue *> LoopExtracts(UF);
2151 ExtractFromALM(LoopALM, LoopExtracts);
2152 VPInstruction *Not = cast<VPInstruction>(Val: Term->getOperand(N: 0));
2153 Not->setOperand(I: 0, New: LoopExtracts[0]);
2154
2155 // Update the incoming values of active lane mask phis.
2156 for (unsigned Part = 0; Part < UF; ++Part) {
2157 Phis[Part]->setStartValue(EntryExtracts[Part]);
2158 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
2159 }
2160
2161 return true;
2162}
2163
2164/// Try to simplify the branch condition of \p Plan. This may restrict the
2165/// resulting plan to \p BestVF and \p BestUF.
2166static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
2167 unsigned BestUF,
2168 PredicatedScalarEvolution &PSE) {
2169 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2170 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2171 auto *Term = &ExitingVPBB->back();
2172 VPValue *Cond;
2173 if (match(V: Term,
2174 P: m_BranchOnCount(Op0: m_Add(Op0: m_VPValue(), Op1: m_Specific(VPV: &Plan.getVFxUF())),
2175 Op1: m_VPValue())) ||
2176 match(V: Term, P: m_BranchOnCond(Op0: m_Not(Op0: m_ActiveLaneMask(
2177 Op0: m_VPValue(), Op1: m_VPValue(), Op2: m_VPValue()))))) {
2178 // Try to simplify the branch condition if VectorTC <= VF * UF when the
2179 // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)).
2180 const SCEV *VectorTripCount =
2181 vputils::getSCEVExprForVPValue(V: &Plan.getVectorTripCount(), PSE);
2182 if (isa<SCEVCouldNotCompute>(Val: VectorTripCount))
2183 VectorTripCount =
2184 vputils::getSCEVExprForVPValue(V: Plan.getTripCount(), PSE);
2185 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2186 "Trip count SCEV must be computable");
2187 ScalarEvolution &SE = *PSE.getSE();
2188 ElementCount NumElements = BestVF.multiplyCoefficientBy(RHS: BestUF);
2189 const SCEV *C = SE.getElementCount(Ty: VectorTripCount->getType(), EC: NumElements);
2190 if (!SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: VectorTripCount, RHS: C))
2191 return false;
2192 } else if (match(V: Term, P: m_BranchOnCond(Op0: m_VPValue(V&: Cond))) ||
2193 match(V: Term, P: m_BranchOnTwoConds(Op0: m_VPValue(), Op1: m_VPValue(V&: Cond)))) {
2194 // For BranchOnCond, check if we can prove the condition to be true using VF
2195 // and UF.
2196 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, PSE))
2197 return false;
2198 } else {
2199 return false;
2200 }
2201
2202 // The vector loop region only executes once. If possible, completely remove
2203 // the region, otherwise replace the terminator controlling the latch with
2204 // (BranchOnCond true).
2205 // TODO: VPWidenIntOrFpInductionRecipe is only partially supported; add
2206 // support for other non-canonical widen induction recipes (e.g.,
2207 // VPWidenPointerInductionRecipe).
2208 // TODO: fold branch-on-constant after dissolving region.
2209 auto *Header = cast<VPBasicBlock>(Val: VectorRegion->getEntry());
2210 if (all_of(Range: Header->phis(), P: [](VPRecipeBase &Phi) {
2211 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi))
2212 return R->isCanonical();
2213 return isa<VPCanonicalIVPHIRecipe, VPCurrentIterationPHIRecipe,
2214 VPFirstOrderRecurrencePHIRecipe, VPPhi>(Val: &Phi);
2215 })) {
2216 for (VPRecipeBase &HeaderR : make_early_inc_range(Range: Header->phis())) {
2217 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &HeaderR)) {
2218 VPBuilder Builder(Plan.getVectorPreheader());
2219 VPValue *StepV = Builder.createNaryOp(Opcode: VPInstruction::StepVector, Operands: {},
2220 ResultTy: R->getScalarType());
2221 HeaderR.getVPSingleValue()->replaceAllUsesWith(New: StepV);
2222 HeaderR.eraseFromParent();
2223 continue;
2224 }
2225 auto *Phi = cast<VPPhiAccessors>(Val: &HeaderR);
2226 HeaderR.getVPSingleValue()->replaceAllUsesWith(New: Phi->getIncomingValue(Idx: 0));
2227 HeaderR.eraseFromParent();
2228 }
2229
2230 VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
2231 SmallVector<VPBlockBase *> Exits = to_vector(Range&: VectorRegion->getSuccessors());
2232 VPBlockUtils::disconnectBlocks(From: Preheader, To: VectorRegion);
2233 for (VPBlockBase *Exit : Exits)
2234 VPBlockUtils::disconnectBlocks(From: VectorRegion, To: Exit);
2235
2236 for (VPBlockBase *B : vp_depth_first_shallow(G: VectorRegion->getEntry()))
2237 B->setParent(nullptr);
2238
2239 VPBlockUtils::connectBlocks(From: Preheader, To: Header);
2240
2241 for (VPBlockBase *Exit : Exits)
2242 VPBlockUtils::connectBlocks(From: ExitingVPBB, To: Exit);
2243
2244 // Replace terminating branch-on-two-conds with branch-on-cond to early
2245 // exit.
2246 if (Exits.size() != 1) {
2247 assert(match(Term, m_BranchOnTwoConds()) && Exits.size() == 2 &&
2248 "BranchOnTwoConds needs 2 remaining exits");
2249 VPBuilder(Term).createNaryOp(Opcode: VPInstruction::BranchOnCond,
2250 Operands: Term->getOperand(N: 0));
2251 }
2252 VPlanTransforms::simplifyRecipes(Plan);
2253 } else {
2254 // The vector region contains header phis for which we cannot remove the
2255 // loop region yet.
2256
2257 // For BranchOnTwoConds, set the latch exit condition to true directly.
2258 if (match(V: Term, P: m_BranchOnTwoConds())) {
2259 Term->setOperand(I: 1, New: Plan.getTrue());
2260 return true;
2261 }
2262
2263 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, {Plan.getTrue()},
2264 {}, {}, Term->getDebugLoc());
2265 ExitingVPBB->appendRecipe(Recipe: BOC);
2266 }
2267
2268 Term->eraseFromParent();
2269
2270 return true;
2271}
2272
2273/// From the definition of llvm.experimental.get.vector.length,
2274/// VPInstruction::ExplicitVectorLength(%AVL) = %AVL when %AVL <= VF.
2275static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF,
2276 PredicatedScalarEvolution &PSE) {
2277 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2278 Range: vp_depth_first_deep(G: Plan.getEntry()))) {
2279 for (VPRecipeBase &R : *VPBB) {
2280 VPValue *AVL;
2281 if (!match(V: &R, P: m_EVL(Op0: m_VPValue(V&: AVL))))
2282 continue;
2283
2284 const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(V: AVL, PSE);
2285 if (isa<SCEVCouldNotCompute>(Val: AVLSCEV))
2286 continue;
2287 ScalarEvolution &SE = *PSE.getSE();
2288 const SCEV *VFSCEV = SE.getElementCount(Ty: AVLSCEV->getType(), EC: VF);
2289 if (!SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: AVLSCEV, RHS: VFSCEV))
2290 continue;
2291
2292 VPValue *Trunc = VPBuilder(&R).createScalarZExtOrTrunc(
2293 Op: AVL, ResultTy: Type::getInt32Ty(C&: Plan.getContext()), SrcTy: AVLSCEV->getType(),
2294 DL: R.getDebugLoc());
2295 if (Trunc != AVL) {
2296 auto *TruncR = cast<VPSingleDefRecipe>(Val: Trunc);
2297 const DataLayout &DL = Plan.getDataLayout();
2298 VPTypeAnalysis TypeInfo(Plan);
2299 if (VPValue *Folded =
2300 tryToFoldLiveIns(R&: *TruncR, Operands: TruncR->operands(), DL, TypeInfo))
2301 Trunc = Folded;
2302 }
2303 R.getVPSingleValue()->replaceAllUsesWith(New: Trunc);
2304 return true;
2305 }
2306 }
2307 return false;
2308}
2309
2310void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
2311 unsigned BestUF,
2312 PredicatedScalarEvolution &PSE) {
2313 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
2314 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
2315
2316 bool MadeChange = tryToReplaceALMWithWideALM(Plan, VF: BestVF, UF: BestUF);
2317 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
2318 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
2319 MadeChange |= simplifyKnownEVL(Plan, VF: BestVF, PSE);
2320
2321 if (MadeChange) {
2322 Plan.setVF(BestVF);
2323 assert(Plan.getConcreteUF() == BestUF && "BestUF must match the Plan's UF");
2324 }
2325}
2326
2327/// Sink users of \p FOR after the recipe defining the previous value \p
2328/// Previous of the recurrence. \returns true if all users of \p FOR could be
2329/// re-arranged as needed or false if it is not possible.
2330static bool
2331sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR,
2332 VPRecipeBase *Previous,
2333 VPDominatorTree &VPDT) {
2334 // If Previous is a live-in (no defining recipe), it naturally dominates all
2335 // recipes in the loop, so no sinking is needed.
2336 if (!Previous)
2337 return true;
2338
2339 // Collect recipes that need sinking.
2340 SmallVector<VPRecipeBase *> WorkList;
2341 SmallPtrSet<VPRecipeBase *, 8> Seen;
2342 Seen.insert(Ptr: Previous);
2343 auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
2344 // The previous value must not depend on the users of the recurrence phi. In
2345 // that case, FOR is not a fixed order recurrence.
2346 if (SinkCandidate == Previous)
2347 return false;
2348
2349 if (isa<VPHeaderPHIRecipe>(Val: SinkCandidate) ||
2350 !Seen.insert(Ptr: SinkCandidate).second ||
2351 VPDT.properlyDominates(A: Previous, B: SinkCandidate))
2352 return true;
2353
2354 if (cannotHoistOrSinkRecipe(R: *SinkCandidate))
2355 return false;
2356
2357 WorkList.push_back(Elt: SinkCandidate);
2358 return true;
2359 };
2360
2361 // Recursively sink users of FOR after Previous.
2362 WorkList.push_back(Elt: FOR);
2363 for (unsigned I = 0; I != WorkList.size(); ++I) {
2364 VPRecipeBase *Current = WorkList[I];
2365 assert(Current->getNumDefinedValues() == 1 &&
2366 "only recipes with a single defined value expected");
2367
2368 for (VPUser *User : Current->getVPSingleValue()->users()) {
2369 if (!TryToPushSinkCandidate(cast<VPRecipeBase>(Val: User)))
2370 return false;
2371 }
2372 }
2373
2374 // Keep recipes to sink ordered by dominance so earlier instructions are
2375 // processed first.
2376 sort(C&: WorkList, Comp: [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2377 return VPDT.properlyDominates(A, B);
2378 });
2379
2380 for (VPRecipeBase *SinkCandidate : WorkList) {
2381 if (SinkCandidate == FOR)
2382 continue;
2383
2384 SinkCandidate->moveAfter(MovePos: Previous);
2385 Previous = SinkCandidate;
2386 }
2387 return true;
2388}
2389
2390/// Try to hoist \p Previous and its operands before all users of \p FOR.
2391static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR,
2392 VPRecipeBase *Previous,
2393 VPDominatorTree &VPDT) {
2394 if (cannotHoistOrSinkRecipe(R: *Previous))
2395 return false;
2396
2397 // Collect recipes that need hoisting.
2398 SmallVector<VPRecipeBase *> HoistCandidates;
2399 SmallPtrSet<VPRecipeBase *, 8> Visited;
2400 VPRecipeBase *HoistPoint = nullptr;
2401 // Find the closest hoist point by looking at all users of FOR and selecting
2402 // the recipe dominating all other users.
2403 for (VPUser *U : FOR->users()) {
2404 auto *R = cast<VPRecipeBase>(Val: U);
2405 if (!HoistPoint || VPDT.properlyDominates(A: R, B: HoistPoint))
2406 HoistPoint = R;
2407 }
2408 assert(all_of(FOR->users(),
2409 [&VPDT, HoistPoint](VPUser *U) {
2410 auto *R = cast<VPRecipeBase>(U);
2411 return HoistPoint == R ||
2412 VPDT.properlyDominates(HoistPoint, R);
2413 }) &&
2414 "HoistPoint must dominate all users of FOR");
2415
2416 auto NeedsHoisting = [HoistPoint, &VPDT,
2417 &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * {
2418 VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
2419 if (!HoistCandidate)
2420 return nullptr;
2421 VPRegionBlock *EnclosingLoopRegion =
2422 HoistCandidate->getParent()->getEnclosingLoopRegion();
2423 assert((!HoistCandidate->getRegion() ||
2424 HoistCandidate->getRegion() == EnclosingLoopRegion) &&
2425 "CFG in VPlan should still be flat, without replicate regions");
2426 // Hoist candidate was already visited, no need to hoist.
2427 if (!Visited.insert(Ptr: HoistCandidate).second)
2428 return nullptr;
2429
2430 // Candidate is outside loop region or a header phi, dominates FOR users w/o
2431 // hoisting.
2432 if (!EnclosingLoopRegion || isa<VPHeaderPHIRecipe>(Val: HoistCandidate))
2433 return nullptr;
2434
2435 // If we reached a recipe that dominates HoistPoint, we don't need to
2436 // hoist the recipe.
2437 if (VPDT.properlyDominates(A: HoistCandidate, B: HoistPoint))
2438 return nullptr;
2439 return HoistCandidate;
2440 };
2441
2442 if (!NeedsHoisting(Previous->getVPSingleValue()))
2443 return true;
2444
2445 // Recursively try to hoist Previous and its operands before all users of FOR.
2446 HoistCandidates.push_back(Elt: Previous);
2447
2448 for (unsigned I = 0; I != HoistCandidates.size(); ++I) {
2449 VPRecipeBase *Current = HoistCandidates[I];
2450 assert(Current->getNumDefinedValues() == 1 &&
2451 "only recipes with a single defined value expected");
2452 if (cannotHoistOrSinkRecipe(R: *Current))
2453 return false;
2454
2455 for (VPValue *Op : Current->operands()) {
2456 // If we reach FOR, it means the original Previous depends on some other
2457 // recurrence that in turn depends on FOR. If that is the case, we would
2458 // also need to hoist recipes involving the other FOR, which may break
2459 // dependencies.
2460 if (Op == FOR)
2461 return false;
2462
2463 if (auto *R = NeedsHoisting(Op)) {
2464 // Bail out if the recipe defines multiple values.
2465 // TODO: Hoisting such recipes requires additional handling.
2466 if (R->getNumDefinedValues() != 1)
2467 return false;
2468 HoistCandidates.push_back(Elt: R);
2469 }
2470 }
2471 }
2472
2473 // Order recipes to hoist by dominance so earlier instructions are processed
2474 // first.
2475 sort(C&: HoistCandidates, Comp: [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2476 return VPDT.properlyDominates(A, B);
2477 });
2478
2479 for (VPRecipeBase *HoistCandidate : HoistCandidates) {
2480 HoistCandidate->moveBefore(BB&: *HoistPoint->getParent(),
2481 I: HoistPoint->getIterator());
2482 }
2483
2484 return true;
2485}
2486
2487bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
2488 VPBuilder &LoopBuilder) {
2489 VPDominatorTree VPDT(Plan);
2490
2491 SmallVector<VPFirstOrderRecurrencePHIRecipe *> RecurrencePhis;
2492 for (VPRecipeBase &R :
2493 Plan.getVectorLoopRegion()->getEntry()->getEntryBasicBlock()->phis())
2494 if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Val: &R))
2495 RecurrencePhis.push_back(Elt: FOR);
2496
2497 for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
2498 SmallPtrSet<VPFirstOrderRecurrencePHIRecipe *, 4> SeenPhis;
2499 VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
2500 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
2501 // to terminate.
2502 while (auto *PrevPhi =
2503 dyn_cast_or_null<VPFirstOrderRecurrencePHIRecipe>(Val: Previous)) {
2504 assert(PrevPhi->getParent() == FOR->getParent());
2505 assert(SeenPhis.insert(PrevPhi).second);
2506 Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
2507 }
2508
2509 if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
2510 !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
2511 return false;
2512
2513 // Introduce a recipe to combine the incoming and previous values of a
2514 // fixed-order recurrence.
2515 VPBasicBlock *InsertBlock =
2516 Previous ? Previous->getParent() : FOR->getParent();
2517 if (!Previous || isa<VPHeaderPHIRecipe>(Val: Previous))
2518 LoopBuilder.setInsertPoint(TheBB: InsertBlock, IP: InsertBlock->getFirstNonPhi());
2519 else
2520 LoopBuilder.setInsertPoint(TheBB: InsertBlock,
2521 IP: std::next(x: Previous->getIterator()));
2522
2523 auto *RecurSplice =
2524 LoopBuilder.createNaryOp(Opcode: VPInstruction::FirstOrderRecurrenceSplice,
2525 Operands: {FOR, FOR->getBackedgeValue()});
2526
2527 FOR->replaceAllUsesWith(New: RecurSplice);
2528 // Set the first operand of RecurSplice to FOR again, after replacing
2529 // all users.
2530 RecurSplice->setOperand(I: 0, New: FOR);
2531
2532 // Check for users extracting at the penultimate active lane of the FOR.
2533 // If only a single lane is active in the current iteration, we need to
2534 // select the last element from the previous iteration (from the FOR phi
2535 // directly).
2536 for (VPUser *U : RecurSplice->users()) {
2537 if (!match(U, P: m_ExtractLane(Op0: m_LastActiveLane(Op0: m_VPValue()),
2538 Op1: m_Specific(VPV: RecurSplice))))
2539 continue;
2540
2541 VPBuilder B(cast<VPInstruction>(Val: U));
2542 VPValue *LastActiveLane = cast<VPInstruction>(Val: U)->getOperand(N: 0);
2543 VPValue *Zero = Plan.getConstantInt(BitWidth: 64, Val: 0);
2544 VPValue *One = Plan.getConstantInt(BitWidth: 64, Val: 1);
2545 VPValue *PenultimateIndex = B.createSub(LHS: LastActiveLane, RHS: One);
2546 VPValue *PenultimateLastIter =
2547 B.createNaryOp(Opcode: VPInstruction::ExtractLane,
2548 Operands: {PenultimateIndex, FOR->getBackedgeValue()});
2549 VPValue *LastPrevIter =
2550 B.createNaryOp(Opcode: VPInstruction::ExtractLastLane, Operands: FOR);
2551
2552 VPValue *Cmp = B.createICmp(Pred: CmpInst::ICMP_EQ, A: LastActiveLane, B: Zero);
2553 VPValue *Sel = B.createSelect(Cond: Cmp, TrueVal: LastPrevIter, FalseVal: PenultimateLastIter);
2554 cast<VPInstruction>(Val: U)->replaceAllUsesWith(New: Sel);
2555 }
2556 }
2557 return true;
2558}
2559
2560void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {
2561 for (VPRecipeBase &R :
2562 Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
2563 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
2564 if (!PhiR)
2565 continue;
2566 RecurKind RK = PhiR->getRecurrenceKind();
2567 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2568 RK != RecurKind::AddChainWithSubs)
2569 continue;
2570
2571 for (VPUser *U : collectUsersRecursively(V: PhiR))
2572 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(Val: U)) {
2573 RecWithFlags->dropPoisonGeneratingFlags();
2574 }
2575 }
2576}
2577
2578namespace {
2579struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2580 static bool isSentinel(const VPSingleDefRecipe *Def) {
2581 return Def == getEmptyKey() || Def == getTombstoneKey();
2582 }
2583
2584 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2585 /// return that source element type.
2586 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2587 // All VPInstructions that lower to GEPs must have the i8 source element
2588 // type (as they are PtrAdds), so we omit it.
2589 return TypeSwitch<const VPSingleDefRecipe *, Type *>(R)
2590 .Case(caseFn: [](const VPReplicateRecipe *I) -> Type * {
2591 if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: I->getUnderlyingValue()))
2592 return GEP->getSourceElementType();
2593 return nullptr;
2594 })
2595 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2596 caseFn: [](auto *I) { return I->getSourceElementType(); })
2597 .Default(defaultFn: [](auto *) { return nullptr; });
2598 }
2599
2600 /// Returns true if recipe \p Def can be safely handed for CSE.
2601 static bool canHandle(const VPSingleDefRecipe *Def) {
2602 // We can extend the list of handled recipes in the future,
2603 // provided we account for the data embedded in them while checking for
2604 // equality or hashing.
2605 auto C = getOpcodeOrIntrinsicID(R: Def);
2606
2607 // The issue with (Insert|Extract)Value is that the index of the
2608 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2609 // VPlan.
2610 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2611 C->second == Instruction::ExtractValue)))
2612 return false;
2613
2614 // During CSE, we can only handle recipes that don't read from memory: if
2615 // they read from memory, there could be an intervening write to memory
2616 // before the next instance is CSE'd, leading to an incorrect result.
2617 return !Def->mayReadFromMemory();
2618 }
2619
2620 /// Hash the underlying data of \p Def.
2621 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2622 const VPlan *Plan = Def->getParent()->getPlan();
2623 VPTypeAnalysis TypeInfo(*Plan);
2624 hash_code Result = hash_combine(
2625 args: Def->getVPRecipeID(), args: getOpcodeOrIntrinsicID(R: Def),
2626 args: getGEPSourceElementType(R: Def), args: TypeInfo.inferScalarType(V: Def),
2627 args: vputils::isSingleScalar(VPV: Def), args: hash_combine_range(R: Def->operands()));
2628 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Val: Def))
2629 if (RFlags->hasPredicate())
2630 return hash_combine(args: Result, args: RFlags->getPredicate());
2631 return Result;
2632 }
2633
2634 /// Check equality of underlying data of \p L and \p R.
2635 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2636 if (isSentinel(Def: L) || isSentinel(Def: R))
2637 return L == R;
2638 if (L->getVPRecipeID() != R->getVPRecipeID() ||
2639 getOpcodeOrIntrinsicID(R: L) != getOpcodeOrIntrinsicID(R) ||
2640 getGEPSourceElementType(R: L) != getGEPSourceElementType(R) ||
2641 vputils::isSingleScalar(VPV: L) != vputils::isSingleScalar(VPV: R) ||
2642 !equal(LRange: L->operands(), RRange: R->operands()))
2643 return false;
2644 assert(getOpcodeOrIntrinsicID(L) && getOpcodeOrIntrinsicID(R) &&
2645 "must have valid opcode info for both recipes");
2646 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(Val: L))
2647 if (LFlags->hasPredicate() &&
2648 LFlags->getPredicate() !=
2649 cast<VPRecipeWithIRFlags>(Val: R)->getPredicate())
2650 return false;
2651 // Recipes in replicate regions implicitly depend on predicate. If either
2652 // recipe is in a replicate region, only consider them equal if both have
2653 // the same parent.
2654 const VPRegionBlock *RegionL = L->getRegion();
2655 const VPRegionBlock *RegionR = R->getRegion();
2656 if (((RegionL && RegionL->isReplicator()) ||
2657 (RegionR && RegionR->isReplicator())) &&
2658 L->getParent() != R->getParent())
2659 return false;
2660 const VPlan *Plan = L->getParent()->getPlan();
2661 VPTypeAnalysis TypeInfo(*Plan);
2662 return TypeInfo.inferScalarType(V: L) == TypeInfo.inferScalarType(V: R);
2663 }
2664};
2665} // end anonymous namespace
2666
2667/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2668/// Plan.
2669void VPlanTransforms::cse(VPlan &Plan) {
2670 VPDominatorTree VPDT(Plan);
2671 DenseMap<VPSingleDefRecipe *, VPSingleDefRecipe *, VPCSEDenseMapInfo> CSEMap;
2672
2673 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2674 Range: vp_depth_first_deep(G: Plan.getEntry()))) {
2675 for (VPRecipeBase &R : *VPBB) {
2676 auto *Def = dyn_cast<VPSingleDefRecipe>(Val: &R);
2677 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2678 continue;
2679 if (VPSingleDefRecipe *V = CSEMap.lookup(Val: Def)) {
2680 // V must dominate Def for a valid replacement.
2681 if (!VPDT.dominates(A: V->getParent(), B: VPBB))
2682 continue;
2683 // Only keep flags present on both V and Def.
2684 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Val: V))
2685 RFlags->intersectFlags(Other: *cast<VPRecipeWithIRFlags>(Val: Def));
2686 Def->replaceAllUsesWith(New: V);
2687 continue;
2688 }
2689 CSEMap[Def] = Def;
2690 }
2691 }
2692}
2693
2694/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2695static void licm(VPlan &Plan) {
2696 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2697
2698 // Hoist any loop invariant recipes from the vector loop region to the
2699 // preheader. Preform a shallow traversal of the vector loop region, to
2700 // exclude recipes in replicate regions. Since the top-level blocks in the
2701 // vector loop region are guaranteed to execute if the vector pre-header is,
2702 // we don't need to check speculation safety.
2703 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2704 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2705 "Expected vector prehader's successor to be the vector loop region");
2706 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2707 Range: vp_depth_first_shallow(G: LoopRegion->getEntry()))) {
2708 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
2709 if (cannotHoistOrSinkRecipe(R))
2710 continue;
2711 if (any_of(Range: R.operands(), P: [](VPValue *Op) {
2712 return !Op->isDefinedOutsideLoopRegions();
2713 }))
2714 continue;
2715 R.moveBefore(BB&: *Preheader, I: Preheader->end());
2716 }
2717 }
2718
2719#ifndef NDEBUG
2720 VPDominatorTree VPDT(Plan);
2721#endif
2722 // Sink recipes with no users inside the vector loop region if all users are
2723 // in the same exit block of the region.
2724 // TODO: Extend to sink recipes from inner loops.
2725 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2726 Range: vp_post_order_shallow(G: LoopRegion->getEntry()))) {
2727 for (VPRecipeBase &R : make_early_inc_range(Range: reverse(C&: *VPBB))) {
2728 if (cannotHoistOrSinkRecipe(R))
2729 continue;
2730
2731 // TODO: Support sinking VPReplicateRecipe after ensuring replicateByVF
2732 // handles sunk recipes correctly.
2733 if (isa<VPReplicateRecipe>(Val: &R))
2734 continue;
2735
2736 // TODO: Use R.definedValues() instead of casting to VPSingleDefRecipe to
2737 // support recipes with multiple defined values (e.g., interleaved loads).
2738 auto *Def = cast<VPSingleDefRecipe>(Val: &R);
2739 // Skip recipes without users as we cannot determine a sink block.
2740 // TODO: Clone sinkable recipes without users to all exit blocks to reduce
2741 // their execution frequency.
2742 if (Def->getNumUsers() == 0)
2743 continue;
2744
2745 VPBasicBlock *SinkBB = nullptr;
2746 // Cannot sink the recipe if any user
2747 // * is defined in any loop region, or
2748 // * is a phi, or
2749 // * multiple users in different blocks.
2750 if (any_of(Range: Def->users(), P: [&SinkBB](VPUser *U) {
2751 auto *UserR = cast<VPRecipeBase>(Val: U);
2752 VPBasicBlock *Parent = UserR->getParent();
2753 // TODO: If the user is a PHI node, we should check the block of
2754 // incoming value. Support PHI node users if needed.
2755 if (UserR->isPhi() || Parent->getEnclosingLoopRegion())
2756 return true;
2757 // TODO: Support sinking when users are in multiple blocks.
2758 if (SinkBB && SinkBB != Parent)
2759 return true;
2760 SinkBB = Parent;
2761 return false;
2762 }))
2763 continue;
2764
2765 // Only sink to dedicated exit blocks of the loop region.
2766 if (SinkBB->getSinglePredecessor() != LoopRegion)
2767 continue;
2768
2769 // TODO: This will need to be a check instead of a assert after
2770 // conditional branches in vectorized loops are supported.
2771 assert(VPDT.properlyDominates(VPBB, SinkBB) &&
2772 "Defining block must dominate sink block");
2773 // TODO: Clone the recipe if users are on multiple exit paths, instead of
2774 // just moving.
2775 Def->moveBefore(BB&: *SinkBB, I: SinkBB->getFirstNonPhi());
2776 }
2777 }
2778}
2779
2780void VPlanTransforms::truncateToMinimalBitwidths(
2781 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2782 if (Plan.hasScalarVFOnly())
2783 return;
2784 // Keep track of created truncates, so they can be re-used. Note that we
2785 // cannot use RAUW after creating a new truncate, as this would could make
2786 // other uses have different types for their operands, making them invalidly
2787 // typed.
2788 DenseMap<VPValue *, VPWidenCastRecipe *> ProcessedTruncs;
2789 VPTypeAnalysis TypeInfo(Plan);
2790 VPBasicBlock *PH = Plan.getVectorPreheader();
2791 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2792 Range: vp_depth_first_deep(G: Plan.getVectorLoopRegion()))) {
2793 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
2794 if (!isa<VPWidenRecipe, VPWidenCastRecipe, VPReplicateRecipe,
2795 VPWidenLoadRecipe, VPWidenIntrinsicRecipe>(Val: &R))
2796 continue;
2797
2798 VPValue *ResultVPV = R.getVPSingleValue();
2799 auto *UI = cast_or_null<Instruction>(Val: ResultVPV->getUnderlyingValue());
2800 unsigned NewResSizeInBits = MinBWs.lookup(Key: UI);
2801 if (!NewResSizeInBits)
2802 continue;
2803
2804 // If the value wasn't vectorized, we must maintain the original scalar
2805 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2806 // skip casts which do not need to be handled explicitly here, as
2807 // redundant casts will be removed during recipe simplification.
2808 if (isa<VPReplicateRecipe, VPWidenCastRecipe>(Val: &R))
2809 continue;
2810
2811 Type *OldResTy = TypeInfo.inferScalarType(V: ResultVPV);
2812 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2813 assert(OldResTy->isIntegerTy() && "only integer types supported");
2814 (void)OldResSizeInBits;
2815
2816 auto *NewResTy = IntegerType::get(C&: Plan.getContext(), NumBits: NewResSizeInBits);
2817
2818 // Any wrapping introduced by shrinking this operation shouldn't be
2819 // considered undefined behavior. So, we can't unconditionally copy
2820 // arithmetic wrapping flags to VPW.
2821 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(Val: &R))
2822 VPW->dropPoisonGeneratingFlags();
2823
2824 if (OldResSizeInBits != NewResSizeInBits &&
2825 !match(V: &R, P: m_ICmp(Op0: m_VPValue(), Op1: m_VPValue()))) {
2826 // Extend result to original width.
2827 auto *Ext = new VPWidenCastRecipe(
2828 Instruction::ZExt, ResultVPV, OldResTy, nullptr,
2829 VPIRFlags::getDefaultFlags(Opcode: Instruction::ZExt));
2830 Ext->insertAfter(InsertPos: &R);
2831 ResultVPV->replaceAllUsesWith(New: Ext);
2832 Ext->setOperand(I: 0, New: ResultVPV);
2833 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
2834 } else {
2835 assert(match(&R, m_ICmp(m_VPValue(), m_VPValue())) &&
2836 "Only ICmps should not need extending the result.");
2837 }
2838
2839 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2840 if (isa<VPWidenLoadRecipe, VPWidenIntrinsicRecipe>(Val: &R))
2841 continue;
2842
2843 // Shrink operands by introducing truncates as needed.
2844 unsigned StartIdx =
2845 match(V: &R, P: m_Select(Op0: m_VPValue(), Op1: m_VPValue(), Op2: m_VPValue())) ? 1 : 0;
2846 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
2847 auto *Op = R.getOperand(N: Idx);
2848 unsigned OpSizeInBits =
2849 TypeInfo.inferScalarType(V: Op)->getScalarSizeInBits();
2850 if (OpSizeInBits == NewResSizeInBits)
2851 continue;
2852 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2853 auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Key: Op);
2854 if (!IterIsEmpty) {
2855 R.setOperand(I: Idx, New: ProcessedIter->second);
2856 continue;
2857 }
2858
2859 VPBuilder Builder;
2860 if (isa<VPIRValue>(Val: Op))
2861 Builder.setInsertPoint(PH);
2862 else
2863 Builder.setInsertPoint(&R);
2864 VPWidenCastRecipe *NewOp =
2865 Builder.createWidenCast(Opcode: Instruction::Trunc, Op, ResultTy: NewResTy);
2866 ProcessedIter->second = NewOp;
2867 R.setOperand(I: Idx, New: NewOp);
2868 }
2869
2870 }
2871 }
2872}
2873
2874void VPlanTransforms::removeBranchOnConst(VPlan &Plan) {
2875 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2876 Range: vp_depth_first_shallow(G: Plan.getEntry()))) {
2877 VPValue *Cond;
2878 // Skip blocks that are not terminated by BranchOnCond.
2879 if (VPBB->empty() || !match(V: &VPBB->back(), P: m_BranchOnCond(Op0: m_VPValue(V&: Cond))))
2880 continue;
2881
2882 assert(VPBB->getNumSuccessors() == 2 &&
2883 "Two successors expected for BranchOnCond");
2884 unsigned RemovedIdx;
2885 if (match(V: Cond, P: m_True()))
2886 RemovedIdx = 1;
2887 else if (match(V: Cond, P: m_False()))
2888 RemovedIdx = 0;
2889 else
2890 continue;
2891
2892 VPBasicBlock *RemovedSucc =
2893 cast<VPBasicBlock>(Val: VPBB->getSuccessors()[RemovedIdx]);
2894 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2895 "There must be a single edge between VPBB and its successor");
2896 // Values coming from VPBB into phi recipes of RemoveSucc are removed from
2897 // these recipes.
2898 for (VPRecipeBase &R : RemovedSucc->phis())
2899 cast<VPPhiAccessors>(Val: &R)->removeIncomingValueFor(IncomingBlock: VPBB);
2900
2901 // Disconnect blocks and remove the terminator. RemovedSucc will be deleted
2902 // automatically on VPlan destruction if it becomes unreachable.
2903 VPBlockUtils::disconnectBlocks(From: VPBB, To: RemovedSucc);
2904 VPBB->back().eraseFromParent();
2905 }
2906}
2907
2908void VPlanTransforms::optimize(VPlan &Plan) {
2909 RUN_VPLAN_PASS(removeRedundantCanonicalIVs, Plan);
2910 RUN_VPLAN_PASS(removeRedundantInductionCasts, Plan);
2911
2912 RUN_VPLAN_PASS(reassociateHeaderMask, Plan);
2913 RUN_VPLAN_PASS(simplifyRecipes, Plan);
2914 RUN_VPLAN_PASS(removeDeadRecipes, Plan);
2915 RUN_VPLAN_PASS(simplifyBlends, Plan);
2916 RUN_VPLAN_PASS(legalizeAndOptimizeInductions, Plan);
2917 RUN_VPLAN_PASS(narrowToSingleScalarRecipes, Plan);
2918 RUN_VPLAN_PASS(removeRedundantExpandSCEVRecipes, Plan);
2919 RUN_VPLAN_PASS(reassociateHeaderMask, Plan);
2920 RUN_VPLAN_PASS(simplifyRecipes, Plan);
2921 RUN_VPLAN_PASS(removeBranchOnConst, Plan);
2922 RUN_VPLAN_PASS(removeDeadRecipes, Plan);
2923
2924 RUN_VPLAN_PASS(createAndOptimizeReplicateRegions, Plan);
2925 RUN_VPLAN_PASS(hoistInvariantLoads, Plan);
2926 RUN_VPLAN_PASS(mergeBlocksIntoPredecessors, Plan);
2927 RUN_VPLAN_PASS(licm, Plan);
2928}
2929
2930// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2931// the loop terminator with a branch-on-cond recipe with the negated
2932// active-lane-mask as operand. Note that this turns the loop into an
2933// uncountable one. Only the existing terminator is replaced, all other existing
2934// recipes/users remain unchanged, except for poison-generating flags being
2935// dropped from the canonical IV increment. Return the created
2936// VPActiveLaneMaskPHIRecipe.
2937//
2938// The function adds the following recipes:
2939//
2940// vector.ph:
2941// %EntryInc = canonical-iv-increment-for-part CanonicalIVStart
2942// %EntryALM = active-lane-mask %EntryInc, TC
2943//
2944// vector.body:
2945// ...
2946// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2947// ...
2948// %InLoopInc = canonical-iv-increment-for-part CanonicalIVIncrement
2949// %ALM = active-lane-mask %InLoopInc, TC
2950// %Negated = Not %ALM
2951// branch-on-cond %Negated
2952//
2953static VPActiveLaneMaskPHIRecipe *
2954addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan) {
2955 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2956 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2957 auto *CanonicalIVPHI = TopRegion->getCanonicalIV();
2958 VPValue *StartV = CanonicalIVPHI->getStartValue();
2959
2960 auto *CanonicalIVIncrement =
2961 cast<VPInstruction>(Val: CanonicalIVPHI->getBackedgeValue());
2962 // TODO: Check if dropping the flags is needed.
2963 CanonicalIVIncrement->dropPoisonGeneratingFlags();
2964 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2965 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2966 // we have to take unrolling into account. Each part needs to start at
2967 // Part * VF
2968 auto *VecPreheader = Plan.getVectorPreheader();
2969 VPBuilder Builder(VecPreheader);
2970
2971 // Create the ActiveLaneMask instruction using the correct start values.
2972 VPValue *TC = Plan.getTripCount();
2973 VPValue *VF = &Plan.getVF();
2974
2975 auto *EntryIncrement = Builder.createOverflowingOp(
2976 Opcode: VPInstruction::CanonicalIVIncrementForPart, Operands: {StartV, VF}, WrapFlags: {false, false},
2977 DL, Name: "index.part.next");
2978
2979 // Create the active lane mask instruction in the VPlan preheader.
2980 VPValue *ALMMultiplier =
2981 Plan.getConstantInt(Ty: TopRegion->getCanonicalIVType(), Val: 1);
2982 auto *EntryALM = Builder.createNaryOp(Opcode: VPInstruction::ActiveLaneMask,
2983 Operands: {EntryIncrement, TC, ALMMultiplier}, DL,
2984 Name: "active.lane.mask.entry");
2985
2986 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2987 // preheader ActiveLaneMask instruction.
2988 auto *LaneMaskPhi =
2989 new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc::getUnknown());
2990 LaneMaskPhi->insertAfter(InsertPos: CanonicalIVPHI);
2991
2992 // Create the active lane mask for the next iteration of the loop before the
2993 // original terminator.
2994 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2995 Builder.setInsertPoint(OriginalTerminator);
2996 auto *InLoopIncrement = Builder.createOverflowingOp(
2997 Opcode: VPInstruction::CanonicalIVIncrementForPart,
2998 Operands: {CanonicalIVIncrement, &Plan.getVF()}, WrapFlags: {false, false}, DL);
2999 auto *ALM = Builder.createNaryOp(Opcode: VPInstruction::ActiveLaneMask,
3000 Operands: {InLoopIncrement, TC, ALMMultiplier}, DL,
3001 Name: "active.lane.mask.next");
3002 LaneMaskPhi->addOperand(Operand: ALM);
3003
3004 // Replace the original terminator with BranchOnCond. We have to invert the
3005 // mask here because a true condition means jumping to the exit block.
3006 auto *NotMask = Builder.createNot(Operand: ALM, DL);
3007 Builder.createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: {NotMask}, DL);
3008 OriginalTerminator->eraseFromParent();
3009 return LaneMaskPhi;
3010}
3011
3012void VPlanTransforms::addActiveLaneMask(VPlan &Plan,
3013 bool UseActiveLaneMaskForControlFlow) {
3014 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3015 auto *FoundWidenCanonicalIVUser = find_if(
3016 Range: LoopRegion->getCanonicalIV()->users(), P: IsaPred<VPWidenCanonicalIVRecipe>);
3017 assert(FoundWidenCanonicalIVUser &&
3018 "Must have widened canonical IV when tail folding!");
3019 VPSingleDefRecipe *HeaderMask = vputils::findHeaderMask(Plan);
3020 auto *WideCanonicalIV =
3021 cast<VPWidenCanonicalIVRecipe>(Val: *FoundWidenCanonicalIVUser);
3022 VPSingleDefRecipe *LaneMask;
3023 if (UseActiveLaneMaskForControlFlow) {
3024 LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(Plan);
3025 } else {
3026 VPBuilder B = VPBuilder::getToInsertAfter(R: WideCanonicalIV);
3027 VPValue *ALMMultiplier =
3028 Plan.getConstantInt(Ty: LoopRegion->getCanonicalIVType(), Val: 1);
3029 LaneMask =
3030 B.createNaryOp(Opcode: VPInstruction::ActiveLaneMask,
3031 Operands: {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
3032 DL: nullptr, Name: "active.lane.mask");
3033 }
3034
3035 // Walk users of WideCanonicalIV and replace the header mask of the form
3036 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
3037 // removing the old one to ensure there is always only a single header mask.
3038 HeaderMask->replaceAllUsesWith(New: LaneMask);
3039 HeaderMask->eraseFromParent();
3040}
3041
3042template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
3043 Op0_t In;
3044 Op1_t &Out;
3045
3046 RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
3047
3048 template <typename OpTy> bool match(OpTy *V) const {
3049 if (m_Specific(In).match(V)) {
3050 Out = nullptr;
3051 return true;
3052 }
3053 return m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V);
3054 }
3055};
3056
3057/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
3058/// Returns the remaining part \p Out if so, or nullptr otherwise.
3059template <typename Op0_t, typename Op1_t>
3060static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
3061 Op1_t &Out) {
3062 return RemoveMask_match<Op0_t, Op1_t>(In, Out);
3063}
3064
3065/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
3066/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
3067/// recipe could be created.
3068/// \p HeaderMask Header Mask.
3069/// \p CurRecipe Recipe to be transform.
3070/// \p TypeInfo VPlan-based type analysis.
3071/// \p EVL The explicit vector length parameter of vector-predication
3072/// intrinsics.
3073static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
3074 VPRecipeBase &CurRecipe,
3075 VPTypeAnalysis &TypeInfo, VPValue &EVL) {
3076 VPlan *Plan = CurRecipe.getParent()->getPlan();
3077 DebugLoc DL = CurRecipe.getDebugLoc();
3078 VPValue *Addr, *Mask, *EndPtr;
3079
3080 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
3081 auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
3082 auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(Val: EndPtr)->clone();
3083 EVLEndPtr->insertBefore(InsertPos: &CurRecipe);
3084 EVLEndPtr->setOperand(I: 1, New: &EVL);
3085 return EVLEndPtr;
3086 };
3087
3088 if (match(V: &CurRecipe,
3089 P: m_MaskedLoad(Addr: m_VPValue(V&: Addr), Mask: m_RemoveMask(In: HeaderMask, Out&: Mask))) &&
3090 !cast<VPWidenLoadRecipe>(Val&: CurRecipe).isReverse())
3091 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(Val&: CurRecipe), Addr,
3092 EVL, Mask);
3093
3094 VPValue *ReversedVal;
3095 if (match(V: &CurRecipe, P: m_Reverse(Op0: m_VPValue(V&: ReversedVal))) &&
3096 match(V: ReversedVal,
3097 P: m_MaskedLoad(Addr: m_VPValue(V&: EndPtr), Mask: m_RemoveMask(In: HeaderMask, Out&: Mask))) &&
3098 match(V: EndPtr, P: m_VecEndPtr(Op0: m_VPValue(V&: Addr), Op1: m_Specific(VPV: &Plan->getVF()))) &&
3099 cast<VPWidenLoadRecipe>(Val: ReversedVal)->isReverse()) {
3100 auto *LoadR = new VPWidenLoadEVLRecipe(
3101 *cast<VPWidenLoadRecipe>(Val: ReversedVal), AdjustEndPtr(EndPtr), EVL, Mask);
3102 LoadR->insertBefore(InsertPos: &CurRecipe);
3103 return new VPWidenIntrinsicRecipe(
3104 Intrinsic::experimental_vp_reverse, {LoadR, Plan->getTrue(), &EVL},
3105 TypeInfo.inferScalarType(V: LoadR), {}, {}, DL);
3106 }
3107
3108 VPValue *StoredVal;
3109 if (match(V: &CurRecipe, P: m_MaskedStore(Addr: m_VPValue(V&: Addr), Val: m_VPValue(V&: StoredVal),
3110 Mask: m_RemoveMask(In: HeaderMask, Out&: Mask))) &&
3111 !cast<VPWidenStoreRecipe>(Val&: CurRecipe).isReverse())
3112 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(Val&: CurRecipe), Addr,
3113 StoredVal, EVL, Mask);
3114
3115 if (match(V: &CurRecipe,
3116 P: m_MaskedStore(Addr: m_VPValue(V&: EndPtr), Val: m_Reverse(Op0: m_VPValue(V&: ReversedVal)),
3117 Mask: m_RemoveMask(In: HeaderMask, Out&: Mask))) &&
3118 match(V: EndPtr, P: m_VecEndPtr(Op0: m_VPValue(V&: Addr), Op1: m_Specific(VPV: &Plan->getVF()))) &&
3119 cast<VPWidenStoreRecipe>(Val&: CurRecipe).isReverse()) {
3120 auto *NewReverse = new VPWidenIntrinsicRecipe(
3121 Intrinsic::experimental_vp_reverse,
3122 {ReversedVal, Plan->getTrue(), &EVL},
3123 TypeInfo.inferScalarType(V: ReversedVal), {}, {}, DL);
3124 NewReverse->insertBefore(InsertPos: &CurRecipe);
3125 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(Val&: CurRecipe),
3126 AdjustEndPtr(EndPtr), NewReverse, EVL,
3127 Mask);
3128 }
3129
3130 if (auto *Rdx = dyn_cast<VPReductionRecipe>(Val: &CurRecipe))
3131 if (Rdx->isConditional() &&
3132 match(V: Rdx->getCondOp(), P: m_RemoveMask(In: HeaderMask, Out&: Mask)))
3133 return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
3134
3135 if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(Val: &CurRecipe))
3136 if (Interleave->getMask() &&
3137 match(V: Interleave->getMask(), P: m_RemoveMask(In: HeaderMask, Out&: Mask)))
3138 return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
3139
3140 VPValue *LHS, *RHS;
3141 if (match(V: &CurRecipe,
3142 P: m_Select(Op0: m_Specific(VPV: HeaderMask), Op1: m_VPValue(V&: LHS), Op2: m_VPValue(V&: RHS))))
3143 return new VPWidenIntrinsicRecipe(
3144 Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL},
3145 TypeInfo.inferScalarType(V: LHS), {}, {}, DL);
3146
3147 if (match(V: &CurRecipe, P: m_Select(Op0: m_RemoveMask(In: HeaderMask, Out&: Mask), Op1: m_VPValue(V&: LHS),
3148 Op2: m_VPValue(V&: RHS))))
3149 return new VPWidenIntrinsicRecipe(
3150 Intrinsic::vp_merge, {Mask, LHS, RHS, &EVL},
3151 TypeInfo.inferScalarType(V: LHS), {}, {}, DL);
3152
3153 if (match(V: &CurRecipe, P: m_LastActiveLane(Op0: m_Specific(VPV: HeaderMask)))) {
3154 Type *Ty = TypeInfo.inferScalarType(V: CurRecipe.getVPSingleValue());
3155 VPValue *ZExt =
3156 VPBuilder(&CurRecipe).createScalarCast(Opcode: Instruction::ZExt, Op: &EVL, ResultTy: Ty, DL);
3157 return new VPInstruction(
3158 Instruction::Sub, {ZExt, Plan->getConstantInt(Ty, Val: 1)},
3159 VPIRFlags::getDefaultFlags(Opcode: Instruction::Sub), {}, DL);
3160 }
3161
3162 return nullptr;
3163}
3164
3165/// Optimize away any EVL-based header masks to VP intrinsic based recipes.
3166/// The transforms here need to preserve the original semantics.
3167void VPlanTransforms::optimizeEVLMasks(VPlan &Plan) {
3168 // Find the EVL-based header mask if it exists: icmp ult step-vector, EVL
3169 VPValue *HeaderMask = nullptr, *EVL = nullptr;
3170 for (VPRecipeBase &R : *Plan.getVectorLoopRegion()->getEntryBasicBlock()) {
3171 if (match(V: &R, P: m_SpecificICmp(MatchPred: CmpInst::ICMP_ULT, Op0: m_StepVector(),
3172 Op1: m_VPValue(V&: EVL))) &&
3173 match(V: EVL, P: m_EVL(Op0: m_VPValue()))) {
3174 HeaderMask = R.getVPSingleValue();
3175 break;
3176 }
3177 }
3178 if (!HeaderMask)
3179 return;
3180
3181 VPTypeAnalysis TypeInfo(Plan);
3182 SmallVector<VPRecipeBase *> OldRecipes;
3183 for (VPUser *U : collectUsersRecursively(V: HeaderMask)) {
3184 VPRecipeBase *R = cast<VPRecipeBase>(Val: U);
3185 if (auto *NewR = optimizeMaskToEVL(HeaderMask, CurRecipe&: *R, TypeInfo, EVL&: *EVL)) {
3186 NewR->insertBefore(InsertPos: R);
3187 for (auto [Old, New] :
3188 zip_equal(t: R->definedValues(), u: NewR->definedValues()))
3189 Old->replaceAllUsesWith(New);
3190 OldRecipes.push_back(Elt: R);
3191 }
3192 }
3193 // Erase old recipes at the end so we don't invalidate TypeInfo.
3194 for (VPRecipeBase *R : reverse(C&: OldRecipes)) {
3195 SmallVector<VPValue *> PossiblyDead(R->operands());
3196 R->eraseFromParent();
3197 for (VPValue *Op : PossiblyDead)
3198 recursivelyDeleteDeadRecipes(V: Op);
3199 }
3200}
3201
3202/// After replacing the canonical IV with a EVL-based IV, fixup recipes that use
3203/// VF to use the EVL instead to avoid incorrect updates on the penultimate
3204/// iteration.
3205static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL) {
3206 VPTypeAnalysis TypeInfo(Plan);
3207 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3208 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3209
3210 assert(all_of(Plan.getVF().users(),
3211 IsaPred<VPVectorEndPointerRecipe, VPScalarIVStepsRecipe,
3212 VPWidenIntOrFpInductionRecipe>) &&
3213 "User of VF that we can't transform to EVL.");
3214 Plan.getVF().replaceUsesWithIf(New: &EVL, ShouldReplace: [](VPUser &U, unsigned Idx) {
3215 return isa<VPWidenIntOrFpInductionRecipe, VPScalarIVStepsRecipe>(Val: U);
3216 });
3217
3218 assert(all_of(Plan.getVFxUF().users(),
3219 [&LoopRegion, &Plan](VPUser *U) {
3220 return match(U,
3221 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
3222 m_Specific(&Plan.getVFxUF()))) ||
3223 isa<VPWidenPointerInductionRecipe>(U);
3224 }) &&
3225 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
3226 "increment of the canonical induction.");
3227 Plan.getVFxUF().replaceUsesWithIf(New: &EVL, ShouldReplace: [](VPUser &U, unsigned Idx) {
3228 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
3229 // canonical induction must not be updated.
3230 return isa<VPWidenPointerInductionRecipe>(Val: U);
3231 });
3232
3233 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
3234 // contained.
3235 bool ContainsFORs =
3236 any_of(Range: Header->phis(), P: IsaPred<VPFirstOrderRecurrencePHIRecipe>);
3237 if (ContainsFORs) {
3238 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
3239 VPValue *MaxEVL = &Plan.getVF();
3240 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
3241 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
3242 MaxEVL = Builder.createScalarZExtOrTrunc(
3243 Op: MaxEVL, ResultTy: Type::getInt32Ty(C&: Plan.getContext()),
3244 SrcTy: TypeInfo.inferScalarType(V: MaxEVL), DL: DebugLoc::getUnknown());
3245
3246 Builder.setInsertPoint(TheBB: Header, IP: Header->getFirstNonPhi());
3247 VPValue *PrevEVL = Builder.createScalarPhi(
3248 IncomingValues: {MaxEVL, &EVL}, DL: DebugLoc::getUnknown(), Name: "prev.evl");
3249
3250 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
3251 Range: vp_depth_first_deep(G: Plan.getVectorLoopRegion()->getEntry()))) {
3252 for (VPRecipeBase &R : *VPBB) {
3253 VPValue *V1, *V2;
3254 if (!match(V: &R,
3255 P: m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>(
3256 Ops: m_VPValue(V&: V1), Ops: m_VPValue(V&: V2))))
3257 continue;
3258 VPValue *Imm = Plan.getOrAddLiveIn(
3259 V: ConstantInt::getSigned(Ty: Type::getInt32Ty(C&: Plan.getContext()), V: -1));
3260 VPWidenIntrinsicRecipe *VPSplice = new VPWidenIntrinsicRecipe(
3261 Intrinsic::experimental_vp_splice,
3262 {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
3263 TypeInfo.inferScalarType(V: R.getVPSingleValue()), {}, {},
3264 R.getDebugLoc());
3265 VPSplice->insertBefore(InsertPos: &R);
3266 R.getVPSingleValue()->replaceAllUsesWith(New: VPSplice);
3267 }
3268 }
3269 }
3270
3271 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
3272 if (!HeaderMask)
3273 return;
3274
3275 // Replace header masks with a mask equivalent to predicating by EVL:
3276 //
3277 // icmp ule widen-canonical-iv backedge-taken-count
3278 // ->
3279 // icmp ult step-vector, EVL
3280 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
3281 VPBuilder Builder(EVLR->getParent(), std::next(x: EVLR->getIterator()));
3282 Type *EVLType = TypeInfo.inferScalarType(V: &EVL);
3283 VPValue *EVLMask = Builder.createICmp(
3284 Pred: CmpInst::ICMP_ULT,
3285 A: Builder.createNaryOp(Opcode: VPInstruction::StepVector, Operands: {}, ResultTy: EVLType), B: &EVL);
3286 HeaderMask->replaceAllUsesWith(New: EVLMask);
3287}
3288
3289/// Converts a tail folded vector loop region to step by
3290/// VPInstruction::ExplicitVectorLength elements instead of VF elements each
3291/// iteration.
3292///
3293/// - Add a VPCurrentIterationPHIRecipe and related recipes to \p Plan and
3294/// replaces all uses except the canonical IV increment of
3295/// VPCanonicalIVPHIRecipe with a VPCurrentIterationPHIRecipe.
3296/// VPCanonicalIVPHIRecipe is used only for loop iterations counting after
3297/// this transformation.
3298///
3299/// - The header mask is replaced with a header mask based on the EVL.
3300///
3301/// - Plans with FORs have a new phi added to keep track of the EVL of the
3302/// previous iteration, and VPFirstOrderRecurrencePHIRecipes are replaced with
3303/// @llvm.vp.splice.
3304///
3305/// The function uses the following definitions:
3306/// %StartV is the canonical induction start value.
3307///
3308/// The function adds the following recipes:
3309///
3310/// vector.ph:
3311/// ...
3312///
3313/// vector.body:
3314/// ...
3315/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3316/// [ %NextIter, %vector.body ]
3317/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3318/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
3319/// ...
3320/// %OpEVL = cast i32 %VPEVL to IVSize
3321/// %NextIter = add IVSize %OpEVL, %CurrentIter
3322/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3323/// ...
3324///
3325/// If MaxSafeElements is provided, the function adds the following recipes:
3326/// vector.ph:
3327/// ...
3328///
3329/// vector.body:
3330/// ...
3331/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3332/// [ %NextIter, %vector.body ]
3333/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3334/// %cmp = cmp ult %AVL, MaxSafeElements
3335/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
3336/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
3337/// ...
3338/// %OpEVL = cast i32 %VPEVL to IVSize
3339/// %NextIter = add IVSize %OpEVL, %CurrentIter
3340/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3341/// ...
3342///
3343void VPlanTransforms::addExplicitVectorLength(
3344 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
3345 if (Plan.hasScalarVFOnly())
3346 return;
3347 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3348 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3349
3350 auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
3351 auto *CanIVTy = LoopRegion->getCanonicalIVType();
3352 VPValue *StartV = CanonicalIVPHI->getStartValue();
3353
3354 // Create the CurrentIteration recipe in the vector loop.
3355 auto *CurrentIteration =
3356 new VPCurrentIterationPHIRecipe(StartV, DebugLoc::getUnknown());
3357 CurrentIteration->insertAfter(InsertPos: CanonicalIVPHI);
3358 VPBuilder Builder(Header, Header->getFirstNonPhi());
3359 // Create the AVL (application vector length), starting from TC -> 0 in steps
3360 // of EVL.
3361 VPPhi *AVLPhi = Builder.createScalarPhi(
3362 IncomingValues: {Plan.getTripCount()}, DL: DebugLoc::getCompilerGenerated(), Name: "avl");
3363 VPValue *AVL = AVLPhi;
3364
3365 if (MaxSafeElements) {
3366 // Support for MaxSafeDist for correct loop emission.
3367 VPValue *AVLSafe = Plan.getConstantInt(Ty: CanIVTy, Val: *MaxSafeElements);
3368 VPValue *Cmp = Builder.createICmp(Pred: ICmpInst::ICMP_ULT, A: AVL, B: AVLSafe);
3369 AVL = Builder.createSelect(Cond: Cmp, TrueVal: AVL, FalseVal: AVLSafe, DL: DebugLoc::getUnknown(),
3370 Name: "safe_avl");
3371 }
3372 auto *VPEVL = Builder.createNaryOp(Opcode: VPInstruction::ExplicitVectorLength, Operands: AVL,
3373 DL: DebugLoc::getUnknown(), Name: "evl");
3374
3375 auto *CanonicalIVIncrement =
3376 cast<VPInstruction>(Val: CanonicalIVPHI->getBackedgeValue());
3377 Builder.setInsertPoint(CanonicalIVIncrement);
3378 VPValue *OpVPEVL = VPEVL;
3379
3380 auto *I32Ty = Type::getInt32Ty(C&: Plan.getContext());
3381 OpVPEVL = Builder.createScalarZExtOrTrunc(
3382 Op: OpVPEVL, ResultTy: CanIVTy, SrcTy: I32Ty, DL: CanonicalIVIncrement->getDebugLoc());
3383
3384 auto *NextIter = Builder.createAdd(LHS: OpVPEVL, RHS: CurrentIteration,
3385 DL: CanonicalIVIncrement->getDebugLoc(),
3386 Name: "current.iteration.next",
3387 WrapFlags: {CanonicalIVIncrement->hasNoUnsignedWrap(),
3388 CanonicalIVIncrement->hasNoSignedWrap()});
3389 CurrentIteration->addOperand(Operand: NextIter);
3390
3391 VPValue *NextAVL =
3392 Builder.createSub(LHS: AVLPhi, RHS: OpVPEVL, DL: DebugLoc::getCompilerGenerated(),
3393 Name: "avl.next", WrapFlags: {/*NUW=*/true, /*NSW=*/false});
3394 AVLPhi->addOperand(Operand: NextAVL);
3395
3396 fixupVFUsersForEVL(Plan, EVL&: *VPEVL);
3397 removeDeadRecipes(Plan);
3398
3399 // Replace all uses of VPCanonicalIVPHIRecipe by
3400 // VPCurrentIterationPHIRecipe except for the canonical IV increment.
3401 CanonicalIVPHI->replaceAllUsesWith(New: CurrentIteration);
3402 CanonicalIVIncrement->setOperand(I: 0, New: CanonicalIVPHI);
3403 // TODO: support unroll factor > 1.
3404 Plan.setUF(1);
3405}
3406
3407void VPlanTransforms::convertToVariableLengthStep(VPlan &Plan) {
3408 // Find the vector loop entry by locating VPCurrentIterationPHIRecipe.
3409 // There should be only one VPCurrentIteration in the entire plan.
3410 VPCurrentIterationPHIRecipe *CurrentIteration = nullptr;
3411
3412 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
3413 Range: vp_depth_first_shallow(G: Plan.getEntry())))
3414 for (VPRecipeBase &R : VPBB->phis())
3415 if (auto *PhiR = dyn_cast<VPCurrentIterationPHIRecipe>(Val: &R)) {
3416 assert(!CurrentIteration &&
3417 "Found multiple CurrentIteration. Only one expected");
3418 CurrentIteration = PhiR;
3419 }
3420
3421 // Early return if it is not variable-length stepping.
3422 if (!CurrentIteration)
3423 return;
3424
3425 VPBasicBlock *HeaderVPBB = CurrentIteration->getParent();
3426 VPValue *CurrentIterationIncr = CurrentIteration->getBackedgeValue();
3427
3428 // Convert CurrentIteration to concrete recipe.
3429 auto *ScalarR =
3430 VPBuilder(CurrentIteration)
3431 .createScalarPhi(
3432 IncomingValues: {CurrentIteration->getStartValue(), CurrentIterationIncr},
3433 DL: CurrentIteration->getDebugLoc(), Name: "current.iteration.iv");
3434 CurrentIteration->replaceAllUsesWith(New: ScalarR);
3435 CurrentIteration->eraseFromParent();
3436
3437 // Replace CanonicalIVInc with CurrentIteration increment.
3438 auto *CanonicalIV = cast<VPPhi>(Val: &*HeaderVPBB->begin());
3439 VPValue *Backedge = CanonicalIV->getIncomingValue(Idx: 1);
3440 assert(match(Backedge, m_c_Add(m_Specific(CanonicalIV),
3441 m_Specific(&Plan.getVFxUF()))) &&
3442 "Unexpected canonical iv");
3443 Backedge->replaceAllUsesWith(New: CurrentIterationIncr);
3444
3445 // Remove unused phi and increment.
3446 VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
3447 CanonicalIVIncrement->eraseFromParent();
3448 CanonicalIV->eraseFromParent();
3449}
3450
3451void VPlanTransforms::convertEVLExitCond(VPlan &Plan) {
3452 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3453 // The canonical IV may not exist at this stage.
3454 if (!LoopRegion ||
3455 !isa<VPCanonicalIVPHIRecipe>(Val: LoopRegion->getEntryBasicBlock()->front()))
3456 return;
3457 VPCanonicalIVPHIRecipe *CanIV = LoopRegion->getCanonicalIV();
3458 if (std::next(x: CanIV->getIterator()) == CanIV->getParent()->end())
3459 return;
3460 // The EVL IV is always immediately after the canonical IV.
3461 auto *EVLPhi = dyn_cast_or_null<VPCurrentIterationPHIRecipe>(
3462 Val: std::next(x: CanIV->getIterator()));
3463 if (!EVLPhi)
3464 return;
3465
3466 // Bail if not an EVL tail folded loop.
3467 VPValue *AVL;
3468 if (!match(V: EVLPhi->getBackedgeValue(),
3469 P: m_c_Add(Op0: m_ZExtOrSelf(Op0: m_EVL(Op0: m_VPValue(V&: AVL))), Op1: m_Specific(VPV: EVLPhi))))
3470 return;
3471
3472 // The AVL may be capped to a safe distance.
3473 VPValue *SafeAVL;
3474 if (match(V: AVL, P: m_Select(Op0: m_VPValue(), Op1: m_VPValue(V&: SafeAVL), Op2: m_VPValue())))
3475 AVL = SafeAVL;
3476
3477 VPValue *AVLNext;
3478 [[maybe_unused]] bool FoundAVLNext =
3479 match(V: AVL, P: m_VPInstruction<Instruction::PHI>(
3480 Ops: m_Specific(VPV: Plan.getTripCount()), Ops: m_VPValue(V&: AVLNext)));
3481 assert(FoundAVLNext && "Didn't find AVL backedge?");
3482
3483 VPBasicBlock *Latch = LoopRegion->getExitingBasicBlock();
3484 auto *LatchBr = cast<VPInstruction>(Val: Latch->getTerminator());
3485 if (match(V: LatchBr, P: m_BranchOnCond(Op0: m_True())))
3486 return;
3487
3488 assert(
3489 match(LatchBr,
3490 m_BranchOnCond(m_SpecificCmp(
3491 CmpInst::ICMP_EQ, m_Specific(CanIV->getIncomingValue(1)),
3492 m_Specific(&Plan.getVectorTripCount())))) &&
3493 "Expected BranchOnCond with ICmp comparing CanIV increment with vector "
3494 "trip count");
3495
3496 Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(V: AVLNext);
3497 VPBuilder Builder(LatchBr);
3498 LatchBr->setOperand(
3499 I: 0, New: Builder.createICmp(Pred: CmpInst::ICMP_EQ, A: AVLNext, B: Plan.getZero(Ty: AVLTy)));
3500}
3501
3502void VPlanTransforms::replaceSymbolicStrides(
3503 VPlan &Plan, PredicatedScalarEvolution &PSE,
3504 const DenseMap<Value *, const SCEV *> &StridesMap) {
3505 // Replace VPValues for known constant strides guaranteed by predicate scalar
3506 // evolution.
3507 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
3508 auto *R = cast<VPRecipeBase>(Val: &U);
3509 return R->getRegion() ||
3510 R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
3511 };
3512 ValueToSCEVMapTy RewriteMap;
3513 for (const SCEV *Stride : StridesMap.values()) {
3514 using namespace SCEVPatternMatch;
3515 auto *StrideV = cast<SCEVUnknown>(Val: Stride)->getValue();
3516 const APInt *StrideConst;
3517 if (!match(S: PSE.getSCEV(V: StrideV), P: m_scev_APInt(C&: StrideConst)))
3518 // Only handle constant strides for now.
3519 continue;
3520
3521 auto *CI = Plan.getConstantInt(Val: *StrideConst);
3522 if (VPValue *StrideVPV = Plan.getLiveIn(V: StrideV))
3523 StrideVPV->replaceUsesWithIf(New: CI, ShouldReplace: CanUseVersionedStride);
3524
3525 // The versioned value may not be used in the loop directly but through a
3526 // sext/zext. Add new live-ins in those cases.
3527 for (Value *U : StrideV->users()) {
3528 if (!isa<SExtInst, ZExtInst>(Val: U))
3529 continue;
3530 VPValue *StrideVPV = Plan.getLiveIn(V: U);
3531 if (!StrideVPV)
3532 continue;
3533 unsigned BW = U->getType()->getScalarSizeInBits();
3534 APInt C =
3535 isa<SExtInst>(Val: U) ? StrideConst->sext(width: BW) : StrideConst->zext(width: BW);
3536 VPValue *CI = Plan.getConstantInt(Val: C);
3537 StrideVPV->replaceUsesWithIf(New: CI, ShouldReplace: CanUseVersionedStride);
3538 }
3539 RewriteMap[StrideV] = PSE.getSCEV(V: StrideV);
3540 }
3541
3542 for (VPRecipeBase &R : *Plan.getEntry()) {
3543 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(Val: &R);
3544 if (!ExpSCEV)
3545 continue;
3546 const SCEV *ScevExpr = ExpSCEV->getSCEV();
3547 auto *NewSCEV =
3548 SCEVParameterRewriter::rewrite(Scev: ScevExpr, SE&: *PSE.getSE(), Map&: RewriteMap);
3549 if (NewSCEV != ScevExpr) {
3550 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, Expr: NewSCEV);
3551 ExpSCEV->replaceAllUsesWith(New: NewExp);
3552 if (Plan.getTripCount() == ExpSCEV)
3553 Plan.resetTripCount(NewTripCount: NewExp);
3554 }
3555 }
3556}
3557
3558void VPlanTransforms::dropPoisonGeneratingRecipes(
3559 VPlan &Plan,
3560 const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
3561 // Collect recipes in the backward slice of `Root` that may generate a poison
3562 // value that is used after vectorization.
3563 SmallPtrSet<VPRecipeBase *, 16> Visited;
3564 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
3565 SmallVector<VPRecipeBase *, 16> Worklist;
3566 Worklist.push_back(Elt: Root);
3567
3568 // Traverse the backward slice of Root through its use-def chain.
3569 while (!Worklist.empty()) {
3570 VPRecipeBase *CurRec = Worklist.pop_back_val();
3571
3572 if (!Visited.insert(Ptr: CurRec).second)
3573 continue;
3574
3575 // Prune search if we find another recipe generating a widen memory
3576 // instruction. Widen memory instructions involved in address computation
3577 // will lead to gather/scatter instructions, which don't need to be
3578 // handled.
3579 if (isa<VPWidenMemoryRecipe, VPInterleaveRecipe, VPScalarIVStepsRecipe,
3580 VPHeaderPHIRecipe>(Val: CurRec))
3581 continue;
3582
3583 // This recipe contributes to the address computation of a widen
3584 // load/store. If the underlying instruction has poison-generating flags,
3585 // drop them directly.
3586 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(Val: CurRec)) {
3587 VPValue *A, *B;
3588 // Dropping disjoint from an OR may yield incorrect results, as some
3589 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3590 // for dependence analysis). Instead, replace it with an equivalent Add.
3591 // This is possible as all users of the disjoint OR only access lanes
3592 // where the operands are disjoint or poison otherwise.
3593 if (match(V: RecWithFlags, P: m_BinaryOr(Op0: m_VPValue(V&: A), Op1: m_VPValue(V&: B))) &&
3594 RecWithFlags->isDisjoint()) {
3595 VPBuilder Builder(RecWithFlags);
3596 VPInstruction *New =
3597 Builder.createAdd(LHS: A, RHS: B, DL: RecWithFlags->getDebugLoc());
3598 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3599 RecWithFlags->replaceAllUsesWith(New);
3600 RecWithFlags->eraseFromParent();
3601 CurRec = New;
3602 } else
3603 RecWithFlags->dropPoisonGeneratingFlags();
3604 } else {
3605 Instruction *Instr = dyn_cast_or_null<Instruction>(
3606 Val: CurRec->getVPSingleValue()->getUnderlyingValue());
3607 (void)Instr;
3608 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3609 "found instruction with poison generating flags not covered by "
3610 "VPRecipeWithIRFlags");
3611 }
3612
3613 // Add new definitions to the worklist.
3614 for (VPValue *Operand : CurRec->operands())
3615 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3616 Worklist.push_back(Elt: OpDef);
3617 }
3618 });
3619
3620 // Traverse all the recipes in the VPlan and collect the poison-generating
3621 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3622 // VPInterleaveRecipe.
3623 auto Iter = vp_depth_first_deep(G: Plan.getEntry());
3624 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Iter)) {
3625 for (VPRecipeBase &Recipe : *VPBB) {
3626 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(Val: &Recipe)) {
3627 Instruction &UnderlyingInstr = WidenRec->getIngredient();
3628 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3629 if (AddrDef && WidenRec->isConsecutive() &&
3630 BlockNeedsPredication(UnderlyingInstr.getParent()))
3631 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3632 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(Val: &Recipe)) {
3633 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3634 if (AddrDef) {
3635 // Check if any member of the interleave group needs predication.
3636 const InterleaveGroup<Instruction> *InterGroup =
3637 InterleaveRec->getInterleaveGroup();
3638 bool NeedPredication = false;
3639 for (int I = 0, NumMembers = InterGroup->getNumMembers();
3640 I < NumMembers; ++I) {
3641 Instruction *Member = InterGroup->getMember(Index: I);
3642 if (Member)
3643 NeedPredication |= BlockNeedsPredication(Member->getParent());
3644 }
3645
3646 if (NeedPredication)
3647 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3648 }
3649 }
3650 }
3651 }
3652}
3653
3654void VPlanTransforms::createInterleaveGroups(
3655 VPlan &Plan,
3656 const SmallPtrSetImpl<const InterleaveGroup<Instruction> *>
3657 &InterleaveGroups,
3658 VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed) {
3659 if (InterleaveGroups.empty())
3660 return;
3661
3662 // Interleave memory: for each Interleave Group we marked earlier as relevant
3663 // for this VPlan, replace the Recipes widening its memory instructions with a
3664 // single VPInterleaveRecipe at its insertion point.
3665 VPDominatorTree VPDT(Plan);
3666 for (const auto *IG : InterleaveGroups) {
3667 auto *Start =
3668 cast<VPWidenMemoryRecipe>(Val: RecipeBuilder.getRecipe(I: IG->getMember(Index: 0)));
3669 VPIRMetadata InterleaveMD(*Start);
3670 SmallVector<VPValue *, 4> StoredValues;
3671 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Val: Start))
3672 StoredValues.push_back(Elt: StoreR->getStoredValue());
3673 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3674 Instruction *MemberI = IG->getMember(Index: I);
3675 if (!MemberI)
3676 continue;
3677 VPWidenMemoryRecipe *MemoryR =
3678 cast<VPWidenMemoryRecipe>(Val: RecipeBuilder.getRecipe(I: MemberI));
3679 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Val: MemoryR))
3680 StoredValues.push_back(Elt: StoreR->getStoredValue());
3681 InterleaveMD.intersect(MD: *MemoryR);
3682 }
3683
3684 bool NeedsMaskForGaps =
3685 (IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed) ||
3686 (!StoredValues.empty() && !IG->isFull());
3687
3688 Instruction *IRInsertPos = IG->getInsertPos();
3689 auto *InsertPos =
3690 cast<VPWidenMemoryRecipe>(Val: RecipeBuilder.getRecipe(I: IRInsertPos));
3691
3692 GEPNoWrapFlags NW = GEPNoWrapFlags::none();
3693 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3694 Val: getLoadStorePointerOperand(V: IRInsertPos)->stripPointerCasts()))
3695 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3696
3697 // Get or create the start address for the interleave group.
3698 VPValue *Addr = Start->getAddr();
3699 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3700 if (AddrDef && !VPDT.properlyDominates(A: AddrDef, B: InsertPos)) {
3701 // We cannot re-use the address of member zero because it does not
3702 // dominate the insert position. Instead, use the address of the insert
3703 // position and create a PtrAdd adjusting it to the address of member
3704 // zero.
3705 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3706 // InsertPos or sink loads above zero members to join it.
3707 assert(IG->getIndex(IRInsertPos) != 0 &&
3708 "index of insert position shouldn't be zero");
3709 auto &DL = IRInsertPos->getDataLayout();
3710 APInt Offset(32,
3711 DL.getTypeAllocSize(Ty: getLoadStoreType(I: IRInsertPos)) *
3712 IG->getIndex(Instr: IRInsertPos),
3713 /*IsSigned=*/true);
3714 VPValue *OffsetVPV = Plan.getConstantInt(Val: -Offset);
3715 VPBuilder B(InsertPos);
3716 Addr = B.createNoWrapPtrAdd(Ptr: InsertPos->getAddr(), Offset: OffsetVPV, GEPFlags: NW);
3717 }
3718 // If the group is reverse, adjust the index to refer to the last vector
3719 // lane instead of the first. We adjust the index from the first vector
3720 // lane, rather than directly getting the pointer for lane VF - 1, because
3721 // the pointer operand of the interleaved access is supposed to be uniform.
3722 if (IG->isReverse()) {
3723 auto *ReversePtr = new VPVectorEndPointerRecipe(
3724 Addr, &Plan.getVF(), getLoadStoreType(I: IRInsertPos),
3725 -(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc());
3726 ReversePtr->insertBefore(InsertPos);
3727 Addr = ReversePtr;
3728 }
3729 auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
3730 InsertPos->getMask(), NeedsMaskForGaps,
3731 InterleaveMD, InsertPos->getDebugLoc());
3732 VPIG->insertBefore(InsertPos);
3733
3734 unsigned J = 0;
3735 for (unsigned i = 0; i < IG->getFactor(); ++i)
3736 if (Instruction *Member = IG->getMember(Index: i)) {
3737 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(I: Member);
3738 if (!Member->getType()->isVoidTy()) {
3739 VPValue *OriginalV = MemberR->getVPSingleValue();
3740 OriginalV->replaceAllUsesWith(New: VPIG->getVPValue(I: J));
3741 J++;
3742 }
3743 MemberR->eraseFromParent();
3744 }
3745 }
3746}
3747
3748/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3749/// value, phi and backedge value. In the following example:
3750///
3751/// vector.ph:
3752/// Successor(s): vector loop
3753///
3754/// <x1> vector loop: {
3755/// vector.body:
3756/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3757/// ...
3758/// EMIT branch-on-count ...
3759/// No successors
3760/// }
3761///
3762/// WIDEN-INDUCTION will get expanded to:
3763///
3764/// vector.ph:
3765/// ...
3766/// vp<%induction.start> = ...
3767/// vp<%induction.increment> = ...
3768///
3769/// Successor(s): vector loop
3770///
3771/// <x1> vector loop: {
3772/// vector.body:
3773/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3774/// ...
3775/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3776/// EMIT branch-on-count ...
3777/// No successors
3778/// }
3779static void
3780expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR,
3781 VPTypeAnalysis &TypeInfo) {
3782 VPlan *Plan = WidenIVR->getParent()->getPlan();
3783 VPValue *Start = WidenIVR->getStartValue();
3784 VPValue *Step = WidenIVR->getStepValue();
3785 VPValue *VF = WidenIVR->getVFValue();
3786 DebugLoc DL = WidenIVR->getDebugLoc();
3787
3788 // The value from the original loop to which we are mapping the new induction
3789 // variable.
3790 Type *Ty = TypeInfo.inferScalarType(V: WidenIVR);
3791
3792 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3793 Instruction::BinaryOps AddOp;
3794 Instruction::BinaryOps MulOp;
3795 VPIRFlags Flags = *WidenIVR;
3796 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3797 AddOp = Instruction::Add;
3798 MulOp = Instruction::Mul;
3799 } else {
3800 AddOp = ID.getInductionOpcode();
3801 MulOp = Instruction::FMul;
3802 }
3803
3804 // If the phi is truncated, truncate the start and step values.
3805 VPBuilder Builder(Plan->getVectorPreheader());
3806 Type *StepTy = TypeInfo.inferScalarType(V: Step);
3807 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3808 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3809 Step = Builder.createScalarCast(Opcode: Instruction::Trunc, Op: Step, ResultTy: Ty, DL);
3810 Start = Builder.createScalarCast(Opcode: Instruction::Trunc, Op: Start, ResultTy: Ty, DL);
3811 // Truncation doesn't preserve WrapFlags.
3812 Flags.dropPoisonGeneratingFlags();
3813 StepTy = Ty;
3814 }
3815
3816 // Construct the initial value of the vector IV in the vector loop preheader.
3817 Type *IVIntTy =
3818 IntegerType::get(C&: Plan->getContext(), NumBits: StepTy->getScalarSizeInBits());
3819 VPValue *Init = Builder.createNaryOp(Opcode: VPInstruction::StepVector, Operands: {}, ResultTy: IVIntTy);
3820 if (StepTy->isFloatingPointTy())
3821 Init = Builder.createWidenCast(Opcode: Instruction::UIToFP, Op: Init, ResultTy: StepTy);
3822
3823 VPValue *SplatStart = Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: Start);
3824 VPValue *SplatStep = Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: Step);
3825
3826 Init = Builder.createNaryOp(Opcode: MulOp, Operands: {Init, SplatStep}, Flags);
3827 Init = Builder.createNaryOp(Opcode: AddOp, Operands: {SplatStart, Init}, Flags,
3828 DL: DebugLoc::getUnknown(), Name: "induction");
3829
3830 // Create the widened phi of the vector IV.
3831 auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), Init,
3832 WidenIVR->getDebugLoc(), "vec.ind");
3833 WidePHI->insertBefore(InsertPos: WidenIVR);
3834
3835 // Create the backedge value for the vector IV.
3836 VPValue *Inc;
3837 VPValue *Prev;
3838 // If unrolled, use the increment and prev value from the operands.
3839 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3840 Inc = SplatVF;
3841 Prev = WidenIVR->getLastUnrolledPartOperand();
3842 } else {
3843 if (VPRecipeBase *R = VF->getDefiningRecipe())
3844 Builder.setInsertPoint(TheBB: R->getParent(), IP: std::next(x: R->getIterator()));
3845 // Multiply the vectorization factor by the step using integer or
3846 // floating-point arithmetic as appropriate.
3847 if (StepTy->isFloatingPointTy())
3848 VF = Builder.createScalarCast(Opcode: Instruction::CastOps::UIToFP, Op: VF, ResultTy: StepTy,
3849 DL);
3850 else
3851 VF = Builder.createScalarZExtOrTrunc(Op: VF, ResultTy: StepTy,
3852 SrcTy: TypeInfo.inferScalarType(V: VF), DL);
3853
3854 Inc = Builder.createNaryOp(Opcode: MulOp, Operands: {Step, VF}, Flags);
3855 Inc = Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: Inc);
3856 Prev = WidePHI;
3857 }
3858
3859 VPBasicBlock *ExitingBB = Plan->getVectorLoopRegion()->getExitingBasicBlock();
3860 Builder.setInsertPoint(TheBB: ExitingBB, IP: ExitingBB->getTerminator()->getIterator());
3861 auto *Next = Builder.createNaryOp(Opcode: AddOp, Operands: {Prev, Inc}, Flags,
3862 DL: WidenIVR->getDebugLoc(), Name: "vec.ind.next");
3863
3864 WidePHI->addOperand(Operand: Next);
3865
3866 WidenIVR->replaceAllUsesWith(New: WidePHI);
3867}
3868
3869/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3870/// initial value, phi and backedge value. In the following example:
3871///
3872/// <x1> vector loop: {
3873/// vector.body:
3874/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3875/// ...
3876/// EMIT branch-on-count ...
3877/// }
3878///
3879/// WIDEN-POINTER-INDUCTION will get expanded to:
3880///
3881/// <x1> vector loop: {
3882/// vector.body:
3883/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3884/// EMIT %mul = mul %stepvector, %step
3885/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3886/// ...
3887/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3888/// EMIT branch-on-count ...
3889/// }
3890static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R,
3891 VPTypeAnalysis &TypeInfo) {
3892 VPlan *Plan = R->getParent()->getPlan();
3893 VPValue *Start = R->getStartValue();
3894 VPValue *Step = R->getStepValue();
3895 VPValue *VF = R->getVFValue();
3896
3897 assert(R->getInductionDescriptor().getKind() ==
3898 InductionDescriptor::IK_PtrInduction &&
3899 "Not a pointer induction according to InductionDescriptor!");
3900 assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");
3901 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3902 "Recipe should have been replaced");
3903
3904 VPBuilder Builder(R);
3905 DebugLoc DL = R->getDebugLoc();
3906
3907 // Build a scalar pointer phi.
3908 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(IncomingValues: Start, DL, Name: "pointer.phi");
3909
3910 // Create actual address geps that use the pointer phi as base and a
3911 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3912 Builder.setInsertPoint(TheBB: R->getParent(), IP: R->getParent()->getFirstNonPhi());
3913 Type *StepTy = TypeInfo.inferScalarType(V: Step);
3914 VPValue *Offset = Builder.createNaryOp(Opcode: VPInstruction::StepVector, Operands: {}, ResultTy: StepTy);
3915 Offset = Builder.createOverflowingOp(Opcode: Instruction::Mul, Operands: {Offset, Step});
3916 VPValue *PtrAdd =
3917 Builder.createWidePtrAdd(Ptr: ScalarPtrPhi, Offset, DL, Name: "vector.gep");
3918 R->replaceAllUsesWith(New: PtrAdd);
3919
3920 // Create the backedge value for the scalar pointer phi.
3921 VPBasicBlock *ExitingBB = Plan->getVectorLoopRegion()->getExitingBasicBlock();
3922 Builder.setInsertPoint(TheBB: ExitingBB, IP: ExitingBB->getTerminator()->getIterator());
3923 VF = Builder.createScalarZExtOrTrunc(Op: VF, ResultTy: StepTy, SrcTy: TypeInfo.inferScalarType(V: VF),
3924 DL);
3925 VPValue *Inc = Builder.createOverflowingOp(Opcode: Instruction::Mul, Operands: {Step, VF});
3926
3927 VPValue *InductionGEP =
3928 Builder.createPtrAdd(Ptr: ScalarPtrPhi, Offset: Inc, DL, Name: "ptr.ind");
3929 ScalarPtrPhi->addOperand(Operand: InductionGEP);
3930}
3931
3932void VPlanTransforms::dissolveLoopRegions(VPlan &Plan) {
3933 // Replace loop regions with explicity CFG.
3934 SmallVector<VPRegionBlock *> LoopRegions;
3935 for (VPRegionBlock *R : VPBlockUtils::blocksOnly<VPRegionBlock>(
3936 Range: vp_depth_first_deep(G: Plan.getEntry()))) {
3937 if (!R->isReplicator())
3938 LoopRegions.push_back(Elt: R);
3939 }
3940 for (VPRegionBlock *R : LoopRegions)
3941 R->dissolveToCFGLoop();
3942}
3943
3944void VPlanTransforms::expandBranchOnTwoConds(VPlan &Plan) {
3945 SmallVector<VPInstruction *> WorkList;
3946 // The transform runs after dissolving loop regions, so all VPBasicBlocks
3947 // terminated with BranchOnTwoConds are reached via a shallow traversal.
3948 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
3949 Range: vp_depth_first_shallow(G: Plan.getEntry()))) {
3950 if (!VPBB->empty() && match(V: &VPBB->back(), P: m_BranchOnTwoConds()))
3951 WorkList.push_back(Elt: cast<VPInstruction>(Val: &VPBB->back()));
3952 }
3953
3954 // Expand BranchOnTwoConds instructions into explicit CFG with two new
3955 // single-condition branches:
3956 // 1. A branch that replaces BranchOnTwoConds, jumps to the first successor if
3957 // the first condition is true, and otherwise jumps to a new interim block.
3958 // 2. A branch that ends the interim block, jumps to the second successor if
3959 // the second condition is true, and otherwise jumps to the third
3960 // successor.
3961 for (VPInstruction *Br : WorkList) {
3962 assert(Br->getNumOperands() == 2 &&
3963 "BranchOnTwoConds must have exactly 2 conditions");
3964 DebugLoc DL = Br->getDebugLoc();
3965 VPBasicBlock *BrOnTwoCondsBB = Br->getParent();
3966 const auto Successors = to_vector(Range&: BrOnTwoCondsBB->getSuccessors());
3967 assert(Successors.size() == 3 &&
3968 "BranchOnTwoConds must have exactly 3 successors");
3969
3970 for (VPBlockBase *Succ : Successors)
3971 VPBlockUtils::disconnectBlocks(From: BrOnTwoCondsBB, To: Succ);
3972
3973 VPValue *Cond0 = Br->getOperand(N: 0);
3974 VPValue *Cond1 = Br->getOperand(N: 1);
3975 VPBlockBase *Succ0 = Successors[0];
3976 VPBlockBase *Succ1 = Successors[1];
3977 VPBlockBase *Succ2 = Successors[2];
3978 assert(!Succ0->getParent() && !Succ1->getParent() && !Succ2->getParent() &&
3979 !BrOnTwoCondsBB->getParent() && "regions must already be dissolved");
3980
3981 VPBasicBlock *InterimBB =
3982 Plan.createVPBasicBlock(Name: BrOnTwoCondsBB->getName() + ".interim");
3983
3984 VPBuilder(BrOnTwoCondsBB)
3985 .createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: {Cond0}, DL);
3986 VPBlockUtils::connectBlocks(From: BrOnTwoCondsBB, To: Succ0);
3987 VPBlockUtils::connectBlocks(From: BrOnTwoCondsBB, To: InterimBB);
3988
3989 VPBuilder(InterimBB).createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: {Cond1}, DL);
3990 VPBlockUtils::connectBlocks(From: InterimBB, To: Succ1);
3991 VPBlockUtils::connectBlocks(From: InterimBB, To: Succ2);
3992 Br->eraseFromParent();
3993 }
3994}
3995
3996void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) {
3997 VPTypeAnalysis TypeInfo(Plan);
3998 SmallVector<VPRecipeBase *> ToRemove;
3999 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4000 Range: vp_depth_first_deep(G: Plan.getEntry()))) {
4001 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
4002 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &R)) {
4003 expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
4004 ToRemove.push_back(Elt: WidenIVR);
4005 continue;
4006 }
4007
4008 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(Val: &R)) {
4009 // If the recipe only generates scalars, scalarize it instead of
4010 // expanding it.
4011 if (WidenIVR->onlyScalarsGenerated(IsScalable: Plan.hasScalableVF())) {
4012 VPBuilder Builder(WidenIVR);
4013 VPValue *PtrAdd =
4014 scalarizeVPWidenPointerInduction(PtrIV: WidenIVR, Plan, Builder);
4015 WidenIVR->replaceAllUsesWith(New: PtrAdd);
4016 ToRemove.push_back(Elt: WidenIVR);
4017 continue;
4018 }
4019 expandVPWidenPointerInduction(R: WidenIVR, TypeInfo);
4020 ToRemove.push_back(Elt: WidenIVR);
4021 continue;
4022 }
4023
4024 // Expand VPBlendRecipe into VPInstruction::Select.
4025 VPBuilder Builder(&R);
4026 if (auto *Blend = dyn_cast<VPBlendRecipe>(Val: &R)) {
4027 VPValue *Select = Blend->getIncomingValue(Idx: 0);
4028 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
4029 Select = Builder.createSelect(Cond: Blend->getMask(Idx: I),
4030 TrueVal: Blend->getIncomingValue(Idx: I), FalseVal: Select,
4031 DL: R.getDebugLoc(), Name: "predphi", Flags: *Blend);
4032 Blend->replaceAllUsesWith(New: Select);
4033 ToRemove.push_back(Elt: Blend);
4034 }
4035
4036 if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(Val: &R)) {
4037 if (!VEPR->getOffset()) {
4038 assert(Plan.getConcreteUF() == 1 &&
4039 "Expected unroller to have materialized offset for UF != 1");
4040 VEPR->materializeOffset();
4041 }
4042 }
4043
4044 if (auto *Expr = dyn_cast<VPExpressionRecipe>(Val: &R)) {
4045 Expr->decompose();
4046 ToRemove.push_back(Elt: Expr);
4047 }
4048
4049 // Expand LastActiveLane into Not + FirstActiveLane + Sub.
4050 auto *LastActiveL = dyn_cast<VPInstruction>(Val: &R);
4051 if (LastActiveL &&
4052 LastActiveL->getOpcode() == VPInstruction::LastActiveLane) {
4053 // Create Not(Mask) for all operands.
4054 SmallVector<VPValue *, 2> NotMasks;
4055 for (VPValue *Op : LastActiveL->operands()) {
4056 VPValue *NotMask = Builder.createNot(Operand: Op, DL: LastActiveL->getDebugLoc());
4057 NotMasks.push_back(Elt: NotMask);
4058 }
4059
4060 // Create FirstActiveLane on the inverted masks.
4061 VPValue *FirstInactiveLane = Builder.createNaryOp(
4062 Opcode: VPInstruction::FirstActiveLane, Operands: NotMasks,
4063 DL: LastActiveL->getDebugLoc(), Name: "first.inactive.lane");
4064
4065 // Subtract 1 to get the last active lane.
4066 VPValue *One = Plan.getConstantInt(BitWidth: 64, Val: 1);
4067 VPValue *LastLane =
4068 Builder.createSub(LHS: FirstInactiveLane, RHS: One,
4069 DL: LastActiveL->getDebugLoc(), Name: "last.active.lane");
4070
4071 LastActiveL->replaceAllUsesWith(New: LastLane);
4072 ToRemove.push_back(Elt: LastActiveL);
4073 continue;
4074 }
4075
4076 // Lower MaskedCond with block mask to LogicalAnd.
4077 if (match(V: &R, P: m_VPInstruction<VPInstruction::MaskedCond>())) {
4078 auto *VPI = cast<VPInstruction>(Val: &R);
4079 assert(VPI->isMasked() &&
4080 "Unmasked MaskedCond should be simplified earlier");
4081 VPI->replaceAllUsesWith(New: Builder.createNaryOp(
4082 Opcode: VPInstruction::LogicalAnd, Operands: {VPI->getOperand(N: 0), VPI->getMask()}));
4083 ToRemove.push_back(Elt: VPI);
4084 continue;
4085 }
4086
4087 // Lower BranchOnCount to ICmp + BranchOnCond.
4088 VPValue *IV, *TC;
4089 if (match(V: &R, P: m_BranchOnCount(Op0: m_VPValue(V&: IV), Op1: m_VPValue(V&: TC)))) {
4090 auto *BranchOnCountInst = cast<VPInstruction>(Val: &R);
4091 DebugLoc DL = BranchOnCountInst->getDebugLoc();
4092 VPValue *Cond = Builder.createICmp(Pred: CmpInst::ICMP_EQ, A: IV, B: TC, DL);
4093 Builder.createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: Cond, DL);
4094 ToRemove.push_back(Elt: BranchOnCountInst);
4095 continue;
4096 }
4097
4098 VPValue *VectorStep;
4099 VPValue *ScalarStep;
4100 if (!match(V: &R, P: m_VPInstruction<VPInstruction::WideIVStep>(
4101 Ops: m_VPValue(V&: VectorStep), Ops: m_VPValue(V&: ScalarStep))))
4102 continue;
4103
4104 // Expand WideIVStep.
4105 auto *VPI = cast<VPInstruction>(Val: &R);
4106 Type *IVTy = TypeInfo.inferScalarType(V: VPI);
4107 if (TypeInfo.inferScalarType(V: VectorStep) != IVTy) {
4108 Instruction::CastOps CastOp = IVTy->isFloatingPointTy()
4109 ? Instruction::UIToFP
4110 : Instruction::Trunc;
4111 VectorStep = Builder.createWidenCast(Opcode: CastOp, Op: VectorStep, ResultTy: IVTy);
4112 }
4113
4114 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
4115 if (TypeInfo.inferScalarType(V: ScalarStep) != IVTy) {
4116 ScalarStep =
4117 Builder.createWidenCast(Opcode: Instruction::Trunc, Op: ScalarStep, ResultTy: IVTy);
4118 }
4119
4120 VPIRFlags Flags;
4121 unsigned MulOpc;
4122 if (IVTy->isFloatingPointTy()) {
4123 MulOpc = Instruction::FMul;
4124 Flags = VPI->getFastMathFlags();
4125 } else {
4126 MulOpc = Instruction::Mul;
4127 Flags = VPIRFlags::getDefaultFlags(Opcode: MulOpc);
4128 }
4129
4130 VPInstruction *Mul = Builder.createNaryOp(
4131 Opcode: MulOpc, Operands: {VectorStep, ScalarStep}, Flags, DL: R.getDebugLoc());
4132 VectorStep = Mul;
4133 VPI->replaceAllUsesWith(New: VectorStep);
4134 ToRemove.push_back(Elt: VPI);
4135 }
4136 }
4137
4138 for (VPRecipeBase *R : ToRemove)
4139 R->eraseFromParent();
4140}
4141
4142void VPlanTransforms::handleUncountableEarlyExits(VPlan &Plan,
4143 VPBasicBlock *HeaderVPBB,
4144 VPBasicBlock *LatchVPBB,
4145 VPBasicBlock *MiddleVPBB) {
4146 struct EarlyExitInfo {
4147 VPBasicBlock *EarlyExitingVPBB;
4148 VPIRBasicBlock *EarlyExitVPBB;
4149 VPValue *CondToExit;
4150 };
4151
4152 VPDominatorTree VPDT(Plan);
4153 VPBuilder Builder(LatchVPBB->getTerminator());
4154 SmallVector<EarlyExitInfo> Exits;
4155 for (VPIRBasicBlock *ExitBlock : Plan.getExitBlocks()) {
4156 for (VPBlockBase *Pred : to_vector(Range&: ExitBlock->getPredecessors())) {
4157 if (Pred == MiddleVPBB)
4158 continue;
4159 // Collect condition for this early exit.
4160 auto *EarlyExitingVPBB = cast<VPBasicBlock>(Val: Pred);
4161 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
4162 VPValue *CondOfEarlyExitingVPBB;
4163 [[maybe_unused]] bool Matched =
4164 match(V: EarlyExitingVPBB->getTerminator(),
4165 P: m_BranchOnCond(Op0: m_VPValue(V&: CondOfEarlyExitingVPBB)));
4166 assert(Matched && "Terminator must be BranchOnCond");
4167
4168 // Insert the MaskedCond in the EarlyExitingVPBB so the predicator adds
4169 // the correct block mask.
4170 VPBuilder EarlyExitingBuilder(EarlyExitingVPBB->getTerminator());
4171 auto *CondToEarlyExit = EarlyExitingBuilder.createNaryOp(
4172 Opcode: VPInstruction::MaskedCond,
4173 Operands: TrueSucc == ExitBlock
4174 ? CondOfEarlyExitingVPBB
4175 : EarlyExitingBuilder.createNot(Operand: CondOfEarlyExitingVPBB));
4176 assert((isa<VPIRValue>(CondOfEarlyExitingVPBB) ||
4177 !VPDT.properlyDominates(EarlyExitingVPBB, LatchVPBB) ||
4178 VPDT.properlyDominates(
4179 CondOfEarlyExitingVPBB->getDefiningRecipe()->getParent(),
4180 LatchVPBB)) &&
4181 "exit condition must dominate the latch");
4182 Exits.push_back(Elt: {
4183 .EarlyExitingVPBB: EarlyExitingVPBB,
4184 .EarlyExitVPBB: ExitBlock,
4185 .CondToExit: CondToEarlyExit,
4186 });
4187 }
4188 }
4189
4190 assert(!Exits.empty() && "must have at least one early exit");
4191 // Sort exits by RPO order to get correct program order. RPO gives a
4192 // topological ordering of the CFG, ensuring upstream exits are checked
4193 // before downstream exits in the dispatch chain.
4194 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
4195 HeaderVPBB);
4196 DenseMap<VPBlockBase *, unsigned> RPOIdx;
4197 for (const auto &[Num, VPB] : enumerate(First&: RPOT))
4198 RPOIdx[VPB] = Num;
4199 llvm::sort(C&: Exits, Comp: [&RPOIdx](const EarlyExitInfo &A, const EarlyExitInfo &B) {
4200 return RPOIdx[A.EarlyExitingVPBB] < RPOIdx[B.EarlyExitingVPBB];
4201 });
4202#ifndef NDEBUG
4203 // After RPO sorting, verify that for any pair where one exit dominates
4204 // another, the dominating exit comes first. This is guaranteed by RPO
4205 // (topological order) and is required for the dispatch chain correctness.
4206 for (unsigned I = 0; I + 1 < Exits.size(); ++I)
4207 for (unsigned J = I + 1; J < Exits.size(); ++J)
4208 assert(!VPDT.properlyDominates(Exits[J].EarlyExitingVPBB,
4209 Exits[I].EarlyExitingVPBB) &&
4210 "RPO sort must place dominating exits before dominated ones");
4211#endif
4212
4213 // Build the AnyOf condition for the latch terminator using logical OR
4214 // to avoid poison propagation from later exit conditions when an earlier
4215 // exit is taken.
4216 VPValue *Combined = Exits[0].CondToExit;
4217 for (const EarlyExitInfo &Info : drop_begin(RangeOrContainer&: Exits))
4218 Combined = Builder.createLogicalOr(LHS: Combined, RHS: Info.CondToExit);
4219
4220 VPValue *IsAnyExitTaken =
4221 Builder.createNaryOp(Opcode: VPInstruction::AnyOf, Operands: {Combined});
4222
4223 // Create the vector.early.exit blocks.
4224 SmallVector<VPBasicBlock *> VectorEarlyExitVPBBs(Exits.size());
4225 for (unsigned Idx = 0; Idx != Exits.size(); ++Idx) {
4226 Twine BlockSuffix = Exits.size() == 1 ? "" : Twine(".") + Twine(Idx);
4227 VPBasicBlock *VectorEarlyExitVPBB =
4228 Plan.createVPBasicBlock(Name: "vector.early.exit" + BlockSuffix);
4229 VectorEarlyExitVPBBs[Idx] = VectorEarlyExitVPBB;
4230 }
4231
4232 // Create the dispatch block (or reuse the single exit block if only one
4233 // exit). The dispatch block computes the first active lane of the combined
4234 // condition and, for multiple exits, chains through conditions to determine
4235 // which exit to take.
4236 VPBasicBlock *DispatchVPBB =
4237 Exits.size() == 1 ? VectorEarlyExitVPBBs[0]
4238 : Plan.createVPBasicBlock(Name: "vector.early.exit.check");
4239 VPBuilder DispatchBuilder(DispatchVPBB, DispatchVPBB->begin());
4240 VPValue *FirstActiveLane =
4241 DispatchBuilder.createNaryOp(Opcode: VPInstruction::FirstActiveLane, Operands: {Combined},
4242 DL: DebugLoc::getUnknown(), Name: "first.active.lane");
4243
4244 // For each early exit, disconnect the original exiting block
4245 // (early.exiting.I) from the exit block (ir-bb<exit.I>) and route through a
4246 // new vector.early.exit block. Update ir-bb<exit.I>'s phis to extract their
4247 // values at the first active lane:
4248 //
4249 // Input:
4250 // early.exiting.I:
4251 // ...
4252 // EMIT branch-on-cond vp<%cond.I>
4253 // Successor(s): in.loop.succ, ir-bb<exit.I>
4254 //
4255 // ir-bb<exit.I>:
4256 // IR %phi = phi [ vp<%incoming.I>, early.exiting.I ], ...
4257 //
4258 // Output:
4259 // early.exiting.I:
4260 // ...
4261 // Successor(s): in.loop.succ
4262 //
4263 // vector.early.exit.I:
4264 // EMIT vp<%exit.val> = extract-lane vp<%first.lane>, vp<%incoming.I>
4265 // Successor(s): ir-bb<exit.I>
4266 //
4267 // ir-bb<exit.I>:
4268 // IR %phi = phi ... (extra operand: vp<%exit.val> from
4269 // vector.early.exit.I)
4270 //
4271 for (auto [Exit, VectorEarlyExitVPBB] :
4272 zip_equal(t&: Exits, u&: VectorEarlyExitVPBBs)) {
4273 auto &[EarlyExitingVPBB, EarlyExitVPBB, _] = Exit;
4274 // Adjust the phi nodes in EarlyExitVPBB.
4275 // 1. remove incoming values from EarlyExitingVPBB,
4276 // 2. extract the incoming value at FirstActiveLane
4277 // 3. add back the extracts as last operands for the phis
4278 // Then adjust the CFG, removing the edge between EarlyExitingVPBB and
4279 // EarlyExitVPBB and adding a new edge between VectorEarlyExitVPBB and
4280 // EarlyExitVPBB. The extracts at FirstActiveLane are now the incoming
4281 // values from VectorEarlyExitVPBB.
4282 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
4283 auto *ExitIRI = cast<VPIRPhi>(Val: &R);
4284 VPValue *IncomingVal =
4285 ExitIRI->getIncomingValueForBlock(VPBB: EarlyExitingVPBB);
4286 VPValue *NewIncoming = IncomingVal;
4287 if (!isa<VPIRValue>(Val: IncomingVal)) {
4288 VPBuilder EarlyExitBuilder(VectorEarlyExitVPBB);
4289 NewIncoming = EarlyExitBuilder.createNaryOp(
4290 Opcode: VPInstruction::ExtractLane, Operands: {FirstActiveLane, IncomingVal},
4291 DL: DebugLoc::getUnknown(), Name: "early.exit.value");
4292 }
4293 ExitIRI->removeIncomingValueFor(IncomingBlock: EarlyExitingVPBB);
4294 ExitIRI->addOperand(Operand: NewIncoming);
4295 }
4296
4297 EarlyExitingVPBB->getTerminator()->eraseFromParent();
4298 VPBlockUtils::disconnectBlocks(From: EarlyExitingVPBB, To: EarlyExitVPBB);
4299 VPBlockUtils::connectBlocks(From: VectorEarlyExitVPBB, To: EarlyExitVPBB);
4300 }
4301
4302 // Chain through exits: for each exit, check if its condition is true at
4303 // the first active lane. If so, take that exit; otherwise, try the next.
4304 // The last exit needs no check since it must be taken if all others fail.
4305 //
4306 // For 3 exits (cond.0, cond.1, cond.2), this creates:
4307 //
4308 // latch:
4309 // ...
4310 // EMIT vp<%combined> = logical-or vp<%cond.0>, vp<%cond.1>, vp<%cond.2>
4311 // ...
4312 //
4313 // vector.early.exit.check:
4314 // EMIT vp<%first.lane> = first-active-lane vp<%combined>
4315 // EMIT vp<%at.cond.0> = extract-lane vp<%first.lane>, vp<%cond.0>
4316 // EMIT branch-on-cond vp<%at.cond.0>
4317 // Successor(s): vector.early.exit.0, vector.early.exit.check.0
4318 //
4319 // vector.early.exit.check.0:
4320 // EMIT vp<%at.cond.1> = extract-lane vp<%first.lane>, vp<%cond.1>
4321 // EMIT branch-on-cond vp<%at.cond.1>
4322 // Successor(s): vector.early.exit.1, vector.early.exit.2
4323 VPBasicBlock *CurrentBB = DispatchVPBB;
4324 for (auto [I, Exit] : enumerate(First: ArrayRef(Exits).drop_back())) {
4325 VPValue *LaneVal = DispatchBuilder.createNaryOp(
4326 Opcode: VPInstruction::ExtractLane, Operands: {FirstActiveLane, Exit.CondToExit},
4327 DL: DebugLoc::getUnknown(), Name: "exit.cond.at.lane");
4328
4329 // For the last dispatch, branch directly to the last exit on false;
4330 // otherwise, create a new check block.
4331 bool IsLastDispatch = (I + 2 == Exits.size());
4332 VPBasicBlock *FalseBB =
4333 IsLastDispatch ? VectorEarlyExitVPBBs.back()
4334 : Plan.createVPBasicBlock(
4335 Name: Twine("vector.early.exit.check.") + Twine(I));
4336
4337 DispatchBuilder.createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: {LaneVal});
4338 CurrentBB->setSuccessors({VectorEarlyExitVPBBs[I], FalseBB});
4339 VectorEarlyExitVPBBs[I]->setPredecessors({CurrentBB});
4340 FalseBB->setPredecessors({CurrentBB});
4341
4342 CurrentBB = FalseBB;
4343 DispatchBuilder.setInsertPoint(CurrentBB);
4344 }
4345
4346 // Replace the latch terminator with the new branching logic.
4347 auto *LatchExitingBranch = cast<VPInstruction>(Val: LatchVPBB->getTerminator());
4348 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
4349 "Unexpected terminator");
4350 auto *IsLatchExitTaken =
4351 Builder.createICmp(Pred: CmpInst::ICMP_EQ, A: LatchExitingBranch->getOperand(N: 0),
4352 B: LatchExitingBranch->getOperand(N: 1));
4353
4354 DebugLoc LatchDL = LatchExitingBranch->getDebugLoc();
4355 LatchExitingBranch->eraseFromParent();
4356 Builder.setInsertPoint(LatchVPBB);
4357 Builder.createNaryOp(Opcode: VPInstruction::BranchOnTwoConds,
4358 Operands: {IsAnyExitTaken, IsLatchExitTaken}, DL: LatchDL);
4359 LatchVPBB->clearSuccessors();
4360 LatchVPBB->setSuccessors({DispatchVPBB, MiddleVPBB, HeaderVPBB});
4361 DispatchVPBB->setPredecessors({LatchVPBB});
4362}
4363
4364/// This function tries convert extended in-loop reductions to
4365/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
4366/// valid. The created recipe must be decomposed to its constituent
4367/// recipes before execution.
4368static VPExpressionRecipe *
4369tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx,
4370 VFRange &Range) {
4371 Type *RedTy = Ctx.Types.inferScalarType(V: Red);
4372 VPValue *VecOp = Red->getVecOp();
4373
4374 // Clamp the range if using extended-reduction is profitable.
4375 auto IsExtendedRedValidAndClampRange =
4376 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
4377 return LoopVectorizationPlanner::getDecisionAndClampRange(
4378 Predicate: [&](ElementCount VF) {
4379 auto *SrcVecTy = cast<VectorType>(Val: toVectorTy(Scalar: SrcTy, EC: VF));
4380 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4381
4382 InstructionCost ExtRedCost = InstructionCost::getInvalid();
4383 InstructionCost ExtCost =
4384 cast<VPWidenCastRecipe>(Val: VecOp)->computeCost(VF, Ctx);
4385 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4386
4387 if (Red->isPartialReduction()) {
4388 TargetTransformInfo::PartialReductionExtendKind ExtKind =
4389 TargetTransformInfo::getPartialReductionExtendKind(CastOpc: ExtOpc);
4390 // FIXME: Move partial reduction creation, costing and clamping
4391 // here from LoopVectorize.cpp.
4392 ExtRedCost = Ctx.TTI.getPartialReductionCost(
4393 Opcode, InputTypeA: SrcTy, InputTypeB: nullptr, AccumType: RedTy, VF, OpAExtend: ExtKind,
4394 OpBExtend: llvm::TargetTransformInfo::PR_None, BinOp: std::nullopt, CostKind: Ctx.CostKind,
4395 FMF: RedTy->isFloatingPointTy()
4396 ? std::optional{Red->getFastMathFlags()}
4397 : std::nullopt);
4398 } else if (!RedTy->isFloatingPointTy()) {
4399 // TTI::getExtendedReductionCost only supports integer types.
4400 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
4401 Opcode, IsUnsigned: ExtOpc == Instruction::CastOps::ZExt, ResTy: RedTy, Ty: SrcVecTy,
4402 FMF: Red->getFastMathFlags(), CostKind);
4403 }
4404 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
4405 },
4406 Range);
4407 };
4408
4409 VPValue *A;
4410 // Match reduce(ext)).
4411 if (isa<VPWidenCastRecipe>(Val: VecOp) &&
4412 (match(V: VecOp, P: m_ZExtOrSExt(Op0: m_VPValue(V&: A))) ||
4413 match(V: VecOp, P: m_FPExt(Op0: m_VPValue(V&: A)))) &&
4414 IsExtendedRedValidAndClampRange(
4415 RecurrenceDescriptor::getOpcode(Kind: Red->getRecurrenceKind()),
4416 cast<VPWidenCastRecipe>(Val: VecOp)->getOpcode(),
4417 Ctx.Types.inferScalarType(V: A)))
4418 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(Val: VecOp), Red);
4419
4420 return nullptr;
4421}
4422
4423/// This function tries convert extended in-loop reductions to
4424/// VPExpressionRecipe and clamp the \p Range if it is beneficial
4425/// and valid. The created VPExpressionRecipe must be decomposed to its
4426/// constituent recipes before execution. Patterns of the
4427/// VPExpressionRecipe:
4428/// reduce.add(mul(...)),
4429/// reduce.add(mul(ext(A), ext(B))),
4430/// reduce.add(ext(mul(ext(A), ext(B)))).
4431/// reduce.fadd(fmul(ext(A), ext(B)))
4432static VPExpressionRecipe *
4433tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
4434 VPCostContext &Ctx, VFRange &Range) {
4435 unsigned Opcode = RecurrenceDescriptor::getOpcode(Kind: Red->getRecurrenceKind());
4436 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
4437 Opcode != Instruction::FAdd)
4438 return nullptr;
4439
4440 Type *RedTy = Ctx.Types.inferScalarType(V: Red);
4441
4442 // Clamp the range if using multiply-accumulate-reduction is profitable.
4443 auto IsMulAccValidAndClampRange =
4444 [&](VPWidenRecipe *Mul, VPWidenCastRecipe *Ext0, VPWidenCastRecipe *Ext1,
4445 VPWidenCastRecipe *OuterExt) -> bool {
4446 return LoopVectorizationPlanner::getDecisionAndClampRange(
4447 Predicate: [&](ElementCount VF) {
4448 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4449 Type *SrcTy =
4450 Ext0 ? Ctx.Types.inferScalarType(V: Ext0->getOperand(N: 0)) : RedTy;
4451 InstructionCost MulAccCost;
4452
4453 if (Red->isPartialReduction()) {
4454 Type *SrcTy2 =
4455 Ext1 ? Ctx.Types.inferScalarType(V: Ext1->getOperand(N: 0)) : nullptr;
4456 // FIXME: Move partial reduction creation, costing and clamping
4457 // here from LoopVectorize.cpp.
4458 MulAccCost = Ctx.TTI.getPartialReductionCost(
4459 Opcode, InputTypeA: SrcTy, InputTypeB: SrcTy2, AccumType: RedTy, VF,
4460 OpAExtend: Ext0 ? TargetTransformInfo::getPartialReductionExtendKind(
4461 CastOpc: Ext0->getOpcode())
4462 : TargetTransformInfo::PR_None,
4463 OpBExtend: Ext1 ? TargetTransformInfo::getPartialReductionExtendKind(
4464 CastOpc: Ext1->getOpcode())
4465 : TargetTransformInfo::PR_None,
4466 BinOp: Mul->getOpcode(), CostKind,
4467 FMF: RedTy->isFloatingPointTy()
4468 ? std::optional{Red->getFastMathFlags()}
4469 : std::nullopt);
4470 } else {
4471 // Only partial reductions support mixed or floating-point extends
4472 // at the moment.
4473 if (Ext0 && Ext1 &&
4474 (Ext0->getOpcode() != Ext1->getOpcode() ||
4475 Ext0->getOpcode() == Instruction::CastOps::FPExt))
4476 return false;
4477
4478 bool IsZExt =
4479 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
4480 auto *SrcVecTy = cast<VectorType>(Val: toVectorTy(Scalar: SrcTy, EC: VF));
4481 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsUnsigned: IsZExt, RedOpcode: Opcode, ResTy: RedTy,
4482 Ty: SrcVecTy, CostKind);
4483 }
4484
4485 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
4486 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4487 InstructionCost ExtCost = 0;
4488 if (Ext0)
4489 ExtCost += Ext0->computeCost(VF, Ctx);
4490 if (Ext1)
4491 ExtCost += Ext1->computeCost(VF, Ctx);
4492 if (OuterExt)
4493 ExtCost += OuterExt->computeCost(VF, Ctx);
4494
4495 return MulAccCost.isValid() &&
4496 MulAccCost < ExtCost + MulCost + RedCost;
4497 },
4498 Range);
4499 };
4500
4501 VPValue *VecOp = Red->getVecOp();
4502 VPRecipeBase *Sub = nullptr;
4503 VPValue *A, *B;
4504 VPValue *Tmp = nullptr;
4505
4506 // Try to match reduce.fadd(fmul(fpext(...), fpext(...))).
4507 if (match(V: VecOp, P: m_FMul(Op0: m_FPExt(Op0: m_VPValue()), Op1: m_FPExt(Op0: m_VPValue())))) {
4508 assert(Opcode == Instruction::FAdd &&
4509 "MulAccumulateReduction from an FMul must accumulate into an FAdd "
4510 "instruction");
4511 auto *FMul = dyn_cast<VPWidenRecipe>(Val: VecOp);
4512 if (!FMul)
4513 return nullptr;
4514
4515 auto *RecipeA = dyn_cast<VPWidenCastRecipe>(Val: FMul->getOperand(N: 0));
4516 auto *RecipeB = dyn_cast<VPWidenCastRecipe>(Val: FMul->getOperand(N: 1));
4517
4518 if (RecipeA && RecipeB &&
4519 IsMulAccValidAndClampRange(FMul, RecipeA, RecipeB, nullptr)) {
4520 return new VPExpressionRecipe(RecipeA, RecipeB, FMul, Red);
4521 }
4522 }
4523 if (RedTy->isFloatingPointTy())
4524 return nullptr;
4525
4526 // Sub reductions could have a sub between the add reduction and vec op.
4527 if (match(V: VecOp, P: m_Sub(Op0: m_ZeroInt(), Op1: m_VPValue(V&: Tmp)))) {
4528 Sub = VecOp->getDefiningRecipe();
4529 VecOp = Tmp;
4530 }
4531
4532 // If ValB is a constant and can be safely extended, truncate it to the same
4533 // type as ExtA's operand, then extend it to the same type as ExtA. This
4534 // creates two uniform extends that can more easily be matched by the rest of
4535 // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
4536 // replaced with the new extend of the constant.
4537 auto ExtendAndReplaceConstantOp = [&Ctx, &Red](VPWidenCastRecipe *ExtA,
4538 VPWidenCastRecipe *&ExtB,
4539 VPValue *&ValB,
4540 VPWidenRecipe *Mul) {
4541 if (!ExtA || ExtB || !isa<VPIRValue>(Val: ValB) || Red->isPartialReduction())
4542 return;
4543 Type *NarrowTy = Ctx.Types.inferScalarType(V: ExtA->getOperand(N: 0));
4544 Instruction::CastOps ExtOpc = ExtA->getOpcode();
4545 const APInt *Const;
4546 if (!match(V: ValB, P: m_APInt(C&: Const)) ||
4547 !llvm::canConstantBeExtended(
4548 C: Const, NarrowType: NarrowTy, ExtKind: TTI::getPartialReductionExtendKind(CastOpc: ExtOpc)))
4549 return;
4550 // The truncate ensures that the type of each extended operand is the
4551 // same, and it's been proven that the constant can be extended from
4552 // NarrowTy safely. Necessary since ExtA's extended operand would be
4553 // e.g. an i8, while the const will likely be an i32. This will be
4554 // elided by later optimisations.
4555 VPBuilder Builder(Mul);
4556 auto *Trunc =
4557 Builder.createWidenCast(Opcode: Instruction::CastOps::Trunc, Op: ValB, ResultTy: NarrowTy);
4558 Type *WideTy = Ctx.Types.inferScalarType(V: ExtA);
4559 ValB = ExtB = Builder.createWidenCast(Opcode: ExtOpc, Op: Trunc, ResultTy: WideTy);
4560 Mul->setOperand(I: 1, New: ExtB);
4561 };
4562
4563 // Try to match reduce.add(mul(...)).
4564 if (match(V: VecOp, P: m_Mul(Op0: m_VPValue(V&: A), Op1: m_VPValue(V&: B)))) {
4565 auto *RecipeA = dyn_cast_if_present<VPWidenCastRecipe>(Val: A);
4566 auto *RecipeB = dyn_cast_if_present<VPWidenCastRecipe>(Val: B);
4567 auto *Mul = cast<VPWidenRecipe>(Val: VecOp);
4568
4569 // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
4570 ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
4571
4572 // Match reduce.add/sub(mul(ext, ext)).
4573 if (RecipeA && RecipeB && match(V: RecipeA, P: m_ZExtOrSExt(Op0: m_VPValue())) &&
4574 match(V: RecipeB, P: m_ZExtOrSExt(Op0: m_VPValue())) &&
4575 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
4576 if (Sub)
4577 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
4578 cast<VPWidenRecipe>(Val: Sub), Red);
4579 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
4580 }
4581 // TODO: Add an expression type for this variant with a negated mul
4582 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
4583 return new VPExpressionRecipe(Mul, Red);
4584 }
4585 // TODO: Add an expression type for negated versions of other expression
4586 // variants.
4587 if (Sub)
4588 return nullptr;
4589
4590 // Match reduce.add(ext(mul(A, B))).
4591 if (!Red->isPartialReduction() &&
4592 match(V: VecOp, P: m_ZExtOrSExt(Op0: m_Mul(Op0: m_VPValue(V&: A), Op1: m_VPValue(V&: B))))) {
4593 auto *Ext = cast<VPWidenCastRecipe>(Val: VecOp);
4594 auto *Mul = cast<VPWidenRecipe>(Val: Ext->getOperand(N: 0));
4595 auto *Ext0 = dyn_cast_if_present<VPWidenCastRecipe>(Val: A);
4596 auto *Ext1 = dyn_cast_if_present<VPWidenCastRecipe>(Val: B);
4597
4598 // reduce.add(ext(mul(ext, const)))
4599 // -> reduce.add(ext(mul(ext, ext(const))))
4600 ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
4601
4602 // reduce.add(ext(mul(ext(A), ext(B))))
4603 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
4604 // The inner extends must either have the same opcode as the outer extend or
4605 // be the same, in which case the multiply can never result in a negative
4606 // value and the outer extend can be folded away by doing wider
4607 // extends for the operands of the mul.
4608 if (Ext0 && Ext1 &&
4609 (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
4610 Ext0->getOpcode() == Ext1->getOpcode() &&
4611 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
4612 auto *NewExt0 = new VPWidenCastRecipe(
4613 Ext0->getOpcode(), Ext0->getOperand(N: 0), Ext->getResultType(), nullptr,
4614 *Ext0, *Ext0, Ext0->getDebugLoc());
4615 NewExt0->insertBefore(InsertPos: Ext0);
4616
4617 VPWidenCastRecipe *NewExt1 = NewExt0;
4618 if (Ext0 != Ext1) {
4619 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(N: 0),
4620 Ext->getResultType(), nullptr, *Ext1,
4621 *Ext1, Ext1->getDebugLoc());
4622 NewExt1->insertBefore(InsertPos: Ext1);
4623 }
4624 Mul->setOperand(I: 0, New: NewExt0);
4625 Mul->setOperand(I: 1, New: NewExt1);
4626 Red->setOperand(I: 1, New: Mul);
4627 return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red);
4628 }
4629 }
4630 return nullptr;
4631}
4632
4633/// This function tries to create abstract recipes from the reduction recipe for
4634/// following optimizations and cost estimation.
4635static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red,
4636 VPCostContext &Ctx,
4637 VFRange &Range) {
4638 VPExpressionRecipe *AbstractR = nullptr;
4639 auto IP = std::next(x: Red->getIterator());
4640 auto *VPBB = Red->getParent();
4641 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
4642 AbstractR = MulAcc;
4643 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
4644 AbstractR = ExtRed;
4645 // Cannot create abstract inloop reduction recipes.
4646 if (!AbstractR)
4647 return;
4648
4649 AbstractR->insertBefore(BB&: *VPBB, IP);
4650 Red->replaceAllUsesWith(New: AbstractR);
4651}
4652
4653void VPlanTransforms::convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx,
4654 VFRange &Range) {
4655 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4656 Range: vp_depth_first_deep(G: Plan.getVectorLoopRegion()))) {
4657 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
4658 if (auto *Red = dyn_cast<VPReductionRecipe>(Val: &R))
4659 tryToCreateAbstractReductionRecipe(Red, Ctx, Range);
4660 }
4661 }
4662}
4663
4664void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
4665 if (Plan.hasScalarVFOnly())
4666 return;
4667
4668#ifndef NDEBUG
4669 VPDominatorTree VPDT(Plan);
4670#endif
4671
4672 SmallVector<VPValue *> VPValues;
4673 if (Plan.getOrCreateBackedgeTakenCount()->getNumUsers() > 0)
4674 VPValues.push_back(Elt: Plan.getOrCreateBackedgeTakenCount());
4675 append_range(C&: VPValues, R: Plan.getLiveIns());
4676 for (VPRecipeBase &R : *Plan.getEntry())
4677 append_range(C&: VPValues, R: R.definedValues());
4678
4679 auto *VectorPreheader = Plan.getVectorPreheader();
4680 for (VPValue *VPV : VPValues) {
4681 if (vputils::onlyScalarValuesUsed(Def: VPV) ||
4682 (isa<VPIRValue>(Val: VPV) && isa<Constant>(Val: VPV->getLiveInIRValue())))
4683 continue;
4684
4685 // Add explicit broadcast at the insert point that dominates all users.
4686 VPBasicBlock *HoistBlock = VectorPreheader;
4687 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
4688 for (VPUser *User : VPV->users()) {
4689 if (User->usesScalars(Op: VPV))
4690 continue;
4691 if (cast<VPRecipeBase>(Val: User)->getParent() == VectorPreheader)
4692 HoistPoint = HoistBlock->begin();
4693 else
4694 assert(VPDT.dominates(VectorPreheader,
4695 cast<VPRecipeBase>(User)->getParent()) &&
4696 "All users must be in the vector preheader or dominated by it");
4697 }
4698
4699 VPBuilder Builder(cast<VPBasicBlock>(Val: HoistBlock), HoistPoint);
4700 auto *Broadcast = Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: {VPV});
4701 VPV->replaceUsesWithIf(New: Broadcast,
4702 ShouldReplace: [VPV, Broadcast](VPUser &U, unsigned Idx) {
4703 return Broadcast != &U && !U.usesScalars(Op: VPV);
4704 });
4705 }
4706}
4707
4708void VPlanTransforms::hoistInvariantLoads(VPlan &Plan) {
4709 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4710
4711 // Collect candidate loads with invariant addresses and noalias scopes
4712 // metadata and memory-writing recipes with noalias metadata.
4713 SmallVector<std::pair<VPRecipeBase *, MemoryLocation>> CandidateLoads;
4714 SmallVector<MemoryLocation> Stores;
4715 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4716 Range: vp_depth_first_shallow(G: LoopRegion->getEntry()))) {
4717 for (VPRecipeBase &R : *VPBB) {
4718 // Only handle single-scalar replicated loads with invariant addresses.
4719 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R)) {
4720 if (RepR->isPredicated() || !RepR->isSingleScalar() ||
4721 RepR->getOpcode() != Instruction::Load)
4722 continue;
4723
4724 VPValue *Addr = RepR->getOperand(N: 0);
4725 if (Addr->isDefinedOutsideLoopRegions()) {
4726 MemoryLocation Loc = *vputils::getMemoryLocation(R: *RepR);
4727 if (!Loc.AATags.Scope)
4728 continue;
4729 CandidateLoads.push_back(Elt: {RepR, Loc});
4730 }
4731 }
4732 if (R.mayWriteToMemory()) {
4733 auto Loc = vputils::getMemoryLocation(R);
4734 if (!Loc || !Loc->AATags.Scope || !Loc->AATags.NoAlias)
4735 return;
4736 Stores.push_back(Elt: *Loc);
4737 }
4738 }
4739 }
4740
4741 VPBasicBlock *Preheader = Plan.getVectorPreheader();
4742 for (auto &[LoadRecipe, LoadLoc] : CandidateLoads) {
4743 // Hoist the load to the preheader if it doesn't alias with any stores
4744 // according to the noalias metadata. Other loads should have been hoisted
4745 // by other passes
4746 const AAMDNodes &LoadAA = LoadLoc.AATags;
4747 if (all_of(Range&: Stores, P: [&](const MemoryLocation &StoreLoc) {
4748 return !ScopedNoAliasAAResult::mayAliasInScopes(
4749 Scopes: LoadAA.Scope, NoAlias: StoreLoc.AATags.NoAlias);
4750 })) {
4751 LoadRecipe->moveBefore(BB&: *Preheader, I: Preheader->getFirstNonPhi());
4752 }
4753 }
4754}
4755
4756// Collect common metadata from a group of replicate recipes by intersecting
4757// metadata from all recipes in the group.
4758static VPIRMetadata getCommonMetadata(ArrayRef<VPReplicateRecipe *> Recipes) {
4759 VPIRMetadata CommonMetadata = *Recipes.front();
4760 for (VPReplicateRecipe *Recipe : drop_begin(RangeOrContainer&: Recipes))
4761 CommonMetadata.intersect(MD: *Recipe);
4762 return CommonMetadata;
4763}
4764
4765template <unsigned Opcode>
4766static SmallVector<SmallVector<VPReplicateRecipe *, 4>>
4767collectComplementaryPredicatedMemOps(VPlan &Plan,
4768 PredicatedScalarEvolution &PSE,
4769 const Loop *L) {
4770 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
4771 "Only Load and Store opcodes supported");
4772 constexpr bool IsLoad = (Opcode == Instruction::Load);
4773 VPTypeAnalysis TypeInfo(Plan);
4774
4775 // For each address, collect operations with the same or complementary masks.
4776 SmallVector<SmallVector<VPReplicateRecipe *, 4>> AllGroups;
4777 auto GetLoadStoreValueType = [&](VPReplicateRecipe *Recipe) {
4778 return TypeInfo.inferScalarType(V: IsLoad ? Recipe : Recipe->getOperand(N: 0));
4779 };
4780 auto Groups = collectGroupedReplicateMemOps<Opcode>(
4781 Plan, PSE, L,
4782 [](VPReplicateRecipe *RepR) { return RepR->isPredicated(); });
4783 for (auto Recipes : Groups) {
4784 if (Recipes.size() < 2)
4785 continue;
4786
4787 // Collect groups with the same or complementary masks.
4788 for (VPReplicateRecipe *&RecipeI : Recipes) {
4789 if (!RecipeI)
4790 continue;
4791
4792 VPValue *MaskI = RecipeI->getMask();
4793 Type *TypeI = GetLoadStoreValueType(RecipeI);
4794 SmallVector<VPReplicateRecipe *, 4> Group;
4795 Group.push_back(Elt: RecipeI);
4796 RecipeI = nullptr;
4797
4798 // Find all operations with the same or complementary masks.
4799 bool HasComplementaryMask = false;
4800 for (VPReplicateRecipe *&RecipeJ : Recipes) {
4801 if (!RecipeJ)
4802 continue;
4803
4804 VPValue *MaskJ = RecipeJ->getMask();
4805 Type *TypeJ = GetLoadStoreValueType(RecipeJ);
4806 if (TypeI == TypeJ) {
4807 // Check if any operation in the group has a complementary mask with
4808 // another, that is M1 == NOT(M2) or M2 == NOT(M1).
4809 HasComplementaryMask |= match(V: MaskI, P: m_Not(Op0: m_Specific(VPV: MaskJ))) ||
4810 match(V: MaskJ, P: m_Not(Op0: m_Specific(VPV: MaskI)));
4811 Group.push_back(Elt: RecipeJ);
4812 RecipeJ = nullptr;
4813 }
4814 }
4815
4816 if (HasComplementaryMask) {
4817 assert(Group.size() >= 2 && "must have at least 2 entries");
4818 AllGroups.push_back(Elt: std::move(Group));
4819 }
4820 }
4821 }
4822
4823 return AllGroups;
4824}
4825
4826// Find the recipe with minimum alignment in the group.
4827template <typename InstType>
4828static VPReplicateRecipe *
4829findRecipeWithMinAlign(ArrayRef<VPReplicateRecipe *> Group) {
4830 return *min_element(Group, [](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4831 return cast<InstType>(A->getUnderlyingInstr())->getAlign() <
4832 cast<InstType>(B->getUnderlyingInstr())->getAlign();
4833 });
4834}
4835
4836void VPlanTransforms::hoistPredicatedLoads(VPlan &Plan,
4837 PredicatedScalarEvolution &PSE,
4838 const Loop *L) {
4839 auto Groups =
4840 collectComplementaryPredicatedMemOps<Instruction::Load>(Plan, PSE, L);
4841 if (Groups.empty())
4842 return;
4843
4844 // Process each group of loads.
4845 for (auto &Group : Groups) {
4846 // Try to use the earliest (most dominating) load to replace all others.
4847 VPReplicateRecipe *EarliestLoad = Group[0];
4848 VPBasicBlock *FirstBB = EarliestLoad->getParent();
4849 VPBasicBlock *LastBB = Group.back()->getParent();
4850
4851 // Check that the load doesn't alias with stores between first and last.
4852 auto LoadLoc = vputils::getMemoryLocation(R: *EarliestLoad);
4853 if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(MemLoc: *LoadLoc, FirstBB, LastBB))
4854 continue;
4855
4856 // Collect common metadata from all loads in the group.
4857 VPIRMetadata CommonMetadata = getCommonMetadata(Recipes: Group);
4858
4859 // Find the load with minimum alignment to use.
4860 auto *LoadWithMinAlign = findRecipeWithMinAlign<LoadInst>(Group);
4861
4862 bool IsSingleScalar = EarliestLoad->isSingleScalar();
4863 assert(all_of(Group,
4864 [IsSingleScalar](VPReplicateRecipe *R) {
4865 return R->isSingleScalar() == IsSingleScalar;
4866 }) &&
4867 "all members in group must agree on IsSingleScalar");
4868
4869 // Create an unpredicated version of the earliest load with common
4870 // metadata.
4871 auto *UnpredicatedLoad = new VPReplicateRecipe(
4872 LoadWithMinAlign->getUnderlyingInstr(), {EarliestLoad->getOperand(N: 0)},
4873 IsSingleScalar, /*Mask=*/nullptr, *EarliestLoad, CommonMetadata);
4874
4875 UnpredicatedLoad->insertBefore(InsertPos: EarliestLoad);
4876
4877 // Replace all loads in the group with the unpredicated load.
4878 for (VPReplicateRecipe *Load : Group) {
4879 Load->replaceAllUsesWith(New: UnpredicatedLoad);
4880 Load->eraseFromParent();
4881 }
4882 }
4883}
4884
4885static bool
4886canSinkStoreWithNoAliasCheck(ArrayRef<VPReplicateRecipe *> StoresToSink,
4887 PredicatedScalarEvolution &PSE, const Loop &L,
4888 VPTypeAnalysis &TypeInfo) {
4889 auto StoreLoc = vputils::getMemoryLocation(R: *StoresToSink.front());
4890 if (!StoreLoc || !StoreLoc->AATags.Scope)
4891 return false;
4892
4893 // When sinking a group of stores, all members of the group alias each other.
4894 // Skip them during the alias checks.
4895 SmallPtrSet<VPRecipeBase *, 4> StoresToSinkSet(StoresToSink.begin(),
4896 StoresToSink.end());
4897
4898 VPBasicBlock *FirstBB = StoresToSink.front()->getParent();
4899 VPBasicBlock *LastBB = StoresToSink.back()->getParent();
4900 SinkStoreInfo SinkInfo(StoresToSinkSet, *StoresToSink[0], PSE, L, TypeInfo);
4901 return canHoistOrSinkWithNoAliasCheck(MemLoc: *StoreLoc, FirstBB, LastBB, SinkInfo);
4902}
4903
4904void VPlanTransforms::sinkPredicatedStores(VPlan &Plan,
4905 PredicatedScalarEvolution &PSE,
4906 const Loop *L) {
4907 auto Groups =
4908 collectComplementaryPredicatedMemOps<Instruction::Store>(Plan, PSE, L);
4909 if (Groups.empty())
4910 return;
4911
4912 VPTypeAnalysis TypeInfo(Plan);
4913
4914 for (auto &Group : Groups) {
4915 if (!canSinkStoreWithNoAliasCheck(StoresToSink: Group, PSE, L: *L, TypeInfo))
4916 continue;
4917
4918 // Use the last (most dominated) store's location for the unconditional
4919 // store.
4920 VPReplicateRecipe *LastStore = Group.back();
4921 VPBasicBlock *InsertBB = LastStore->getParent();
4922
4923 // Collect common alias metadata from all stores in the group.
4924 VPIRMetadata CommonMetadata = getCommonMetadata(Recipes: Group);
4925
4926 // Build select chain for stored values.
4927 VPValue *SelectedValue = Group[0]->getOperand(N: 0);
4928 VPBuilder Builder(InsertBB, LastStore->getIterator());
4929
4930 bool IsSingleScalar = Group[0]->isSingleScalar();
4931 for (unsigned I = 1; I < Group.size(); ++I) {
4932 assert(IsSingleScalar == Group[I]->isSingleScalar() &&
4933 "all members in group must agree on IsSingleScalar");
4934 VPValue *Mask = Group[I]->getMask();
4935 VPValue *Value = Group[I]->getOperand(N: 0);
4936 SelectedValue = Builder.createSelect(Cond: Mask, TrueVal: Value, FalseVal: SelectedValue,
4937 DL: Group[I]->getDebugLoc());
4938 }
4939
4940 // Find the store with minimum alignment to use.
4941 auto *StoreWithMinAlign = findRecipeWithMinAlign<StoreInst>(Group);
4942
4943 // Create unconditional store with selected value and common metadata.
4944 auto *UnpredicatedStore = new VPReplicateRecipe(
4945 StoreWithMinAlign->getUnderlyingInstr(),
4946 {SelectedValue, LastStore->getOperand(N: 1)}, IsSingleScalar,
4947 /*Mask=*/nullptr, *LastStore, CommonMetadata);
4948 UnpredicatedStore->insertBefore(BB&: *InsertBB, IP: LastStore->getIterator());
4949
4950 // Remove all predicated stores from the group.
4951 for (VPReplicateRecipe *Store : Group)
4952 Store->eraseFromParent();
4953 }
4954}
4955
4956void VPlanTransforms::materializeConstantVectorTripCount(
4957 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
4958 PredicatedScalarEvolution &PSE) {
4959 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
4960 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
4961
4962 VPValue *TC = Plan.getTripCount();
4963 if (TC->getNumUsers() == 0)
4964 return;
4965
4966 // Skip cases for which the trip count may be non-trivial to materialize.
4967 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
4968 // tail is required.
4969 if (!Plan.hasScalarTail() ||
4970 Plan.getMiddleBlock()->getSingleSuccessor() ==
4971 Plan.getScalarPreheader() ||
4972 !isa<VPIRValue>(Val: TC))
4973 return;
4974
4975 // Materialize vector trip counts for constants early if it can simply
4976 // be computed as (Original TC / VF * UF) * VF * UF.
4977 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
4978 // tail-folded loops.
4979 ScalarEvolution &SE = *PSE.getSE();
4980 auto *TCScev = SE.getSCEV(V: TC->getLiveInIRValue());
4981 if (!isa<SCEVConstant>(Val: TCScev))
4982 return;
4983 const SCEV *VFxUF = SE.getElementCount(Ty: TCScev->getType(), EC: BestVF * BestUF);
4984 auto VecTCScev = SE.getMulExpr(LHS: SE.getUDivExpr(LHS: TCScev, RHS: VFxUF), RHS: VFxUF);
4985 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(Val: VecTCScev))
4986 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
4987}
4988
4989void VPlanTransforms::materializeBackedgeTakenCount(VPlan &Plan,
4990 VPBasicBlock *VectorPH) {
4991 VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
4992 if (BTC->getNumUsers() == 0)
4993 return;
4994
4995 VPBuilder Builder(VectorPH, VectorPH->begin());
4996 auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(V: Plan.getTripCount());
4997 auto *TCMO =
4998 Builder.createSub(LHS: Plan.getTripCount(), RHS: Plan.getConstantInt(Ty: TCTy, Val: 1),
4999 DL: DebugLoc::getCompilerGenerated(), Name: "trip.count.minus.1");
5000 BTC->replaceAllUsesWith(New: TCMO);
5001}
5002
5003void VPlanTransforms::materializePacksAndUnpacks(VPlan &Plan) {
5004 if (Plan.hasScalarVFOnly())
5005 return;
5006
5007 VPTypeAnalysis TypeInfo(Plan);
5008 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
5009 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
5010 Range: vp_depth_first_shallow(G: Plan.getEntry()));
5011 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
5012 Range: vp_depth_first_shallow(G: LoopRegion->getEntry()));
5013 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes,
5014 // VPScalarIVStepsRecipe and VPInstructions, excluding ones in replicate
5015 // regions. Those are not materialized explicitly yet. Those vector users are
5016 // still handled in VPReplicateRegion::execute(), via shouldPack().
5017 // TODO: materialize build vectors for replicating recipes in replicating
5018 // regions.
5019 for (VPBasicBlock *VPBB :
5020 concat<VPBasicBlock *>(Ranges&: VPBBsOutsideLoopRegion, Ranges&: VPBBsInsideLoopRegion)) {
5021 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
5022 if (!isa<VPScalarIVStepsRecipe, VPReplicateRecipe, VPInstruction>(Val: &R))
5023 continue;
5024 auto *DefR = cast<VPSingleDefRecipe>(Val: &R);
5025 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
5026 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(Val: U)->getRegion();
5027 return !U->usesScalars(Op: DefR) || ParentRegion != LoopRegion;
5028 };
5029 if ((isa<VPReplicateRecipe>(Val: DefR) &&
5030 cast<VPReplicateRecipe>(Val: DefR)->isSingleScalar()) ||
5031 (isa<VPInstruction>(Val: DefR) &&
5032 (vputils::onlyFirstLaneUsed(Def: DefR) ||
5033 !cast<VPInstruction>(Val: DefR)->doesGeneratePerAllLanes())) ||
5034 none_of(Range: DefR->users(), P: UsesVectorOrInsideReplicateRegion))
5035 continue;
5036
5037 Type *ScalarTy = TypeInfo.inferScalarType(V: DefR);
5038 unsigned Opcode = ScalarTy->isStructTy()
5039 ? VPInstruction::BuildStructVector
5040 : VPInstruction::BuildVector;
5041 auto *BuildVector = new VPInstruction(Opcode, {DefR});
5042 BuildVector->insertAfter(InsertPos: DefR);
5043
5044 DefR->replaceUsesWithIf(
5045 New: BuildVector, ShouldReplace: [BuildVector, &UsesVectorOrInsideReplicateRegion](
5046 VPUser &U, unsigned) {
5047 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
5048 });
5049 }
5050 }
5051
5052 // Create explicit VPInstructions to convert vectors to scalars. The current
5053 // implementation is conservative - it may miss some cases that may or may not
5054 // be vector values. TODO: introduce Unpacks speculatively - remove them later
5055 // if they are known to operate on scalar values.
5056 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
5057 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
5058 if (isa<VPReplicateRecipe, VPInstruction, VPScalarIVStepsRecipe,
5059 VPDerivedIVRecipe, VPCanonicalIVPHIRecipe>(Val: &R))
5060 continue;
5061 for (VPValue *Def : R.definedValues()) {
5062 // Skip recipes that are single-scalar or only have their first lane
5063 // used.
5064 // TODO: The Defs skipped here may or may not be vector values.
5065 // Introduce Unpacks, and remove them later, if they are guaranteed to
5066 // produce scalar values.
5067 if (vputils::isSingleScalar(VPV: Def) || vputils::onlyFirstLaneUsed(Def))
5068 continue;
5069
5070 // At the moment, we create unpacks only for scalar users outside
5071 // replicate regions. Recipes inside replicate regions still extract the
5072 // required lanes implicitly.
5073 // TODO: Remove once replicate regions are unrolled completely.
5074 auto IsCandidateUnpackUser = [Def](VPUser *U) {
5075 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(Val: U)->getRegion();
5076 return U->usesScalars(Op: Def) &&
5077 (!ParentRegion || !ParentRegion->isReplicator());
5078 };
5079 if (none_of(Range: Def->users(), P: IsCandidateUnpackUser))
5080 continue;
5081
5082 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
5083 if (R.isPhi())
5084 Unpack->insertBefore(BB&: *VPBB, IP: VPBB->getFirstNonPhi());
5085 else
5086 Unpack->insertAfter(InsertPos: &R);
5087 Def->replaceUsesWithIf(New: Unpack,
5088 ShouldReplace: [&IsCandidateUnpackUser](VPUser &U, unsigned) {
5089 return IsCandidateUnpackUser(&U);
5090 });
5091 }
5092 }
5093 }
5094}
5095
5096void VPlanTransforms::materializeVectorTripCount(VPlan &Plan,
5097 VPBasicBlock *VectorPHVPBB,
5098 bool TailByMasking,
5099 bool RequiresScalarEpilogue,
5100 VPValue *Step) {
5101 VPSymbolicValue &VectorTC = Plan.getVectorTripCount();
5102 // There's nothing to do if there are no users of the vector trip count or its
5103 // IR value has already been set.
5104 if (VectorTC.getNumUsers() == 0 || VectorTC.getUnderlyingValue())
5105 return;
5106
5107 VPValue *TC = Plan.getTripCount();
5108 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(V: TC);
5109 VPBasicBlock::iterator InsertPt = VectorPHVPBB->begin();
5110 if (auto *StepR = Step->getDefiningRecipe()) {
5111 assert(StepR->getParent() == VectorPHVPBB &&
5112 "Step must be defined in VectorPHVPBB");
5113 // Insert after Step's definition to maintain valid def-use ordering.
5114 InsertPt = std::next(x: StepR->getIterator());
5115 }
5116 VPBuilder Builder(VectorPHVPBB, InsertPt);
5117
5118 // If the tail is to be folded by masking, round the number of iterations N
5119 // up to a multiple of Step instead of rounding down. This is done by first
5120 // adding Step-1 and then rounding down. Note that it's ok if this addition
5121 // overflows: the vector induction variable will eventually wrap to zero given
5122 // that it starts at zero and its Step is a power of two; the loop will then
5123 // exit, with the last early-exit vector comparison also producing all-true.
5124 if (TailByMasking) {
5125 TC = Builder.createAdd(
5126 LHS: TC, RHS: Builder.createSub(LHS: Step, RHS: Plan.getConstantInt(Ty: TCTy, Val: 1)),
5127 DL: DebugLoc::getCompilerGenerated(), Name: "n.rnd.up");
5128 }
5129
5130 // Now we need to generate the expression for the part of the loop that the
5131 // vectorized body will execute. This is equal to N - (N % Step) if scalar
5132 // iterations are not required for correctness, or N - Step, otherwise. Step
5133 // is equal to the vectorization factor (number of SIMD elements) times the
5134 // unroll factor (number of SIMD instructions).
5135 VPValue *R =
5136 Builder.createNaryOp(Opcode: Instruction::URem, Operands: {TC, Step},
5137 DL: DebugLoc::getCompilerGenerated(), Name: "n.mod.vf");
5138
5139 // There are cases where we *must* run at least one iteration in the remainder
5140 // loop. See the cost model for when this can happen. If the step evenly
5141 // divides the trip count, we set the remainder to be equal to the step. If
5142 // the step does not evenly divide the trip count, no adjustment is necessary
5143 // since there will already be scalar iterations. Note that the minimum
5144 // iterations check ensures that N >= Step.
5145 if (RequiresScalarEpilogue) {
5146 assert(!TailByMasking &&
5147 "requiring scalar epilogue is not supported with fail folding");
5148 VPValue *IsZero =
5149 Builder.createICmp(Pred: CmpInst::ICMP_EQ, A: R, B: Plan.getZero(Ty: TCTy));
5150 R = Builder.createSelect(Cond: IsZero, TrueVal: Step, FalseVal: R);
5151 }
5152
5153 VPValue *Res =
5154 Builder.createSub(LHS: TC, RHS: R, DL: DebugLoc::getCompilerGenerated(), Name: "n.vec");
5155 VectorTC.replaceAllUsesWith(New: Res);
5156}
5157
5158void VPlanTransforms::materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH,
5159 ElementCount VFEC) {
5160 // If VF and VFxUF have already been materialized (no remaining users),
5161 // there's nothing more to do.
5162 if (Plan.getVF().isMaterialized()) {
5163 assert(Plan.getVFxUF().isMaterialized() &&
5164 "VF and VFxUF must be materialized together");
5165 return;
5166 }
5167
5168 VPBuilder Builder(VectorPH, VectorPH->begin());
5169 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(V: Plan.getTripCount());
5170 VPValue &VF = Plan.getVF();
5171 VPValue &VFxUF = Plan.getVFxUF();
5172 // If there are no users of the runtime VF, compute VFxUF by constant folding
5173 // the multiplication of VF and UF.
5174 if (VF.getNumUsers() == 0) {
5175 VPValue *RuntimeVFxUF =
5176 Builder.createElementCount(Ty: TCTy, EC: VFEC * Plan.getConcreteUF());
5177 VFxUF.replaceAllUsesWith(New: RuntimeVFxUF);
5178 return;
5179 }
5180
5181 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
5182 // vscale) * UF.
5183 VPValue *RuntimeVF = Builder.createElementCount(Ty: TCTy, EC: VFEC);
5184 if (!vputils::onlyScalarValuesUsed(Def: &VF)) {
5185 VPValue *BC = Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: RuntimeVF);
5186 VF.replaceUsesWithIf(
5187 New: BC, ShouldReplace: [&VF](VPUser &U, unsigned) { return !U.usesScalars(Op: &VF); });
5188 }
5189 VF.replaceAllUsesWith(New: RuntimeVF);
5190
5191 VPValue *MulByUF = Builder.createOverflowingOp(
5192 Opcode: Instruction::Mul,
5193 Operands: {RuntimeVF, Plan.getConstantInt(Ty: TCTy, Val: Plan.getConcreteUF())},
5194 WrapFlags: {true, false});
5195 VFxUF.replaceAllUsesWith(New: MulByUF);
5196}
5197
5198DenseMap<const SCEV *, Value *>
5199VPlanTransforms::expandSCEVs(VPlan &Plan, ScalarEvolution &SE) {
5200 SCEVExpander Expander(SE, "induction", /*PreserveLCSSA=*/false);
5201
5202 auto *Entry = cast<VPIRBasicBlock>(Val: Plan.getEntry());
5203 BasicBlock *EntryBB = Entry->getIRBasicBlock();
5204 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
5205 for (VPRecipeBase &R : make_early_inc_range(Range&: *Entry)) {
5206 if (isa<VPIRInstruction, VPIRPhi>(Val: &R))
5207 continue;
5208 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(Val: &R);
5209 if (!ExpSCEV)
5210 break;
5211 const SCEV *Expr = ExpSCEV->getSCEV();
5212 Value *Res =
5213 Expander.expandCodeFor(SH: Expr, Ty: Expr->getType(), I: EntryBB->getTerminator());
5214 ExpandedSCEVs[ExpSCEV->getSCEV()] = Res;
5215 VPValue *Exp = Plan.getOrAddLiveIn(V: Res);
5216 ExpSCEV->replaceAllUsesWith(New: Exp);
5217 if (Plan.getTripCount() == ExpSCEV)
5218 Plan.resetTripCount(NewTripCount: Exp);
5219 ExpSCEV->eraseFromParent();
5220 }
5221 assert(none_of(*Entry, IsaPred<VPExpandSCEVRecipe>) &&
5222 "VPExpandSCEVRecipes must be at the beginning of the entry block, "
5223 "before any VPIRInstructions");
5224 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
5225 // to the VPIRBasicBlock.
5226 auto EI = Entry->begin();
5227 for (Instruction &I : drop_end(RangeOrContainer&: *EntryBB)) {
5228 if (EI != Entry->end() && isa<VPIRInstruction>(Val: *EI) &&
5229 &cast<VPIRInstruction>(Val: &*EI)->getInstruction() == &I) {
5230 EI++;
5231 continue;
5232 }
5233 VPIRInstruction::create(I)->insertBefore(BB&: *Entry, IP: EI);
5234 }
5235
5236 return ExpandedSCEVs;
5237}
5238
5239/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
5240/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
5241/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
5242/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
5243/// an index-independent load if it feeds all wide ops at all indices (\p OpV
5244/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
5245/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
5246/// is defined at \p Idx of a load interleave group.
5247static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx,
5248 VPValue *OpV, unsigned Idx, bool IsScalable) {
5249 VPValue *Member0Op = WideMember0->getOperand(N: OpIdx);
5250 VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
5251 if (!Member0OpR)
5252 return Member0Op == OpV;
5253 if (auto *W = dyn_cast<VPWidenLoadRecipe>(Val: Member0OpR))
5254 // For scalable VFs, the narrowed plan processes vscale iterations at once,
5255 // so a shared wide load cannot be narrowed to a uniform scalar; bail out.
5256 return !IsScalable && !W->getMask() && W->isConsecutive() &&
5257 Member0Op == OpV;
5258 if (auto *IR = dyn_cast<VPInterleaveRecipe>(Val: Member0OpR))
5259 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(I: Idx) == OpV;
5260 return false;
5261}
5262
5263static bool canNarrowOps(ArrayRef<VPValue *> Ops, bool IsScalable) {
5264 SmallVector<VPValue *> Ops0;
5265 auto *WideMember0 = dyn_cast<VPSingleDefRecipe>(Val: Ops[0]);
5266 if (!WideMember0)
5267 return false;
5268 for (VPValue *V : Ops) {
5269 if (!isa<VPWidenRecipe, VPWidenCastRecipe>(Val: V))
5270 return false;
5271 auto *R = cast<VPSingleDefRecipe>(Val: V);
5272 if (getOpcodeOrIntrinsicID(R) != getOpcodeOrIntrinsicID(R: WideMember0))
5273 return false;
5274 }
5275
5276 for (unsigned Idx = 0; Idx != WideMember0->getNumOperands(); ++Idx) {
5277 SmallVector<VPValue *> OpsI;
5278 for (VPValue *Op : Ops)
5279 OpsI.push_back(Elt: Op->getDefiningRecipe()->getOperand(N: Idx));
5280
5281 if (canNarrowOps(Ops: OpsI, IsScalable))
5282 continue;
5283
5284 if (any_of(Range: enumerate(First&: OpsI), P: [WideMember0, Idx, IsScalable](const auto &P) {
5285 const auto &[OpIdx, OpV] = P;
5286 return !canNarrowLoad(WideMember0, Idx, OpV, OpIdx, IsScalable);
5287 }))
5288 return false;
5289 }
5290
5291 return true;
5292}
5293
5294/// Returns VF from \p VFs if \p IR is a full interleave group with factor and
5295/// number of members both equal to VF. The interleave group must also access
5296/// the full vector width.
5297static std::optional<ElementCount> isConsecutiveInterleaveGroup(
5298 VPInterleaveRecipe *InterleaveR, ArrayRef<ElementCount> VFs,
5299 VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI) {
5300 if (!InterleaveR || InterleaveR->getMask())
5301 return std::nullopt;
5302
5303 Type *GroupElementTy = nullptr;
5304 if (InterleaveR->getStoredValues().empty()) {
5305 GroupElementTy = TypeInfo.inferScalarType(V: InterleaveR->getVPValue(I: 0));
5306 if (!all_of(Range: InterleaveR->definedValues(),
5307 P: [&TypeInfo, GroupElementTy](VPValue *Op) {
5308 return TypeInfo.inferScalarType(V: Op) == GroupElementTy;
5309 }))
5310 return std::nullopt;
5311 } else {
5312 GroupElementTy =
5313 TypeInfo.inferScalarType(V: InterleaveR->getStoredValues()[0]);
5314 if (!all_of(Range: InterleaveR->getStoredValues(),
5315 P: [&TypeInfo, GroupElementTy](VPValue *Op) {
5316 return TypeInfo.inferScalarType(V: Op) == GroupElementTy;
5317 }))
5318 return std::nullopt;
5319 }
5320
5321 auto IG = InterleaveR->getInterleaveGroup();
5322 if (IG->getFactor() != IG->getNumMembers())
5323 return std::nullopt;
5324
5325 auto GetVectorBitWidthForVF = [&TTI](ElementCount VF) {
5326 TypeSize Size = TTI.getRegisterBitWidth(
5327 K: VF.isFixed() ? TargetTransformInfo::RGK_FixedWidthVector
5328 : TargetTransformInfo::RGK_ScalableVector);
5329 assert(Size.isScalable() == VF.isScalable() &&
5330 "if Size is scalable, VF must be scalable and vice versa");
5331 return Size.getKnownMinValue();
5332 };
5333
5334 for (ElementCount VF : VFs) {
5335 unsigned MinVal = VF.getKnownMinValue();
5336 unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * MinVal;
5337 if (IG->getFactor() == MinVal && GroupSize == GetVectorBitWidthForVF(VF))
5338 return {VF};
5339 }
5340 return std::nullopt;
5341}
5342
5343/// Returns true if \p VPValue is a narrow VPValue.
5344static bool isAlreadyNarrow(VPValue *VPV) {
5345 if (isa<VPIRValue>(Val: VPV))
5346 return true;
5347 auto *RepR = dyn_cast<VPReplicateRecipe>(Val: VPV);
5348 return RepR && RepR->isSingleScalar();
5349}
5350
5351// Convert a wide recipe defining a VPValue \p V feeding an interleave group to
5352// a narrow variant.
5353static VPValue *
5354narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl<VPValue *> &NarrowedOps) {
5355 auto *R = V->getDefiningRecipe();
5356 if (!R || NarrowedOps.contains(Ptr: V))
5357 return V;
5358
5359 if (isAlreadyNarrow(VPV: V))
5360 return V;
5361
5362 if (isa<VPWidenRecipe, VPWidenCastRecipe>(Val: R)) {
5363 auto *WideMember0 = cast<VPSingleDefRecipe>(Val: R);
5364 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
5365 WideMember0->setOperand(
5366 I: Idx,
5367 New: narrowInterleaveGroupOp(V: WideMember0->getOperand(N: Idx), NarrowedOps));
5368 return V;
5369 }
5370
5371 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(Val: R)) {
5372 // Narrow interleave group to wide load, as transformed VPlan will only
5373 // process one original iteration.
5374 auto *LI = cast<LoadInst>(Val: LoadGroup->getInterleaveGroup()->getInsertPos());
5375 auto *L = new VPWidenLoadRecipe(
5376 *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
5377 /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
5378 L->insertBefore(InsertPos: LoadGroup);
5379 NarrowedOps.insert(Ptr: L);
5380 return L;
5381 }
5382
5383 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: R)) {
5384 assert(RepR->isSingleScalar() &&
5385 isa<LoadInst>(RepR->getUnderlyingInstr()) &&
5386 "must be a single scalar load");
5387 NarrowedOps.insert(Ptr: RepR);
5388 return RepR;
5389 }
5390
5391 auto *WideLoad = cast<VPWidenLoadRecipe>(Val: R);
5392 VPValue *PtrOp = WideLoad->getAddr();
5393 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(Val: PtrOp))
5394 PtrOp = VecPtr->getOperand(N: 0);
5395 // Narrow wide load to uniform scalar load, as transformed VPlan will only
5396 // process one original iteration.
5397 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
5398 /*IsUniform*/ true,
5399 /*Mask*/ nullptr, {}, *WideLoad);
5400 N->insertBefore(InsertPos: WideLoad);
5401 NarrowedOps.insert(Ptr: N);
5402 return N;
5403}
5404
5405std::unique_ptr<VPlan>
5406VPlanTransforms::narrowInterleaveGroups(VPlan &Plan,
5407 const TargetTransformInfo &TTI) {
5408 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
5409
5410 if (!VectorLoop)
5411 return nullptr;
5412
5413 // Only handle single-block loops for now.
5414 if (VectorLoop->getEntryBasicBlock() != VectorLoop->getExitingBasicBlock())
5415 return nullptr;
5416
5417 // Skip plans when we may not be able to properly narrow.
5418 VPBasicBlock *Exiting = VectorLoop->getExitingBasicBlock();
5419 if (!match(V: &Exiting->back(), P: m_BranchOnCount()))
5420 return nullptr;
5421
5422 assert(match(&Exiting->back(),
5423 m_BranchOnCount(m_Add(m_VPValue(), m_Specific(&Plan.getVFxUF())),
5424 m_Specific(&Plan.getVectorTripCount()))) &&
5425 "unexpected branch-on-count");
5426
5427 VPTypeAnalysis TypeInfo(Plan);
5428 SmallVector<VPInterleaveRecipe *> StoreGroups;
5429 std::optional<ElementCount> VFToOptimize;
5430 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
5431 if (isa<VPCanonicalIVPHIRecipe>(Val: &R))
5432 continue;
5433
5434 if (isa<VPDerivedIVRecipe, VPScalarIVStepsRecipe>(Val: &R) &&
5435 vputils::onlyFirstLaneUsed(Def: cast<VPSingleDefRecipe>(Val: &R)))
5436 continue;
5437
5438 // Bail out on recipes not supported at the moment:
5439 // * phi recipes other than the canonical induction
5440 // * recipes writing to memory except interleave groups
5441 // Only support plans with a canonical induction phi.
5442 if (R.isPhi())
5443 return nullptr;
5444
5445 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(Val: &R);
5446 if (R.mayWriteToMemory() && !InterleaveR)
5447 return nullptr;
5448
5449 // All other ops are allowed, but we reject uses that cannot be converted
5450 // when checking all allowed consumers (store interleave groups) below.
5451 if (!InterleaveR)
5452 continue;
5453
5454 // Try to find a single VF, where all interleave groups are consecutive and
5455 // saturate the full vector width. If we already have a candidate VF, check
5456 // if it is applicable for the current InterleaveR, otherwise look for a
5457 // suitable VF across the Plan's VFs.
5458 SmallVector<ElementCount> VFs =
5459 VFToOptimize ? SmallVector<ElementCount>({*VFToOptimize})
5460 : to_vector(Range: Plan.vectorFactors());
5461 std::optional<ElementCount> NarrowedVF =
5462 isConsecutiveInterleaveGroup(InterleaveR, VFs, TypeInfo, TTI);
5463 if (!NarrowedVF || (VFToOptimize && NarrowedVF != VFToOptimize))
5464 return nullptr;
5465 VFToOptimize = NarrowedVF;
5466
5467 // Skip read interleave groups.
5468 if (InterleaveR->getStoredValues().empty())
5469 continue;
5470
5471 // Narrow interleave groups, if all operands are already matching narrow
5472 // ops.
5473 auto *Member0 = InterleaveR->getStoredValues()[0];
5474 if (isAlreadyNarrow(VPV: Member0) &&
5475 all_of(Range: InterleaveR->getStoredValues(), P: equal_to(Arg&: Member0))) {
5476 StoreGroups.push_back(Elt: InterleaveR);
5477 continue;
5478 }
5479
5480 // For now, we only support full interleave groups storing load interleave
5481 // groups.
5482 if (all_of(Range: enumerate(First: InterleaveR->getStoredValues()), P: [](auto Op) {
5483 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
5484 if (!DefR)
5485 return false;
5486 auto *IR = dyn_cast<VPInterleaveRecipe>(Val: DefR);
5487 return IR && IR->getInterleaveGroup()->isFull() &&
5488 IR->getVPValue(Op.index()) == Op.value();
5489 })) {
5490 StoreGroups.push_back(Elt: InterleaveR);
5491 continue;
5492 }
5493
5494 // Check if all values feeding InterleaveR are matching wide recipes, which
5495 // operands that can be narrowed.
5496 if (!canNarrowOps(Ops: InterleaveR->getStoredValues(),
5497 IsScalable: VFToOptimize->isScalable()))
5498 return nullptr;
5499 StoreGroups.push_back(Elt: InterleaveR);
5500 }
5501
5502 if (StoreGroups.empty())
5503 return nullptr;
5504
5505 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5506 bool RequiresScalarEpilogue =
5507 MiddleVPBB->getNumSuccessors() == 1 &&
5508 MiddleVPBB->getSingleSuccessor() == Plan.getScalarPreheader();
5509 // Bail out for tail-folding (middle block with a single successor to exit).
5510 if (MiddleVPBB->getNumSuccessors() != 2 && !RequiresScalarEpilogue)
5511 return nullptr;
5512
5513 // All interleave groups in Plan can be narrowed for VFToOptimize. Split the
5514 // original Plan into 2: a) a new clone which contains all VFs of Plan, except
5515 // VFToOptimize, and b) the original Plan with VFToOptimize as single VF.
5516 // TODO: Handle cases where only some interleave groups can be narrowed.
5517 std::unique_ptr<VPlan> NewPlan;
5518 if (size(Range: Plan.vectorFactors()) != 1) {
5519 NewPlan = std::unique_ptr<VPlan>(Plan.duplicate());
5520 Plan.setVF(*VFToOptimize);
5521 NewPlan->removeVF(VF: *VFToOptimize);
5522 }
5523
5524 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
5525 SmallPtrSet<VPValue *, 4> NarrowedOps;
5526 // Narrow operation tree rooted at store groups.
5527 for (auto *StoreGroup : StoreGroups) {
5528 VPValue *Res =
5529 narrowInterleaveGroupOp(V: StoreGroup->getStoredValues()[0], NarrowedOps);
5530 auto *SI =
5531 cast<StoreInst>(Val: StoreGroup->getInterleaveGroup()->getInsertPos());
5532 auto *S = new VPWidenStoreRecipe(
5533 *SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
5534 /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
5535 S->insertBefore(InsertPos: StoreGroup);
5536 StoreGroup->eraseFromParent();
5537 }
5538
5539 // Adjust induction to reflect that the transformed plan only processes one
5540 // original iteration.
5541 auto *CanIV = VectorLoop->getCanonicalIV();
5542 auto *Inc = cast<VPInstruction>(Val: CanIV->getBackedgeValue());
5543 VPBasicBlock *VectorPH = Plan.getVectorPreheader();
5544 VPBuilder PHBuilder(VectorPH, VectorPH->begin());
5545
5546 VPValue *UF = &Plan.getUF();
5547 VPValue *Step;
5548 if (VFToOptimize->isScalable()) {
5549 VPValue *VScale = PHBuilder.createElementCount(
5550 Ty: VectorLoop->getCanonicalIVType(), EC: ElementCount::getScalable(MinVal: 1));
5551 Step = PHBuilder.createOverflowingOp(Opcode: Instruction::Mul, Operands: {VScale, UF},
5552 WrapFlags: {true, false});
5553 Plan.getVF().replaceAllUsesWith(New: VScale);
5554 } else {
5555 Step = UF;
5556 Plan.getVF().replaceAllUsesWith(
5557 New: Plan.getConstantInt(Ty: CanIV->getScalarType(), Val: 1));
5558 }
5559 // Materialize vector trip count with the narrowed step.
5560 materializeVectorTripCount(Plan, VectorPHVPBB: VectorPH, /*TailByMasking=*/false,
5561 RequiresScalarEpilogue, Step);
5562
5563 Inc->setOperand(I: 1, New: Step);
5564 Plan.getVFxUF().replaceAllUsesWith(New: Step);
5565
5566 removeDeadRecipes(Plan);
5567 assert(none_of(*VectorLoop->getEntryBasicBlock(),
5568 IsaPred<VPVectorPointerRecipe>) &&
5569 "All VPVectorPointerRecipes should have been removed");
5570 return NewPlan;
5571}
5572
5573/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
5574/// BranchOnCond recipe.
5575void VPlanTransforms::addBranchWeightToMiddleTerminator(
5576 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
5577 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5578 auto *MiddleTerm =
5579 dyn_cast_or_null<VPInstruction>(Val: MiddleVPBB->getTerminator());
5580 // Only add branch metadata if there is a (conditional) terminator.
5581 if (!MiddleTerm)
5582 return;
5583
5584 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
5585 "must have a BranchOnCond");
5586 // Assume that `TripCount % VectorStep ` is equally distributed.
5587 unsigned VectorStep = Plan.getConcreteUF() * VF.getKnownMinValue();
5588 if (VF.isScalable() && VScaleForTuning.has_value())
5589 VectorStep *= *VScaleForTuning;
5590 assert(VectorStep > 0 && "trip count should not be zero");
5591 MDBuilder MDB(Plan.getContext());
5592 MDNode *BranchWeights =
5593 MDB.createBranchWeights(Weights: {1, VectorStep - 1}, /*IsExpected=*/false);
5594 MiddleTerm->setMetadata(Kind: LLVMContext::MD_prof, Node: BranchWeights);
5595}
5596
5597void VPlanTransforms::addExitUsersForFirstOrderRecurrences(VPlan &Plan,
5598 VFRange &Range) {
5599 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5600 auto *MiddleVPBB = Plan.getMiddleBlock();
5601 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5602
5603 auto IsScalableOne = [](ElementCount VF) -> bool {
5604 return VF == ElementCount::getScalable(MinVal: 1);
5605 };
5606
5607 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
5608 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Val: &HeaderPhi);
5609 if (!FOR)
5610 continue;
5611
5612 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5613 "Cannot handle loops with uncountable early exits");
5614
5615 // This is the second phase of vectorizing first-order recurrences, creating
5616 // extract for users outside the loop. An overview of the transformation is
5617 // described below. Suppose we have the following loop with some use after
5618 // the loop of the last a[i-1],
5619 //
5620 // for (int i = 0; i < n; ++i) {
5621 // t = a[i - 1];
5622 // b[i] = a[i] - t;
5623 // }
5624 // use t;
5625 //
5626 // There is a first-order recurrence on "a". For this loop, the shorthand
5627 // scalar IR looks like:
5628 //
5629 // scalar.ph:
5630 // s.init = a[-1]
5631 // br scalar.body
5632 //
5633 // scalar.body:
5634 // i = phi [0, scalar.ph], [i+1, scalar.body]
5635 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
5636 // s2 = a[i]
5637 // b[i] = s2 - s1
5638 // br cond, scalar.body, exit.block
5639 //
5640 // exit.block:
5641 // use = lcssa.phi [s1, scalar.body]
5642 //
5643 // In this example, s1 is a recurrence because it's value depends on the
5644 // previous iteration. In the first phase of vectorization, we created a
5645 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
5646 // for users in the scalar preheader and exit block.
5647 //
5648 // vector.ph:
5649 // v_init = vector(..., ..., ..., a[-1])
5650 // br vector.body
5651 //
5652 // vector.body
5653 // i = phi [0, vector.ph], [i+4, vector.body]
5654 // v1 = phi [v_init, vector.ph], [v2, vector.body]
5655 // v2 = a[i, i+1, i+2, i+3]
5656 // b[i] = v2 - v1
5657 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
5658 // b[i, i+1, i+2, i+3] = v2 - v1
5659 // br cond, vector.body, middle.block
5660 //
5661 // middle.block:
5662 // vector.recur.extract.for.phi = v2(2)
5663 // vector.recur.extract = v2(3)
5664 // br cond, scalar.ph, exit.block
5665 //
5666 // scalar.ph:
5667 // scalar.recur.init = phi [vector.recur.extract, middle.block],
5668 // [s.init, otherwise]
5669 // br scalar.body
5670 //
5671 // scalar.body:
5672 // i = phi [0, scalar.ph], [i+1, scalar.body]
5673 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
5674 // s2 = a[i]
5675 // b[i] = s2 - s1
5676 // br cond, scalar.body, exit.block
5677 //
5678 // exit.block:
5679 // lo = lcssa.phi [s1, scalar.body],
5680 // [vector.recur.extract.for.phi, middle.block]
5681 //
5682 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
5683 // Extract the penultimate value of the recurrence and use it as operand for
5684 // the VPIRInstruction modeling the phi.
5685 for (VPRecipeBase &R : make_early_inc_range(
5686 Range: make_range(x: MiddleVPBB->getFirstNonPhi(), y: MiddleVPBB->end()))) {
5687 if (!match(V: &R, P: m_ExtractLastLaneOfLastPart(Op0: m_Specific(VPV: FOR))))
5688 continue;
5689
5690 // For VF vscale x 1, if vscale = 1, we are unable to extract the
5691 // penultimate value of the recurrence. Instead we rely on the existing
5692 // extract of the last element from the result of
5693 // VPInstruction::FirstOrderRecurrenceSplice.
5694 // TODO: Consider vscale_range info and UF.
5695 if (LoopVectorizationPlanner::getDecisionAndClampRange(Predicate: IsScalableOne,
5696 Range))
5697 return;
5698 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
5699 Opcode: VPInstruction::ExtractPenultimateElement, Operands: FOR->getBackedgeValue(), DL: {},
5700 Name: "vector.recur.extract.for.phi");
5701 for (VPUser *U : to_vector(Range: cast<VPInstruction>(Val: &R)->users())) {
5702 auto *ExitPhi = dyn_cast<VPIRPhi>(Val: U);
5703 if (!ExitPhi)
5704 continue;
5705 ExitPhi->replaceUsesOfWith(From: cast<VPInstruction>(Val: &R), To: PenultimateElement);
5706 }
5707 }
5708 }
5709}
5710
5711void VPlanTransforms::optimizeFindIVReductions(VPlan &Plan,
5712 PredicatedScalarEvolution &PSE,
5713 Loop &L) {
5714 ScalarEvolution &SE = *PSE.getSE();
5715 VPRegionBlock *VectorLoopRegion = Plan.getVectorLoopRegion();
5716
5717 // Helper lambda to check if the IV range excludes the sentinel value.
5718 auto CheckSentinel = [&SE](const SCEV *IVSCEV, bool UseMax,
5719 bool Signed) -> std::optional<APInt> {
5720 unsigned BW = IVSCEV->getType()->getScalarSizeInBits();
5721 APInt Sentinel =
5722 UseMax
5723 ? (Signed ? APInt::getSignedMinValue(numBits: BW) : APInt::getMinValue(numBits: BW))
5724 : (Signed ? APInt::getSignedMaxValue(numBits: BW) : APInt::getMaxValue(numBits: BW));
5725
5726 ConstantRange IVRange =
5727 Signed ? SE.getSignedRange(S: IVSCEV) : SE.getUnsignedRange(S: IVSCEV);
5728 if (!IVRange.contains(Val: Sentinel))
5729 return Sentinel;
5730 return std::nullopt;
5731 };
5732
5733 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
5734 for (VPRecipeBase &Phi :
5735 make_early_inc_range(Range: VectorLoopRegion->getEntryBasicBlock()->phis())) {
5736 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &Phi);
5737 if (!PhiR || !RecurrenceDescriptor::isFindLastRecurrenceKind(
5738 Kind: PhiR->getRecurrenceKind()))
5739 continue;
5740
5741 Type *PhiTy = VPTypeAnalysis(Plan).inferScalarType(V: PhiR);
5742 if (PhiTy->isPointerTy() || PhiTy->isFloatingPointTy())
5743 continue;
5744
5745 // If there's a header mask, the backedge select will not be the find-last
5746 // select.
5747 VPValue *BackedgeVal = PhiR->getBackedgeValue();
5748 VPValue *CondSelect = BackedgeVal;
5749 if (HeaderMask &&
5750 !match(V: BackedgeVal, P: m_Select(Op0: m_Specific(VPV: HeaderMask),
5751 Op1: m_VPValue(V&: CondSelect), Op2: m_Specific(VPV: PhiR))))
5752 llvm_unreachable("expected header mask select");
5753
5754 // Get the IV from the conditional select of the reduction phi.
5755 // The conditional select should be a select between the phi and the IV.
5756 VPValue *Cond, *TrueVal, *FalseVal;
5757 if (!match(V: CondSelect, P: m_Select(Op0: m_VPValue(V&: Cond), Op1: m_VPValue(V&: TrueVal),
5758 Op2: m_VPValue(V&: FalseVal))))
5759 continue;
5760
5761 // The non-phi operand of the select is the IV.
5762 assert(is_contained(CondSelect->getDefiningRecipe()->operands(), PhiR));
5763 VPValue *IV = TrueVal == PhiR ? FalseVal : TrueVal;
5764
5765 const SCEV *IVSCEV = vputils::getSCEVExprForVPValue(V: IV, PSE, L: &L);
5766 const SCEV *Step;
5767 if (!match(S: IVSCEV, P: m_scev_AffineAddRec(Op0: m_SCEV(), Op1: m_SCEV(V&: Step))))
5768 continue;
5769
5770 // Determine direction from SCEV step.
5771 if (!SE.isKnownNonZero(S: Step))
5772 continue;
5773
5774 // Positive step means we need UMax/SMax to find the last IV value, and
5775 // UMin/SMin otherwise.
5776 bool UseMax = SE.isKnownPositive(S: Step);
5777 bool UseSigned = true;
5778 std::optional<APInt> SentinelVal =
5779 CheckSentinel(IVSCEV, UseMax, /*IsSigned=*/true);
5780 if (!SentinelVal) {
5781 SentinelVal = CheckSentinel(IVSCEV, UseMax, /*IsSigned=*/false);
5782 UseSigned = false;
5783 }
5784
5785 // If no sentinel was found, fall back to a boolean AnyOf reduction to track
5786 // if the condition was ever true. Requires the IV to not wrap, otherwise we
5787 // cannot use min/max.
5788 if (!SentinelVal) {
5789 auto *AR = cast<SCEVAddRecExpr>(Val: IVSCEV);
5790 if (AR->hasNoSignedWrap())
5791 UseSigned = true;
5792 else if (AR->hasNoUnsignedWrap())
5793 UseSigned = false;
5794 else
5795 continue;
5796 }
5797
5798 VPInstruction *RdxResult = cast<VPInstruction>(Val: vputils::findRecipe(
5799 Start: BackedgeVal,
5800 Pred: match_fn(P: m_VPInstruction<VPInstruction::ComputeReductionResult>())));
5801
5802 RecurKind MinMaxKind =
5803 UseMax ? (UseSigned ? RecurKind::SMax : RecurKind::UMax)
5804 : (UseSigned ? RecurKind::SMin : RecurKind::UMin);
5805 VPIRFlags Flags(MinMaxKind, /*IsOrdered=*/false, /*IsInLoop=*/false,
5806 FastMathFlags());
5807 DebugLoc ExitDL = RdxResult->getDebugLoc();
5808 VPBuilder MiddleBuilder(RdxResult);
5809 VPValue *ReducedIV =
5810 MiddleBuilder.createNaryOp(Opcode: VPInstruction::ComputeReductionResult,
5811 Operands: RdxResult->getOperand(N: 0), Flags, DL: ExitDL);
5812
5813 VPValue *NewRdxResult;
5814 VPValue *StartVPV = PhiR->getStartValue();
5815 if (SentinelVal) {
5816 // Sentinel-based approach: reduce IVs with min/max, compare against
5817 // sentinel to detect if condition was ever true, select accordingly.
5818 VPValue *Sentinel = Plan.getConstantInt(Val: *SentinelVal);
5819 auto *Cmp = MiddleBuilder.createICmp(Pred: CmpInst::ICMP_NE, A: ReducedIV,
5820 B: Sentinel, DL: ExitDL);
5821 NewRdxResult =
5822 MiddleBuilder.createSelect(Cond: Cmp, TrueVal: ReducedIV, FalseVal: StartVPV, DL: ExitDL);
5823 StartVPV = Sentinel;
5824 } else {
5825 // Introduce a boolean AnyOf reduction to track if the condition was ever
5826 // true in the loop. Use it to select the initial start value, if it was
5827 // never true.
5828 auto *AnyOfPhi = new VPReductionPHIRecipe(
5829 /*Phi=*/nullptr, RecurKind::Or, *Plan.getFalse(), *Plan.getFalse(),
5830 RdxUnordered{.VFScaleFactor: 1}, {}, /*HasUsesOutsideReductionChain=*/false);
5831 AnyOfPhi->insertAfter(InsertPos: PhiR);
5832
5833 VPBuilder LoopBuilder(BackedgeVal->getDefiningRecipe());
5834 VPValue *AnyOfCond = Cond;
5835 if (TrueVal == PhiR)
5836 AnyOfCond = LoopBuilder.createNot(Operand: Cond);
5837 VPValue *OrVal = LoopBuilder.createOr(LHS: AnyOfPhi, RHS: AnyOfCond);
5838 AnyOfPhi->setOperand(I: 1, New: OrVal);
5839
5840 NewRdxResult =
5841 MiddleBuilder.createNaryOp(Opcode: VPInstruction::ComputeAnyOfResult,
5842 Operands: {StartVPV, ReducedIV, OrVal}, Flags: {}, DL: ExitDL);
5843
5844 // Initialize the IV reduction phi with the neutral element, not the
5845 // original start value, to ensure correct min/max reduction results.
5846 StartVPV = Plan.getOrAddLiveIn(
5847 V: getRecurrenceIdentity(K: MinMaxKind, Tp: IVSCEV->getType(), FMF: {}));
5848 }
5849 RdxResult->replaceAllUsesWith(New: NewRdxResult);
5850 RdxResult->eraseFromParent();
5851
5852 auto *NewPhiR = new VPReductionPHIRecipe(
5853 cast<PHINode>(Val: PhiR->getUnderlyingInstr()), RecurKind::FindIV, *StartVPV,
5854 *CondSelect, RdxUnordered{.VFScaleFactor: 1}, {}, PhiR->hasUsesOutsideReductionChain());
5855 NewPhiR->insertBefore(InsertPos: PhiR);
5856 PhiR->replaceAllUsesWith(New: NewPhiR);
5857 PhiR->eraseFromParent();
5858 }
5859}
5860
5861namespace {
5862
5863/// Holds the binary operation used to compute the extended operand and the
5864/// casts that feed into it.
5865struct ExtendedReductionOperand {
5866 VPWidenRecipe *BinOp = nullptr;
5867 // Note: The second cast recipe may be null.
5868 std::array<VPWidenCastRecipe *, 2> CastRecipes = {};
5869};
5870
5871/// A chain of recipes that form a partial reduction. Matches either
5872/// reduction_bin_op (extend (A), accumulator), or
5873/// reduction_bin_op (bin_op (extend (A), (extend (B))), accumulator).
5874struct VPPartialReductionChain {
5875 /// The top-level binary operation that forms the reduction to a scalar
5876 /// after the loop body.
5877 VPWidenRecipe *ReductionBinOp;
5878 /// The user of the extends that is then reduced.
5879 ExtendedReductionOperand ExtendedOp;
5880 unsigned ScaleFactor;
5881 /// The recurrence kind for the entire partial reduction chain.
5882 /// This allows distinguishing between Sub and AddWithSub recurrences,
5883 /// when the ReductionBinOp is a Instruction::Sub.
5884 RecurKind RK;
5885};
5886
5887static VPSingleDefRecipe *
5888optimizeExtendsForPartialReduction(VPSingleDefRecipe *BinOp,
5889 VPTypeAnalysis &TypeInfo) {
5890 // reduce.add(mul(ext(A), C))
5891 // -> reduce.add(mul(ext(A), ext(trunc(C))))
5892 const APInt *Const;
5893 if (match(R: BinOp, P: m_Mul(Op0: m_ZExtOrSExt(Op0: m_VPValue()), Op1: m_APInt(C&: Const)))) {
5894 auto *ExtA = cast<VPWidenCastRecipe>(Val: BinOp->getOperand(N: 0));
5895 Instruction::CastOps ExtOpc = ExtA->getOpcode();
5896 Type *NarrowTy = TypeInfo.inferScalarType(V: ExtA->getOperand(N: 0));
5897 if (!BinOp->hasOneUse() ||
5898 !llvm::canConstantBeExtended(
5899 C: Const, NarrowType: NarrowTy, ExtKind: TTI::getPartialReductionExtendKind(CastOpc: ExtOpc)))
5900 return BinOp;
5901
5902 VPBuilder Builder(BinOp);
5903 auto *Trunc = Builder.createWidenCast(Opcode: Instruction::CastOps::Trunc,
5904 Op: BinOp->getOperand(N: 1), ResultTy: NarrowTy);
5905 Type *WideTy = TypeInfo.inferScalarType(V: ExtA);
5906 BinOp->setOperand(I: 1, New: Builder.createWidenCast(Opcode: ExtOpc, Op: Trunc, ResultTy: WideTy));
5907 return BinOp;
5908 }
5909
5910 // reduce.add(ext(mul(ext(A), ext(B))))
5911 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
5912 if (match(R: BinOp, P: m_ZExtOrSExt(Op0: m_Mul(Op0: m_ZExtOrSExt(Op0: m_VPValue()),
5913 Op1: m_ZExtOrSExt(Op0: m_VPValue()))))) {
5914 auto *Ext = cast<VPWidenCastRecipe>(Val: BinOp);
5915 auto *Mul = cast<VPWidenRecipe>(Val: Ext->getOperand(N: 0));
5916 auto *MulLHS = cast<VPWidenCastRecipe>(Val: Mul->getOperand(N: 0));
5917 auto *MulRHS = cast<VPWidenCastRecipe>(Val: Mul->getOperand(N: 1));
5918 if (!Mul->hasOneUse() ||
5919 (Ext->getOpcode() != MulLHS->getOpcode() && MulLHS != MulRHS) ||
5920 MulLHS->getOpcode() != MulRHS->getOpcode())
5921 return BinOp;
5922 VPBuilder Builder(Mul);
5923 Mul->setOperand(I: 0, New: Builder.createWidenCast(Opcode: MulLHS->getOpcode(),
5924 Op: MulLHS->getOperand(N: 0),
5925 ResultTy: Ext->getResultType()));
5926 Mul->setOperand(I: 1, New: MulLHS == MulRHS
5927 ? Mul->getOperand(N: 0)
5928 : Builder.createWidenCast(Opcode: MulRHS->getOpcode(),
5929 Op: MulRHS->getOperand(N: 0),
5930 ResultTy: Ext->getResultType()));
5931 return Mul;
5932 }
5933
5934 return BinOp;
5935}
5936
5937// Helper to transform a partial reduction chain into a partial reduction
5938// recipe. Assumes profitability has been checked.
5939static void transformToPartialReduction(const VPPartialReductionChain &Chain,
5940 VPTypeAnalysis &TypeInfo, VPlan &Plan,
5941 VPReductionPHIRecipe *RdxPhi) {
5942 VPWidenRecipe *WidenRecipe = Chain.ReductionBinOp;
5943 assert(WidenRecipe->getNumOperands() == 2 && "Expected binary operation");
5944
5945 VPValue *BinOpVal = WidenRecipe->getOperand(N: 0);
5946 VPValue *Accumulator = WidenRecipe->getOperand(N: 1);
5947
5948 // Swap if needed to ensure Accumulator is the PHI or partial reduction.
5949 if (isa<VPReductionPHIRecipe, VPReductionRecipe>(Val: BinOpVal) ||
5950 isa<VPExpressionRecipe>(Val: BinOpVal))
5951 std::swap(a&: BinOpVal, b&: Accumulator);
5952 auto *BinOp = cast<VPSingleDefRecipe>(Val: BinOpVal->getDefiningRecipe());
5953
5954 // Sub-reductions can be implemented in two ways:
5955 // (1) negate the operand in the vector loop (the default way).
5956 // (2) subtract the reduced value from the init value in the middle block.
5957 // Both ways keep the reduction itself as an 'add' reduction.
5958 //
5959 // The ISD nodes for partial reductions don't support folding the
5960 // sub/negation into its operands because the following is not a valid
5961 // transformation:
5962 // sub(0, mul(ext(a), ext(b)))
5963 // -> mul(ext(a), ext(sub(0, b)))
5964 //
5965 // It's therefore better to choose option (2) such that the partial
5966 // reduction is always positive (starting at '0') and to do a final
5967 // subtract in the middle block.
5968 if (WidenRecipe->getOpcode() == Instruction::Sub &&
5969 Chain.RK != RecurKind::Sub) {
5970 VPBuilder Builder(WidenRecipe);
5971 Type *ElemTy = TypeInfo.inferScalarType(V: BinOp);
5972 auto *Zero = Plan.getZero(Ty: ElemTy);
5973 VPIRFlags Flags = WidenRecipe->getUnderlyingInstr()
5974 ? VPIRFlags(*WidenRecipe->getUnderlyingInstr())
5975 : VPIRFlags();
5976 auto *NegRecipe = new VPWidenRecipe(Instruction::Sub, {Zero, BinOp}, Flags,
5977 VPIRMetadata(), DebugLoc::getUnknown());
5978 Builder.insert(R: NegRecipe);
5979 BinOp = NegRecipe;
5980 }
5981
5982 // FIXME: Do these transforms before invoking the cost-model.
5983 BinOp = optimizeExtendsForPartialReduction(BinOp, TypeInfo);
5984
5985 // Check if WidenRecipe is the final result of the reduction. If so look
5986 // through selects for predicated reductions.
5987 VPValue *Cond = nullptr;
5988 VPValue *ExitValue = cast_or_null<VPInstruction>(Val: vputils::findUserOf(
5989 V: WidenRecipe,
5990 P: m_Select(Op0: m_VPValue(V&: Cond), Op1: m_Specific(VPV: WidenRecipe), Op2: m_Specific(VPV: RdxPhi))));
5991 bool IsLastInChain = RdxPhi->getBackedgeValue() == WidenRecipe ||
5992 RdxPhi->getBackedgeValue() == ExitValue;
5993 assert((!ExitValue || IsLastInChain) &&
5994 "if we found ExitValue, it must match RdxPhi's backedge value");
5995
5996 Type *PhiType = TypeInfo.inferScalarType(V: RdxPhi);
5997 RecurKind RdxKind =
5998 PhiType->isFloatingPointTy() ? RecurKind::FAdd : RecurKind::Add;
5999 auto *PartialRed = new VPReductionRecipe(
6000 RdxKind,
6001 RdxKind == RecurKind::FAdd ? WidenRecipe->getFastMathFlags()
6002 : FastMathFlags(),
6003 WidenRecipe->getUnderlyingInstr(), Accumulator, BinOp, Cond,
6004 RdxUnordered{/*VFScaleFactor=*/Chain.ScaleFactor});
6005 PartialRed->insertBefore(InsertPos: WidenRecipe);
6006
6007 if (Cond)
6008 ExitValue->replaceAllUsesWith(New: PartialRed);
6009 WidenRecipe->replaceAllUsesWith(New: PartialRed);
6010
6011 // We only need to update the PHI node once, which is when we find the
6012 // last reduction in the chain.
6013 if (!IsLastInChain)
6014 return;
6015
6016 // Scale the PHI and ReductionStartVector by the VFScaleFactor
6017 assert(RdxPhi->getVFScaleFactor() == 1 && "scale factor must not be set");
6018 RdxPhi->setVFScaleFactor(Chain.ScaleFactor);
6019
6020 auto *StartInst = cast<VPInstruction>(Val: RdxPhi->getStartValue());
6021 assert(StartInst->getOpcode() == VPInstruction::ReductionStartVector);
6022 auto *NewScaleFactor = Plan.getConstantInt(BitWidth: 32, Val: Chain.ScaleFactor);
6023 StartInst->setOperand(I: 2, New: NewScaleFactor);
6024
6025 // If this is the last value in a sub-reduction chain, then update the PHI
6026 // node to start at `0` and update the reduction-result to subtract from
6027 // the PHI's start value.
6028 if (Chain.RK != RecurKind::Sub)
6029 return;
6030
6031 VPValue *OldStartValue = StartInst->getOperand(N: 0);
6032 StartInst->setOperand(I: 0, New: StartInst->getOperand(N: 1));
6033
6034 // Replace reduction_result by 'sub (startval, reductionresult)'.
6035 VPInstruction *RdxResult = vputils::findComputeReductionResult(PhiR: RdxPhi);
6036 assert(RdxResult && "Could not find reduction result");
6037
6038 VPBuilder Builder = VPBuilder::getToInsertAfter(R: RdxResult);
6039 constexpr unsigned SubOpc = Instruction::BinaryOps::Sub;
6040 VPInstruction *NewResult = Builder.createNaryOp(
6041 Opcode: SubOpc, Operands: {OldStartValue, RdxResult}, Flags: VPIRFlags::getDefaultFlags(Opcode: SubOpc),
6042 DL: RdxPhi->getDebugLoc());
6043 RdxResult->replaceUsesWithIf(
6044 New: NewResult,
6045 ShouldReplace: [&NewResult](VPUser &U, unsigned Idx) { return &U != NewResult; });
6046}
6047
6048/// Check if a partial reduction chain is is supported by the target (i.e. does
6049/// not have an invalid cost) for the given VF range. Clamps the range and
6050/// returns true if profitable for any VF.
6051static bool isValidPartialReduction(const VPPartialReductionChain &Chain,
6052 Type *PhiType, VPCostContext &CostCtx,
6053 VFRange &Range) {
6054 auto GetExtInfo = [&CostCtx](VPWidenCastRecipe *Ext)
6055 -> std::pair<Type *, TargetTransformInfo::PartialReductionExtendKind> {
6056 if (!Ext)
6057 return {nullptr, TargetTransformInfo::PR_None};
6058 Type *ExtOpType = CostCtx.Types.inferScalarType(V: Ext->getOperand(N: 0));
6059 auto ExtKind = TargetTransformInfo::getPartialReductionExtendKind(
6060 CastOpc: static_cast<Instruction::CastOps>(Ext->getOpcode()));
6061 return {ExtOpType, ExtKind};
6062 };
6063 ExtendedReductionOperand ExtendedOp = Chain.ExtendedOp;
6064 VPWidenCastRecipe *ExtendA = ExtendedOp.CastRecipes[0];
6065 VPWidenCastRecipe *ExtendB = ExtendedOp.CastRecipes[1];
6066
6067 Type *ExtOpTypeA, *ExtOpTypeB;
6068 TargetTransformInfo::PartialReductionExtendKind ExtKindA, ExtKindB;
6069 std::tie(args&: ExtOpTypeA, args&: ExtKindA) = GetExtInfo(ExtendA);
6070 std::tie(args&: ExtOpTypeB, args&: ExtKindB) = GetExtInfo(ExtendB);
6071
6072 // If ExtendB is nullptr but there's a separate BinOp, the second operand
6073 // was a constant that can use the same extend kind as the first.
6074 if (!ExtendB && ExtendedOp.BinOp &&
6075 ExtendedOp.BinOp != Chain.ReductionBinOp) {
6076 const APInt *Const = nullptr;
6077 for (VPValue *Op : ExtendedOp.BinOp->operands()) {
6078 if (match(V: Op, P: m_APInt(C&: Const)))
6079 break;
6080 }
6081 if (!Const || !canConstantBeExtended(C: Const, NarrowType: ExtOpTypeA, ExtKind: ExtKindA))
6082 return false;
6083 ExtOpTypeB = ExtOpTypeA;
6084 ExtKindB = ExtKindA;
6085 }
6086
6087 std::optional<unsigned> BinOpc;
6088 if (ExtendedOp.BinOp && ExtendedOp.BinOp != Chain.ReductionBinOp)
6089 BinOpc = ExtendedOp.BinOp->getOpcode();
6090
6091 VPWidenRecipe *WidenRecipe = Chain.ReductionBinOp;
6092 return LoopVectorizationPlanner::getDecisionAndClampRange(
6093 Predicate: [&](ElementCount VF) {
6094 return CostCtx.TTI
6095 .getPartialReductionCost(
6096 Opcode: WidenRecipe->getOpcode(), InputTypeA: ExtOpTypeA, InputTypeB: ExtOpTypeB, AccumType: PhiType, VF,
6097 OpAExtend: ExtKindA, OpBExtend: ExtKindB, BinOp: BinOpc, CostKind: CostCtx.CostKind,
6098 FMF: PhiType->isFloatingPointTy()
6099 ? std::optional{WidenRecipe->getFastMathFlags()}
6100 : std::nullopt)
6101 .isValid();
6102 },
6103 Range);
6104}
6105
6106/// Checks if \p Op (which is an operand of \p UpdateR) is an extended reduction
6107/// operand. This is an operand where the source of the value (e.g. a load) has
6108/// been extended (sext, zext, or fpext) before it is used in the reduction.
6109///
6110/// Possible forms matched by this function:
6111/// - UpdateR(PrevValue, ext(...))
6112/// - UpdateR(PrevValue, BinOp(ext(...), ext(...)))
6113/// - UpdateR(PrevValue, BinOp(ext(...), Constant))
6114/// - UpdateR(PrevValue, neg(BinOp(ext(...), ext(...))))
6115/// - UpdateR(PrevValue, neg(BinOp(ext(...), Constant)))
6116/// - UpdateR(PrevValue, ext(mul(ext(...), ext(...))))
6117/// - UpdateR(PrevValue, ext(mul(ext(...), Constant)))
6118///
6119/// Note: The second operand of UpdateR corresponds to \p Op in the examples.
6120static std::optional<ExtendedReductionOperand>
6121matchExtendedReductionOperand(VPWidenRecipe *UpdateR, VPValue *Op) {
6122 assert(is_contained(UpdateR->operands(), Op) &&
6123 "Op should be operand of UpdateR");
6124
6125 // If Op is an extend, then it's still a valid partial reduction if the
6126 // extended mul fulfills the other requirements.
6127 // For example, reduce.add(ext(mul(ext(A), ext(B)))) is still a valid partial
6128 // reduction since the inner extends will be widened. We already have oneUse
6129 // checks on the inner extends so widening them is safe.
6130 std::optional<TTI::PartialReductionExtendKind> OuterExtKind;
6131 if (match(V: Op, P: m_ZExtOrSExt(Op0: m_Mul(Op0: m_VPValue(), Op1: m_VPValue()))) ||
6132 match(V: Op, P: m_FPExt(Op0: m_FMul(Op0: m_VPValue(), Op1: m_VPValue())))) {
6133 auto *CastRecipe = dyn_cast<VPWidenCastRecipe>(Val: Op);
6134 if (!CastRecipe)
6135 return std::nullopt;
6136 auto CastOp = static_cast<Instruction::CastOps>(CastRecipe->getOpcode());
6137 OuterExtKind = TTI::getPartialReductionExtendKind(CastOpc: CastOp);
6138 Op = CastRecipe->getOperand(N: 0);
6139 }
6140
6141 // If the update is a binary op, check both of its operands to see if
6142 // they are extends. Otherwise, see if the update comes directly from an
6143 // extend.
6144 std::array<VPWidenCastRecipe *, 2> CastRecipes = {};
6145
6146 // Match extends and populate CastRecipes. Returns false if matching fails.
6147 auto MatchExtends = [OuterExtKind,
6148 &CastRecipes](ArrayRef<VPValue *> Operands) {
6149 assert(Operands.size() <= 2 && "expected at most 2 operands");
6150
6151 for (const auto &[I, OpVal] : enumerate(First&: Operands)) {
6152 // Allow constant as second operand - validation happens in
6153 // isValidPartialReduction.
6154 const APInt *Unused;
6155 if (I > 0 && CastRecipes[0] && match(V: OpVal, P: m_APInt(C&: Unused)))
6156 continue;
6157
6158 VPValue *ExtInput;
6159 if (!match(V: OpVal, P: m_ZExtOrSExt(Op0: m_VPValue(V&: ExtInput))) &&
6160 !match(V: OpVal, P: m_FPExt(Op0: m_VPValue(V&: ExtInput))))
6161 return false;
6162
6163 CastRecipes[I] = dyn_cast<VPWidenCastRecipe>(Val: OpVal);
6164 if (!CastRecipes[I])
6165 return false;
6166
6167 // The outer extend kind must match the inner extends for folding.
6168 if (OuterExtKind) {
6169 auto CastOp =
6170 static_cast<Instruction::CastOps>(CastRecipes[I]->getOpcode());
6171 if (*OuterExtKind != TTI::getPartialReductionExtendKind(CastOpc: CastOp))
6172 return false;
6173 }
6174 }
6175 return CastRecipes[0] != nullptr;
6176 };
6177
6178 // If Op is a binary operator, check both of its operands to see if they are
6179 // extends. Otherwise, see if the update comes directly from an extend.
6180 auto *BinOp = dyn_cast<VPWidenRecipe>(Val: Op);
6181 if (BinOp && Instruction::isBinaryOp(Opcode: BinOp->getOpcode())) {
6182 if (!BinOp->hasOneUse())
6183 return std::nullopt;
6184
6185 // Handle neg(binop(ext, ext)) pattern.
6186 VPValue *OtherOp = nullptr;
6187 if (match(V: BinOp, P: m_Sub(Op0: m_ZeroInt(), Op1: m_VPValue(V&: OtherOp))))
6188 BinOp = dyn_cast<VPWidenRecipe>(Val: OtherOp);
6189
6190 if (!BinOp || !Instruction::isBinaryOp(Opcode: BinOp->getOpcode()) ||
6191 !MatchExtends(BinOp->operands()))
6192 return std::nullopt;
6193 } else if (match(V: UpdateR, P: m_Add(Op0: m_VPValue(), Op1: m_VPValue())) ||
6194 match(V: UpdateR, P: m_FAdd(Op0: m_VPValue(), Op1: m_VPValue()))) {
6195 // We already know Op is an operand of UpdateR.
6196 if (!MatchExtends({Op}))
6197 return std::nullopt;
6198 BinOp = UpdateR;
6199 } else {
6200 return std::nullopt;
6201 }
6202
6203 return ExtendedReductionOperand{.BinOp: BinOp, .CastRecipes: CastRecipes};
6204}
6205
6206/// Examines each operation in the reduction chain corresponding to \p RedPhiR,
6207/// and determines if the target can use a cheaper operation with a wider
6208/// per-iteration input VF and narrower PHI VF. If successful, returns the chain
6209/// of operations in the reduction.
6210static std::optional<SmallVector<VPPartialReductionChain>>
6211getScaledReductions(VPReductionPHIRecipe *RedPhiR, VPCostContext &CostCtx,
6212 VFRange &Range) {
6213 // Get the backedge value from the reduction PHI and find the
6214 // ComputeReductionResult that uses it (directly or through a select for
6215 // predicated reductions).
6216 auto *RdxResult = vputils::findComputeReductionResult(PhiR: RedPhiR);
6217 if (!RdxResult)
6218 return std::nullopt;
6219 VPValue *ExitValue = RdxResult->getOperand(N: 0);
6220 match(V: ExitValue, P: m_Select(Op0: m_VPValue(), Op1: m_VPValue(V&: ExitValue), Op2: m_VPValue()));
6221
6222 SmallVector<VPPartialReductionChain> Chains;
6223 RecurKind RK = RedPhiR->getRecurrenceKind();
6224 Type *PhiType = CostCtx.Types.inferScalarType(V: RedPhiR);
6225 TypeSize PHISize = PhiType->getPrimitiveSizeInBits();
6226
6227 // Work backwards from the ExitValue examining each reduction operation.
6228 VPValue *CurrentValue = ExitValue;
6229 while (CurrentValue != RedPhiR) {
6230 auto *UpdateR = dyn_cast<VPWidenRecipe>(Val: CurrentValue);
6231 if (!UpdateR || !Instruction::isBinaryOp(Opcode: UpdateR->getOpcode()))
6232 return std::nullopt;
6233
6234 VPValue *Op = UpdateR->getOperand(N: 1);
6235 VPValue *PrevValue = UpdateR->getOperand(N: 0);
6236
6237 // Find the extended operand. The other operand (PrevValue) is the next link
6238 // in the reduction chain.
6239 std::optional<ExtendedReductionOperand> ExtendedOp =
6240 matchExtendedReductionOperand(UpdateR, Op);
6241 if (!ExtendedOp) {
6242 ExtendedOp = matchExtendedReductionOperand(UpdateR, Op: PrevValue);
6243 if (!ExtendedOp)
6244 return std::nullopt;
6245 std::swap(a&: Op, b&: PrevValue);
6246 }
6247
6248 Type *ExtSrcType = CostCtx.Types.inferScalarType(
6249 V: ExtendedOp->CastRecipes[0]->getOperand(N: 0));
6250 TypeSize ExtSrcSize = ExtSrcType->getPrimitiveSizeInBits();
6251 if (!PHISize.hasKnownScalarFactor(RHS: ExtSrcSize))
6252 return std::nullopt;
6253
6254 VPPartialReductionChain Chain(
6255 {.ReductionBinOp: UpdateR, .ExtendedOp: *ExtendedOp,
6256 .ScaleFactor: static_cast<unsigned>(PHISize.getKnownScalarFactor(RHS: ExtSrcSize)), .RK: RK});
6257 if (!isValidPartialReduction(Chain, PhiType, CostCtx, Range))
6258 return std::nullopt;
6259
6260 Chains.push_back(Elt: Chain);
6261 CurrentValue = PrevValue;
6262 }
6263
6264 // The chains were collected by traversing backwards from the exit value.
6265 // Reverse the chains so they are in program order.
6266 std::reverse(first: Chains.begin(), last: Chains.end());
6267 return Chains;
6268}
6269} // namespace
6270
6271void VPlanTransforms::createPartialReductions(VPlan &Plan,
6272 VPCostContext &CostCtx,
6273 VFRange &Range) {
6274 // Find all possible valid partial reductions, grouping chains by their PHI.
6275 // This grouping allows invalidating the whole chain, if any link is not a
6276 // valid partial reduction.
6277 MapVector<VPReductionPHIRecipe *, SmallVector<VPPartialReductionChain>>
6278 ChainsByPhi;
6279 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
6280 for (VPRecipeBase &R : HeaderVPBB->phis()) {
6281 auto *RedPhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
6282 if (!RedPhiR)
6283 continue;
6284
6285 if (auto Chains = getScaledReductions(RedPhiR, CostCtx, Range))
6286 ChainsByPhi.try_emplace(Key: RedPhiR, Args: std::move(*Chains));
6287 }
6288
6289 if (ChainsByPhi.empty())
6290 return;
6291
6292 // Build set of partial reduction operations for extend user validation and
6293 // a map of reduction bin ops to their scale factors for scale validation.
6294 SmallPtrSet<VPRecipeBase *, 4> PartialReductionOps;
6295 DenseMap<VPSingleDefRecipe *, unsigned> ScaledReductionMap;
6296 for (const auto &[_, Chains] : ChainsByPhi)
6297 for (const VPPartialReductionChain &Chain : Chains) {
6298 PartialReductionOps.insert(Ptr: Chain.ExtendedOp.BinOp);
6299 ScaledReductionMap[Chain.ReductionBinOp] = Chain.ScaleFactor;
6300 }
6301
6302 // A partial reduction is invalid if any of its extends are used by
6303 // something that isn't another partial reduction. This is because the
6304 // extends are intended to be lowered along with the reduction itself.
6305 auto ExtendUsersValid = [&](VPWidenCastRecipe *Ext) {
6306 return !Ext || all_of(Range: Ext->users(), P: [&](VPUser *U) {
6307 return PartialReductionOps.contains(Ptr: cast<VPRecipeBase>(Val: U));
6308 });
6309 };
6310
6311 // Validate chains: check that extends are only used by partial reductions,
6312 // and that reduction bin ops are only used by other partial reductions with
6313 // matching scale factors, are outside the loop region or the select
6314 // introduced by tail-folding. Otherwise we would create users of scaled
6315 // reductions where the types of the other operands don't match.
6316 for (auto &[RedPhiR, Chains] : ChainsByPhi) {
6317 for (const VPPartialReductionChain &Chain : Chains) {
6318 if (!all_of(Range: Chain.ExtendedOp.CastRecipes, P: ExtendUsersValid)) {
6319 Chains.clear();
6320 break;
6321 }
6322 auto UseIsValid = [&, RedPhiR = RedPhiR](VPUser *U) {
6323 if (auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: U))
6324 return PhiR == RedPhiR;
6325 auto *R = cast<VPSingleDefRecipe>(Val: U);
6326 return Chain.ScaleFactor == ScaledReductionMap.lookup_or(Val: R, Default: 0) ||
6327 match(R, P: m_ComputeReductionResult(
6328 Op0: m_Specific(VPV: Chain.ReductionBinOp))) ||
6329 match(R, P: m_Select(Op0: m_VPValue(), Op1: m_Specific(VPV: Chain.ReductionBinOp),
6330 Op2: m_Specific(VPV: RedPhiR)));
6331 };
6332 if (!all_of(Range: Chain.ReductionBinOp->users(), P: UseIsValid)) {
6333 Chains.clear();
6334 break;
6335 }
6336
6337 // Check if the compute-reduction-result is used by a sunk store.
6338 // TODO: Also form partial reductions in those cases.
6339 if (auto *RdxResult = vputils::findComputeReductionResult(PhiR: RedPhiR)) {
6340 if (any_of(Range: RdxResult->users(), P: [](VPUser *U) {
6341 auto *RepR = dyn_cast<VPReplicateRecipe>(Val: U);
6342 return RepR && isa<StoreInst>(Val: RepR->getUnderlyingInstr());
6343 })) {
6344 Chains.clear();
6345 break;
6346 }
6347 }
6348 }
6349 }
6350
6351 for (auto &[Phi, Chains] : ChainsByPhi)
6352 for (const VPPartialReductionChain &Chain : Chains)
6353 transformToPartialReduction(Chain, TypeInfo&: CostCtx.Types, Plan, RdxPhi: Phi);
6354}
6355