1//===- VPlan.cpp - Vectorizer Plan ----------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This is the LLVM vectorization plan. It represents a candidate for
11/// vectorization, allowing to plan and optimize how to vectorize a given loop
12/// before generating LLVM-IR.
13/// The vectorizer uses vectorization plans to estimate the costs of potential
14/// candidates and if profitable to execute the desired plan, generating vector
15/// LLVM-IR code.
16///
17//===----------------------------------------------------------------------===//
18
19#include "VPlan.h"
20#include "LoopVectorizationPlanner.h"
21#include "VPlanCFG.h"
22#include "VPlanDominatorTree.h"
23#include "VPlanHelpers.h"
24#include "VPlanPatternMatch.h"
25#include "VPlanTransforms.h"
26#include "VPlanUtils.h"
27#include "llvm/ADT/PostOrderIterator.h"
28#include "llvm/ADT/STLExtras.h"
29#include "llvm/ADT/SmallVector.h"
30#include "llvm/ADT/StringExtras.h"
31#include "llvm/ADT/Twine.h"
32#include "llvm/Analysis/DomTreeUpdater.h"
33#include "llvm/Analysis/LoopInfo.h"
34#include "llvm/IR/BasicBlock.h"
35#include "llvm/IR/CFG.h"
36#include "llvm/IR/IRBuilder.h"
37#include "llvm/IR/Instruction.h"
38#include "llvm/IR/Instructions.h"
39#include "llvm/IR/Type.h"
40#include "llvm/IR/Value.h"
41#include "llvm/Support/Casting.h"
42#include "llvm/Support/CommandLine.h"
43#include "llvm/Support/Debug.h"
44#include "llvm/Support/GraphWriter.h"
45#include "llvm/Support/raw_ostream.h"
46#include "llvm/Transforms/Utils/BasicBlockUtils.h"
47#include "llvm/Transforms/Utils/LoopVersioning.h"
48#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
49#include <cassert>
50#include <string>
51
52using namespace llvm;
53using namespace llvm::VPlanPatternMatch;
54
55namespace llvm {
56extern cl::opt<bool> ProfcheckDisableMetadataFixes;
57} // namespace llvm
58
59/// @{
60/// Metadata attribute names
61const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
62const char LLVMLoopVectorizeFollowupVectorized[] =
63 "llvm.loop.vectorize.followup_vectorized";
64const char LLVMLoopVectorizeFollowupEpilogue[] =
65 "llvm.loop.vectorize.followup_epilogue";
66/// @}
67
68extern cl::opt<unsigned> ForceTargetInstructionCost;
69
70extern cl::opt<unsigned> NumberOfStoresToPredicate;
71
72static cl::opt<bool> PrintVPlansInDotFormat(
73 "vplan-print-in-dot-format", cl::Hidden,
74 cl::desc("Use dot format instead of plain text when dumping VPlans"));
75
76#define DEBUG_TYPE "loop-vectorize"
77
78#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
79raw_ostream &llvm::operator<<(raw_ostream &OS, const VPRecipeBase &R) {
80 const VPBasicBlock *Parent = R.getParent();
81 VPSlotTracker SlotTracker(Parent ? Parent->getPlan() : nullptr);
82 R.print(OS, "", SlotTracker);
83 return OS;
84}
85#endif
86
87Value *VPLane::getAsRuntimeExpr(IRBuilderBase &Builder,
88 const ElementCount &VF) const {
89 switch (LaneKind) {
90 case VPLane::Kind::ScalableLast:
91 // Lane = RuntimeVF - VF.getKnownMinValue() + Lane
92 return Builder.CreateSub(LHS: getRuntimeVF(B&: Builder, Ty: Builder.getInt32Ty(), VF),
93 RHS: Builder.getInt32(C: VF.getKnownMinValue() - Lane));
94 case VPLane::Kind::First:
95 return Builder.getInt32(C: Lane);
96 }
97 llvm_unreachable("Unknown lane kind");
98}
99
100#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
101void VPValue::print(raw_ostream &OS, VPSlotTracker &SlotTracker) const {
102 if (const VPRecipeBase *R = getDefiningRecipe())
103 R->print(OS, "", SlotTracker);
104 else
105 printAsOperand(OS, SlotTracker);
106}
107
108void VPValue::dump() const {
109 const VPRecipeBase *Instr = getDefiningRecipe();
110 VPSlotTracker SlotTracker(
111 (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr);
112 print(dbgs(), SlotTracker);
113 dbgs() << "\n";
114}
115
116void VPRecipeBase::dump() const {
117 VPSlotTracker SlotTracker(getParent() ? getParent()->getPlan() : nullptr);
118 print(dbgs(), "", SlotTracker);
119 dbgs() << "\n";
120}
121#endif
122
123#if !defined(NDEBUG)
124bool VPRecipeValue::isDefinedBy(const VPDef *D) const { return Def == D; }
125#endif
126
127VPRecipeBase *VPValue::getDefiningRecipe() {
128 auto *DefValue = dyn_cast<VPRecipeValue>(Val: this);
129 return DefValue ? DefValue->Def : nullptr;
130}
131
132const VPRecipeBase *VPValue::getDefiningRecipe() const {
133 auto *DefValue = dyn_cast<VPRecipeValue>(Val: this);
134 return DefValue ? DefValue->Def : nullptr;
135}
136
137Value *VPValue::getLiveInIRValue() const {
138 return cast<VPIRValue>(Val: this)->getValue();
139}
140
141Type *VPIRValue::getType() const { return getUnderlyingValue()->getType(); }
142
143VPRecipeValue::VPRecipeValue(VPRecipeBase *Def, Value *UV)
144 : VPValue(VPVRecipeValueSC, UV), Def(Def) {
145 assert(Def && "VPRecipeValue requires a defining recipe");
146 Def->addDefinedValue(V: this);
147}
148
149VPRecipeValue::~VPRecipeValue() {
150 assert(Users.empty() &&
151 "trying to delete a VPRecipeValue with remaining users");
152 Def->removeDefinedValue(V: this);
153}
154
155// Get the top-most entry block of \p Start. This is the entry block of the
156// containing VPlan. This function is templated to support both const and non-const blocks
157template <typename T> static T *getPlanEntry(T *Start) {
158 T *Next = Start;
159 T *Current = Start;
160 while ((Next = Next->getParent()))
161 Current = Next;
162
163 SmallSetVector<T *, 8> WorkList;
164 WorkList.insert(Current);
165
166 for (unsigned i = 0; i < WorkList.size(); i++) {
167 T *Current = WorkList[i];
168 if (!Current->hasPredecessors())
169 return Current;
170 auto &Predecessors = Current->getPredecessors();
171 WorkList.insert_range(Predecessors);
172 }
173
174 llvm_unreachable("VPlan without any entry node without predecessors");
175}
176
177VPlan *VPBlockBase::getPlan() { return getPlanEntry(Start: this)->Plan; }
178
179const VPlan *VPBlockBase::getPlan() const { return getPlanEntry(Start: this)->Plan; }
180
181/// \return the VPBasicBlock that is the entry of Block, possibly indirectly.
182const VPBasicBlock *VPBlockBase::getEntryBasicBlock() const {
183 const VPBlockBase *Block = this;
184 while (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Val: Block))
185 Block = Region->getEntry();
186 return cast<VPBasicBlock>(Val: Block);
187}
188
189VPBasicBlock *VPBlockBase::getEntryBasicBlock() {
190 VPBlockBase *Block = this;
191 while (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Val: Block))
192 Block = Region->getEntry();
193 return cast<VPBasicBlock>(Val: Block);
194}
195
196void VPBlockBase::setPlan(VPlan *ParentPlan) {
197 assert(ParentPlan->getEntry() == this && "Can only set plan on its entry.");
198 Plan = ParentPlan;
199}
200
201/// \return the VPBasicBlock that is the exit of Block, possibly indirectly.
202const VPBasicBlock *VPBlockBase::getExitingBasicBlock() const {
203 const VPBlockBase *Block = this;
204 while (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Val: Block))
205 Block = Region->getExiting();
206 return cast<VPBasicBlock>(Val: Block);
207}
208
209VPBasicBlock *VPBlockBase::getExitingBasicBlock() {
210 VPBlockBase *Block = this;
211 while (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Val: Block))
212 Block = Region->getExiting();
213 return cast<VPBasicBlock>(Val: Block);
214}
215
216VPBlockBase *VPBlockBase::getEnclosingBlockWithSuccessors() {
217 if (!Successors.empty() || !Parent)
218 return this;
219 assert(Parent->getExiting() == this &&
220 "Block w/o successors not the exiting block of its parent.");
221 return Parent->getEnclosingBlockWithSuccessors();
222}
223
224VPBlockBase *VPBlockBase::getEnclosingBlockWithPredecessors() {
225 if (!Predecessors.empty() || !Parent)
226 return this;
227 assert(Parent->getEntry() == this &&
228 "Block w/o predecessors not the entry of its parent.");
229 return Parent->getEnclosingBlockWithPredecessors();
230}
231
232VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() {
233 iterator It = begin();
234 while (It != end() && It->isPhi())
235 It++;
236 return It;
237}
238
239VPTransformState::VPTransformState(const TargetTransformInfo *TTI,
240 ElementCount VF, LoopInfo *LI,
241 DominatorTree *DT, AssumptionCache *AC,
242 IRBuilderBase &Builder, VPlan *Plan,
243 Loop *CurrentParentLoop, Type *CanonicalIVTy)
244 : TTI(TTI), VF(VF), CFG(DT), LI(LI), AC(AC), Builder(Builder), Plan(Plan),
245 CurrentParentLoop(CurrentParentLoop), TypeAnalysis(*Plan), VPDT(*Plan) {}
246
247Value *VPTransformState::get(const VPValue *Def, const VPLane &Lane) {
248 if (isa<VPIRValue, VPSymbolicValue>(Val: Def))
249 return Def->getUnderlyingValue();
250
251 if (hasScalarValue(Def, Lane))
252 return Data.VPV2Scalars[Def][Lane.mapToCacheIndex(VF)];
253
254 if (!Lane.isFirstLane() && vputils::isSingleScalar(VPV: Def) &&
255 hasScalarValue(Def, Lane: VPLane::getFirstLane())) {
256 return Data.VPV2Scalars[Def][0];
257 }
258
259 // Look through BuildVector to avoid redundant extracts.
260 // TODO: Remove once replicate regions are unrolled explicitly.
261 if (Lane.getKind() == VPLane::Kind::First && match(V: Def, P: m_BuildVector())) {
262 auto *BuildVector = cast<VPInstruction>(Val: Def);
263 return get(Def: BuildVector->getOperand(N: Lane.getKnownLane()), IsScalar: true);
264 }
265
266 assert(hasVectorValue(Def));
267 auto *VecPart = Data.VPV2Vector[Def];
268 if (!VecPart->getType()->isVectorTy()) {
269 assert(Lane.isFirstLane() && "cannot get lane > 0 for scalar");
270 return VecPart;
271 }
272 // TODO: Cache created scalar values.
273 Value *LaneV = Lane.getAsRuntimeExpr(Builder, VF);
274 auto *Extract = Builder.CreateExtractElement(Vec: VecPart, Idx: LaneV);
275 // set(Def, Extract, Instance);
276 return Extract;
277}
278
279Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) {
280 if (NeedsScalar) {
281 assert((VF.isScalar() || isa<VPIRValue, VPSymbolicValue>(Def) ||
282 hasVectorValue(Def) || !vputils::onlyFirstLaneUsed(Def) ||
283 (hasScalarValue(Def, VPLane(0)) &&
284 Data.VPV2Scalars[Def].size() == 1)) &&
285 "Trying to access a single scalar per part but has multiple scalars "
286 "per part.");
287 return get(Def, Lane: VPLane(0));
288 }
289
290 // If Values have been set for this Def return the one relevant for \p Part.
291 if (hasVectorValue(Def))
292 return Data.VPV2Vector[Def];
293
294 auto GetBroadcastInstrs = [this](Value *V) {
295 if (VF.isScalar())
296 return V;
297 // Broadcast the scalar into all locations in the vector.
298 Value *Shuf = Builder.CreateVectorSplat(EC: VF, V, Name: "broadcast");
299 return Shuf;
300 };
301
302 if (!hasScalarValue(Def, Lane: {0})) {
303 Value *IRV = Def->getLiveInIRValue();
304 Value *B = GetBroadcastInstrs(IRV);
305 set(Def, V: B);
306 return B;
307 }
308
309 Value *ScalarValue = get(Def, Lane: VPLane(0));
310 // If we aren't vectorizing, we can just copy the scalar map values over
311 // to the vector map.
312 if (VF.isScalar()) {
313 set(Def, V: ScalarValue);
314 return ScalarValue;
315 }
316
317 bool IsSingleScalar = vputils::isSingleScalar(VPV: Def);
318 VPLane LastLane(IsSingleScalar ? 0 : VF.getFixedValue() - 1);
319
320 // We need to construct the vector value for a single-scalar value by
321 // broadcasting the scalar to all lanes.
322 // TODO: Replace by introducing Broadcast VPInstructions.
323 assert(IsSingleScalar && "must be a single-scalar at this point");
324 // Set the insert point after the last scalarized instruction or after the
325 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence
326 // will directly follow the scalar definitions.
327 auto OldIP = Builder.saveIP();
328 auto *LastInst = cast<Instruction>(Val: get(Def, Lane: LastLane));
329 auto NewIP = isa<PHINode>(Val: LastInst)
330 ? LastInst->getParent()->getFirstNonPHIIt()
331 : std::next(x: BasicBlock::iterator(LastInst));
332 Builder.SetInsertPoint(&*NewIP);
333 Value *VectorValue = GetBroadcastInstrs(ScalarValue);
334 set(Def, V: VectorValue);
335 Builder.restoreIP(IP: OldIP);
336 return VectorValue;
337}
338
339void VPTransformState::setDebugLocFrom(DebugLoc DL) {
340 const DILocation *DIL = DL;
341 // When a FSDiscriminator is enabled, we don't need to add the multiply
342 // factors to the discriminators.
343 if (DIL &&
344 Builder.GetInsertBlock()
345 ->getParent()
346 ->shouldEmitDebugInfoForProfiling() &&
347 !EnableFSDiscriminator) {
348 // FIXME: For scalable vectors, assume vscale=1.
349 unsigned UF = Plan->getConcreteUF();
350 auto NewDIL =
351 DIL->cloneByMultiplyingDuplicationFactor(DF: UF * VF.getKnownMinValue());
352 if (NewDIL)
353 Builder.SetCurrentDebugLocation(*NewDIL);
354 else
355 LLVM_DEBUG(dbgs() << "Failed to create new discriminator: "
356 << DIL->getFilename() << " Line: " << DIL->getLine());
357 } else
358 Builder.SetCurrentDebugLocation(DL);
359}
360
361Value *VPTransformState::packScalarIntoVectorizedValue(const VPValue *Def,
362 Value *WideValue,
363 const VPLane &Lane) {
364 Value *ScalarInst = get(Def, Lane);
365 Value *LaneExpr = Lane.getAsRuntimeExpr(Builder, VF);
366 if (auto *StructTy = dyn_cast<StructType>(Val: WideValue->getType())) {
367 // We must handle each element of a vectorized struct type.
368 for (unsigned I = 0, E = StructTy->getNumElements(); I != E; I++) {
369 Value *ScalarValue = Builder.CreateExtractValue(Agg: ScalarInst, Idxs: I);
370 Value *VectorValue = Builder.CreateExtractValue(Agg: WideValue, Idxs: I);
371 VectorValue =
372 Builder.CreateInsertElement(Vec: VectorValue, NewElt: ScalarValue, Idx: LaneExpr);
373 WideValue = Builder.CreateInsertValue(Agg: WideValue, Val: VectorValue, Idxs: I);
374 }
375 } else {
376 WideValue = Builder.CreateInsertElement(Vec: WideValue, NewElt: ScalarInst, Idx: LaneExpr);
377 }
378 return WideValue;
379}
380
381BasicBlock *VPBasicBlock::createEmptyBasicBlock(VPTransformState &State) {
382 auto &CFG = State.CFG;
383 // BB stands for IR BasicBlocks. VPBB stands for VPlan VPBasicBlocks.
384 // Pred stands for Predessor. Prev stands for Previous - last visited/created.
385 BasicBlock *PrevBB = CFG.PrevBB;
386 BasicBlock *NewBB = BasicBlock::Create(Context&: PrevBB->getContext(), Name: getName(),
387 Parent: PrevBB->getParent(), InsertBefore: CFG.ExitBB);
388 LLVM_DEBUG(dbgs() << "LV: created " << NewBB->getName() << '\n');
389
390 return NewBB;
391}
392
393void VPBasicBlock::connectToPredecessors(VPTransformState &State) {
394 auto &CFG = State.CFG;
395 BasicBlock *NewBB = CFG.VPBB2IRBB[this];
396
397 // Register NewBB in its loop. In innermost loops its the same for all
398 // BB's.
399 Loop *ParentLoop = State.CurrentParentLoop;
400 // If this block has a sole successor that is an exit block or is an exit
401 // block itself then it needs adding to the same parent loop as the exit
402 // block.
403 VPBlockBase *SuccOrExitVPB = getSingleSuccessor();
404 SuccOrExitVPB = SuccOrExitVPB ? SuccOrExitVPB : this;
405 if (State.Plan->isExitBlock(VPBB: SuccOrExitVPB)) {
406 ParentLoop = State.LI->getLoopFor(
407 BB: cast<VPIRBasicBlock>(Val: SuccOrExitVPB)->getIRBasicBlock());
408 }
409
410 if (ParentLoop && !State.LI->getLoopFor(BB: NewBB))
411 ParentLoop->addBasicBlockToLoop(NewBB, LI&: *State.LI);
412
413 SmallVector<VPBlockBase *> Preds;
414 if (VPBlockUtils::isHeader(VPB: this, VPDT: State.VPDT)) {
415 // There's no block for the latch yet, connect to the preheader only.
416 Preds = {getPredecessors()[0]};
417 } else {
418 Preds = to_vector(Range&: getPredecessors());
419 }
420
421 // Hook up the new basic block to its predecessors.
422 for (VPBlockBase *PredVPBlock : Preds) {
423 VPBasicBlock *PredVPBB = PredVPBlock->getExitingBasicBlock();
424 auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors();
425 assert(CFG.VPBB2IRBB.contains(PredVPBB) &&
426 "Predecessor basic-block not found building successor.");
427 BasicBlock *PredBB = CFG.VPBB2IRBB[PredVPBB];
428 auto *PredBBTerminator = PredBB->getTerminator();
429 LLVM_DEBUG(dbgs() << "LV: draw edge from " << PredBB->getName() << '\n');
430
431 if (isa<UnreachableInst>(Val: PredBBTerminator)) {
432 assert(PredVPSuccessors.size() == 1 &&
433 "Predecessor ending w/o branch must have single successor.");
434 DebugLoc DL = PredBBTerminator->getDebugLoc();
435 PredBBTerminator->eraseFromParent();
436 auto *Br = UncondBrInst::Create(Target: NewBB, InsertBefore: PredBB);
437 Br->setDebugLoc(DL);
438 } else if (auto *UBI = dyn_cast<UncondBrInst>(Val: PredBBTerminator)) {
439 UBI->setSuccessor(NewBB);
440 } else {
441 // Set each forward successor here when it is created, excluding
442 // backedges. A backward successor is set when the branch is created.
443 // Branches to VPIRBasicBlocks must have the same successors in VPlan as
444 // in the original IR, except when the predecessor is the entry block.
445 // This enables including SCEV and memory runtime check blocks in VPlan.
446 // TODO: Remove exception by modeling the terminator of entry block using
447 // BranchOnCond.
448 unsigned idx = PredVPSuccessors.front() == this ? 0 : 1;
449 auto *TermBr = cast<CondBrInst>(Val: PredBBTerminator);
450 assert((!TermBr->getSuccessor(idx) ||
451 (isa<VPIRBasicBlock>(this) &&
452 (TermBr->getSuccessor(idx) == NewBB ||
453 PredVPBlock == getPlan()->getEntry()))) &&
454 "Trying to reset an existing successor block.");
455 TermBr->setSuccessor(idx, NewSucc: NewBB);
456 }
457 CFG.DTU.applyUpdates(Updates: {{DominatorTree::Insert, PredBB, NewBB}});
458 }
459}
460
461void VPIRBasicBlock::execute(VPTransformState *State) {
462 assert(getHierarchicalSuccessors().size() <= 2 &&
463 "VPIRBasicBlock can have at most two successors at the moment!");
464 // Move completely disconnected blocks to their final position.
465 if (IRBB->hasNPredecessors(N: 0) && succ_begin(BB: IRBB) == succ_end(BB: IRBB))
466 IRBB->moveAfter(MovePos: State->CFG.PrevBB);
467 State->Builder.SetInsertPoint(IRBB->getTerminator());
468 State->CFG.PrevBB = IRBB;
469 State->CFG.VPBB2IRBB[this] = IRBB;
470 executeRecipes(State, BB: IRBB);
471 // Create a branch instruction to terminate IRBB if one was not created yet
472 // and is needed.
473 if (getSingleSuccessor() && isa<UnreachableInst>(Val: IRBB->getTerminator())) {
474 auto *Br = State->Builder.CreateBr(Dest: IRBB);
475 Br->setOperand(i_nocapture: 0, Val_nocapture: nullptr);
476 IRBB->getTerminator()->eraseFromParent();
477 } else {
478 assert((getNumSuccessors() == 0 ||
479 isa<UncondBrInst, CondBrInst>(IRBB->getTerminator())) &&
480 "other blocks must be terminated by a branch");
481 }
482
483 connectToPredecessors(State&: *State);
484}
485
486VPIRBasicBlock *VPIRBasicBlock::clone() {
487 auto *NewBlock = getPlan()->createEmptyVPIRBasicBlock(IRBB);
488 for (VPRecipeBase &R : Recipes)
489 NewBlock->appendRecipe(Recipe: R.clone());
490 return NewBlock;
491}
492
493void VPBasicBlock::execute(VPTransformState *State) {
494 bool Replica = bool(State->Lane);
495 BasicBlock *NewBB = State->CFG.PrevBB; // Reuse it if possible.
496
497 if (VPBlockUtils::isHeader(VPB: this, VPDT: State->VPDT)) {
498 // Create and register the new vector loop.
499 Loop *PrevParentLoop = State->CurrentParentLoop;
500 State->CurrentParentLoop = State->LI->AllocateLoop();
501
502 // Insert the new loop into the loop nest and register the new basic blocks
503 // before calling any utilities such as SCEV that require valid LoopInfo.
504 if (PrevParentLoop)
505 PrevParentLoop->addChildLoop(NewChild: State->CurrentParentLoop);
506 else
507 State->LI->addTopLevelLoop(New: State->CurrentParentLoop);
508 }
509
510 auto IsReplicateRegion = [](VPBlockBase *BB) {
511 auto *R = dyn_cast_or_null<VPRegionBlock>(Val: BB);
512 assert((!R || R->isReplicator()) &&
513 "only replicate region blocks should remain");
514 return R;
515 };
516 // 1. Create an IR basic block.
517 if ((Replica && this == getParent()->getEntry()) ||
518 IsReplicateRegion(getSingleHierarchicalPredecessor())) {
519 // Reuse the previous basic block if the current VPBB is either
520 // * the entry to a replicate region, or
521 // * the exit of a replicate region.
522 State->CFG.VPBB2IRBB[this] = NewBB;
523 } else {
524 NewBB = createEmptyBasicBlock(State&: *State);
525
526 State->Builder.SetInsertPoint(NewBB);
527 // Temporarily terminate with unreachable until CFG is rewired.
528 UnreachableInst *Terminator = State->Builder.CreateUnreachable();
529 State->Builder.SetInsertPoint(Terminator);
530
531 State->CFG.PrevBB = NewBB;
532 State->CFG.VPBB2IRBB[this] = NewBB;
533 connectToPredecessors(State&: *State);
534 }
535
536 // 2. Fill the IR basic block with IR instructions.
537 executeRecipes(State, BB: NewBB);
538
539 // If this block is a latch, update CurrentParentLoop.
540 if (VPBlockUtils::isLatch(VPB: this, VPDT: State->VPDT))
541 State->CurrentParentLoop = State->CurrentParentLoop->getParentLoop();
542}
543
544VPBasicBlock *VPBasicBlock::clone() {
545 auto *NewBlock = getPlan()->createVPBasicBlock(Name: getName());
546 for (VPRecipeBase &R : *this)
547 NewBlock->appendRecipe(Recipe: R.clone());
548 return NewBlock;
549}
550
551void VPBasicBlock::executeRecipes(VPTransformState *State, BasicBlock *BB) {
552 LLVM_DEBUG(dbgs() << "LV: vectorizing VPBB: " << getName()
553 << " in BB: " << BB->getName() << '\n');
554
555 State->CFG.PrevVPBB = this;
556
557 for (VPRecipeBase &Recipe : Recipes) {
558 State->setDebugLocFrom(Recipe.getDebugLoc());
559 Recipe.execute(State&: *State);
560 }
561
562 LLVM_DEBUG(dbgs() << "LV: filled BB: " << *BB);
563}
564
565VPBasicBlock *VPBasicBlock::splitAt(iterator SplitAt) {
566 assert((SplitAt == end() || SplitAt->getParent() == this) &&
567 "can only split at a position in the same block");
568
569 // Create new empty block after the block to split.
570 auto *SplitBlock = getPlan()->createVPBasicBlock(Name: getName() + ".split");
571 VPBlockUtils::insertBlockAfter(NewBlock: SplitBlock, BlockPtr: this);
572
573 // If this is the exiting block, make the split the new exiting block.
574 auto *ParentRegion = getParent();
575 if (ParentRegion && ParentRegion->getExiting() == this)
576 ParentRegion->setExiting(SplitBlock);
577
578 // Finally, move the recipes starting at SplitAt to new block.
579 for (VPRecipeBase &ToMove :
580 make_early_inc_range(Range: make_range(x: SplitAt, y: this->end())))
581 ToMove.moveBefore(BB&: *SplitBlock, I: SplitBlock->end());
582
583 return SplitBlock;
584}
585
586/// Return the enclosing loop region for region \p P. The templated version is
587/// used to support both const and non-const block arguments.
588template <typename T> static T *getEnclosingLoopRegionForRegion(T *P) {
589 if (P && P->isReplicator()) {
590 P = P->getParent();
591 // Multiple loop regions can be nested, but replicate regions can only be
592 // nested inside a loop region or must be outside any other region.
593 assert((!P || !P->isReplicator()) && "unexpected nested replicate regions");
594 }
595 return P;
596}
597
598VPRegionBlock *VPBasicBlock::getEnclosingLoopRegion() {
599 return getEnclosingLoopRegionForRegion(P: getParent());
600}
601
602const VPRegionBlock *VPBasicBlock::getEnclosingLoopRegion() const {
603 return getEnclosingLoopRegionForRegion(P: getParent());
604}
605
606static bool hasConditionalTerminator(const VPBasicBlock *VPBB) {
607 if (VPBB->empty()) {
608 assert(
609 VPBB->getNumSuccessors() < 2 &&
610 "block with multiple successors doesn't have a recipe as terminator");
611 return false;
612 }
613
614 const VPRecipeBase *R = &VPBB->back();
615 [[maybe_unused]] bool IsSwitch =
616 isa<VPInstruction>(Val: R) &&
617 cast<VPInstruction>(Val: R)->getOpcode() == Instruction::Switch;
618 [[maybe_unused]] bool IsBranchOnTwoConds = match(V: R, P: m_BranchOnTwoConds());
619 [[maybe_unused]] bool IsCondBranch =
620 isa<VPBranchOnMaskRecipe>(Val: R) ||
621 match(V: R, P: m_CombineOr(L: m_BranchOnCond(), R: m_BranchOnCount()));
622 if (VPBB->getNumSuccessors() == 2 ||
623 (VPBB->isExiting() && !VPBB->getParent()->isReplicator())) {
624 assert((IsCondBranch || IsSwitch || IsBranchOnTwoConds) &&
625 "block with multiple successors not terminated by "
626 "conditional branch nor switch recipe");
627
628 return true;
629 }
630
631 if (VPBB->getNumSuccessors() > 2) {
632 assert((IsSwitch || IsBranchOnTwoConds) &&
633 "block with more than 2 successors not terminated by a switch or "
634 "branch-on-two-conds recipe");
635 return true;
636 }
637
638 assert(
639 !IsCondBranch && !IsBranchOnTwoConds &&
640 "block with 0 or 1 successors terminated by conditional branch recipe");
641 return false;
642}
643
644VPRecipeBase *VPBasicBlock::getTerminator() {
645 if (hasConditionalTerminator(VPBB: this))
646 return &back();
647 return nullptr;
648}
649
650const VPRecipeBase *VPBasicBlock::getTerminator() const {
651 if (hasConditionalTerminator(VPBB: this))
652 return &back();
653 return nullptr;
654}
655
656bool VPBasicBlock::isExiting() const {
657 return getParent() && getParent()->getExitingBasicBlock() == this;
658}
659
660#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
661void VPBlockBase::print(raw_ostream &O) const {
662 VPSlotTracker SlotTracker(getPlan());
663 print(O, "", SlotTracker);
664}
665
666void VPBlockBase::printSuccessors(raw_ostream &O, const Twine &Indent) const {
667 if (!hasSuccessors()) {
668 O << Indent << "No successors\n";
669 } else {
670 O << Indent << "Successor(s): ";
671 ListSeparator LS;
672 for (auto *Succ : getSuccessors())
673 O << LS << Succ->getName();
674 O << '\n';
675 }
676}
677
678void VPBasicBlock::print(raw_ostream &O, const Twine &Indent,
679 VPSlotTracker &SlotTracker) const {
680 O << Indent << getName() << ":\n";
681
682 auto RecipeIndent = Indent + " ";
683 for (const VPRecipeBase &Recipe : *this) {
684 Recipe.print(O, RecipeIndent, SlotTracker);
685 O << '\n';
686 }
687
688 printSuccessors(O, Indent);
689}
690#endif
691
692std::pair<VPBlockBase *, VPBlockBase *>
693VPBlockUtils::cloneFrom(VPBlockBase *Entry) {
694 DenseMap<VPBlockBase *, VPBlockBase *> Old2NewVPBlocks;
695 VPBlockBase *Exiting = nullptr;
696 bool InRegion = Entry->getParent();
697 // First, clone blocks reachable from Entry.
698 for (VPBlockBase *BB : vp_depth_first_shallow(G: Entry)) {
699 VPBlockBase *NewBB = BB->clone();
700 Old2NewVPBlocks[BB] = NewBB;
701 if (InRegion && BB->getNumSuccessors() == 0) {
702 assert(!Exiting && "Multiple exiting blocks?");
703 Exiting = BB;
704 }
705 }
706 assert((!InRegion || Exiting) && "regions must have a single exiting block");
707
708 // Second, update the predecessors & successors of the cloned blocks.
709 for (VPBlockBase *BB : vp_depth_first_shallow(G: Entry)) {
710 VPBlockBase *NewBB = Old2NewVPBlocks[BB];
711 SmallVector<VPBlockBase *> NewPreds;
712 for (VPBlockBase *Pred : BB->getPredecessors()) {
713 NewPreds.push_back(Elt: Old2NewVPBlocks[Pred]);
714 }
715 NewBB->setPredecessors(NewPreds);
716 SmallVector<VPBlockBase *> NewSuccs;
717 for (VPBlockBase *Succ : BB->successors()) {
718 NewSuccs.push_back(Elt: Old2NewVPBlocks[Succ]);
719 }
720 NewBB->setSuccessors(NewSuccs);
721 }
722
723#if !defined(NDEBUG)
724 // Verify that the order of predecessors and successors matches in the cloned
725 // version.
726 for (const auto &[OldBB, NewBB] :
727 zip(vp_depth_first_shallow(Entry),
728 vp_depth_first_shallow(Old2NewVPBlocks[Entry]))) {
729 for (const auto &[OldPred, NewPred] :
730 zip(OldBB->getPredecessors(), NewBB->getPredecessors()))
731 assert(NewPred == Old2NewVPBlocks[OldPred] && "Different predecessors");
732
733 for (const auto &[OldSucc, NewSucc] :
734 zip(OldBB->successors(), NewBB->successors()))
735 assert(NewSucc == Old2NewVPBlocks[OldSucc] && "Different successors");
736 }
737#endif
738
739 return std::make_pair(x&: Old2NewVPBlocks[Entry],
740 y: Exiting ? Old2NewVPBlocks[Exiting] : nullptr);
741}
742
743VPRegionBlock *VPRegionBlock::clone() {
744 const auto &[NewEntry, NewExiting] = VPBlockUtils::cloneFrom(Entry: getEntry());
745 VPlan &Plan = *getPlan();
746 VPRegionBlock *NewRegion =
747 isReplicator()
748 ? Plan.createReplicateRegion(Entry: NewEntry, Exiting: NewExiting, Name: getName())
749 : Plan.createLoopRegion(Name: getName(), Entry: NewEntry, Exiting: NewExiting);
750
751 for (VPBlockBase *Block : vp_depth_first_shallow(G: NewEntry))
752 Block->setParent(NewRegion);
753 return NewRegion;
754}
755
756void VPRegionBlock::execute(VPTransformState *State) {
757 assert(isReplicator() &&
758 "Loop regions should have been lowered to plain CFG");
759 assert(!State->Lane && "Replicating a Region with non-null instance.");
760 assert(!State->VF.isScalable() && "VF is assumed to be non scalable.");
761
762 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
763 Entry);
764 State->Lane = VPLane(0);
765 for (unsigned Lane = 0, VF = State->VF.getFixedValue(); Lane < VF; ++Lane) {
766 State->Lane = VPLane(Lane, VPLane::Kind::First);
767 // Visit the VPBlocks connected to \p this, starting from it.
768 for (VPBlockBase *Block : RPOT) {
769 LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
770 Block->execute(State);
771 }
772 }
773
774 // Exit replicating mode.
775 State->Lane.reset();
776}
777
778InstructionCost VPBasicBlock::cost(ElementCount VF, VPCostContext &Ctx) {
779 InstructionCost Cost = 0;
780 for (VPRecipeBase &R : Recipes)
781 Cost += R.cost(VF, Ctx);
782 return Cost;
783}
784
785const VPBasicBlock *VPBasicBlock::getCFGPredecessor(unsigned Idx) const {
786 const VPBlockBase *Pred = nullptr;
787 if (hasPredecessors()) {
788 Pred = getPredecessors()[Idx];
789 } else {
790 auto *Region = getParent();
791 assert(Region && !Region->isReplicator() && Region->getEntry() == this &&
792 "must be in the entry block of a non-replicate region");
793 assert(Idx < 2 && Region->getNumPredecessors() == 1 &&
794 "loop region has a single predecessor (preheader), its entry block "
795 "has 2 incoming blocks");
796
797 // Idx == 0 selects the predecessor of the region, Idx == 1 selects the
798 // region itself whose exiting block feeds the phi across the backedge.
799 Pred = Idx == 0 ? Region->getSinglePredecessor() : Region;
800 }
801 return Pred->getExitingBasicBlock();
802}
803
804InstructionCost VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx) {
805 if (!isReplicator()) {
806 InstructionCost Cost = 0;
807 for (VPBlockBase *Block : vp_depth_first_shallow(G: getEntry()))
808 Cost += Block->cost(VF, Ctx);
809 InstructionCost BackedgeCost =
810 ForceTargetInstructionCost.getNumOccurrences()
811 ? InstructionCost(ForceTargetInstructionCost)
812 : Ctx.TTI.getCFInstrCost(Opcode: Instruction::UncondBr, CostKind: Ctx.CostKind);
813 LLVM_DEBUG(dbgs() << "Cost of " << BackedgeCost << " for VF " << VF
814 << ": vector loop backedge\n");
815 Cost += BackedgeCost;
816 return Cost;
817 }
818
819 // Compute the cost of a replicate region. Replicating isn't supported for
820 // scalable vectors, return an invalid cost for them.
821 // TODO: Discard scalable VPlans with replicate recipes earlier after
822 // construction.
823 if (VF.isScalable())
824 return InstructionCost::getInvalid();
825
826 // Compute and return the cost of the conditionally executed recipes.
827 assert(VF.isVector() && "Can only compute vector cost at the moment.");
828 VPBasicBlock *Then = cast<VPBasicBlock>(Val: getEntry()->getSuccessors()[0]);
829 return Then->cost(VF, Ctx);
830}
831
832#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
833void VPRegionBlock::print(raw_ostream &O, const Twine &Indent,
834 VPSlotTracker &SlotTracker) const {
835 O << Indent << (isReplicator() ? "<xVFxUF> " : "<x1> ") << getName() << ": {";
836 auto NewIndent = Indent + " ";
837 for (auto *BlockBase : vp_depth_first_shallow(Entry)) {
838 O << '\n';
839 BlockBase->print(O, NewIndent, SlotTracker);
840 }
841 O << Indent << "}\n";
842
843 printSuccessors(O, Indent);
844}
845#endif
846
847void VPRegionBlock::dissolveToCFGLoop() {
848 auto *Header = cast<VPBasicBlock>(Val: getEntry());
849 if (auto *CanIV = dyn_cast<VPCanonicalIVPHIRecipe>(Val: &Header->front())) {
850 assert(this == getPlan()->getVectorLoopRegion() &&
851 "Canonical IV must be in the entry of the top-level loop region");
852 auto *ScalarR = VPBuilder(CanIV).createScalarPhi(
853 IncomingValues: {CanIV->getStartValue(), CanIV->getBackedgeValue()},
854 DL: CanIV->getDebugLoc(), Name: "index");
855 CanIV->replaceAllUsesWith(New: ScalarR);
856 CanIV->eraseFromParent();
857 }
858
859 VPBlockBase *Preheader = getSinglePredecessor();
860 auto *ExitingLatch = cast<VPBasicBlock>(Val: getExiting());
861
862 VPBlockUtils::disconnectBlocks(From: Preheader, To: this);
863
864 for (VPBlockBase *VPB : vp_depth_first_shallow(G: Entry))
865 VPB->setParent(getParent());
866
867 VPBlockUtils::connectBlocks(From: Preheader, To: Header);
868 VPBlockUtils::transferSuccessors(Old: this, New: ExitingLatch);
869 VPBlockUtils::connectBlocks(From: ExitingLatch, To: Header);
870}
871
872VPlan::VPlan(Loop *L) {
873 setEntry(createVPIRBasicBlock(IRBB: L->getLoopPreheader()));
874 ScalarHeader = createVPIRBasicBlock(IRBB: L->getHeader());
875
876 SmallVector<BasicBlock *> IRExitBlocks;
877 L->getUniqueExitBlocks(ExitBlocks&: IRExitBlocks);
878 for (BasicBlock *EB : IRExitBlocks)
879 ExitBlocks.push_back(Elt: createVPIRBasicBlock(IRBB: EB));
880}
881
882VPlan::~VPlan() {
883 VPSymbolicValue DummyValue;
884
885 for (auto *VPB : CreatedBlocks) {
886 if (auto *VPBB = dyn_cast<VPBasicBlock>(Val: VPB)) {
887 // Replace all operands of recipes and all VPValues defined in VPBB with
888 // DummyValue so the block can be deleted.
889 for (VPRecipeBase &R : *VPBB) {
890 for (auto *Def : R.definedValues())
891 Def->replaceAllUsesWith(New: &DummyValue);
892
893 for (unsigned I = 0, E = R.getNumOperands(); I != E; I++)
894 R.setOperand(I, New: &DummyValue);
895 }
896 }
897 delete VPB;
898 }
899 for (VPValue *VPV : getLiveIns())
900 delete VPV;
901 delete BackedgeTakenCount;
902}
903
904VPIRBasicBlock *VPlan::getExitBlock(BasicBlock *IRBB) const {
905 auto Iter = find_if(Range: getExitBlocks(), P: [IRBB](const VPIRBasicBlock *VPIRBB) {
906 return VPIRBB->getIRBasicBlock() == IRBB;
907 });
908 assert(Iter != getExitBlocks().end() && "no exit block found");
909 return *Iter;
910}
911
912bool VPlan::isExitBlock(VPBlockBase *VPBB) {
913 return is_contained(Range&: ExitBlocks, Element: VPBB);
914}
915
916/// To make RUN_VPLAN_PASS print final VPlan.
917static void printFinalVPlan(VPlan &) {}
918
919/// Generate the code inside the preheader and body of the vectorized loop.
920/// Assumes a single pre-header basic-block was created for this. Introduce
921/// additional basic-blocks as needed, and fill them all.
922void VPlan::execute(VPTransformState *State) {
923 // Initialize CFG state.
924 State->CFG.PrevVPBB = nullptr;
925 State->CFG.ExitBB = State->CFG.PrevBB->getSingleSuccessor();
926
927 // Update VPDominatorTree since VPBasicBlock may be removed after State was
928 // constructed.
929 State->VPDT.recalculate(Func&: *this);
930
931 // Disconnect VectorPreHeader from ExitBB in both the CFG and DT.
932 BasicBlock *VectorPreHeader = State->CFG.PrevBB;
933 cast<UncondBrInst>(Val: VectorPreHeader->getTerminator())->setSuccessor(nullptr);
934 State->CFG.DTU.applyUpdates(
935 Updates: {{DominatorTree::Delete, VectorPreHeader, State->CFG.ExitBB}});
936
937 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << State->VF
938 << ", UF=" << getConcreteUF() << '\n');
939 setName("Final VPlan");
940 // TODO: RUN_VPLAN_PASS/VPlanTransforms::runPass should automatically dump
941 // VPlans after some specific stages when "-debug" is specified, but that
942 // hasn't been implemented yet. For now, just do both:
943 LLVM_DEBUG(dump());
944 RUN_VPLAN_PASS(printFinalVPlan, *this);
945
946 BasicBlock *ScalarPh = State->CFG.ExitBB;
947 VPBasicBlock *ScalarPhVPBB = getScalarPreheader();
948 if (ScalarPhVPBB->hasPredecessors()) {
949 // Disconnect scalar preheader and scalar header, as the dominator tree edge
950 // will be updated as part of VPlan execution. This allows keeping the DTU
951 // logic generic during VPlan execution.
952 State->CFG.DTU.applyUpdates(
953 Updates: {{DominatorTree::Delete, ScalarPh, ScalarPh->getSingleSuccessor()}});
954 }
955 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
956 Entry);
957 // Generate code for the VPlan, in parts of the vector skeleton, loop body and
958 // successor blocks including the middle, exit and scalar preheader blocks.
959 for (VPBlockBase *Block : RPOT)
960 Block->execute(State);
961
962 if (hasEarlyExit()) {
963 // Fix up LoopInfo for extra dispatch blocks when vectorizing loops with
964 // early exits. For dispatch blocks, we need to find the smallest common
965 // loop of all successors. Note: we only need to update loop info for blocks
966 // after the middle block, but there is no easy way to get those at this
967 // point.
968 for (VPBlockBase *VPB : reverse(C&: RPOT)) {
969 auto *VPBB = dyn_cast<VPBasicBlock>(Val: VPB);
970 if (!VPBB || isa<VPIRBasicBlock>(Val: VPBB))
971 continue;
972 BasicBlock *BB = State->CFG.VPBB2IRBB[VPBB];
973 Loop *L = State->LI->getLoopFor(BB);
974 if (!L || any_of(Range: successors(BB),
975 P: [L](BasicBlock *Succ) { return L->contains(BB: Succ); }))
976 continue;
977 // Find the innermost loop containing all successors.
978 Loop *Target = State->LI->getLoopFor(BB: *succ_begin(BB));
979 for (BasicBlock *Succ : drop_begin(RangeOrContainer: successors(BB)))
980 Target = State->LI->getSmallestCommonLoop(A: Target,
981 B: State->LI->getLoopFor(BB: Succ));
982 State->LI->removeBlock(BB);
983 if (Target)
984 Target->addBasicBlockToLoop(NewBB: BB, LI&: *State->LI);
985 }
986 }
987
988 // If the original loop is unreachable, delete it and all its blocks.
989 if (!ScalarPhVPBB->hasPredecessors()) {
990 // DeleteDeadBlocks will remove single-entry phis. Remove them from the exit
991 // VPIRBBs in VPlan as well, otherwise we would retain references to deleted
992 // IR instructions.
993 for (VPIRBasicBlock *EB : getExitBlocks()) {
994 for (VPRecipeBase &R : make_early_inc_range(Range: EB->phis())) {
995 if (R.getNumOperands() == 1)
996 R.eraseFromParent();
997 }
998 }
999
1000 Loop *OrigLoop =
1001 State->LI->getLoopFor(BB: getScalarHeader()->getIRBasicBlock());
1002 auto Blocks = OrigLoop->getBlocksVector();
1003 Blocks.push_back(x: cast<VPIRBasicBlock>(Val: ScalarPhVPBB)->getIRBasicBlock());
1004 for (auto *BB : Blocks)
1005 State->LI->removeBlock(BB);
1006 DeleteDeadBlocks(BBs: Blocks, DTU: &State->CFG.DTU);
1007 State->LI->erase(L: OrigLoop);
1008 }
1009
1010 State->CFG.DTU.flush();
1011
1012 VPBasicBlock *Header = vputils::getFirstLoopHeader(Plan&: *this, VPDT&: State->VPDT);
1013 if (!Header)
1014 return;
1015
1016 auto *LatchVPBB = cast<VPBasicBlock>(Val: Header->getPredecessors()[1]);
1017 BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB];
1018
1019 // Fix the latch value of canonical, reduction and first-order recurrences
1020 // phis in the vector loop.
1021 for (VPRecipeBase &R : Header->phis()) {
1022 // Skip phi-like recipes that generate their backedege values themselves.
1023 if (isa<VPWidenPHIRecipe>(Val: &R))
1024 continue;
1025
1026 auto *PhiR = cast<VPSingleDefRecipe>(Val: &R);
1027 // VPInstructions currently model scalar Phis only.
1028 bool NeedsScalar = isa<VPInstruction>(Val: PhiR) ||
1029 (isa<VPReductionPHIRecipe>(Val: PhiR) &&
1030 cast<VPReductionPHIRecipe>(Val: PhiR)->isInLoop());
1031
1032 Value *Phi = State->get(Def: PhiR, NeedsScalar);
1033 // VPHeaderPHIRecipe supports getBackedgeValue() but VPInstruction does
1034 // not.
1035 Value *Val = State->get(Def: PhiR->getOperand(N: 1), NeedsScalar);
1036 cast<PHINode>(Val: Phi)->addIncoming(V: Val, BB: VectorLatchBB);
1037 }
1038}
1039
1040InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx) {
1041 // For now only return the cost of the vector loop region, ignoring any other
1042 // blocks, like the preheader or middle blocks, expect for checking them for
1043 // recipes with invalid costs.
1044 InstructionCost Cost = getVectorLoopRegion()->cost(VF, Ctx);
1045
1046 // If the cost of the loop region is invalid or any recipe in the skeleton
1047 // outside loop regions are invalid return an invalid cost.
1048 if (!Cost.isValid() || any_of(Range: VPBlockUtils::blocksOnly<VPBasicBlock>(
1049 Range: vp_depth_first_shallow(G: getEntry())),
1050 P: [&VF, &Ctx](VPBasicBlock *VPBB) {
1051 return !VPBB->cost(VF, Ctx).isValid();
1052 }))
1053 return InstructionCost::getInvalid();
1054
1055 return Cost;
1056}
1057
1058VPRegionBlock *VPlan::getVectorLoopRegion() {
1059 // TODO: Cache if possible.
1060 for (VPBlockBase *B : vp_depth_first_shallow(G: getEntry()))
1061 if (auto *R = dyn_cast<VPRegionBlock>(Val: B))
1062 return R->isReplicator() ? nullptr : R;
1063 return nullptr;
1064}
1065
1066const VPRegionBlock *VPlan::getVectorLoopRegion() const {
1067 for (const VPBlockBase *B : vp_depth_first_shallow(G: getEntry()))
1068 if (auto *R = dyn_cast<VPRegionBlock>(Val: B))
1069 return R->isReplicator() ? nullptr : R;
1070 return nullptr;
1071}
1072
1073#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1074void VPlan::printLiveIns(raw_ostream &O) const {
1075 VPSlotTracker SlotTracker(this);
1076
1077 if (VF.getNumUsers() > 0) {
1078 O << "\nLive-in ";
1079 VF.printAsOperand(O, SlotTracker);
1080 O << " = VF";
1081 }
1082
1083 if (UF.getNumUsers() > 0) {
1084 O << "\nLive-in ";
1085 UF.printAsOperand(O, SlotTracker);
1086 O << " = UF";
1087 }
1088
1089 if (VFxUF.getNumUsers() > 0) {
1090 O << "\nLive-in ";
1091 VFxUF.printAsOperand(O, SlotTracker);
1092 O << " = VF * UF";
1093 }
1094
1095 if (VectorTripCount.getNumUsers() > 0) {
1096 O << "\nLive-in ";
1097 VectorTripCount.printAsOperand(O, SlotTracker);
1098 O << " = vector-trip-count";
1099 }
1100
1101 if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) {
1102 O << "\nLive-in ";
1103 BackedgeTakenCount->printAsOperand(O, SlotTracker);
1104 O << " = backedge-taken count";
1105 }
1106
1107 O << "\n";
1108 if (TripCount) {
1109 if (isa<VPIRValue>(TripCount))
1110 O << "Live-in ";
1111 TripCount->printAsOperand(O, SlotTracker);
1112 O << " = original trip-count";
1113 O << "\n";
1114 }
1115}
1116
1117LLVM_DUMP_METHOD
1118void VPlan::print(raw_ostream &O) const {
1119 VPSlotTracker SlotTracker(this);
1120
1121 O << "VPlan '" << getName() << "' {";
1122
1123 printLiveIns(O);
1124
1125 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<const VPBlockBase *>>
1126 RPOT(getEntry());
1127 for (const VPBlockBase *Block : RPOT) {
1128 O << '\n';
1129 Block->print(O, "", SlotTracker);
1130 }
1131
1132 O << "}\n";
1133}
1134
1135std::string VPlan::getName() const {
1136 std::string Out;
1137 raw_string_ostream RSO(Out);
1138 RSO << Name << " for ";
1139 if (!VFs.empty()) {
1140 RSO << "VF={" << VFs[0];
1141 for (ElementCount VF : drop_begin(VFs))
1142 RSO << "," << VF;
1143 RSO << "},";
1144 }
1145
1146 if (UFs.empty()) {
1147 RSO << "UF>=1";
1148 } else {
1149 RSO << "UF={" << UFs[0];
1150 for (unsigned UF : drop_begin(UFs))
1151 RSO << "," << UF;
1152 RSO << "}";
1153 }
1154
1155 return Out;
1156}
1157
1158LLVM_DUMP_METHOD
1159void VPlan::printDOT(raw_ostream &O) const {
1160 VPlanPrinter Printer(O, *this);
1161 Printer.dump();
1162}
1163
1164LLVM_DUMP_METHOD
1165void VPlan::dump() const { print(dbgs()); }
1166#endif
1167
1168static void remapOperands(VPBlockBase *Entry, VPBlockBase *NewEntry,
1169 DenseMap<VPValue *, VPValue *> &Old2NewVPValues) {
1170 // Update the operands of all cloned recipes starting at NewEntry. This
1171 // traverses all reachable blocks. This is done in two steps, to handle cycles
1172 // in PHI recipes.
1173 ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>>
1174 OldDeepRPOT(Entry);
1175 ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>>
1176 NewDeepRPOT(NewEntry);
1177 // First, collect all mappings from old to new VPValues defined by cloned
1178 // recipes.
1179 for (const auto &[OldBB, NewBB] :
1180 zip(t: VPBlockUtils::blocksOnly<VPBasicBlock>(Range: OldDeepRPOT),
1181 u: VPBlockUtils::blocksOnly<VPBasicBlock>(Range: NewDeepRPOT))) {
1182 assert(OldBB->getRecipeList().size() == NewBB->getRecipeList().size() &&
1183 "blocks must have the same number of recipes");
1184 for (const auto &[OldR, NewR] : zip(t&: *OldBB, u&: *NewBB)) {
1185 assert(OldR.getNumOperands() == NewR.getNumOperands() &&
1186 "recipes must have the same number of operands");
1187 assert(OldR.getNumDefinedValues() == NewR.getNumDefinedValues() &&
1188 "recipes must define the same number of operands");
1189 for (const auto &[OldV, NewV] :
1190 zip(t: OldR.definedValues(), u: NewR.definedValues()))
1191 Old2NewVPValues[OldV] = NewV;
1192 }
1193 }
1194
1195 // Update all operands to use cloned VPValues.
1196 for (VPBasicBlock *NewBB :
1197 VPBlockUtils::blocksOnly<VPBasicBlock>(Range: NewDeepRPOT)) {
1198 for (VPRecipeBase &NewR : *NewBB)
1199 for (unsigned I = 0, E = NewR.getNumOperands(); I != E; ++I) {
1200 VPValue *NewOp = Old2NewVPValues.lookup(Val: NewR.getOperand(N: I));
1201 NewR.setOperand(I, New: NewOp);
1202 }
1203 }
1204}
1205
1206VPlan *VPlan::duplicate() {
1207 unsigned NumBlocksBeforeCloning = CreatedBlocks.size();
1208 // Clone blocks.
1209 const auto &[NewEntry, __] = VPBlockUtils::cloneFrom(Entry);
1210
1211 BasicBlock *ScalarHeaderIRBB = getScalarHeader()->getIRBasicBlock();
1212 VPIRBasicBlock *NewScalarHeader = nullptr;
1213 if (getScalarHeader()->hasPredecessors()) {
1214 NewScalarHeader = cast<VPIRBasicBlock>(Val: *find_if(
1215 Range: vp_depth_first_shallow(G: NewEntry), P: [ScalarHeaderIRBB](VPBlockBase *VPB) {
1216 auto *VPIRBB = dyn_cast<VPIRBasicBlock>(Val: VPB);
1217 return VPIRBB && VPIRBB->getIRBasicBlock() == ScalarHeaderIRBB;
1218 }));
1219 } else {
1220 NewScalarHeader = createVPIRBasicBlock(IRBB: ScalarHeaderIRBB);
1221 }
1222 // Create VPlan, clone live-ins and remap operands in the cloned blocks.
1223 auto *NewPlan = new VPlan(cast<VPBasicBlock>(Val: NewEntry), NewScalarHeader);
1224 DenseMap<VPValue *, VPValue *> Old2NewVPValues;
1225 for (VPIRValue *OldLiveIn : getLiveIns())
1226 Old2NewVPValues[OldLiveIn] = NewPlan->getOrAddLiveIn(V: OldLiveIn);
1227 Old2NewVPValues[&VectorTripCount] = &NewPlan->VectorTripCount;
1228 Old2NewVPValues[&VF] = &NewPlan->VF;
1229 Old2NewVPValues[&UF] = &NewPlan->UF;
1230 Old2NewVPValues[&VFxUF] = &NewPlan->VFxUF;
1231 if (BackedgeTakenCount) {
1232 NewPlan->BackedgeTakenCount = new VPSymbolicValue();
1233 Old2NewVPValues[BackedgeTakenCount] = NewPlan->BackedgeTakenCount;
1234 }
1235 if (auto *TripCountIRV = dyn_cast_or_null<VPIRValue>(Val: TripCount))
1236 Old2NewVPValues[TripCountIRV] = NewPlan->getOrAddLiveIn(V: TripCountIRV);
1237 // else NewTripCount will be created and inserted into Old2NewVPValues when
1238 // TripCount is cloned. In any case NewPlan->TripCount is updated below.
1239
1240 remapOperands(Entry, NewEntry, Old2NewVPValues);
1241
1242 // Initialize remaining fields of cloned VPlan.
1243 NewPlan->VFs = VFs;
1244 NewPlan->UFs = UFs;
1245 // TODO: Adjust names.
1246 NewPlan->Name = Name;
1247 if (TripCount) {
1248 assert(Old2NewVPValues.contains(TripCount) &&
1249 "TripCount must have been added to Old2NewVPValues");
1250 NewPlan->TripCount = Old2NewVPValues[TripCount];
1251 }
1252
1253 // Transfer all cloned blocks (the second half of all current blocks) from
1254 // current to new VPlan.
1255 unsigned NumBlocksAfterCloning = CreatedBlocks.size();
1256 for (unsigned I :
1257 seq<unsigned>(Begin: NumBlocksBeforeCloning, End: NumBlocksAfterCloning))
1258 NewPlan->CreatedBlocks.push_back(Elt: this->CreatedBlocks[I]);
1259 CreatedBlocks.truncate(N: NumBlocksBeforeCloning);
1260
1261 // Update ExitBlocks of the new plan.
1262 for (VPBlockBase *VPB : NewPlan->CreatedBlocks) {
1263 if (VPB->getNumSuccessors() == 0 && isa<VPIRBasicBlock>(Val: VPB) &&
1264 VPB != NewScalarHeader)
1265 NewPlan->ExitBlocks.push_back(Elt: cast<VPIRBasicBlock>(Val: VPB));
1266 }
1267
1268 return NewPlan;
1269}
1270
1271VPIRBasicBlock *VPlan::createEmptyVPIRBasicBlock(BasicBlock *IRBB) {
1272 auto *VPIRBB = new VPIRBasicBlock(IRBB);
1273 CreatedBlocks.push_back(Elt: VPIRBB);
1274 return VPIRBB;
1275}
1276
1277VPIRBasicBlock *VPlan::createVPIRBasicBlock(BasicBlock *IRBB) {
1278 auto *VPIRBB = createEmptyVPIRBasicBlock(IRBB);
1279 for (Instruction &I :
1280 make_range(x: IRBB->begin(), y: IRBB->getTerminator()->getIterator()))
1281 VPIRBB->appendRecipe(Recipe: VPIRInstruction::create(I));
1282 return VPIRBB;
1283}
1284
1285#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1286
1287Twine VPlanPrinter::getUID(const VPBlockBase *Block) {
1288 return (isa<VPRegionBlock>(Block) ? "cluster_N" : "N") +
1289 Twine(getOrCreateBID(Block));
1290}
1291
1292Twine VPlanPrinter::getOrCreateName(const VPBlockBase *Block) {
1293 const std::string &Name = Block->getName();
1294 if (!Name.empty())
1295 return Name;
1296 return "VPB" + Twine(getOrCreateBID(Block));
1297}
1298
1299void VPlanPrinter::dump() {
1300 Depth = 1;
1301 bumpIndent(0);
1302 OS << "digraph VPlan {\n";
1303 OS << "graph [labelloc=t, fontsize=30; label=\"Vectorization Plan";
1304 if (!Plan.getName().empty())
1305 OS << "\\n" << DOT::EscapeString(Plan.getName());
1306
1307 {
1308 // Print live-ins.
1309 std::string Str;
1310 raw_string_ostream SS(Str);
1311 Plan.printLiveIns(SS);
1312 SmallVector<StringRef, 0> Lines;
1313 StringRef(Str).rtrim('\n').split(Lines, "\n");
1314 for (auto Line : Lines)
1315 OS << DOT::EscapeString(Line.str()) << "\\n";
1316 }
1317
1318 OS << "\"]\n";
1319 OS << "node [shape=rect, fontname=Courier, fontsize=30]\n";
1320 OS << "edge [fontname=Courier, fontsize=30]\n";
1321 OS << "compound=true\n";
1322
1323 for (const VPBlockBase *Block : vp_depth_first_shallow(Plan.getEntry()))
1324 dumpBlock(Block);
1325
1326 OS << "}\n";
1327}
1328
1329void VPlanPrinter::dumpBlock(const VPBlockBase *Block) {
1330 if (const VPBasicBlock *BasicBlock = dyn_cast<VPBasicBlock>(Block))
1331 dumpBasicBlock(BasicBlock);
1332 else if (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
1333 dumpRegion(Region);
1334 else
1335 llvm_unreachable("Unsupported kind of VPBlock.");
1336}
1337
1338void VPlanPrinter::drawEdge(const VPBlockBase *From, const VPBlockBase *To,
1339 bool Hidden, const Twine &Label) {
1340 // Due to "dot" we print an edge between two regions as an edge between the
1341 // exiting basic block and the entry basic of the respective regions.
1342 const VPBlockBase *Tail = From->getExitingBasicBlock();
1343 const VPBlockBase *Head = To->getEntryBasicBlock();
1344 OS << Indent << getUID(Tail) << " -> " << getUID(Head);
1345 OS << " [ label=\"" << Label << '\"';
1346 if (Tail != From)
1347 OS << " ltail=" << getUID(From);
1348 if (Head != To)
1349 OS << " lhead=" << getUID(To);
1350 if (Hidden)
1351 OS << "; splines=none";
1352 OS << "]\n";
1353}
1354
1355void VPlanPrinter::dumpEdges(const VPBlockBase *Block) {
1356 auto &Successors = Block->getSuccessors();
1357 if (Successors.size() == 1)
1358 drawEdge(Block, Successors.front(), false, "");
1359 else if (Successors.size() == 2) {
1360 drawEdge(Block, Successors.front(), false, "T");
1361 drawEdge(Block, Successors.back(), false, "F");
1362 } else {
1363 unsigned SuccessorNumber = 0;
1364 for (auto *Successor : Successors)
1365 drawEdge(Block, Successor, false, Twine(SuccessorNumber++));
1366 }
1367}
1368
1369void VPlanPrinter::dumpBasicBlock(const VPBasicBlock *BasicBlock) {
1370 // Implement dot-formatted dump by performing plain-text dump into the
1371 // temporary storage followed by some post-processing.
1372 OS << Indent << getUID(BasicBlock) << " [label =\n";
1373 bumpIndent(1);
1374 std::string Str;
1375 raw_string_ostream SS(Str);
1376 // Use no indentation as we need to wrap the lines into quotes ourselves.
1377 BasicBlock->print(SS, "", SlotTracker);
1378
1379 // We need to process each line of the output separately, so split
1380 // single-string plain-text dump.
1381 SmallVector<StringRef, 0> Lines;
1382 StringRef(Str).rtrim('\n').split(Lines, "\n");
1383
1384 auto EmitLine = [&](StringRef Line, StringRef Suffix) {
1385 OS << Indent << '"' << DOT::EscapeString(Line.str()) << "\\l\"" << Suffix;
1386 };
1387
1388 // Don't need the "+" after the last line.
1389 for (auto Line : make_range(Lines.begin(), Lines.end() - 1))
1390 EmitLine(Line, " +\n");
1391 EmitLine(Lines.back(), "\n");
1392
1393 bumpIndent(-1);
1394 OS << Indent << "]\n";
1395
1396 dumpEdges(BasicBlock);
1397}
1398
1399void VPlanPrinter::dumpRegion(const VPRegionBlock *Region) {
1400 OS << Indent << "subgraph " << getUID(Region) << " {\n";
1401 bumpIndent(1);
1402 OS << Indent << "fontname=Courier\n"
1403 << Indent << "label=\""
1404 << DOT::EscapeString(Region->isReplicator() ? "<xVFxUF> " : "<x1> ")
1405 << DOT::EscapeString(Region->getName()) << "\"\n";
1406 // Dump the blocks of the region.
1407 assert(Region->getEntry() && "Region contains no inner blocks.");
1408 for (const VPBlockBase *Block : vp_depth_first_shallow(Region->getEntry()))
1409 dumpBlock(Block);
1410 bumpIndent(-1);
1411 OS << Indent << "}\n";
1412 dumpEdges(Region);
1413}
1414
1415#endif
1416
1417/// Returns true if there is a vector loop region and \p VPV is defined in a
1418/// loop region.
1419static bool isDefinedInsideLoopRegions(const VPValue *VPV) {
1420 const VPRecipeBase *DefR = VPV->getDefiningRecipe();
1421 return DefR && (!DefR->getParent()->getPlan()->getVectorLoopRegion() ||
1422 DefR->getParent()->getEnclosingLoopRegion());
1423}
1424
1425bool VPValue::isDefinedOutsideLoopRegions() const {
1426 return !isDefinedInsideLoopRegions(VPV: this);
1427}
1428void VPValue::replaceAllUsesWith(VPValue *New) {
1429 replaceUsesWithIf(New, ShouldReplace: [](VPUser &, unsigned) { return true; });
1430 if (auto *SV = dyn_cast<VPSymbolicValue>(Val: this))
1431 SV->markMaterialized();
1432}
1433
1434void VPValue::replaceUsesWithIf(
1435 VPValue *New,
1436 llvm::function_ref<bool(VPUser &U, unsigned Idx)> ShouldReplace) {
1437 assertNotMaterialized();
1438 // Note that this early exit is required for correctness; the implementation
1439 // below relies on the number of users for this VPValue to decrease, which
1440 // isn't the case if this == New.
1441 if (this == New)
1442 return;
1443
1444 for (unsigned J = 0; J < getNumUsers();) {
1445 VPUser *User = Users[J];
1446 bool RemovedUser = false;
1447 for (unsigned I = 0, E = User->getNumOperands(); I < E; ++I) {
1448 if (User->getOperand(N: I) != this || !ShouldReplace(*User, I))
1449 continue;
1450
1451 RemovedUser = true;
1452 User->setOperand(I, New);
1453 }
1454 // If a user got removed after updating the current user, the next user to
1455 // update will be moved to the current position, so we only need to
1456 // increment the index if the number of users did not change.
1457 if (!RemovedUser)
1458 J++;
1459 }
1460}
1461
1462void VPUser::replaceUsesOfWith(VPValue *From, VPValue *To) {
1463 for (unsigned Idx = 0; Idx != getNumOperands(); ++Idx) {
1464 if (getOperand(N: Idx) == From)
1465 setOperand(I: Idx, New: To);
1466 }
1467}
1468
1469#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1470void VPValue::printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const {
1471 OS << Tracker.getOrCreateName(this);
1472}
1473
1474void VPUser::printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const {
1475 interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) {
1476 Op->printAsOperand(O, SlotTracker);
1477 });
1478}
1479#endif
1480
1481void VPSlotTracker::assignName(const VPValue *V) {
1482 assert(!VPValue2Name.contains(V) && "VPValue already has a name!");
1483 auto *UV = V->getUnderlyingValue();
1484 auto *VPI = dyn_cast_or_null<VPInstruction>(Val: V);
1485 if (!UV && !(VPI && !VPI->getName().empty())) {
1486 VPValue2Name[V] = (Twine("vp<%") + Twine(NextSlot) + ">").str();
1487 NextSlot++;
1488 return;
1489 }
1490
1491 // Use the name of the underlying Value, wrapped in "ir<>", and versioned by
1492 // appending ".Number" to the name if there are multiple uses.
1493 std::string Name;
1494 if (UV)
1495 Name = getName(V: UV);
1496 else
1497 Name = VPI->getName();
1498
1499 assert(!Name.empty() && "Name cannot be empty.");
1500 StringRef Prefix = UV ? "ir<" : "vp<%";
1501 std::string BaseName = (Twine(Prefix) + Name + Twine(">")).str();
1502
1503 // First assign the base name for V.
1504 const auto &[A, _] = VPValue2Name.try_emplace(Key: V, Args&: BaseName);
1505 // Integer or FP constants with different types will result in the same string
1506 // due to stripping types.
1507 if (isa<VPIRValue>(Val: V) && isa<ConstantInt, ConstantFP>(Val: UV))
1508 return;
1509
1510 // If it is already used by C > 0 other VPValues, increase the version counter
1511 // C and use it for V.
1512 const auto &[C, UseInserted] = BaseName2Version.try_emplace(Key: BaseName, Args: 0);
1513 if (!UseInserted) {
1514 C->second++;
1515 A->second = (BaseName + Twine(".") + Twine(C->second)).str();
1516 }
1517}
1518
1519void VPSlotTracker::assignNames(const VPlan &Plan) {
1520 if (Plan.VF.getNumUsers() > 0)
1521 assignName(V: &Plan.VF);
1522 if (Plan.UF.getNumUsers() > 0)
1523 assignName(V: &Plan.UF);
1524 if (Plan.VFxUF.getNumUsers() > 0)
1525 assignName(V: &Plan.VFxUF);
1526 assignName(V: &Plan.VectorTripCount);
1527 if (Plan.BackedgeTakenCount)
1528 assignName(V: Plan.BackedgeTakenCount);
1529 for (VPValue *LI : Plan.getLiveIns())
1530 assignName(V: LI);
1531
1532 ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<const VPBlockBase *>>
1533 RPOT(VPBlockDeepTraversalWrapper<const VPBlockBase *>(Plan.getEntry()));
1534 for (const VPBasicBlock *VPBB :
1535 VPBlockUtils::blocksOnly<const VPBasicBlock>(Range: RPOT))
1536 assignNames(VPBB);
1537}
1538
1539void VPSlotTracker::assignNames(const VPBasicBlock *VPBB) {
1540 for (const VPRecipeBase &Recipe : *VPBB)
1541 for (VPValue *Def : Recipe.definedValues())
1542 assignName(V: Def);
1543}
1544
1545std::string VPSlotTracker::getName(const Value *V) {
1546 std::string Name;
1547 raw_string_ostream S(Name);
1548 if (V->hasName() || !isa<Instruction>(Val: V)) {
1549 V->printAsOperand(O&: S, PrintType: false);
1550 return Name;
1551 }
1552
1553 if (!MST) {
1554 // Lazily create the ModuleSlotTracker when we first hit an unnamed
1555 // instruction.
1556 auto *I = cast<Instruction>(Val: V);
1557 // This check is required to support unit tests with incomplete IR.
1558 if (I->getParent()) {
1559 MST = std::make_unique<ModuleSlotTracker>(args: I->getModule());
1560 MST->incorporateFunction(F: *I->getFunction());
1561 } else {
1562 MST = std::make_unique<ModuleSlotTracker>(args: nullptr);
1563 }
1564 }
1565 V->printAsOperand(O&: S, PrintType: false, MST&: *MST);
1566 return Name;
1567}
1568
1569std::string VPSlotTracker::getOrCreateName(const VPValue *V) const {
1570 std::string Name = VPValue2Name.lookup(Val: V);
1571 if (!Name.empty())
1572 return Name;
1573
1574 // If no name was assigned, no VPlan was provided when creating the slot
1575 // tracker or it is not reachable from the provided VPlan. This can happen,
1576 // e.g. when trying to print a recipe that has not been inserted into a VPlan
1577 // in a debugger.
1578 // TODO: Update VPSlotTracker constructor to assign names to recipes &
1579 // VPValues not associated with a VPlan, instead of constructing names ad-hoc
1580 // here.
1581 const VPRecipeBase *DefR = V->getDefiningRecipe();
1582 (void)DefR;
1583 assert((!DefR || !DefR->getParent() || !DefR->getParent()->getPlan()) &&
1584 "VPValue defined by a recipe in a VPlan?");
1585
1586 // Use the underlying value's name, if there is one.
1587 if (auto *UV = V->getUnderlyingValue()) {
1588 std::string Name;
1589 raw_string_ostream S(Name);
1590 UV->printAsOperand(O&: S, PrintType: false);
1591 return (Twine("ir<") + Name + ">").str();
1592 }
1593
1594 return "<badref>";
1595}
1596
1597bool LoopVectorizationPlanner::getDecisionAndClampRange(
1598 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
1599 assert(!Range.isEmpty() && "Trying to test an empty VF range.");
1600 bool PredicateAtRangeStart = Predicate(Range.Start);
1601
1602 for (ElementCount TmpVF : VFRange(Range.Start * 2, Range.End))
1603 if (Predicate(TmpVF) != PredicateAtRangeStart) {
1604 Range.End = TmpVF;
1605 break;
1606 }
1607
1608 return PredicateAtRangeStart;
1609}
1610
1611/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
1612/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
1613/// of VF's starting at a given VF and extending it as much as possible. Each
1614/// vectorization decision can potentially shorten this sub-range during
1615/// buildVPlan().
1616void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
1617 ElementCount MaxVF) {
1618 auto MaxVFTimes2 = MaxVF * 2;
1619 for (ElementCount VF = MinVF; ElementCount::isKnownLT(LHS: VF, RHS: MaxVFTimes2);) {
1620 VFRange SubRange = {VF, MaxVFTimes2};
1621 if (auto Plan = tryToBuildVPlan(Range&: SubRange)) {
1622 VPlanTransforms::optimize(Plan&: *Plan);
1623 // Update the name of the latch of the top-level vector loop region region
1624 // after optimizations which includes block folding.
1625 Plan->getVectorLoopRegion()->getExiting()->setName("vector.latch");
1626 VPlans.push_back(Elt: std::move(Plan));
1627 }
1628 VF = SubRange.End;
1629 }
1630}
1631
1632VPlan &LoopVectorizationPlanner::getPlanFor(ElementCount VF) const {
1633 assert(count_if(VPlans,
1634 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
1635 1 &&
1636 "Multiple VPlans for VF.");
1637
1638 for (const VPlanPtr &Plan : VPlans) {
1639 if (Plan->hasVF(VF))
1640 return *Plan.get();
1641 }
1642 llvm_unreachable("No plan found!");
1643}
1644
1645static void addRuntimeUnrollDisableMetaData(Loop *L) {
1646 SmallVector<Metadata *, 4> MDs;
1647 // Reserve first location for self reference to the LoopID metadata node.
1648 MDs.push_back(Elt: nullptr);
1649 bool IsUnrollMetadata = false;
1650 MDNode *LoopID = L->getLoopID();
1651 if (LoopID) {
1652 // First find existing loop unrolling disable metadata.
1653 for (unsigned I = 1, IE = LoopID->getNumOperands(); I < IE; ++I) {
1654 auto *MD = dyn_cast<MDNode>(Val: LoopID->getOperand(I));
1655 if (MD) {
1656 const auto *S = dyn_cast<MDString>(Val: MD->getOperand(I: 0));
1657 if (!S)
1658 continue;
1659 if (S->getString().starts_with(Prefix: "llvm.loop.unroll.runtime.disable"))
1660 continue;
1661 IsUnrollMetadata =
1662 S->getString().starts_with(Prefix: "llvm.loop.unroll.disable");
1663 }
1664 MDs.push_back(Elt: LoopID->getOperand(I));
1665 }
1666 }
1667
1668 if (!IsUnrollMetadata) {
1669 // Add runtime unroll disable metadata.
1670 LLVMContext &Context = L->getHeader()->getContext();
1671 SmallVector<Metadata *, 1> DisableOperands;
1672 DisableOperands.push_back(
1673 Elt: MDString::get(Context, Str: "llvm.loop.unroll.runtime.disable"));
1674 MDNode *DisableNode = MDNode::get(Context, MDs: DisableOperands);
1675 MDs.push_back(Elt: DisableNode);
1676 MDNode *NewLoopID = MDNode::get(Context, MDs);
1677 // Set operand 0 to refer to the loop id itself.
1678 NewLoopID->replaceOperandWith(I: 0, New: NewLoopID);
1679 L->setLoopID(NewLoopID);
1680 }
1681}
1682
1683void LoopVectorizationPlanner::updateLoopMetadataAndProfileInfo(
1684 Loop *VectorLoop, VPBasicBlock *HeaderVPBB, const VPlan &Plan,
1685 bool VectorizingEpilogue, MDNode *OrigLoopID,
1686 std::optional<unsigned> OrigAverageTripCount,
1687 unsigned OrigLoopInvocationWeight, unsigned EstimatedVFxUF,
1688 bool DisableRuntimeUnroll) {
1689 // Update the metadata of the scalar loop. Skip the update when vectorizing
1690 // the epilogue loop to ensure it is updated only once. Also skip the update
1691 // when the scalar loop became unreachable.
1692 if (Plan.getScalarPreheader()->hasPredecessors() && !VectorizingEpilogue) {
1693 std::optional<MDNode *> RemainderLoopID =
1694 makeFollowupLoopID(OrigLoopID, FollowupAttrs: {LLVMLoopVectorizeFollowupAll,
1695 LLVMLoopVectorizeFollowupEpilogue});
1696 if (RemainderLoopID) {
1697 OrigLoop->setLoopID(*RemainderLoopID);
1698 } else {
1699 if (DisableRuntimeUnroll)
1700 addRuntimeUnrollDisableMetaData(L: OrigLoop);
1701
1702 LoopVectorizeHints Hints(OrigLoop, true, *ORE);
1703 Hints.setAlreadyVectorized();
1704 }
1705 }
1706
1707 if (!VectorLoop)
1708 return;
1709
1710 if (std::optional<MDNode *> VectorizedLoopID = makeFollowupLoopID(
1711 OrigLoopID, FollowupAttrs: {LLVMLoopVectorizeFollowupAll,
1712 LLVMLoopVectorizeFollowupVectorized})) {
1713 VectorLoop->setLoopID(*VectorizedLoopID);
1714 } else {
1715 // Keep all loop hints from the original loop on the vector loop (we'll
1716 // replace the vectorizer-specific hints below).
1717 if (OrigLoopID)
1718 VectorLoop->setLoopID(OrigLoopID);
1719
1720 if (!VectorizingEpilogue) {
1721 LoopVectorizeHints Hints(VectorLoop, true, *ORE);
1722 Hints.setAlreadyVectorized();
1723 }
1724 }
1725 TargetTransformInfo::UnrollingPreferences UP;
1726 TTI.getUnrollingPreferences(L: VectorLoop, *PSE.getSE(), UP, ORE);
1727 if (!UP.UnrollVectorizedLoop || VectorizingEpilogue)
1728 addRuntimeUnrollDisableMetaData(L: VectorLoop);
1729
1730 // Set/update profile weights for the vector and remainder loops as original
1731 // loop iterations are now distributed among them. Note that original loop
1732 // becomes the scalar remainder loop after vectorization.
1733 //
1734 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
1735 // end up getting slightly roughened result but that should be OK since
1736 // profile is not inherently precise anyway. Note also possible bypass of
1737 // vector code caused by legality checks is ignored, assigning all the weight
1738 // to the vector loop, optimistically.
1739 //
1740 // For scalable vectorization we can't know at compile time how many
1741 // iterations of the loop are handled in one vector iteration, so instead
1742 // use the value of vscale used for tuning.
1743 unsigned AverageVectorTripCount = 0;
1744 unsigned RemainderAverageTripCount = 0;
1745 auto EC = VectorLoop->getLoopPreheader()->getParent()->getEntryCount();
1746 auto IsProfiled = EC && EC->getCount();
1747 if (!OrigAverageTripCount) {
1748 if (!IsProfiled)
1749 return;
1750 auto &SE = *PSE.getSE();
1751 AverageVectorTripCount = SE.getSmallConstantTripCount(L: VectorLoop);
1752 if (ProfcheckDisableMetadataFixes || !AverageVectorTripCount)
1753 return;
1754 if (Plan.getScalarPreheader()->hasPredecessors())
1755 RemainderAverageTripCount =
1756 SE.getSmallConstantTripCount(L: OrigLoop) % EstimatedVFxUF;
1757 // Setting to 1 should be sufficient to generate the correct branch weights.
1758 OrigLoopInvocationWeight = 1;
1759 } else {
1760 // Calculate number of iterations in unrolled loop.
1761 AverageVectorTripCount = *OrigAverageTripCount / EstimatedVFxUF;
1762 // Calculate number of iterations for remainder loop.
1763 RemainderAverageTripCount = *OrigAverageTripCount % EstimatedVFxUF;
1764 }
1765 if (HeaderVPBB) {
1766 setLoopEstimatedTripCount(L: VectorLoop, EstimatedTripCount: AverageVectorTripCount,
1767 EstimatedLoopInvocationWeight: OrigLoopInvocationWeight);
1768 }
1769
1770 if (Plan.getScalarPreheader()->hasPredecessors()) {
1771 setLoopEstimatedTripCount(L: OrigLoop, EstimatedTripCount: RemainderAverageTripCount,
1772 EstimatedLoopInvocationWeight: OrigLoopInvocationWeight);
1773 }
1774}
1775
1776#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1777void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
1778 if (VPlans.empty()) {
1779 O << "LV: No VPlans built.\n";
1780 return;
1781 }
1782 for (const auto &Plan : VPlans)
1783 if (PrintVPlansInDotFormat)
1784 Plan->printDOT(O);
1785 else
1786 Plan->print(O);
1787}
1788#endif
1789
1790bool llvm::canConstantBeExtended(const APInt *C, Type *NarrowType,
1791 TTI::PartialReductionExtendKind ExtKind) {
1792 APInt TruncatedVal = C->trunc(width: NarrowType->getScalarSizeInBits());
1793 unsigned WideSize = C->getBitWidth();
1794 APInt ExtendedVal = ExtKind == TTI::PR_SignExtend
1795 ? TruncatedVal.sext(width: WideSize)
1796 : TruncatedVal.zext(width: WideSize);
1797 return ExtendedVal == *C;
1798}
1799
1800TargetTransformInfo::OperandValueInfo
1801VPCostContext::getOperandInfo(VPValue *V) const {
1802 if (auto *IRV = dyn_cast<VPIRValue>(Val: V))
1803 return TTI::getOperandInfo(V: IRV->getValue());
1804
1805 return {};
1806}
1807
1808InstructionCost VPCostContext::getScalarizationOverhead(
1809 Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF,
1810 TTI::VectorInstrContext VIC, bool AlwaysIncludeReplicatingR) {
1811 if (VF.isScalar())
1812 return 0;
1813
1814 assert(!VF.isScalable() &&
1815 "Scalarization overhead not supported for scalable vectors");
1816
1817 InstructionCost ScalarizationCost = 0;
1818 // Compute the cost of scalarizing the result if needed.
1819 if (!ResultTy->isVoidTy()) {
1820 for (Type *VectorTy :
1821 to_vector(Range: getContainedTypes(Ty: toVectorizedTy(Ty: ResultTy, EC: VF)))) {
1822 ScalarizationCost += TTI.getScalarizationOverhead(
1823 Ty: cast<VectorType>(Val: VectorTy), DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
1824 /*Insert=*/true, /*Extract=*/false, CostKind,
1825 /*ForPoisonSrc=*/true, VL: {}, VIC);
1826 }
1827 }
1828 // Compute the cost of scalarizing the operands, skipping ones that do not
1829 // require extraction/scalarization and do not incur any overhead.
1830 SmallPtrSet<const VPValue *, 4> UniqueOperands;
1831 SmallVector<Type *> Tys;
1832 for (auto *Op : Operands) {
1833 if (isa<VPIRValue>(Val: Op) ||
1834 (!AlwaysIncludeReplicatingR &&
1835 isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Val: Op)) ||
1836 (isa<VPReplicateRecipe>(Val: Op) &&
1837 cast<VPReplicateRecipe>(Val: Op)->getOpcode() == Instruction::Load) ||
1838 !UniqueOperands.insert(Ptr: Op).second)
1839 continue;
1840 Tys.push_back(Elt: toVectorizedTy(Ty: Types.inferScalarType(V: Op), EC: VF));
1841 }
1842 return ScalarizationCost +
1843 TTI.getOperandsScalarizationOverhead(Tys, CostKind, VIC);
1844}
1845
1846bool VPCostContext::useEmulatedMaskMemRefHack(const VPReplicateRecipe *R,
1847 ElementCount VF) {
1848 const Instruction *UI = R->getUnderlyingInstr();
1849 if (isa<LoadInst>(Val: UI))
1850 return true;
1851 assert(isa<StoreInst>(UI) && "R must either be a load or store");
1852
1853 if (!NumPredStores) {
1854 // Count the number of predicated stores in the VPlan, caching the result.
1855 // Only stores where scatter is not legal are counted, matching the legacy
1856 // cost model behavior.
1857 const VPlan &Plan = *R->getParent()->getPlan();
1858 NumPredStores = 0;
1859 for (const VPRegionBlock *VPRB :
1860 VPBlockUtils::blocksOnly<const VPRegionBlock>(
1861 Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry()))) {
1862 assert(VPRB->isReplicator() && "must only contain replicate regions");
1863 for (const VPBasicBlock *VPBB :
1864 VPBlockUtils::blocksOnly<const VPBasicBlock>(
1865 Range: vp_depth_first_shallow(G: VPRB->getEntry()))) {
1866 for (const VPRecipeBase &Recipe : *VPBB) {
1867 auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &Recipe);
1868 if (!RepR)
1869 continue;
1870 if (!isa<StoreInst>(Val: RepR->getUnderlyingInstr()))
1871 continue;
1872 // Check if scatter is legal for this store. If so, don't count it.
1873 Type *Ty = Types.inferScalarType(V: RepR->getOperand(N: 0));
1874 auto *VTy = VectorType::get(ElementType: Ty, EC: VF);
1875 const Align Alignment =
1876 getLoadStoreAlignment(I: RepR->getUnderlyingInstr());
1877 if (!TTI.isLegalMaskedScatter(DataType: VTy, Alignment))
1878 ++(*NumPredStores);
1879 }
1880 }
1881 }
1882 }
1883 return *NumPredStores > NumberOfStoresToPredicate;
1884}
1885