1//===- VPlan.cpp - Vectorizer Plan ----------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This is the LLVM vectorization plan. It represents a candidate for
11/// vectorization, allowing to plan and optimize how to vectorize a given loop
12/// before generating LLVM-IR.
13/// The vectorizer uses vectorization plans to estimate the costs of potential
14/// candidates and if profitable to execute the desired plan, generating vector
15/// LLVM-IR code.
16///
17//===----------------------------------------------------------------------===//
18
19#include "VPlan.h"
20#include "LoopVectorizationPlanner.h"
21#include "VPlanCFG.h"
22#include "VPlanDominatorTree.h"
23#include "VPlanHelpers.h"
24#include "VPlanPatternMatch.h"
25#include "VPlanTransforms.h"
26#include "VPlanUtils.h"
27#include "llvm/ADT/PostOrderIterator.h"
28#include "llvm/ADT/STLExtras.h"
29#include "llvm/ADT/SmallVector.h"
30#include "llvm/ADT/StringExtras.h"
31#include "llvm/ADT/Twine.h"
32#include "llvm/Analysis/DomTreeUpdater.h"
33#include "llvm/Analysis/LoopInfo.h"
34#include "llvm/Analysis/OptimizationRemarkEmitter.h"
35#include "llvm/IR/BasicBlock.h"
36#include "llvm/IR/CFG.h"
37#include "llvm/IR/IRBuilder.h"
38#include "llvm/IR/Instruction.h"
39#include "llvm/IR/Instructions.h"
40#include "llvm/IR/Type.h"
41#include "llvm/IR/Value.h"
42#include "llvm/Support/Casting.h"
43#include "llvm/Support/CommandLine.h"
44#include "llvm/Support/Debug.h"
45#include "llvm/Support/GraphWriter.h"
46#include "llvm/Support/raw_ostream.h"
47#include "llvm/Transforms/Utils/BasicBlockUtils.h"
48#include "llvm/Transforms/Utils/LoopVersioning.h"
49#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
50#include <cassert>
51#include <string>
52
53using namespace llvm;
54using namespace llvm::VPlanPatternMatch;
55
56namespace llvm {
57extern cl::opt<bool> ProfcheckDisableMetadataFixes;
58} // namespace llvm
59
60/// @{
61/// Metadata attribute names
62const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
63const char LLVMLoopVectorizeFollowupVectorized[] =
64 "llvm.loop.vectorize.followup_vectorized";
65const char LLVMLoopVectorizeFollowupEpilogue[] =
66 "llvm.loop.vectorize.followup_epilogue";
67/// @}
68
69extern cl::opt<unsigned> ForceTargetInstructionCost;
70
71extern cl::opt<unsigned> NumberOfStoresToPredicate;
72
73static cl::opt<bool> PrintVPlansInDotFormat(
74 "vplan-print-in-dot-format", cl::Hidden,
75 cl::desc("Use dot format instead of plain text when dumping VPlans"));
76
77#define DEBUG_TYPE "loop-vectorize"
78
79#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
80raw_ostream &llvm::operator<<(raw_ostream &OS, const VPRecipeBase &R) {
81 const VPBasicBlock *Parent = R.getParent();
82 VPSlotTracker SlotTracker(Parent ? Parent->getPlan() : nullptr);
83 R.print(OS, "", SlotTracker);
84 return OS;
85}
86#endif
87
88Value *VPLane::getAsRuntimeExpr(IRBuilderBase &Builder,
89 const ElementCount &VF) const {
90 switch (LaneKind) {
91 case VPLane::Kind::ScalableLast:
92 // Lane = RuntimeVF - VF.getKnownMinValue() + Lane
93 return Builder.CreateSub(LHS: getRuntimeVF(B&: Builder, Ty: Builder.getInt32Ty(), VF),
94 RHS: Builder.getInt32(C: VF.getKnownMinValue() - Lane));
95 case VPLane::Kind::First:
96 return Builder.getInt64(C: Lane);
97 }
98 llvm_unreachable("Unknown lane kind");
99}
100
101#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
102void VPValue::print(raw_ostream &OS, VPSlotTracker &SlotTracker) const {
103 if (const VPRecipeBase *R = getDefiningRecipe())
104 R->print(OS, "", SlotTracker);
105 else
106 printAsOperand(OS, SlotTracker);
107}
108
109void VPValue::dump() const {
110 const VPRecipeBase *Instr = getDefiningRecipe();
111 VPSlotTracker SlotTracker(
112 (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr);
113 print(dbgs(), SlotTracker);
114 dbgs() << "\n";
115}
116
117void VPRecipeBase::dump() const {
118 VPSlotTracker SlotTracker(getParent() ? getParent()->getPlan() : nullptr);
119 print(dbgs(), "", SlotTracker);
120 dbgs() << "\n";
121}
122#endif
123
124#if !defined(NDEBUG)
125bool VPRecipeValue::isDefinedBy(const VPDef *D) const {
126 return getDefiningRecipe() == D;
127}
128#endif
129
130VPRecipeBase *VPValue::getDefiningRecipe() {
131 auto *RecipeValue = dyn_cast<VPRecipeValue>(Val: this);
132 if (!RecipeValue)
133 return nullptr;
134 if (auto *MultiDef = dyn_cast<VPMultiDefValue>(Val: RecipeValue))
135 return MultiDef->getDef();
136 return static_cast<VPSingleDefRecipe *>(RecipeValue);
137}
138
139const VPRecipeBase *VPValue::getDefiningRecipe() const {
140 return const_cast<VPValue *>(this)->getDefiningRecipe();
141}
142
143Value *VPValue::getLiveInIRValue() const {
144 return cast<VPIRValue>(Val: this)->getValue();
145}
146
147Type *VPIRValue::getType() const { return getUnderlyingValue()->getType(); }
148
149Type *VPValue::getScalarType() const {
150 switch (getVPValueID()) {
151 case VPVIRValueSC:
152 return cast<VPIRValue>(Val: this)->getType();
153 case VPRegionValueSC:
154 return cast<VPRegionValue>(Val: this)->getType();
155 case VPVSymbolicSC:
156 return cast<VPSymbolicValue>(Val: this)->getType();
157 case VPVMultiDefValueSC:
158 case VPVSingleDefValueSC:
159 return cast<VPRecipeValue>(Val: this)->getScalarType();
160 }
161 llvm_unreachable("Unhandled VPValue subclass");
162}
163
164VPRecipeValue::~VPRecipeValue() {
165 assert(Users.empty() &&
166 "trying to delete a VPRecipeValue with remaining users");
167}
168
169VPSingleDefValue::VPSingleDefValue(VPSingleDefRecipe *Def, Value *UV, Type *Ty)
170 : VPRecipeValue(VPVSingleDefValueSC, UV, Ty) {
171 assert(Def && "VPSingleDefValue requires a defining recipe");
172 Def->addDefinedValue(V: this);
173}
174
175VPSingleDefValue::~VPSingleDefValue() {
176 getDefiningRecipe()->removeDefinedValue(V: this);
177}
178
179VPMultiDefValue::VPMultiDefValue(VPRecipeBase *Def, Value *UV, Type *Ty)
180 : VPRecipeValue(VPVMultiDefValueSC, UV, Ty), Def(Def) {
181 assert(Def && "VPMultiDefValue requires a defining recipe");
182 Def->addDefinedValue(V: this);
183}
184
185VPMultiDefValue::~VPMultiDefValue() {
186 getDefiningRecipe()->removeDefinedValue(V: this);
187}
188
189// Get the top-most entry block of \p Start. This is the entry block of the
190// containing VPlan. This function is templated to support both const and non-const blocks
191template <typename T> static T *getPlanEntry(T *Start) {
192 T *Next = Start;
193 T *Current = Start;
194 while ((Next = Next->getParent()))
195 Current = Next;
196
197 SmallSetVector<T *, 8> WorkList;
198 WorkList.insert(Current);
199
200 for (unsigned i = 0; i < WorkList.size(); i++) {
201 T *Current = WorkList[i];
202 if (!Current->hasPredecessors())
203 return Current;
204 auto &Predecessors = Current->getPredecessors();
205 WorkList.insert_range(Predecessors);
206 }
207
208 llvm_unreachable("VPlan without any entry node without predecessors");
209}
210
211VPlan *VPBlockBase::getPlan() { return getPlanEntry(Start: this)->Plan; }
212
213const VPlan *VPBlockBase::getPlan() const { return getPlanEntry(Start: this)->Plan; }
214
215/// \return the VPBasicBlock that is the entry of Block, possibly indirectly.
216const VPBasicBlock *VPBlockBase::getEntryBasicBlock() const {
217 const VPBlockBase *Block = this;
218 while (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Val: Block))
219 Block = Region->getEntry();
220 return cast<VPBasicBlock>(Val: Block);
221}
222
223VPBasicBlock *VPBlockBase::getEntryBasicBlock() {
224 VPBlockBase *Block = this;
225 while (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Val: Block))
226 Block = Region->getEntry();
227 return cast<VPBasicBlock>(Val: Block);
228}
229
230void VPBlockBase::setPlan(VPlan *ParentPlan) {
231 assert(ParentPlan->getEntry() == this && "Can only set plan on its entry.");
232 Plan = ParentPlan;
233}
234
235/// \return the VPBasicBlock that is the exit of Block, possibly indirectly.
236const VPBasicBlock *VPBlockBase::getExitingBasicBlock() const {
237 const VPBlockBase *Block = this;
238 while (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Val: Block))
239 Block = Region->getExiting();
240 return cast<VPBasicBlock>(Val: Block);
241}
242
243VPBasicBlock *VPBlockBase::getExitingBasicBlock() {
244 VPBlockBase *Block = this;
245 while (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Val: Block))
246 Block = Region->getExiting();
247 return cast<VPBasicBlock>(Val: Block);
248}
249
250VPBlockBase *VPBlockBase::getEnclosingBlockWithSuccessors() {
251 if (!Successors.empty() || !Parent)
252 return this;
253 assert(Parent->getExiting() == this &&
254 "Block w/o successors not the exiting block of its parent.");
255 return Parent->getEnclosingBlockWithSuccessors();
256}
257
258VPBlockBase *VPBlockBase::getEnclosingBlockWithPredecessors() {
259 if (!Predecessors.empty() || !Parent)
260 return this;
261 assert(Parent->getEntry() == this &&
262 "Block w/o predecessors not the entry of its parent.");
263 return Parent->getEnclosingBlockWithPredecessors();
264}
265
266VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() {
267 iterator It = begin();
268 while (It != end() && It->isPhi())
269 It++;
270 return It;
271}
272
273VPTransformState::VPTransformState(const TargetTransformInfo *TTI,
274 ElementCount VF, LoopInfo *LI,
275 DominatorTree *DT, AssumptionCache *AC,
276 IRBuilderBase &Builder, VPlan *Plan,
277 Loop *CurrentParentLoop)
278 : TTI(TTI), VF(VF), CFG(DT), LI(LI), AC(AC), Builder(Builder), Plan(Plan),
279 CurrentParentLoop(CurrentParentLoop), VPDT(*Plan) {}
280
281Value *VPTransformState::get(const VPValue *Def, const VPLane &Lane) {
282 if (isa<VPIRValue, VPSymbolicValue>(Val: Def))
283 return Def->getUnderlyingValue();
284
285 if (hasScalarValue(Def, Lane))
286 return Data.VPV2Scalars[Def][Lane.mapToCacheIndex(VF)];
287
288 if (!Lane.isFirstLane() && vputils::isSingleScalar(VPV: Def) &&
289 hasScalarValue(Def, Lane: VPLane::getFirstLane())) {
290 return Data.VPV2Scalars[Def][0];
291 }
292
293 // Look through BuildVector to avoid redundant extracts.
294 // TODO: Remove once replicate regions are unrolled explicitly.
295 if (Lane.getKind() == VPLane::Kind::First && match(V: Def, P: m_BuildVector())) {
296 auto *BuildVector = cast<VPInstruction>(Val: Def);
297 return get(Def: BuildVector->getOperand(N: Lane.getKnownLane()), IsScalar: true);
298 }
299
300 assert(hasVectorValue(Def));
301 auto *VecPart = Data.VPV2Vector[Def];
302 if (!VecPart->getType()->isVectorTy()) {
303 assert(Lane.isFirstLane() && "cannot get lane > 0 for scalar");
304 return VecPart;
305 }
306 // TODO: Cache created scalar values.
307 Value *LaneV = Lane.getAsRuntimeExpr(Builder, VF);
308 auto *Extract = Builder.CreateExtractElement(Vec: VecPart, Idx: LaneV);
309 // set(Def, Extract, Instance);
310 return Extract;
311}
312
313Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) {
314 if (NeedsScalar) {
315 assert((VF.isScalar() || isa<VPIRValue, VPSymbolicValue>(Def) ||
316 hasVectorValue(Def) || !vputils::onlyFirstLaneUsed(Def) ||
317 (hasScalarValue(Def, VPLane(0)) &&
318 Data.VPV2Scalars[Def].size() == 1)) &&
319 "Trying to access a single scalar per part but has multiple scalars "
320 "per part.");
321 return get(Def, Lane: VPLane(0));
322 }
323
324 // If Values have been set for this Def return the one relevant for \p Part.
325 if (hasVectorValue(Def))
326 return Data.VPV2Vector[Def];
327
328 auto GetBroadcastInstrs = [this](Value *V) {
329 if (VF.isScalar())
330 return V;
331 // Broadcast the scalar into all locations in the vector.
332 Value *Shuf = Builder.CreateVectorSplat(EC: VF, V, Name: "broadcast");
333 return Shuf;
334 };
335
336 Value *ScalarValue = get(Def, Lane: VPLane(0));
337 VPLane LastLane = VPLane::getLastLaneForVF(VF);
338 IRBuilderBase::InsertPointGuard Guard(Builder);
339 if (auto *LastInst = dyn_cast<Instruction>(Val: get(Def, Lane: LastLane)))
340 // Set the insert point after the last scalarized instruction or after the
341 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence
342 // will directly follow the scalar definitions.
343 Builder.SetInsertPoint(isa<PHINode>(Val: LastInst)
344 ? LastInst->getParent()->getFirstNonPHIIt()
345 : std::next(x: BasicBlock::iterator(LastInst)));
346 Value *VectorValue = GetBroadcastInstrs(ScalarValue);
347 set(Def, V: VectorValue);
348 return VectorValue;
349}
350
351void VPTransformState::setDebugLocFrom(DebugLoc DL) {
352 const DILocation *DIL = DL;
353 // When a FSDiscriminator is enabled, we don't need to add the multiply
354 // factors to the discriminators.
355 if (DIL &&
356 Builder.GetInsertBlock()
357 ->getParent()
358 ->shouldEmitDebugInfoForProfiling() &&
359 !EnableFSDiscriminator) {
360 // FIXME: For scalable vectors, assume vscale=1.
361 unsigned UF = Plan->getConcreteUF();
362 auto NewDIL =
363 DIL->cloneByMultiplyingDuplicationFactor(DF: UF * VF.getKnownMinValue());
364 if (NewDIL)
365 Builder.SetCurrentDebugLocation(*NewDIL);
366 else
367 LLVM_DEBUG(dbgs() << "Failed to create new discriminator: "
368 << DIL->getFilename() << " Line: " << DIL->getLine());
369 } else
370 Builder.SetCurrentDebugLocation(DL);
371}
372
373Value *VPTransformState::packScalarIntoVectorizedValue(const VPValue *Def,
374 Value *WideValue,
375 const VPLane &Lane) {
376 Value *ScalarInst = get(Def, Lane);
377 Value *LaneExpr = Lane.getAsRuntimeExpr(Builder, VF);
378 if (auto *StructTy = dyn_cast<StructType>(Val: WideValue->getType())) {
379 // We must handle each element of a vectorized struct type.
380 for (unsigned I = 0, E = StructTy->getNumElements(); I != E; I++) {
381 Value *ScalarValue = Builder.CreateExtractValue(Agg: ScalarInst, Idxs: I);
382 Value *VectorValue = Builder.CreateExtractValue(Agg: WideValue, Idxs: I);
383 VectorValue =
384 Builder.CreateInsertElement(Vec: VectorValue, NewElt: ScalarValue, Idx: LaneExpr);
385 WideValue = Builder.CreateInsertValue(Agg: WideValue, Val: VectorValue, Idxs: I);
386 }
387 } else {
388 WideValue = Builder.CreateInsertElement(Vec: WideValue, NewElt: ScalarInst, Idx: LaneExpr);
389 }
390 return WideValue;
391}
392
393BasicBlock *VPBasicBlock::createEmptyBasicBlock(VPTransformState &State) {
394 auto &CFG = State.CFG;
395 // BB stands for IR BasicBlocks. VPBB stands for VPlan VPBasicBlocks.
396 // Pred stands for Predessor. Prev stands for Previous - last visited/created.
397 BasicBlock *PrevBB = CFG.PrevBB;
398 BasicBlock *NewBB = BasicBlock::Create(Context&: PrevBB->getContext(), Name: getName(),
399 Parent: PrevBB->getParent(), InsertBefore: CFG.ExitBB);
400 LLVM_DEBUG(dbgs() << "LV: created " << NewBB->getName() << '\n');
401
402 return NewBB;
403}
404
405void VPBasicBlock::connectToPredecessors(VPTransformState &State) {
406 auto &CFG = State.CFG;
407 BasicBlock *NewBB = CFG.VPBB2IRBB[this];
408
409 // Register NewBB in its loop. In innermost loops its the same for all
410 // BB's.
411 Loop *ParentLoop = State.CurrentParentLoop;
412 // If this block has a sole successor that is an exit block or is an exit
413 // block itself then it needs adding to the same parent loop as the exit
414 // block.
415 VPBlockBase *SuccOrExitVPB = getSingleSuccessor();
416 SuccOrExitVPB = SuccOrExitVPB ? SuccOrExitVPB : this;
417 if (State.Plan->isExitBlock(VPBB: SuccOrExitVPB)) {
418 ParentLoop = State.LI->getLoopFor(
419 BB: cast<VPIRBasicBlock>(Val: SuccOrExitVPB)->getIRBasicBlock());
420 }
421
422 if (ParentLoop && !State.LI->getLoopFor(BB: NewBB))
423 ParentLoop->addBasicBlockToLoop(NewBB, LI&: *State.LI);
424
425 SmallVector<VPBlockBase *> Preds;
426 if (VPBlockUtils::isHeader(VPB: this, VPDT: State.VPDT)) {
427 // There's no block for the latch yet, connect to the preheader only.
428 Preds = {getPredecessors()[0]};
429 } else {
430 Preds = to_vector(Range&: getPredecessors());
431 }
432
433 // Hook up the new basic block to its predecessors.
434 for (VPBlockBase *PredVPBlock : Preds) {
435 VPBasicBlock *PredVPBB = PredVPBlock->getExitingBasicBlock();
436 auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors();
437 assert(CFG.VPBB2IRBB.contains(PredVPBB) &&
438 "Predecessor basic-block not found building successor.");
439 BasicBlock *PredBB = CFG.VPBB2IRBB[PredVPBB];
440 auto *PredBBTerminator = PredBB->getTerminator();
441 LLVM_DEBUG(dbgs() << "LV: draw edge from " << PredBB->getName() << '\n');
442
443 if (isa<UnreachableInst>(Val: PredBBTerminator)) {
444 assert(PredVPSuccessors.size() == 1 &&
445 "Predecessor ending w/o branch must have single successor.");
446 DebugLoc DL = PredBBTerminator->getDebugLoc();
447 PredBBTerminator->eraseFromParent();
448 auto *Br = UncondBrInst::Create(Target: NewBB, InsertBefore: PredBB);
449 Br->setDebugLoc(DL);
450 } else if (auto *UBI = dyn_cast<UncondBrInst>(Val: PredBBTerminator)) {
451 UBI->setSuccessor(NewBB);
452 } else {
453 // Set each forward successor here when it is created, excluding
454 // backedges. A backward successor is set when the branch is created.
455 // Branches to VPIRBasicBlocks must have the same successors in VPlan as
456 // in the original IR, except when the predecessor is the entry block.
457 // This enables including SCEV and memory runtime check blocks in VPlan.
458 // TODO: Remove exception by modeling the terminator of entry block using
459 // BranchOnCond.
460 unsigned idx = PredVPSuccessors.front() == this ? 0 : 1;
461 auto *TermBr = cast<CondBrInst>(Val: PredBBTerminator);
462 assert((!TermBr->getSuccessor(idx) ||
463 (isa<VPIRBasicBlock>(this) &&
464 (TermBr->getSuccessor(idx) == NewBB ||
465 PredVPBlock == getPlan()->getEntry()))) &&
466 "Trying to reset an existing successor block.");
467 TermBr->setSuccessor(idx, NewSucc: NewBB);
468 }
469 CFG.DTU.applyUpdates(Updates: {{DominatorTree::Insert, PredBB, NewBB}});
470 }
471}
472
473void VPIRBasicBlock::execute(VPTransformState *State) {
474 assert(getHierarchicalSuccessors().size() <= 2 &&
475 "VPIRBasicBlock can have at most two successors at the moment!");
476 // Move completely disconnected blocks to their final position.
477 if (IRBB->hasNPredecessors(N: 0) && succ_begin(BB: IRBB) == succ_end(BB: IRBB))
478 IRBB->moveAfter(MovePos: State->CFG.PrevBB);
479 State->Builder.SetInsertPoint(IRBB->getTerminator());
480 State->CFG.PrevBB = IRBB;
481 State->CFG.VPBB2IRBB[this] = IRBB;
482 executeRecipes(State, BB: IRBB);
483 // Create a branch instruction to terminate IRBB if one was not created yet
484 // and is needed.
485 if (getSingleSuccessor() && isa<UnreachableInst>(Val: IRBB->getTerminator())) {
486 auto *Br = State->Builder.CreateBr(Dest: IRBB);
487 Br->setOperand(i_nocapture: 0, Val_nocapture: nullptr);
488 IRBB->getTerminator()->eraseFromParent();
489 } else {
490 assert((getNumSuccessors() == 0 ||
491 isa<UncondBrInst, CondBrInst>(IRBB->getTerminator())) &&
492 "other blocks must be terminated by a branch");
493 }
494
495 connectToPredecessors(State&: *State);
496}
497
498VPIRBasicBlock *VPIRBasicBlock::clone() {
499 auto *NewBlock = getPlan()->createEmptyVPIRBasicBlock(IRBB);
500 for (VPRecipeBase &R : Recipes)
501 NewBlock->appendRecipe(Recipe: R.clone());
502 return NewBlock;
503}
504
505void VPBasicBlock::execute(VPTransformState *State) {
506 if (VPBlockUtils::isHeader(VPB: this, VPDT: State->VPDT)) {
507 // Create and register the new vector loop.
508 Loop *PrevParentLoop = State->CurrentParentLoop;
509 State->CurrentParentLoop = State->LI->AllocateLoop();
510
511 // Insert the new loop into the loop nest and register the new basic blocks
512 // before calling any utilities such as SCEV that require valid LoopInfo.
513 if (PrevParentLoop)
514 PrevParentLoop->addChildLoop(NewChild: State->CurrentParentLoop);
515 else
516 State->LI->addTopLevelLoop(New: State->CurrentParentLoop);
517 }
518
519 // 1. Create an IR basic block.
520 BasicBlock *NewBB = createEmptyBasicBlock(State&: *State);
521
522 State->Builder.SetInsertPoint(NewBB);
523 // Temporarily terminate with unreachable until CFG is rewired.
524 UnreachableInst *Terminator = State->Builder.CreateUnreachable();
525 State->Builder.SetInsertPoint(Terminator);
526
527 State->CFG.PrevBB = NewBB;
528 State->CFG.VPBB2IRBB[this] = NewBB;
529 connectToPredecessors(State&: *State);
530
531 // 2. Fill the IR basic block with IR instructions.
532 executeRecipes(State, BB: NewBB);
533
534 // If this block is a latch, update CurrentParentLoop.
535 if (VPBlockUtils::isLatch(VPB: this, VPDT: State->VPDT))
536 State->CurrentParentLoop = State->CurrentParentLoop->getParentLoop();
537}
538
539VPBasicBlock *VPBasicBlock::clone() {
540 auto *NewBlock = getPlan()->createVPBasicBlock(Name: getName());
541 for (VPRecipeBase &R : *this)
542 NewBlock->appendRecipe(Recipe: R.clone());
543 return NewBlock;
544}
545
546void VPBasicBlock::executeRecipes(VPTransformState *State, BasicBlock *BB) {
547 LLVM_DEBUG(dbgs() << "LV: vectorizing VPBB: " << getName()
548 << " in BB: " << BB->getName() << '\n');
549
550 State->CFG.PrevVPBB = this;
551
552 for (VPRecipeBase &Recipe : Recipes) {
553 State->setDebugLocFrom(Recipe.getDebugLoc());
554 Recipe.execute(State&: *State);
555 }
556
557 LLVM_DEBUG(dbgs() << "LV: filled BB: " << *BB);
558}
559
560VPBasicBlock *VPBasicBlock::splitAt(iterator SplitAt) {
561 assert((SplitAt == end() || SplitAt->getParent() == this) &&
562 "can only split at a position in the same block");
563
564 // Create new empty block after the block to split.
565 auto *SplitBlock = getPlan()->createVPBasicBlock(Name: getName() + ".split");
566 VPBlockUtils::insertBlockAfter(NewBlock: SplitBlock, BlockPtr: this);
567
568 // If this is the exiting block, make the split the new exiting block.
569 auto *ParentRegion = getParent();
570 if (ParentRegion && ParentRegion->getExiting() == this)
571 ParentRegion->setExiting(SplitBlock);
572
573 // Finally, move the recipes starting at SplitAt to new block.
574 for (VPRecipeBase &ToMove :
575 make_early_inc_range(Range: make_range(x: SplitAt, y: this->end())))
576 ToMove.moveBefore(BB&: *SplitBlock, I: SplitBlock->end());
577
578 return SplitBlock;
579}
580
581/// Return the enclosing loop region for region \p P. The templated version is
582/// used to support both const and non-const block arguments.
583template <typename T> static T *getEnclosingLoopRegionForRegion(T *P) {
584 if (P && P->isReplicator()) {
585 P = P->getParent();
586 // Multiple loop regions can be nested, but replicate regions can only be
587 // nested inside a loop region or must be outside any other region.
588 assert((!P || !P->isReplicator()) && "unexpected nested replicate regions");
589 }
590 return P;
591}
592
593VPRegionBlock *VPBasicBlock::getEnclosingLoopRegion() {
594 return getEnclosingLoopRegionForRegion(P: getParent());
595}
596
597const VPRegionBlock *VPBasicBlock::getEnclosingLoopRegion() const {
598 return getEnclosingLoopRegionForRegion(P: getParent());
599}
600
601static bool hasConditionalTerminator(const VPBasicBlock *VPBB) {
602 if (VPBB->empty()) {
603 assert(
604 VPBB->getNumSuccessors() < 2 &&
605 "block with multiple successors doesn't have a recipe as terminator");
606 return false;
607 }
608
609 const VPRecipeBase *R = &VPBB->back();
610 [[maybe_unused]] bool IsSwitch =
611 isa<VPInstruction>(Val: R) &&
612 cast<VPInstruction>(Val: R)->getOpcode() == Instruction::Switch;
613 [[maybe_unused]] bool IsBranchOnTwoConds = match(V: R, P: m_BranchOnTwoConds());
614 [[maybe_unused]] bool IsCondBranch =
615 isa<VPBranchOnMaskRecipe>(Val: R) ||
616 match(V: R, P: m_CombineOr(Ps: m_BranchOnCond(), Ps: m_BranchOnCount()));
617 if (VPBB->getNumSuccessors() == 2 ||
618 (VPBB->isExiting() && !VPBB->getParent()->isReplicator())) {
619 assert((IsCondBranch || IsSwitch || IsBranchOnTwoConds) &&
620 "block with multiple successors not terminated by "
621 "conditional branch nor switch recipe");
622
623 return true;
624 }
625
626 if (VPBB->getNumSuccessors() > 2) {
627 assert((IsSwitch || IsBranchOnTwoConds) &&
628 "block with more than 2 successors not terminated by a switch or "
629 "branch-on-two-conds recipe");
630 return true;
631 }
632
633 assert(
634 !IsCondBranch && !IsBranchOnTwoConds &&
635 "block with 0 or 1 successors terminated by conditional branch recipe");
636 return false;
637}
638
639VPRecipeBase *VPBasicBlock::getTerminator() {
640 if (hasConditionalTerminator(VPBB: this))
641 return &back();
642 return nullptr;
643}
644
645const VPRecipeBase *VPBasicBlock::getTerminator() const {
646 if (hasConditionalTerminator(VPBB: this))
647 return &back();
648 return nullptr;
649}
650
651bool VPBasicBlock::isExiting() const {
652 return getParent() && getParent()->getExitingBasicBlock() == this;
653}
654
655#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
656void VPBlockBase::print(raw_ostream &O) const {
657 VPSlotTracker SlotTracker(getPlan());
658 print(O, "", SlotTracker);
659}
660
661void VPBlockBase::printSuccessors(raw_ostream &O, const Twine &Indent) const {
662 if (!hasSuccessors()) {
663 O << Indent << "No successors\n";
664 } else {
665 O << Indent << "Successor(s): ";
666 ListSeparator LS;
667 for (auto *Succ : getSuccessors())
668 O << LS << Succ->getName();
669 O << '\n';
670 }
671}
672
673void VPBasicBlock::print(raw_ostream &O, const Twine &Indent,
674 VPSlotTracker &SlotTracker) const {
675 O << Indent << getName() << ":\n";
676
677 auto RecipeIndent = Indent + " ";
678 for (const VPRecipeBase &Recipe : *this) {
679 Recipe.print(O, RecipeIndent, SlotTracker);
680 O << '\n';
681 }
682
683 printSuccessors(O, Indent);
684}
685#endif
686
687std::pair<VPBlockBase *, VPBlockBase *>
688VPBlockUtils::cloneFrom(VPBlockBase *Entry) {
689 DenseMap<VPBlockBase *, VPBlockBase *> Old2NewVPBlocks;
690 VPBlockBase *Exiting = nullptr;
691 bool InRegion = Entry->getParent();
692 // First, clone blocks reachable from Entry.
693 for (VPBlockBase *BB : vp_depth_first_shallow(G: Entry)) {
694 VPBlockBase *NewBB = BB->clone();
695 Old2NewVPBlocks[BB] = NewBB;
696 if (InRegion && BB->getNumSuccessors() == 0) {
697 assert(!Exiting && "Multiple exiting blocks?");
698 Exiting = BB;
699 }
700 }
701 assert((!InRegion || Exiting) && "regions must have a single exiting block");
702
703 // Second, update the predecessors & successors of the cloned blocks.
704 for (VPBlockBase *BB : vp_depth_first_shallow(G: Entry)) {
705 VPBlockBase *NewBB = Old2NewVPBlocks[BB];
706 SmallVector<VPBlockBase *> NewPreds;
707 for (VPBlockBase *Pred : BB->getPredecessors()) {
708 NewPreds.push_back(Elt: Old2NewVPBlocks[Pred]);
709 }
710 NewBB->setPredecessors(NewPreds);
711 SmallVector<VPBlockBase *> NewSuccs;
712 for (VPBlockBase *Succ : BB->successors()) {
713 NewSuccs.push_back(Elt: Old2NewVPBlocks[Succ]);
714 }
715 NewBB->setSuccessors(NewSuccs);
716 }
717
718#if !defined(NDEBUG)
719 // Verify that the order of predecessors and successors matches in the cloned
720 // version.
721 for (const auto &[OldBB, NewBB] :
722 zip(vp_depth_first_shallow(Entry),
723 vp_depth_first_shallow(Old2NewVPBlocks[Entry]))) {
724 for (const auto &[OldPred, NewPred] :
725 zip(OldBB->getPredecessors(), NewBB->getPredecessors()))
726 assert(NewPred == Old2NewVPBlocks[OldPred] && "Different predecessors");
727
728 for (const auto &[OldSucc, NewSucc] :
729 zip(OldBB->successors(), NewBB->successors()))
730 assert(NewSucc == Old2NewVPBlocks[OldSucc] && "Different successors");
731 }
732#endif
733
734 return std::make_pair(x&: Old2NewVPBlocks[Entry],
735 y: Exiting ? Old2NewVPBlocks[Exiting] : nullptr);
736}
737
738VPRegionBlock *VPRegionBlock::clone() {
739 const auto &[NewEntry, NewExiting] = VPBlockUtils::cloneFrom(Entry: getEntry());
740 VPlan &Plan = *getPlan();
741 VPRegionValue *CanIV = getCanonicalIV();
742 VPRegionBlock *NewRegion =
743 CanIV ? Plan.createLoopRegion(CanIVTy: CanIV->getType(), DL: CanIV->getDebugLoc(),
744 Name: getName(), Entry: NewEntry, Exiting: NewExiting)
745 : Plan.createReplicateRegion(Entry: NewEntry, Exiting: NewExiting, Name: getName());
746
747 for (VPBlockBase *Block : vp_depth_first_shallow(G: NewEntry))
748 Block->setParent(NewRegion);
749 return NewRegion;
750}
751
752void VPRegionBlock::execute(VPTransformState *State) {
753 llvm_unreachable("regions must get dissolved before ::execute");
754}
755
756InstructionCost VPBasicBlock::cost(ElementCount VF, VPCostContext &Ctx) {
757 InstructionCost Cost = 0;
758 for (VPRecipeBase &R : Recipes)
759 Cost += R.cost(VF, Ctx);
760 return Cost;
761}
762
763const VPBasicBlock *VPBasicBlock::getCFGPredecessor(unsigned Idx) const {
764 const VPBlockBase *Pred = nullptr;
765 if (hasPredecessors()) {
766 Pred = getPredecessors()[Idx];
767 } else {
768 auto *Region = getParent();
769 assert(Region && !Region->isReplicator() && Region->getEntry() == this &&
770 "must be in the entry block of a non-replicate region");
771 assert(Idx < 2 && Region->getNumPredecessors() == 1 &&
772 "loop region has a single predecessor (preheader), its entry block "
773 "has 2 incoming blocks");
774
775 // Idx == 0 selects the predecessor of the region, Idx == 1 selects the
776 // region itself whose exiting block feeds the phi across the backedge.
777 Pred = Idx == 0 ? Region->getSinglePredecessor() : Region;
778 }
779 return Pred->getExitingBasicBlock();
780}
781
782InstructionCost VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx) {
783 if (!isReplicator()) {
784 // Neglect the cost of canonical IV, matching the legacy cost model.
785 InstructionCost Cost = 0;
786 for (VPBlockBase *Block : vp_depth_first_shallow(G: getEntry()))
787 Cost += Block->cost(VF, Ctx);
788 InstructionCost BackedgeCost =
789 ForceTargetInstructionCost.getNumOccurrences()
790 ? InstructionCost(ForceTargetInstructionCost)
791 : Ctx.TTI.getCFInstrCost(Opcode: Instruction::UncondBr, CostKind: Ctx.CostKind);
792 LLVM_DEBUG(dbgs() << "Cost of " << BackedgeCost << " for VF " << VF
793 << ": vector loop backedge\n");
794 Cost += BackedgeCost;
795 return Cost;
796 }
797
798 // Compute the cost of a replicate region. Replicating isn't supported for
799 // scalable vectors, return an invalid cost for them.
800 // TODO: Discard scalable VPlans with replicate recipes earlier after
801 // construction.
802 if (VF.isScalable())
803 return InstructionCost::getInvalid();
804
805 // Compute and return the cost of the conditionally executed recipes.
806 assert(VF.isVector() && "Can only compute vector cost at the moment.");
807 VPBasicBlock *Then = cast<VPBasicBlock>(Val: getEntry()->getSuccessors()[0]);
808 return Then->cost(VF, Ctx);
809}
810
811#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
812void VPRegionBlock::print(raw_ostream &O, const Twine &Indent,
813 VPSlotTracker &SlotTracker) const {
814 O << Indent << (isReplicator() ? "<xVFxUF> " : "<x1> ") << getName() << ": {";
815 auto NewIndent = Indent + " ";
816 if (auto *CanIV = getCanonicalIV()) {
817 O << '\n';
818 CanIV->print(O, SlotTracker);
819 O << " = CANONICAL-IV\n";
820 }
821 for (auto *BlockBase : vp_depth_first_shallow(Entry)) {
822 O << '\n';
823 BlockBase->print(O, NewIndent, SlotTracker);
824 }
825 O << Indent << "}\n";
826
827 printSuccessors(O, Indent);
828}
829#endif
830
831void VPRegionBlock::dissolveToCFGLoop() {
832 auto *Header = cast<VPBasicBlock>(Val: getEntry());
833 auto *ExitingLatch = cast<VPBasicBlock>(Val: getExiting());
834 auto *CanIV = getCanonicalIV();
835 if (!CanIV->user_empty()) {
836 VPlan &Plan = *getPlan();
837 auto *Zero = Plan.getZero(Ty: CanIV->getType());
838 DebugLoc DL = CanIV->getDebugLoc();
839 VPInstruction *CanIVInc = getOrCreateCanonicalIVIncrement();
840 VPBuilder HeaderBuilder(Header, Header->begin());
841 auto *ScalarR =
842 HeaderBuilder.createScalarPhi(IncomingValues: {Zero, CanIVInc}, DL, Name: "index");
843 CanIV->replaceAllUsesWith(New: ScalarR);
844 }
845
846 VPBlockBase *Preheader = getSinglePredecessor();
847 VPBlockUtils::disconnectBlocks(From: Preheader, To: this);
848
849 for (VPBlockBase *VPB : vp_depth_first_shallow(G: Entry))
850 VPB->setParent(getParent());
851
852 VPBlockUtils::connectBlocks(From: Preheader, To: Header);
853 VPBlockUtils::transferSuccessors(Old: this, New: ExitingLatch);
854 VPBlockUtils::connectBlocks(From: ExitingLatch, To: Header);
855}
856
857VPInstruction *VPRegionBlock::getOrCreateCanonicalIVIncrement() {
858 // TODO: Represent the increment as VPRegionValue as well.
859 VPRegionValue *CanIV = getCanonicalIV();
860 assert(CanIV && "Expected a canonical IV");
861
862 if (auto *Inc = vputils::findCanonicalIVIncrement(Plan&: *getPlan()))
863 return Inc;
864
865 assert(!getPlan()->getVFxUF().isMaterialized() &&
866 "VFxUF can be used only before it is materialized.");
867 auto *ExitingLatch = cast<VPBasicBlock>(Val: getExiting());
868 return VPBuilder(ExitingLatch->getTerminator())
869 .createOverflowingOp(Opcode: Instruction::Add, Operands: {CanIV, &getPlan()->getVFxUF()},
870 WrapFlags: {hasCanonicalIVNUW(), /* HasNSW */ false},
871 DL: CanIV->getDebugLoc(), Name: "index.next");
872}
873
874VPlan::VPlan(Loop *L, Type *IdxTy)
875 : VectorTripCount(IdxTy), VF(IdxTy), UF(IdxTy), VFxUF(IdxTy) {
876 setEntry(createVPIRBasicBlock(IRBB: L->getLoopPreheader()));
877 ScalarHeader = createVPIRBasicBlock(IRBB: L->getHeader());
878
879 SmallVector<BasicBlock *> IRExitBlocks;
880 L->getUniqueExitBlocks(ExitBlocks&: IRExitBlocks);
881 for (BasicBlock *EB : IRExitBlocks)
882 ExitBlocks.push_back(Elt: createVPIRBasicBlock(IRBB: EB));
883}
884
885VPlan::~VPlan() {
886 VPSymbolicValue DummyValue(nullptr);
887
888 // Redirect all recipe operands to DummyValue before deleting blocks.
889 for (VPBasicBlock *VPBB :
890 VPBlockUtils::blocksOnly<VPBasicBlock>(Range&: CreatedBlocks))
891 for (VPRecipeBase &R : *VPBB)
892 for (unsigned I = 0, E = R.getNumOperands(); I != E; I++)
893 R.setOperand(I, New: &DummyValue);
894
895 for (auto *VPB : CreatedBlocks)
896 delete VPB;
897 for (VPValue *VPV : getLiveIns())
898 delete VPV;
899 delete BackedgeTakenCount;
900}
901
902bool VPlan::isExitBlock(VPBlockBase *VPBB) {
903 return is_contained(Range&: ExitBlocks, Element: VPBB);
904}
905
906/// To make RUN_VPLAN_PASS print final VPlan.
907static void printFinalVPlan(VPlan &) {}
908
909/// Generate the code inside the preheader and body of the vectorized loop.
910/// Assumes a single pre-header basic-block was created for this. Introduce
911/// additional basic-blocks as needed, and fill them all.
912void VPlan::execute(VPTransformState *State) {
913 assert(none_of(vp_depth_first_shallow(getEntry()), IsaPred<VPRegionBlock>) &&
914 "all region blocks must be dissolved before ::execute");
915
916 // Initialize CFG state.
917 State->CFG.PrevVPBB = nullptr;
918 State->CFG.ExitBB = State->CFG.PrevBB->getSingleSuccessor();
919
920 // Update VPDominatorTree since VPBasicBlock may be removed after State was
921 // constructed.
922 State->VPDT.recalculate(Func&: *this);
923
924 // Disconnect VectorPreHeader from ExitBB in both the CFG and DT.
925 BasicBlock *VectorPreHeader = State->CFG.PrevBB;
926 cast<UncondBrInst>(Val: VectorPreHeader->getTerminator())->setSuccessor(nullptr);
927 State->CFG.DTU.applyUpdates(
928 Updates: {{DominatorTree::Delete, VectorPreHeader, State->CFG.ExitBB}});
929
930 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << State->VF
931 << ", UF=" << getConcreteUF() << '\n');
932 setName("Final VPlan");
933 // TODO: RUN_VPLAN_PASS/VPlanTransforms::runPass should automatically dump
934 // VPlans after some specific stages when "-debug" is specified, but that
935 // hasn't been implemented yet. For now, just do both:
936 LLVM_DEBUG(dump());
937 RUN_VPLAN_PASS(printFinalVPlan, *this);
938
939 BasicBlock *ScalarPh = State->CFG.ExitBB;
940 VPBasicBlock *ScalarPhVPBB = getScalarPreheader();
941 if (ScalarPhVPBB) {
942 // Disconnect scalar preheader and scalar header, as the dominator tree edge
943 // will be updated as part of VPlan execution. This allows keeping the DTU
944 // logic generic during VPlan execution.
945 State->CFG.DTU.applyUpdates(
946 Updates: {{DominatorTree::Delete, ScalarPh, ScalarPh->getSingleSuccessor()}});
947 }
948 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
949 Entry);
950 // Generate code for the VPlan, in parts of the vector skeleton, loop body and
951 // successor blocks including the middle, exit and scalar preheader blocks.
952 for (VPBlockBase *Block : RPOT)
953 Block->execute(State);
954
955 if (hasEarlyExit()) {
956 // Fix up LoopInfo for extra dispatch blocks when vectorizing loops with
957 // early exits. For dispatch blocks, we need to find the smallest common
958 // loop of all successors that are in a loop. Note: we only need to update
959 // loop info for blocks after the middle block, but there is no easy way to
960 // get those at this point.
961 for (VPBlockBase *VPB : reverse(C&: RPOT)) {
962 auto *VPBB = dyn_cast<VPBasicBlock>(Val: VPB);
963 if (!VPBB || isa<VPIRBasicBlock>(Val: VPBB))
964 continue;
965 BasicBlock *BB = State->CFG.VPBB2IRBB[VPBB];
966 Loop *L = State->LI->getLoopFor(BB);
967 if (!L || any_of(Range: successors(BB),
968 P: [L](BasicBlock *Succ) { return L->contains(BB: Succ); }))
969 continue;
970 // Find the innermost loop containing all successors that are in a loop.
971 // Successors not in any loop don't constrain the target loop.
972 Loop *Target = nullptr;
973 for (BasicBlock *Succ : successors(BB)) {
974 Loop *SuccLoop = State->LI->getLoopFor(BB: Succ);
975 if (!SuccLoop)
976 continue;
977 if (!Target)
978 Target = SuccLoop;
979 else
980 Target = State->LI->getSmallestCommonLoop(A: Target, B: SuccLoop);
981 }
982 State->LI->removeBlock(BB);
983 if (Target)
984 Target->addBasicBlockToLoop(NewBB: BB, LI&: *State->LI);
985 }
986 }
987
988 // If the original loop is unreachable, delete it and all its blocks.
989 if (!ScalarPhVPBB) {
990 // DeleteDeadBlocks will remove single-entry phis. Remove them from the exit
991 // VPIRBBs in VPlan as well, otherwise we would retain references to deleted
992 // IR instructions.
993 for (VPIRBasicBlock *EB : getExitBlocks()) {
994 for (VPRecipeBase &R : make_early_inc_range(Range: EB->phis())) {
995 if (R.getNumOperands() == 1)
996 R.eraseFromParent();
997 }
998 }
999
1000 Loop *OrigLoop =
1001 State->LI->getLoopFor(BB: getScalarHeader()->getIRBasicBlock());
1002 auto Blocks = OrigLoop->getBlocksVector();
1003 Blocks.push_back(x: ScalarPh);
1004 while (!OrigLoop->isInnermost())
1005 State->LI->erase(L: *OrigLoop->begin());
1006 State->LI->erase(L: OrigLoop);
1007 for (auto *BB : Blocks)
1008 State->LI->removeBlock(BB);
1009 DeleteDeadBlocks(BBs: Blocks, DTU: &State->CFG.DTU);
1010 }
1011
1012 State->CFG.DTU.flush();
1013
1014 // Fix the latch (backedge) value of all header phis in all loop headers.
1015 for (VPBlockBase *VPB : vp_depth_first_shallow(G: getEntry())) {
1016 if (!VPBlockUtils::isHeader(VPB, VPDT: State->VPDT))
1017 continue;
1018 auto *Header = cast<VPBasicBlock>(Val: VPB);
1019 auto *LatchVPBB = cast<VPBasicBlock>(Val: Header->getPredecessors()[1]);
1020 BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB];
1021
1022 for (VPRecipeBase &R : Header->phis()) {
1023 auto *PhiR = cast<VPSingleDefRecipe>(Val: &R);
1024 bool NeedsScalar =
1025 isa<VPPhi>(Val: PhiR) || (isa<VPReductionPHIRecipe>(Val: PhiR) &&
1026 cast<VPReductionPHIRecipe>(Val: PhiR)->isInLoop());
1027
1028 Value *Phi = State->get(Def: PhiR, NeedsScalar);
1029 Value *Val = State->get(Def: PhiR->getOperand(N: 1), NeedsScalar);
1030 cast<PHINode>(Val: Phi)->addIncoming(V: Val, BB: VectorLatchBB);
1031 }
1032 }
1033}
1034
1035InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx) {
1036 // For now only return the cost of the vector loop region, ignoring any other
1037 // blocks, like the preheader or middle blocks, expect for checking them for
1038 // recipes with invalid costs.
1039 InstructionCost Cost = getVectorLoopRegion()->cost(VF, Ctx);
1040
1041 // If the cost of the loop region is invalid or any recipe in the skeleton
1042 // outside loop regions are invalid return an invalid cost.
1043 if (!Cost.isValid() || any_of(Range: VPBlockUtils::blocksOnly<VPBasicBlock>(
1044 Range: vp_depth_first_shallow(G: getEntry())),
1045 P: [&VF, &Ctx](VPBasicBlock *VPBB) {
1046 return !VPBB->cost(VF, Ctx).isValid();
1047 }))
1048 return InstructionCost::getInvalid();
1049
1050 return Cost;
1051}
1052
1053VPRegionBlock *VPlan::getVectorLoopRegion() {
1054 // TODO: Cache if possible.
1055 for (VPBlockBase *B : vp_depth_first_shallow(G: getEntry()))
1056 if (auto *R = dyn_cast<VPRegionBlock>(Val: B))
1057 return R->isReplicator() ? nullptr : R;
1058 return nullptr;
1059}
1060
1061const VPRegionBlock *VPlan::getVectorLoopRegion() const {
1062 for (const VPBlockBase *B : vp_depth_first_shallow(G: getEntry()))
1063 if (auto *R = dyn_cast<VPRegionBlock>(Val: B))
1064 return R->isReplicator() ? nullptr : R;
1065 return nullptr;
1066}
1067
1068bool VPlan::isOuterLoop() const {
1069 const VPRegionBlock *LoopRegion = getVectorLoopRegion();
1070 assert(LoopRegion && "expected a vector loop region");
1071 return any_of(Range: VPBlockUtils::blocksOnly<const VPRegionBlock>(
1072 Range: vp_depth_first_shallow(G: LoopRegion->getEntry())),
1073 P: [](const VPRegionBlock *R) { return !R->isReplicator(); });
1074}
1075
1076#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1077void VPlan::printLiveIns(raw_ostream &O) const {
1078 VPSlotTracker SlotTracker(this);
1079
1080 if (!VF.user_empty()) {
1081 O << "\nLive-in ";
1082 VF.printAsOperand(O, SlotTracker);
1083 O << " = VF";
1084 }
1085
1086 if (!UF.user_empty()) {
1087 O << "\nLive-in ";
1088 UF.printAsOperand(O, SlotTracker);
1089 O << " = UF";
1090 }
1091
1092 if (!VFxUF.user_empty()) {
1093 O << "\nLive-in ";
1094 VFxUF.printAsOperand(O, SlotTracker);
1095 O << " = VF * UF";
1096 }
1097
1098 if (!VectorTripCount.user_empty()) {
1099 O << "\nLive-in ";
1100 VectorTripCount.printAsOperand(O, SlotTracker);
1101 O << " = vector-trip-count";
1102 }
1103
1104 if (BackedgeTakenCount && !BackedgeTakenCount->user_empty()) {
1105 O << "\nLive-in ";
1106 BackedgeTakenCount->printAsOperand(O, SlotTracker);
1107 O << " = backedge-taken count";
1108 }
1109
1110 O << "\n";
1111 if (TripCount && !TripCount->user_empty()) {
1112 if (isa<VPIRValue>(TripCount))
1113 O << "Live-in ";
1114 TripCount->printAsOperand(O, SlotTracker);
1115 O << " = original trip-count";
1116 O << "\n";
1117 }
1118}
1119
1120LLVM_DUMP_METHOD
1121void VPlan::print(raw_ostream &O) const {
1122 VPSlotTracker SlotTracker(this);
1123
1124 O << "VPlan '" << getName() << "' {";
1125
1126 printLiveIns(O);
1127
1128 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<const VPBlockBase *>>
1129 RPOT(getEntry());
1130 for (const VPBlockBase *Block : RPOT) {
1131 O << '\n';
1132 Block->print(O, "", SlotTracker);
1133 }
1134
1135 O << "}\n";
1136}
1137
1138std::string VPlan::getName() const {
1139 std::string Out;
1140 raw_string_ostream RSO(Out);
1141 RSO << Name << " for ";
1142 if (!VFs.empty()) {
1143 RSO << "VF={" << VFs[0];
1144 for (ElementCount VF : drop_begin(VFs))
1145 RSO << "," << VF;
1146 RSO << "},";
1147 }
1148
1149 if (UFs.empty()) {
1150 RSO << "UF>=1";
1151 } else {
1152 RSO << "UF={" << UFs[0];
1153 for (unsigned UF : drop_begin(UFs))
1154 RSO << "," << UF;
1155 RSO << "}";
1156 }
1157
1158 return Out;
1159}
1160
1161LLVM_DUMP_METHOD
1162void VPlan::printDOT(raw_ostream &O) const {
1163 VPlanPrinter Printer(O, *this);
1164 Printer.dump();
1165}
1166
1167LLVM_DUMP_METHOD
1168void VPlan::dump() const { print(dbgs()); }
1169#endif
1170
1171static void remapOperands(VPBlockBase *Entry, VPBlockBase *NewEntry,
1172 DenseMap<VPValue *, VPValue *> &Old2NewVPValues) {
1173 // Update the operands of all cloned recipes starting at NewEntry. This
1174 // traverses all reachable blocks. This is done in two steps, to handle cycles
1175 // in PHI recipes.
1176 ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>>
1177 OldDeepRPOT(Entry);
1178 ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>>
1179 NewDeepRPOT(NewEntry);
1180 // First, collect all mappings from old to new VPValues defined by cloned
1181 // recipes.
1182 for (const auto &[OldBB, NewBB] :
1183 zip(t: VPBlockUtils::blocksOnly<VPBasicBlock>(Range&: OldDeepRPOT),
1184 u: VPBlockUtils::blocksOnly<VPBasicBlock>(Range&: NewDeepRPOT))) {
1185 assert(OldBB->getRecipeList().size() == NewBB->getRecipeList().size() &&
1186 "blocks must have the same number of recipes");
1187 for (const auto &[OldR, NewR] : zip(t&: *OldBB, u&: *NewBB)) {
1188 assert(OldR.getNumOperands() == NewR.getNumOperands() &&
1189 "recipes must have the same number of operands");
1190 assert(OldR.getNumDefinedValues() == NewR.getNumDefinedValues() &&
1191 "recipes must define the same number of operands");
1192 for (const auto &[OldV, NewV] :
1193 zip(t: OldR.definedValues(), u: NewR.definedValues()))
1194 Old2NewVPValues[OldV] = NewV;
1195 }
1196 }
1197
1198 // Update all operands to use cloned VPValues.
1199 for (VPBasicBlock *NewBB :
1200 VPBlockUtils::blocksOnly<VPBasicBlock>(Range&: NewDeepRPOT)) {
1201 for (VPRecipeBase &NewR : *NewBB)
1202 for (unsigned I = 0, E = NewR.getNumOperands(); I != E; ++I) {
1203 VPValue *NewOp = Old2NewVPValues.lookup(Val: NewR.getOperand(N: I));
1204 NewR.setOperand(I, New: NewOp);
1205 }
1206 }
1207}
1208
1209VPlan *VPlan::duplicate() {
1210 unsigned NumBlocksBeforeCloning = CreatedBlocks.size();
1211 // Clone blocks.
1212 const auto &[NewEntry, __] = VPBlockUtils::cloneFrom(Entry);
1213
1214 BasicBlock *ScalarHeaderIRBB = getScalarHeader()->getIRBasicBlock();
1215 VPIRBasicBlock *NewScalarHeader = nullptr;
1216 if (getScalarHeader()->hasPredecessors()) {
1217 NewScalarHeader = cast<VPIRBasicBlock>(Val: *find_if(
1218 Range: vp_depth_first_shallow(G: NewEntry), P: [ScalarHeaderIRBB](VPBlockBase *VPB) {
1219 auto *VPIRBB = dyn_cast<VPIRBasicBlock>(Val: VPB);
1220 return VPIRBB && VPIRBB->getIRBasicBlock() == ScalarHeaderIRBB;
1221 }));
1222 } else {
1223 NewScalarHeader = createVPIRBasicBlock(IRBB: ScalarHeaderIRBB);
1224 }
1225 // Create VPlan, clone live-ins and remap operands in the cloned blocks.
1226 auto *NewPlan =
1227 new VPlan(cast<VPBasicBlock>(Val: NewEntry), NewScalarHeader, getIndexType());
1228 DenseMap<VPValue *, VPValue *> Old2NewVPValues;
1229 for (VPIRValue *OldLiveIn : getLiveIns())
1230 Old2NewVPValues[OldLiveIn] = NewPlan->getOrAddLiveIn(V: OldLiveIn);
1231
1232 if (auto *TripCountIRV = dyn_cast_or_null<VPIRValue>(Val: TripCount))
1233 Old2NewVPValues[TripCountIRV] = NewPlan->getOrAddLiveIn(V: TripCountIRV);
1234 // else NewTripCount will be created and inserted into Old2NewVPValues when
1235 // TripCount is cloned. In any case NewPlan->TripCount is updated below.
1236
1237 if (auto *LoopRegion = getVectorLoopRegion()) {
1238 auto *OldCanIV = LoopRegion->getCanonicalIV();
1239 auto *NewCanIV = NewPlan->getVectorLoopRegion()->getCanonicalIV();
1240 assert(OldCanIV && NewCanIV &&
1241 "Loop regions of both plans must have canonical IVs.");
1242 Old2NewVPValues[OldCanIV] = NewCanIV;
1243 }
1244
1245 assert(none_of(Old2NewVPValues.keys(), IsaPred<VPSymbolicValue>) &&
1246 "All VPSymbolicValues must be handled below");
1247
1248 if (BackedgeTakenCount)
1249 NewPlan->BackedgeTakenCount =
1250 new VPSymbolicValue(BackedgeTakenCount->getType());
1251
1252 // Map and propagate materialized state for symbolic values.
1253 for (auto [OldSV, NewSV] :
1254 {std::pair{&VectorTripCount, &NewPlan->VectorTripCount},
1255 {&VF, &NewPlan->VF},
1256 {&UF, &NewPlan->UF},
1257 {&VFxUF, &NewPlan->VFxUF},
1258 {BackedgeTakenCount, NewPlan->BackedgeTakenCount}}) {
1259 if (!OldSV)
1260 continue;
1261 Old2NewVPValues[OldSV] = NewSV;
1262 if (OldSV->isMaterialized())
1263 NewSV->markMaterialized();
1264 }
1265
1266 remapOperands(Entry, NewEntry, Old2NewVPValues);
1267
1268 // Initialize remaining fields of cloned VPlan.
1269 NewPlan->VFs = VFs;
1270 NewPlan->UFs = UFs;
1271 // TODO: Adjust names.
1272 NewPlan->Name = Name;
1273 if (TripCount) {
1274 assert(Old2NewVPValues.contains(TripCount) &&
1275 "TripCount must have been added to Old2NewVPValues");
1276 NewPlan->TripCount = Old2NewVPValues[TripCount];
1277 }
1278
1279 // Transfer all cloned blocks (the second half of all current blocks) from
1280 // current to new VPlan.
1281 unsigned NumBlocksAfterCloning = CreatedBlocks.size();
1282 for (unsigned I :
1283 seq<unsigned>(Begin: NumBlocksBeforeCloning, End: NumBlocksAfterCloning))
1284 NewPlan->CreatedBlocks.push_back(Elt: this->CreatedBlocks[I]);
1285 CreatedBlocks.truncate(N: NumBlocksBeforeCloning);
1286
1287 // Update ExitBlocks of the new plan.
1288 for (VPBlockBase *VPB : NewPlan->CreatedBlocks) {
1289 if (VPB->getNumSuccessors() == 0 && isa<VPIRBasicBlock>(Val: VPB) &&
1290 VPB != NewScalarHeader)
1291 NewPlan->ExitBlocks.push_back(Elt: cast<VPIRBasicBlock>(Val: VPB));
1292 }
1293
1294 return NewPlan;
1295}
1296
1297VPIRBasicBlock *VPlan::createEmptyVPIRBasicBlock(BasicBlock *IRBB) {
1298 auto *VPIRBB = new VPIRBasicBlock(IRBB);
1299 CreatedBlocks.push_back(Elt: VPIRBB);
1300 return VPIRBB;
1301}
1302
1303VPIRBasicBlock *VPlan::createVPIRBasicBlock(BasicBlock *IRBB) {
1304 auto *VPIRBB = createEmptyVPIRBasicBlock(IRBB);
1305 for (Instruction &I :
1306 make_range(x: IRBB->begin(), y: IRBB->getTerminator()->getIterator()))
1307 VPIRBB->appendRecipe(Recipe: VPIRInstruction::create(I));
1308 return VPIRBB;
1309}
1310
1311#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1312
1313Twine VPlanPrinter::getUID(const VPBlockBase *Block) {
1314 return (isa<VPRegionBlock>(Block) ? "cluster_N" : "N") +
1315 Twine(getOrCreateBID(Block));
1316}
1317
1318void VPlanPrinter::dump() {
1319 Depth = 1;
1320 bumpIndent(0);
1321 OS << "digraph VPlan {\n";
1322 OS << "graph [labelloc=t, fontsize=30; label=\"Vectorization Plan";
1323 if (!Plan.getName().empty())
1324 OS << "\\n" << DOT::EscapeString(Plan.getName());
1325
1326 {
1327 // Print live-ins.
1328 std::string Str;
1329 raw_string_ostream SS(Str);
1330 Plan.printLiveIns(SS);
1331 SmallVector<StringRef, 0> Lines;
1332 StringRef(Str).rtrim('\n').split(Lines, "\n");
1333 for (auto Line : Lines)
1334 OS << DOT::EscapeString(Line.str()) << "\\n";
1335 }
1336
1337 OS << "\"]\n";
1338 OS << "node [shape=rect, fontname=Courier, fontsize=30]\n";
1339 OS << "edge [fontname=Courier, fontsize=30]\n";
1340 OS << "compound=true\n";
1341
1342 for (const VPBlockBase *Block : vp_depth_first_shallow(Plan.getEntry()))
1343 dumpBlock(Block);
1344
1345 OS << "}\n";
1346}
1347
1348void VPlanPrinter::dumpBlock(const VPBlockBase *Block) {
1349 if (const VPBasicBlock *BasicBlock = dyn_cast<VPBasicBlock>(Block))
1350 dumpBasicBlock(BasicBlock);
1351 else if (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
1352 dumpRegion(Region);
1353 else
1354 llvm_unreachable("Unsupported kind of VPBlock.");
1355}
1356
1357void VPlanPrinter::drawEdge(const VPBlockBase *From, const VPBlockBase *To,
1358 bool Hidden, const Twine &Label) {
1359 // Due to "dot" we print an edge between two regions as an edge between the
1360 // exiting basic block and the entry basic of the respective regions.
1361 const VPBlockBase *Tail = From->getExitingBasicBlock();
1362 const VPBlockBase *Head = To->getEntryBasicBlock();
1363 OS << Indent << getUID(Tail) << " -> " << getUID(Head);
1364 OS << " [ label=\"" << Label << '\"';
1365 if (Tail != From)
1366 OS << " ltail=" << getUID(From);
1367 if (Head != To)
1368 OS << " lhead=" << getUID(To);
1369 if (Hidden)
1370 OS << "; splines=none";
1371 OS << "]\n";
1372}
1373
1374void VPlanPrinter::dumpEdges(const VPBlockBase *Block) {
1375 auto &Successors = Block->getSuccessors();
1376 if (Successors.size() == 1)
1377 drawEdge(Block, Successors.front(), false, "");
1378 else if (Successors.size() == 2) {
1379 drawEdge(Block, Successors.front(), false, "T");
1380 drawEdge(Block, Successors.back(), false, "F");
1381 } else {
1382 unsigned SuccessorNumber = 0;
1383 for (auto *Successor : Successors)
1384 drawEdge(Block, Successor, false, Twine(SuccessorNumber++));
1385 }
1386}
1387
1388void VPlanPrinter::dumpBasicBlock(const VPBasicBlock *BasicBlock) {
1389 // Implement dot-formatted dump by performing plain-text dump into the
1390 // temporary storage followed by some post-processing.
1391 OS << Indent << getUID(BasicBlock) << " [label =\n";
1392 bumpIndent(1);
1393 std::string Str;
1394 raw_string_ostream SS(Str);
1395 // Use no indentation as we need to wrap the lines into quotes ourselves.
1396 BasicBlock->print(SS, "", SlotTracker);
1397
1398 // We need to process each line of the output separately, so split
1399 // single-string plain-text dump.
1400 SmallVector<StringRef, 0> Lines;
1401 StringRef(Str).rtrim('\n').split(Lines, "\n");
1402
1403 auto EmitLine = [&](StringRef Line, StringRef Suffix) {
1404 OS << Indent << '"' << DOT::EscapeString(Line.str()) << "\\l\"" << Suffix;
1405 };
1406
1407 // Don't need the "+" after the last line.
1408 for (auto Line : make_range(Lines.begin(), Lines.end() - 1))
1409 EmitLine(Line, " +\n");
1410 EmitLine(Lines.back(), "\n");
1411
1412 bumpIndent(-1);
1413 OS << Indent << "]\n";
1414
1415 dumpEdges(BasicBlock);
1416}
1417
1418void VPlanPrinter::dumpRegion(const VPRegionBlock *Region) {
1419 OS << Indent << "subgraph " << getUID(Region) << " {\n";
1420 bumpIndent(1);
1421 OS << Indent << "fontname=Courier\n"
1422 << Indent << "label=\""
1423 << DOT::EscapeString(Region->isReplicator() ? "<xVFxUF> " : "<x1> ")
1424 << DOT::EscapeString(Region->getName()) << "\"\n";
1425
1426 if (auto *CanIV = Region->getCanonicalIV()) {
1427 OS << Indent << "\"";
1428 std::string Op;
1429 raw_string_ostream S(Op);
1430 CanIV->printAsOperand(S, SlotTracker);
1431 OS << DOT::EscapeString(Op);
1432 OS << " = CANONICAL-IV\"\n";
1433 }
1434
1435 // Dump the blocks of the region.
1436 assert(Region->getEntry() && "Region contains no inner blocks.");
1437 for (const VPBlockBase *Block : vp_depth_first_shallow(Region->getEntry()))
1438 dumpBlock(Block);
1439 bumpIndent(-1);
1440 OS << Indent << "}\n";
1441 dumpEdges(Region);
1442}
1443
1444#endif
1445
1446/// Returns true if there is a vector loop region and \p VPV is defined in a
1447/// loop region.
1448static bool isDefinedInsideLoopRegions(const VPValue *VPV) {
1449 if (isa<VPRegionValue>(Val: VPV))
1450 return true;
1451 const VPRecipeBase *DefR = VPV->getDefiningRecipe();
1452 return DefR && (!DefR->getParent()->getPlan()->getVectorLoopRegion() ||
1453 DefR->getParent()->getEnclosingLoopRegion());
1454}
1455
1456bool VPValue::isDefinedOutsideLoopRegions() const {
1457 return !isDefinedInsideLoopRegions(VPV: this);
1458}
1459void VPValue::replaceAllUsesWith(VPValue *New) {
1460 replaceUsesWithIf(New, ShouldReplace: [](VPUser &, unsigned) { return true; });
1461 if (auto *SV = dyn_cast<VPSymbolicValue>(Val: this))
1462 SV->markMaterialized();
1463}
1464
1465void VPValue::replaceUsesWithIf(
1466 VPValue *New,
1467 llvm::function_ref<bool(VPUser &U, unsigned Idx)> ShouldReplace) {
1468 assertNotMaterialized();
1469 // Note that this early exit is required for correctness; the implementation
1470 // below relies on the number of users for this VPValue to decrease, which
1471 // isn't the case if this == New.
1472 if (this == New)
1473 return;
1474
1475 for (unsigned J = 0; J < getNumUsers();) {
1476 VPUser *User = Users[J];
1477 bool RemovedUser = false;
1478 for (unsigned I = 0, E = User->getNumOperands(); I < E; ++I) {
1479 if (User->getOperand(N: I) != this || !ShouldReplace(*User, I))
1480 continue;
1481
1482 RemovedUser = true;
1483 User->setOperand(I, New);
1484 }
1485 // If a user got removed after updating the current user, the next user to
1486 // update will be moved to the current position, so we only need to
1487 // increment the index if the number of users did not change.
1488 if (!RemovedUser)
1489 J++;
1490 }
1491}
1492
1493void VPUser::replaceUsesOfWith(VPValue *From, VPValue *To) {
1494 for (unsigned Idx = 0; Idx != getNumOperands(); ++Idx) {
1495 if (getOperand(N: Idx) == From)
1496 setOperand(I: Idx, New: To);
1497 }
1498}
1499
1500#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1501void VPValue::printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const {
1502 OS << Tracker.getOrCreateName(this);
1503}
1504
1505void VPUser::printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const {
1506 interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) {
1507 Op->printAsOperand(O, SlotTracker);
1508 });
1509}
1510#endif
1511
1512void VPSlotTracker::assignName(const VPValue *V) {
1513 assert(!VPValue2Name.contains(V) && "VPValue already has a name!");
1514 auto *UV = V->getUnderlyingValue();
1515 auto *VPI = dyn_cast_or_null<VPInstruction>(Val: V);
1516 if (!UV && !(VPI && !VPI->getName().empty())) {
1517 VPValue2Name[V] = (Twine("vp<%") + Twine(NextSlot) + ">").str();
1518 NextSlot++;
1519 return;
1520 }
1521
1522 // Use the name of the underlying Value, wrapped in "ir<>", and versioned by
1523 // appending ".Number" to the name if there are multiple uses.
1524 std::string Name;
1525 if (UV)
1526 Name = getName(V: UV);
1527 else
1528 Name = VPI->getName();
1529
1530 assert(!Name.empty() && "Name cannot be empty.");
1531 StringRef Prefix = UV ? "ir<" : "vp<%";
1532 std::string BaseName = (Twine(Prefix) + Name + Twine(">")).str();
1533
1534 // First assign the base name for V.
1535 const auto &[A, _] = VPValue2Name.try_emplace(Key: V, Args&: BaseName);
1536 // Integer or FP constants with different types will result in the same string
1537 // due to stripping types.
1538 if (isa<VPIRValue>(Val: V) && isa<ConstantInt, ConstantFP>(Val: UV))
1539 return;
1540
1541 // If it is already used by C > 0 other VPValues, increase the version counter
1542 // C and use it for V.
1543 const auto &[C, UseInserted] = BaseName2Version.try_emplace(Key: BaseName, Args: 0);
1544 if (!UseInserted) {
1545 C->second++;
1546 A->second = (BaseName + Twine(".") + Twine(C->second)).str();
1547 }
1548}
1549
1550void VPSlotTracker::assignNames(const VPlan &Plan) {
1551 if (!Plan.VF.user_empty())
1552 assignName(V: &Plan.VF);
1553 if (!Plan.UF.user_empty())
1554 assignName(V: &Plan.UF);
1555 if (!Plan.VFxUF.user_empty())
1556 assignName(V: &Plan.VFxUF);
1557 assignName(V: &Plan.VectorTripCount);
1558 if (Plan.BackedgeTakenCount)
1559 assignName(V: Plan.BackedgeTakenCount);
1560 for (VPValue *LI : Plan.getLiveIns())
1561 assignName(V: LI);
1562
1563 ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<const VPBlockBase *>>
1564 RPOT(VPBlockDeepTraversalWrapper<const VPBlockBase *>(Plan.getEntry()));
1565 for (const VPBlockBase *VPB : RPOT) {
1566 if (auto *VPBB = dyn_cast<VPBasicBlock>(Val: VPB))
1567 assignNames(VPBB);
1568 else if (auto *CanIV = cast<VPRegionBlock>(Val: VPB)->getCanonicalIV())
1569 assignName(V: CanIV);
1570 }
1571}
1572
1573void VPSlotTracker::assignNames(const VPBasicBlock *VPBB) {
1574 for (const VPRecipeBase &Recipe : *VPBB)
1575 for (VPValue *Def : Recipe.definedValues())
1576 assignName(V: Def);
1577}
1578
1579std::string VPSlotTracker::getName(const Value *V) {
1580 std::string Name;
1581 raw_string_ostream S(Name);
1582 if (V->hasName() || !isa<Instruction>(Val: V)) {
1583 V->printAsOperand(O&: S, PrintType: false);
1584 return Name;
1585 }
1586
1587 if (!MST) {
1588 // Lazily create the ModuleSlotTracker when we first hit an unnamed
1589 // instruction.
1590 auto *I = cast<Instruction>(Val: V);
1591 // This check is required to support unit tests with incomplete IR.
1592 if (I->getParent()) {
1593 MST = std::make_unique<ModuleSlotTracker>(args: I->getModule());
1594 MST->incorporateFunction(F: *I->getFunction());
1595 } else {
1596 MST = std::make_unique<ModuleSlotTracker>(args: nullptr);
1597 }
1598 }
1599 V->printAsOperand(O&: S, PrintType: false, MST&: *MST);
1600 return Name;
1601}
1602
1603std::string VPSlotTracker::getOrCreateName(const VPValue *V) const {
1604 std::string Name = VPValue2Name.lookup(Val: V);
1605 if (!Name.empty())
1606 return Name;
1607
1608 // If no name was assigned, no VPlan was provided when creating the slot
1609 // tracker or it is not reachable from the provided VPlan. This can happen,
1610 // e.g. when trying to print a recipe that has not been inserted into a VPlan
1611 // in a debugger.
1612 // TODO: Update VPSlotTracker constructor to assign names to recipes &
1613 // VPValues not associated with a VPlan, instead of constructing names ad-hoc
1614 // here.
1615 const VPRecipeBase *DefR = V->getDefiningRecipe();
1616 (void)DefR;
1617 assert((!DefR || !DefR->getParent() || !DefR->getParent()->getPlan()) &&
1618 "VPValue defined by a recipe in a VPlan?");
1619
1620 // Use the underlying value's name, if there is one.
1621 if (auto *UV = V->getUnderlyingValue()) {
1622 std::string Name;
1623 raw_string_ostream S(Name);
1624 UV->printAsOperand(O&: S, PrintType: false);
1625 return (Twine("ir<") + Name + ">").str();
1626 }
1627
1628 return "<badref>";
1629}
1630
1631VPInstruction *VPBuilder::createAnyOfReduction(VPValue *ChainOp,
1632 VPValue *TrueVal,
1633 VPValue *FalseVal, DebugLoc DL) {
1634 assert(ChainOp->getScalarType()->isIntegerTy(1) &&
1635 "ChainOp must be i1 for AnyOf reduction");
1636 VPIRFlags Flags(RecurKind::Or, /*IsOrdered=*/false, /*IsInLoop=*/false,
1637 FastMathFlags());
1638 auto *OrReduce =
1639 createNaryOp(Opcode: VPInstruction::ComputeReductionResult, Operands: {ChainOp}, Flags, DL);
1640 auto *Freeze = createNaryOp(Opcode: Instruction::Freeze, Operands: {OrReduce}, DL);
1641 return createSelect(Cond: Freeze, TrueVal, FalseVal, DL, Name: "rdx.select");
1642}
1643
1644bool LoopVectorizationPlanner::getDecisionAndClampRange(
1645 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
1646 assert(!Range.isEmpty() && "Trying to test an empty VF range.");
1647 bool PredicateAtRangeStart = Predicate(Range.Start);
1648
1649 for (ElementCount TmpVF : VFRange(Range.Start * 2, Range.End))
1650 if (Predicate(TmpVF) != PredicateAtRangeStart) {
1651 Range.End = TmpVF;
1652 break;
1653 }
1654
1655 return PredicateAtRangeStart;
1656}
1657
1658VPlan &LoopVectorizationPlanner::getPlanFor(ElementCount VF) const {
1659 assert(count_if(VPlans,
1660 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
1661 1 &&
1662 "Multiple VPlans for VF.");
1663
1664 for (const VPlanPtr &Plan : VPlans) {
1665 if (Plan->hasVF(VF))
1666 return *Plan.get();
1667 }
1668 llvm_unreachable("No plan found!");
1669}
1670
1671static void addRuntimeUnrollDisableMetaData(Loop *L) {
1672 SmallVector<Metadata *, 4> MDs;
1673 // Reserve first location for self reference to the LoopID metadata node.
1674 MDs.push_back(Elt: nullptr);
1675 bool IsUnrollMetadata = false;
1676 MDNode *LoopID = L->getLoopID();
1677 if (LoopID) {
1678 // First find existing loop unrolling disable metadata.
1679 for (unsigned I = 1, IE = LoopID->getNumOperands(); I < IE; ++I) {
1680 auto *MD = dyn_cast<MDNode>(Val: LoopID->getOperand(I));
1681 if (MD) {
1682 const auto *S = dyn_cast<MDString>(Val: MD->getOperand(I: 0));
1683 if (!S)
1684 continue;
1685 if (S->getString().starts_with(Prefix: "llvm.loop.unroll.runtime.disable"))
1686 continue;
1687 IsUnrollMetadata =
1688 S->getString().starts_with(Prefix: "llvm.loop.unroll.disable");
1689 }
1690 MDs.push_back(Elt: LoopID->getOperand(I));
1691 }
1692 }
1693
1694 if (!IsUnrollMetadata) {
1695 // Add runtime unroll disable metadata.
1696 LLVMContext &Context = L->getHeader()->getContext();
1697 SmallVector<Metadata *, 1> DisableOperands;
1698 DisableOperands.push_back(
1699 Elt: MDString::get(Context, Str: "llvm.loop.unroll.runtime.disable"));
1700 MDNode *DisableNode = MDNode::get(Context, MDs: DisableOperands);
1701 MDs.push_back(Elt: DisableNode);
1702 MDNode *NewLoopID = MDNode::get(Context, MDs);
1703 // Set operand 0 to refer to the loop id itself.
1704 NewLoopID->replaceOperandWith(I: 0, New: NewLoopID);
1705 L->setLoopID(NewLoopID);
1706 }
1707}
1708
1709void LoopVectorizationPlanner::updateLoopMetadataAndProfileInfo(
1710 Loop *VectorLoop, VPBasicBlock *HeaderVPBB, const VPlan &Plan,
1711 bool VectorizingEpilogue, MDNode *OrigLoopID,
1712 std::optional<unsigned> OrigAverageTripCount,
1713 unsigned OrigLoopInvocationWeight, unsigned EstimatedVFxUF,
1714 bool DisableRuntimeUnroll) {
1715 // Update the metadata of the scalar loop. Skip the update when vectorizing
1716 // the epilogue loop to ensure it is updated only once. Also skip the update
1717 // when the scalar loop became unreachable.
1718 auto *ScalarPH = Plan.getScalarPreheader();
1719 if (ScalarPH && !VectorizingEpilogue) {
1720 std::optional<MDNode *> RemainderLoopID =
1721 makeFollowupLoopID(OrigLoopID, FollowupAttrs: {LLVMLoopVectorizeFollowupAll,
1722 LLVMLoopVectorizeFollowupEpilogue});
1723 if (RemainderLoopID) {
1724 OrigLoop->setLoopID(*RemainderLoopID);
1725 } else {
1726 if (DisableRuntimeUnroll)
1727 addRuntimeUnrollDisableMetaData(L: OrigLoop);
1728
1729 LoopVectorizeHints Hints(OrigLoop, /*InterleaveOnlyWhenForced*/ false,
1730 *ORE);
1731 Hints.setAlreadyVectorized();
1732 }
1733 }
1734 // Tag the scalar remainder so downstream passes (e.g. the unroller and
1735 // WarnMissedTransforms) can produce more informative remarks. Only emit
1736 // when remarks are enabled.
1737 if (ORE->enabled() && ScalarPH && ScalarPH->hasPredecessors())
1738 OrigLoop->addIntLoopAttribute(Name: "llvm.loop.vectorize.epilogue", Value: 1);
1739
1740 if (!VectorLoop)
1741 return;
1742
1743 if (std::optional<MDNode *> VectorizedLoopID = makeFollowupLoopID(
1744 OrigLoopID, FollowupAttrs: {LLVMLoopVectorizeFollowupAll,
1745 LLVMLoopVectorizeFollowupVectorized})) {
1746 VectorLoop->setLoopID(*VectorizedLoopID);
1747 } else {
1748 // Keep all loop hints from the original loop on the vector loop (we'll
1749 // replace the vectorizer-specific hints below).
1750 if (OrigLoopID)
1751 VectorLoop->setLoopID(OrigLoopID);
1752
1753 if (!VectorizingEpilogue) {
1754 LoopVectorizeHints Hints(VectorLoop, /*InterleaveOnlyWhenForced*/ false,
1755 *ORE);
1756 Hints.setAlreadyVectorized();
1757 }
1758 }
1759 // Tag the vector loop body so downstream passes can identify it. Only
1760 // emit when remarks are enabled.
1761 if (ORE->enabled())
1762 VectorLoop->addIntLoopAttribute(Name: "llvm.loop.vectorize.body", Value: 1);
1763 TargetTransformInfo::UnrollingPreferences UP;
1764 TTI.getUnrollingPreferences(L: VectorLoop, *PSE.getSE(), UP, ORE);
1765 if (!UP.UnrollVectorizedLoop || VectorizingEpilogue)
1766 addRuntimeUnrollDisableMetaData(L: VectorLoop);
1767
1768 // Set/update profile weights for the vector and remainder loops as original
1769 // loop iterations are now distributed among them. Note that original loop
1770 // becomes the scalar remainder loop after vectorization.
1771 //
1772 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
1773 // end up getting slightly roughened result but that should be OK since
1774 // profile is not inherently precise anyway. Note also possible bypass of
1775 // vector code caused by legality checks is ignored, assigning all the weight
1776 // to the vector loop, optimistically.
1777 //
1778 // For scalable vectorization we can't know at compile time how many
1779 // iterations of the loop are handled in one vector iteration, so instead
1780 // use the value of vscale used for tuning.
1781 unsigned AverageVectorTripCount = 0;
1782 unsigned RemainderAverageTripCount = 0;
1783 auto EC = VectorLoop->getLoopPreheader()->getParent()->getEntryCount();
1784 auto IsProfiled = EC && *EC != 0;
1785 if (!OrigAverageTripCount) {
1786 if (!IsProfiled)
1787 return;
1788 auto &SE = *PSE.getSE();
1789 AverageVectorTripCount = SE.getSmallConstantTripCount(L: VectorLoop);
1790 if (ProfcheckDisableMetadataFixes || !AverageVectorTripCount)
1791 return;
1792 if (ScalarPH)
1793 RemainderAverageTripCount =
1794 SE.getSmallConstantTripCount(L: OrigLoop) % EstimatedVFxUF;
1795 // Setting to 1 should be sufficient to generate the correct branch weights.
1796 OrigLoopInvocationWeight = 1;
1797 } else {
1798 // Calculate number of iterations in unrolled loop.
1799 AverageVectorTripCount = *OrigAverageTripCount / EstimatedVFxUF;
1800 // Calculate number of iterations for remainder loop.
1801 RemainderAverageTripCount = *OrigAverageTripCount % EstimatedVFxUF;
1802 }
1803 if (HeaderVPBB) {
1804 setLoopEstimatedTripCount(L: VectorLoop, EstimatedTripCount: AverageVectorTripCount,
1805 EstimatedLoopInvocationWeight: OrigLoopInvocationWeight);
1806 }
1807
1808 if (ScalarPH) {
1809 setLoopEstimatedTripCount(L: OrigLoop, EstimatedTripCount: RemainderAverageTripCount,
1810 EstimatedLoopInvocationWeight: OrigLoopInvocationWeight);
1811 }
1812}
1813
1814#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1815void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
1816 if (VPlans.empty()) {
1817 O << "LV: No VPlans built.\n";
1818 return;
1819 }
1820 for (const auto &Plan : VPlans)
1821 if (PrintVPlansInDotFormat)
1822 Plan->printDOT(O);
1823 else
1824 Plan->print(O);
1825}
1826#endif
1827
1828bool llvm::canConstantBeExtended(const APInt *C, Type *NarrowType,
1829 TTI::PartialReductionExtendKind ExtKind) {
1830 APInt TruncatedVal = C->trunc(width: NarrowType->getScalarSizeInBits());
1831 unsigned WideSize = C->getBitWidth();
1832 APInt ExtendedVal = ExtKind == TTI::PR_SignExtend
1833 ? TruncatedVal.sext(width: WideSize)
1834 : TruncatedVal.zext(width: WideSize);
1835 return ExtendedVal == *C;
1836}
1837
1838TargetTransformInfo::OperandValueInfo
1839VPCostContext::getOperandInfo(VPValue *V) const {
1840 if (auto *IRV = dyn_cast<VPIRValue>(Val: V))
1841 return TTI::getOperandInfo(V: IRV->getValue());
1842
1843 return {};
1844}
1845
1846InstructionCost VPCostContext::getScalarizationOverhead(
1847 Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF,
1848 TTI::VectorInstrContext VIC, bool AlwaysIncludeReplicatingR) {
1849 if (VF.isScalar())
1850 return 0;
1851
1852 assert(!VF.isScalable() &&
1853 "Scalarization overhead not supported for scalable vectors");
1854
1855 InstructionCost ScalarizationCost = 0;
1856 // Compute the cost of scalarizing the result if needed.
1857 if (!ResultTy->isVoidTy()) {
1858 for (Type *VectorTy :
1859 to_vector(Range: getContainedTypes(Ty: toVectorizedTy(Ty: ResultTy, EC: VF)))) {
1860 ScalarizationCost += TTI.getScalarizationOverhead(
1861 Ty: cast<VectorType>(Val: VectorTy), DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
1862 /*Insert=*/true, /*Extract=*/false, CostKind,
1863 /*ForPoisonSrc=*/true, VL: {}, VIC);
1864 }
1865 }
1866 // Compute the cost of scalarizing the operands, skipping ones that do not
1867 // require extraction/scalarization and do not incur any overhead.
1868 SmallPtrSet<const VPValue *, 4> UniqueOperands;
1869 SmallVector<Type *> Tys;
1870 for (auto *Op : Operands) {
1871 if (isa<VPIRValue>(Val: Op) ||
1872 (!AlwaysIncludeReplicatingR &&
1873 isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Val: Op)) ||
1874 (isa<VPReplicateRecipe>(Val: Op) &&
1875 cast<VPReplicateRecipe>(Val: Op)->getOpcode() == Instruction::Load) ||
1876 !UniqueOperands.insert(Ptr: Op).second)
1877 continue;
1878 Tys.push_back(Elt: toVectorizedTy(Ty: Op->getScalarType(), EC: VF));
1879 }
1880 return ScalarizationCost +
1881 TTI.getOperandsScalarizationOverhead(Tys, CostKind, VIC);
1882}
1883
1884bool VPCostContext::useEmulatedMaskMemRefHack(const VPReplicateRecipe *R,
1885 ElementCount VF) {
1886 const Instruction *UI = R->getUnderlyingInstr();
1887 if (isa<LoadInst>(Val: UI))
1888 return true;
1889 assert(isa<StoreInst>(UI) && "R must either be a load or store");
1890
1891 if (!NumPredStores) {
1892 // Count the number of predicated stores in the VPlan, caching the result.
1893 // Only stores where scatter is not legal are counted, matching the legacy
1894 // cost model behavior.
1895 const VPlan &Plan = *R->getParent()->getPlan();
1896 NumPredStores = 0;
1897 for (const VPRegionBlock *VPRB :
1898 VPBlockUtils::blocksOnly<const VPRegionBlock>(
1899 Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry()))) {
1900 assert(VPRB->isReplicator() && "must only contain replicate regions");
1901 for (const VPBasicBlock *VPBB :
1902 VPBlockUtils::blocksOnly<const VPBasicBlock>(
1903 Range: vp_depth_first_shallow(G: VPRB->getEntry()))) {
1904 for (const VPRecipeBase &Recipe : *VPBB) {
1905 auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &Recipe);
1906 if (!RepR)
1907 continue;
1908 if (!isa<StoreInst>(Val: RepR->getUnderlyingInstr()))
1909 continue;
1910 // Check if scatter is legal for this store. If so, don't count it.
1911 Type *Ty = RepR->getOperand(N: 0)->getScalarType();
1912 auto *VTy = VectorType::get(ElementType: Ty, EC: VF);
1913 const Align Alignment =
1914 getLoadStoreAlignment(I: RepR->getUnderlyingInstr());
1915 if (!TTI.isLegalMaskedScatter(DataType: VTy, Alignment))
1916 ++(*NumPredStores);
1917 }
1918 }
1919 }
1920 }
1921 return *NumPredStores > NumberOfStoresToPredicate;
1922}
1923
1924bool VPCostContext::isFreeScalarIntrinsic(Intrinsic::ID ID) {
1925 return is_contained(Set: {Intrinsic::assume, Intrinsic::lifetime_end,
1926 Intrinsic::lifetime_start, Intrinsic::sideeffect,
1927 Intrinsic::pseudoprobe,
1928 Intrinsic::experimental_noalias_scope_decl},
1929 Element: ID);
1930}
1931