1//===- VPlan.cpp - Vectorizer Plan ----------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This is the LLVM vectorization plan. It represents a candidate for
11/// vectorization, allowing to plan and optimize how to vectorize a given loop
12/// before generating LLVM-IR.
13/// The vectorizer uses vectorization plans to estimate the costs of potential
14/// candidates and if profitable to execute the desired plan, generating vector
15/// LLVM-IR code.
16///
17//===----------------------------------------------------------------------===//
18
19#include "VPlan.h"
20#include "LoopVectorizationPlanner.h"
21#include "VPlanCFG.h"
22#include "VPlanDominatorTree.h"
23#include "VPlanHelpers.h"
24#include "VPlanPatternMatch.h"
25#include "VPlanTransforms.h"
26#include "VPlanUtils.h"
27#include "llvm/ADT/PostOrderIterator.h"
28#include "llvm/ADT/STLExtras.h"
29#include "llvm/ADT/SmallVector.h"
30#include "llvm/ADT/StringExtras.h"
31#include "llvm/ADT/Twine.h"
32#include "llvm/Analysis/DomTreeUpdater.h"
33#include "llvm/Analysis/LoopInfo.h"
34#include "llvm/IR/BasicBlock.h"
35#include "llvm/IR/CFG.h"
36#include "llvm/IR/IRBuilder.h"
37#include "llvm/IR/Instruction.h"
38#include "llvm/IR/Instructions.h"
39#include "llvm/IR/Type.h"
40#include "llvm/IR/Value.h"
41#include "llvm/Support/Casting.h"
42#include "llvm/Support/CommandLine.h"
43#include "llvm/Support/Debug.h"
44#include "llvm/Support/GraphWriter.h"
45#include "llvm/Support/raw_ostream.h"
46#include "llvm/Transforms/Utils/BasicBlockUtils.h"
47#include "llvm/Transforms/Utils/LoopVersioning.h"
48#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
49#include <cassert>
50#include <string>
51
52using namespace llvm;
53using namespace llvm::VPlanPatternMatch;
54
55namespace llvm {
56extern cl::opt<bool> ProfcheckDisableMetadataFixes;
57} // namespace llvm
58
59/// @{
60/// Metadata attribute names
61const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
62const char LLVMLoopVectorizeFollowupVectorized[] =
63 "llvm.loop.vectorize.followup_vectorized";
64const char LLVMLoopVectorizeFollowupEpilogue[] =
65 "llvm.loop.vectorize.followup_epilogue";
66/// @}
67
68extern cl::opt<unsigned> ForceTargetInstructionCost;
69
70extern cl::opt<unsigned> NumberOfStoresToPredicate;
71
72static cl::opt<bool> PrintVPlansInDotFormat(
73 "vplan-print-in-dot-format", cl::Hidden,
74 cl::desc("Use dot format instead of plain text when dumping VPlans"));
75
76#define DEBUG_TYPE "loop-vectorize"
77
78#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
79raw_ostream &llvm::operator<<(raw_ostream &OS, const VPRecipeBase &R) {
80 const VPBasicBlock *Parent = R.getParent();
81 VPSlotTracker SlotTracker(Parent ? Parent->getPlan() : nullptr);
82 R.print(OS, "", SlotTracker);
83 return OS;
84}
85#endif
86
87Value *VPLane::getAsRuntimeExpr(IRBuilderBase &Builder,
88 const ElementCount &VF) const {
89 switch (LaneKind) {
90 case VPLane::Kind::ScalableLast:
91 // Lane = RuntimeVF - VF.getKnownMinValue() + Lane
92 return Builder.CreateSub(LHS: getRuntimeVF(B&: Builder, Ty: Builder.getInt32Ty(), VF),
93 RHS: Builder.getInt32(C: VF.getKnownMinValue() - Lane));
94 case VPLane::Kind::First:
95 return Builder.getInt32(C: Lane);
96 }
97 llvm_unreachable("Unknown lane kind");
98}
99
100#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
101void VPValue::print(raw_ostream &OS, VPSlotTracker &SlotTracker) const {
102 if (const VPRecipeBase *R = getDefiningRecipe())
103 R->print(OS, "", SlotTracker);
104 else
105 printAsOperand(OS, SlotTracker);
106}
107
108void VPValue::dump() const {
109 const VPRecipeBase *Instr = getDefiningRecipe();
110 VPSlotTracker SlotTracker(
111 (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr);
112 print(dbgs(), SlotTracker);
113 dbgs() << "\n";
114}
115
116void VPRecipeBase::dump() const {
117 VPSlotTracker SlotTracker(getParent() ? getParent()->getPlan() : nullptr);
118 print(dbgs(), "", SlotTracker);
119 dbgs() << "\n";
120}
121#endif
122
123#if !defined(NDEBUG)
124bool VPRecipeValue::isDefinedBy(const VPDef *D) const { return Def == D; }
125#endif
126
127VPRecipeBase *VPValue::getDefiningRecipe() {
128 auto *DefValue = dyn_cast<VPRecipeValue>(Val: this);
129 return DefValue ? DefValue->Def : nullptr;
130}
131
132const VPRecipeBase *VPValue::getDefiningRecipe() const {
133 auto *DefValue = dyn_cast<VPRecipeValue>(Val: this);
134 return DefValue ? DefValue->Def : nullptr;
135}
136
137Value *VPValue::getLiveInIRValue() const {
138 return cast<VPIRValue>(Val: this)->getValue();
139}
140
141Type *VPIRValue::getType() const { return getUnderlyingValue()->getType(); }
142
143VPRecipeValue::VPRecipeValue(VPRecipeBase *Def, Value *UV)
144 : VPValue(VPVRecipeValueSC, UV), Def(Def) {
145 assert(Def && "VPRecipeValue requires a defining recipe");
146 Def->addDefinedValue(V: this);
147}
148
149VPRecipeValue::~VPRecipeValue() {
150 assert(Users.empty() &&
151 "trying to delete a VPRecipeValue with remaining users");
152 Def->removeDefinedValue(V: this);
153}
154
155// Get the top-most entry block of \p Start. This is the entry block of the
156// containing VPlan. This function is templated to support both const and non-const blocks
157template <typename T> static T *getPlanEntry(T *Start) {
158 T *Next = Start;
159 T *Current = Start;
160 while ((Next = Next->getParent()))
161 Current = Next;
162
163 SmallSetVector<T *, 8> WorkList;
164 WorkList.insert(Current);
165
166 for (unsigned i = 0; i < WorkList.size(); i++) {
167 T *Current = WorkList[i];
168 if (!Current->hasPredecessors())
169 return Current;
170 auto &Predecessors = Current->getPredecessors();
171 WorkList.insert_range(Predecessors);
172 }
173
174 llvm_unreachable("VPlan without any entry node without predecessors");
175}
176
177VPlan *VPBlockBase::getPlan() { return getPlanEntry(Start: this)->Plan; }
178
179const VPlan *VPBlockBase::getPlan() const { return getPlanEntry(Start: this)->Plan; }
180
181/// \return the VPBasicBlock that is the entry of Block, possibly indirectly.
182const VPBasicBlock *VPBlockBase::getEntryBasicBlock() const {
183 const VPBlockBase *Block = this;
184 while (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Val: Block))
185 Block = Region->getEntry();
186 return cast<VPBasicBlock>(Val: Block);
187}
188
189VPBasicBlock *VPBlockBase::getEntryBasicBlock() {
190 VPBlockBase *Block = this;
191 while (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Val: Block))
192 Block = Region->getEntry();
193 return cast<VPBasicBlock>(Val: Block);
194}
195
196void VPBlockBase::setPlan(VPlan *ParentPlan) {
197 assert(ParentPlan->getEntry() == this && "Can only set plan on its entry.");
198 Plan = ParentPlan;
199}
200
201/// \return the VPBasicBlock that is the exit of Block, possibly indirectly.
202const VPBasicBlock *VPBlockBase::getExitingBasicBlock() const {
203 const VPBlockBase *Block = this;
204 while (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Val: Block))
205 Block = Region->getExiting();
206 return cast<VPBasicBlock>(Val: Block);
207}
208
209VPBasicBlock *VPBlockBase::getExitingBasicBlock() {
210 VPBlockBase *Block = this;
211 while (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Val: Block))
212 Block = Region->getExiting();
213 return cast<VPBasicBlock>(Val: Block);
214}
215
216VPBlockBase *VPBlockBase::getEnclosingBlockWithSuccessors() {
217 if (!Successors.empty() || !Parent)
218 return this;
219 assert(Parent->getExiting() == this &&
220 "Block w/o successors not the exiting block of its parent.");
221 return Parent->getEnclosingBlockWithSuccessors();
222}
223
224VPBlockBase *VPBlockBase::getEnclosingBlockWithPredecessors() {
225 if (!Predecessors.empty() || !Parent)
226 return this;
227 assert(Parent->getEntry() == this &&
228 "Block w/o predecessors not the entry of its parent.");
229 return Parent->getEnclosingBlockWithPredecessors();
230}
231
232VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() {
233 iterator It = begin();
234 while (It != end() && It->isPhi())
235 It++;
236 return It;
237}
238
239VPTransformState::VPTransformState(const TargetTransformInfo *TTI,
240 ElementCount VF, LoopInfo *LI,
241 DominatorTree *DT, AssumptionCache *AC,
242 IRBuilderBase &Builder, VPlan *Plan,
243 Loop *CurrentParentLoop, Type *CanonicalIVTy)
244 : TTI(TTI), VF(VF), CFG(DT), LI(LI), AC(AC), Builder(Builder), Plan(Plan),
245 CurrentParentLoop(CurrentParentLoop), TypeAnalysis(*Plan), VPDT(*Plan) {}
246
247Value *VPTransformState::get(const VPValue *Def, const VPLane &Lane) {
248 if (isa<VPIRValue, VPSymbolicValue>(Val: Def))
249 return Def->getUnderlyingValue();
250
251 if (hasScalarValue(Def, Lane))
252 return Data.VPV2Scalars[Def][Lane.mapToCacheIndex(VF)];
253
254 if (!Lane.isFirstLane() && vputils::isSingleScalar(VPV: Def) &&
255 hasScalarValue(Def, Lane: VPLane::getFirstLane())) {
256 return Data.VPV2Scalars[Def][0];
257 }
258
259 // Look through BuildVector to avoid redundant extracts.
260 // TODO: Remove once replicate regions are unrolled explicitly.
261 if (Lane.getKind() == VPLane::Kind::First && match(V: Def, P: m_BuildVector())) {
262 auto *BuildVector = cast<VPInstruction>(Val: Def);
263 return get(Def: BuildVector->getOperand(N: Lane.getKnownLane()), IsScalar: true);
264 }
265
266 assert(hasVectorValue(Def));
267 auto *VecPart = Data.VPV2Vector[Def];
268 if (!VecPart->getType()->isVectorTy()) {
269 assert(Lane.isFirstLane() && "cannot get lane > 0 for scalar");
270 return VecPart;
271 }
272 // TODO: Cache created scalar values.
273 Value *LaneV = Lane.getAsRuntimeExpr(Builder, VF);
274 auto *Extract = Builder.CreateExtractElement(Vec: VecPart, Idx: LaneV);
275 // set(Def, Extract, Instance);
276 return Extract;
277}
278
279Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) {
280 if (NeedsScalar) {
281 assert((VF.isScalar() || isa<VPIRValue, VPSymbolicValue>(Def) ||
282 hasVectorValue(Def) || !vputils::onlyFirstLaneUsed(Def) ||
283 (hasScalarValue(Def, VPLane(0)) &&
284 Data.VPV2Scalars[Def].size() == 1)) &&
285 "Trying to access a single scalar per part but has multiple scalars "
286 "per part.");
287 return get(Def, Lane: VPLane(0));
288 }
289
290 // If Values have been set for this Def return the one relevant for \p Part.
291 if (hasVectorValue(Def))
292 return Data.VPV2Vector[Def];
293
294 auto GetBroadcastInstrs = [this](Value *V) {
295 if (VF.isScalar())
296 return V;
297 // Broadcast the scalar into all locations in the vector.
298 Value *Shuf = Builder.CreateVectorSplat(EC: VF, V, Name: "broadcast");
299 return Shuf;
300 };
301
302 if (!hasScalarValue(Def, Lane: {0})) {
303 Value *IRV = Def->getLiveInIRValue();
304 Value *B = GetBroadcastInstrs(IRV);
305 set(Def, V: B);
306 return B;
307 }
308
309 Value *ScalarValue = get(Def, Lane: VPLane(0));
310 // If we aren't vectorizing, we can just copy the scalar map values over
311 // to the vector map.
312 if (VF.isScalar()) {
313 set(Def, V: ScalarValue);
314 return ScalarValue;
315 }
316
317 bool IsSingleScalar = vputils::isSingleScalar(VPV: Def);
318 VPLane LastLane(IsSingleScalar ? 0 : VF.getFixedValue() - 1);
319
320 // We need to construct the vector value for a single-scalar value by
321 // broadcasting the scalar to all lanes.
322 // TODO: Replace by introducing Broadcast VPInstructions.
323 assert(IsSingleScalar && "must be a single-scalar at this point");
324 // Set the insert point after the last scalarized instruction or after the
325 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence
326 // will directly follow the scalar definitions.
327 auto OldIP = Builder.saveIP();
328 auto *LastInst = cast<Instruction>(Val: get(Def, Lane: LastLane));
329 auto NewIP = isa<PHINode>(Val: LastInst)
330 ? LastInst->getParent()->getFirstNonPHIIt()
331 : std::next(x: BasicBlock::iterator(LastInst));
332 Builder.SetInsertPoint(&*NewIP);
333 Value *VectorValue = GetBroadcastInstrs(ScalarValue);
334 set(Def, V: VectorValue);
335 Builder.restoreIP(IP: OldIP);
336 return VectorValue;
337}
338
339void VPTransformState::setDebugLocFrom(DebugLoc DL) {
340 const DILocation *DIL = DL;
341 // When a FSDiscriminator is enabled, we don't need to add the multiply
342 // factors to the discriminators.
343 if (DIL &&
344 Builder.GetInsertBlock()
345 ->getParent()
346 ->shouldEmitDebugInfoForProfiling() &&
347 !EnableFSDiscriminator) {
348 // FIXME: For scalable vectors, assume vscale=1.
349 unsigned UF = Plan->getConcreteUF();
350 auto NewDIL =
351 DIL->cloneByMultiplyingDuplicationFactor(DF: UF * VF.getKnownMinValue());
352 if (NewDIL)
353 Builder.SetCurrentDebugLocation(*NewDIL);
354 else
355 LLVM_DEBUG(dbgs() << "Failed to create new discriminator: "
356 << DIL->getFilename() << " Line: " << DIL->getLine());
357 } else
358 Builder.SetCurrentDebugLocation(DL);
359}
360
361Value *VPTransformState::packScalarIntoVectorizedValue(const VPValue *Def,
362 Value *WideValue,
363 const VPLane &Lane) {
364 Value *ScalarInst = get(Def, Lane);
365 Value *LaneExpr = Lane.getAsRuntimeExpr(Builder, VF);
366 if (auto *StructTy = dyn_cast<StructType>(Val: WideValue->getType())) {
367 // We must handle each element of a vectorized struct type.
368 for (unsigned I = 0, E = StructTy->getNumElements(); I != E; I++) {
369 Value *ScalarValue = Builder.CreateExtractValue(Agg: ScalarInst, Idxs: I);
370 Value *VectorValue = Builder.CreateExtractValue(Agg: WideValue, Idxs: I);
371 VectorValue =
372 Builder.CreateInsertElement(Vec: VectorValue, NewElt: ScalarValue, Idx: LaneExpr);
373 WideValue = Builder.CreateInsertValue(Agg: WideValue, Val: VectorValue, Idxs: I);
374 }
375 } else {
376 WideValue = Builder.CreateInsertElement(Vec: WideValue, NewElt: ScalarInst, Idx: LaneExpr);
377 }
378 return WideValue;
379}
380
381BasicBlock *VPBasicBlock::createEmptyBasicBlock(VPTransformState &State) {
382 auto &CFG = State.CFG;
383 // BB stands for IR BasicBlocks. VPBB stands for VPlan VPBasicBlocks.
384 // Pred stands for Predessor. Prev stands for Previous - last visited/created.
385 BasicBlock *PrevBB = CFG.PrevBB;
386 BasicBlock *NewBB = BasicBlock::Create(Context&: PrevBB->getContext(), Name: getName(),
387 Parent: PrevBB->getParent(), InsertBefore: CFG.ExitBB);
388 LLVM_DEBUG(dbgs() << "LV: created " << NewBB->getName() << '\n');
389
390 return NewBB;
391}
392
393void VPBasicBlock::connectToPredecessors(VPTransformState &State) {
394 auto &CFG = State.CFG;
395 BasicBlock *NewBB = CFG.VPBB2IRBB[this];
396
397 // Register NewBB in its loop. In innermost loops its the same for all
398 // BB's.
399 Loop *ParentLoop = State.CurrentParentLoop;
400 // If this block has a sole successor that is an exit block or is an exit
401 // block itself then it needs adding to the same parent loop as the exit
402 // block.
403 VPBlockBase *SuccOrExitVPB = getSingleSuccessor();
404 SuccOrExitVPB = SuccOrExitVPB ? SuccOrExitVPB : this;
405 if (State.Plan->isExitBlock(VPBB: SuccOrExitVPB)) {
406 ParentLoop = State.LI->getLoopFor(
407 BB: cast<VPIRBasicBlock>(Val: SuccOrExitVPB)->getIRBasicBlock());
408 }
409
410 if (ParentLoop && !State.LI->getLoopFor(BB: NewBB))
411 ParentLoop->addBasicBlockToLoop(NewBB, LI&: *State.LI);
412
413 SmallVector<VPBlockBase *> Preds;
414 if (VPBlockUtils::isHeader(VPB: this, VPDT: State.VPDT)) {
415 // There's no block for the latch yet, connect to the preheader only.
416 Preds = {getPredecessors()[0]};
417 } else {
418 Preds = to_vector(Range&: getPredecessors());
419 }
420
421 // Hook up the new basic block to its predecessors.
422 for (VPBlockBase *PredVPBlock : Preds) {
423 VPBasicBlock *PredVPBB = PredVPBlock->getExitingBasicBlock();
424 auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors();
425 assert(CFG.VPBB2IRBB.contains(PredVPBB) &&
426 "Predecessor basic-block not found building successor.");
427 BasicBlock *PredBB = CFG.VPBB2IRBB[PredVPBB];
428 auto *PredBBTerminator = PredBB->getTerminator();
429 LLVM_DEBUG(dbgs() << "LV: draw edge from " << PredBB->getName() << '\n');
430
431 if (isa<UnreachableInst>(Val: PredBBTerminator)) {
432 assert(PredVPSuccessors.size() == 1 &&
433 "Predecessor ending w/o branch must have single successor.");
434 DebugLoc DL = PredBBTerminator->getDebugLoc();
435 PredBBTerminator->eraseFromParent();
436 auto *Br = UncondBrInst::Create(IfTrue: NewBB, InsertBefore: PredBB);
437 Br->setDebugLoc(DL);
438 } else if (auto *UBI = dyn_cast<UncondBrInst>(Val: PredBBTerminator)) {
439 UBI->setSuccessor(NewBB);
440 } else {
441 // Set each forward successor here when it is created, excluding
442 // backedges. A backward successor is set when the branch is created.
443 // Branches to VPIRBasicBlocks must have the same successors in VPlan as
444 // in the original IR, except when the predecessor is the entry block.
445 // This enables including SCEV and memory runtime check blocks in VPlan.
446 // TODO: Remove exception by modeling the terminator of entry block using
447 // BranchOnCond.
448 unsigned idx = PredVPSuccessors.front() == this ? 0 : 1;
449 auto *TermBr = cast<CondBrInst>(Val: PredBBTerminator);
450 assert((!TermBr->getSuccessor(idx) ||
451 (isa<VPIRBasicBlock>(this) &&
452 (TermBr->getSuccessor(idx) == NewBB ||
453 PredVPBlock == getPlan()->getEntry()))) &&
454 "Trying to reset an existing successor block.");
455 TermBr->setSuccessor(idx, NewSucc: NewBB);
456 }
457 CFG.DTU.applyUpdates(Updates: {{DominatorTree::Insert, PredBB, NewBB}});
458 }
459}
460
461void VPIRBasicBlock::execute(VPTransformState *State) {
462 assert(getHierarchicalSuccessors().size() <= 2 &&
463 "VPIRBasicBlock can have at most two successors at the moment!");
464 // Move completely disconnected blocks to their final position.
465 if (IRBB->hasNPredecessors(N: 0) && succ_begin(BB: IRBB) == succ_end(BB: IRBB))
466 IRBB->moveAfter(MovePos: State->CFG.PrevBB);
467 State->Builder.SetInsertPoint(IRBB->getTerminator());
468 State->CFG.PrevBB = IRBB;
469 State->CFG.VPBB2IRBB[this] = IRBB;
470 executeRecipes(State, BB: IRBB);
471 // Create a branch instruction to terminate IRBB if one was not created yet
472 // and is needed.
473 if (getSingleSuccessor() && isa<UnreachableInst>(Val: IRBB->getTerminator())) {
474 auto *Br = State->Builder.CreateBr(Dest: IRBB);
475 Br->setOperand(i_nocapture: 0, Val_nocapture: nullptr);
476 IRBB->getTerminator()->eraseFromParent();
477 } else {
478 assert((getNumSuccessors() == 0 ||
479 isa<UncondBrInst, CondBrInst>(IRBB->getTerminator())) &&
480 "other blocks must be terminated by a branch");
481 }
482
483 connectToPredecessors(State&: *State);
484}
485
486VPIRBasicBlock *VPIRBasicBlock::clone() {
487 auto *NewBlock = getPlan()->createEmptyVPIRBasicBlock(IRBB);
488 for (VPRecipeBase &R : Recipes)
489 NewBlock->appendRecipe(Recipe: R.clone());
490 return NewBlock;
491}
492
493void VPBasicBlock::execute(VPTransformState *State) {
494 bool Replica = bool(State->Lane);
495 BasicBlock *NewBB = State->CFG.PrevBB; // Reuse it if possible.
496
497 if (VPBlockUtils::isHeader(VPB: this, VPDT: State->VPDT)) {
498 // Create and register the new vector loop.
499 Loop *PrevParentLoop = State->CurrentParentLoop;
500 State->CurrentParentLoop = State->LI->AllocateLoop();
501
502 // Insert the new loop into the loop nest and register the new basic blocks
503 // before calling any utilities such as SCEV that require valid LoopInfo.
504 if (PrevParentLoop)
505 PrevParentLoop->addChildLoop(NewChild: State->CurrentParentLoop);
506 else
507 State->LI->addTopLevelLoop(New: State->CurrentParentLoop);
508 }
509
510 auto IsReplicateRegion = [](VPBlockBase *BB) {
511 auto *R = dyn_cast_or_null<VPRegionBlock>(Val: BB);
512 assert((!R || R->isReplicator()) &&
513 "only replicate region blocks should remain");
514 return R;
515 };
516 // 1. Create an IR basic block.
517 if ((Replica && this == getParent()->getEntry()) ||
518 IsReplicateRegion(getSingleHierarchicalPredecessor())) {
519 // Reuse the previous basic block if the current VPBB is either
520 // * the entry to a replicate region, or
521 // * the exit of a replicate region.
522 State->CFG.VPBB2IRBB[this] = NewBB;
523 } else {
524 NewBB = createEmptyBasicBlock(State&: *State);
525
526 State->Builder.SetInsertPoint(NewBB);
527 // Temporarily terminate with unreachable until CFG is rewired.
528 UnreachableInst *Terminator = State->Builder.CreateUnreachable();
529 State->Builder.SetInsertPoint(Terminator);
530
531 State->CFG.PrevBB = NewBB;
532 State->CFG.VPBB2IRBB[this] = NewBB;
533 connectToPredecessors(State&: *State);
534 }
535
536 // 2. Fill the IR basic block with IR instructions.
537 executeRecipes(State, BB: NewBB);
538
539 // If this block is a latch, update CurrentParentLoop.
540 if (VPBlockUtils::isLatch(VPB: this, VPDT: State->VPDT))
541 State->CurrentParentLoop = State->CurrentParentLoop->getParentLoop();
542}
543
544VPBasicBlock *VPBasicBlock::clone() {
545 auto *NewBlock = getPlan()->createVPBasicBlock(Name: getName());
546 for (VPRecipeBase &R : *this)
547 NewBlock->appendRecipe(Recipe: R.clone());
548 return NewBlock;
549}
550
551void VPBasicBlock::executeRecipes(VPTransformState *State, BasicBlock *BB) {
552 LLVM_DEBUG(dbgs() << "LV: vectorizing VPBB: " << getName()
553 << " in BB: " << BB->getName() << '\n');
554
555 State->CFG.PrevVPBB = this;
556
557 for (VPRecipeBase &Recipe : Recipes) {
558 State->setDebugLocFrom(Recipe.getDebugLoc());
559 Recipe.execute(State&: *State);
560 }
561
562 LLVM_DEBUG(dbgs() << "LV: filled BB: " << *BB);
563}
564
565VPBasicBlock *VPBasicBlock::splitAt(iterator SplitAt) {
566 assert((SplitAt == end() || SplitAt->getParent() == this) &&
567 "can only split at a position in the same block");
568
569 // Create new empty block after the block to split.
570 auto *SplitBlock = getPlan()->createVPBasicBlock(Name: getName() + ".split");
571 VPBlockUtils::insertBlockAfter(NewBlock: SplitBlock, BlockPtr: this);
572
573 // If this is the exiting block, make the split the new exiting block.
574 auto *ParentRegion = getParent();
575 if (ParentRegion && ParentRegion->getExiting() == this)
576 ParentRegion->setExiting(SplitBlock);
577
578 // Finally, move the recipes starting at SplitAt to new block.
579 for (VPRecipeBase &ToMove :
580 make_early_inc_range(Range: make_range(x: SplitAt, y: this->end())))
581 ToMove.moveBefore(BB&: *SplitBlock, I: SplitBlock->end());
582
583 return SplitBlock;
584}
585
586/// Return the enclosing loop region for region \p P. The templated version is
587/// used to support both const and non-const block arguments.
588template <typename T> static T *getEnclosingLoopRegionForRegion(T *P) {
589 if (P && P->isReplicator()) {
590 P = P->getParent();
591 // Multiple loop regions can be nested, but replicate regions can only be
592 // nested inside a loop region or must be outside any other region.
593 assert((!P || !P->isReplicator()) && "unexpected nested replicate regions");
594 }
595 return P;
596}
597
598VPRegionBlock *VPBasicBlock::getEnclosingLoopRegion() {
599 return getEnclosingLoopRegionForRegion(P: getParent());
600}
601
602const VPRegionBlock *VPBasicBlock::getEnclosingLoopRegion() const {
603 return getEnclosingLoopRegionForRegion(P: getParent());
604}
605
606static bool hasConditionalTerminator(const VPBasicBlock *VPBB) {
607 if (VPBB->empty()) {
608 assert(
609 VPBB->getNumSuccessors() < 2 &&
610 "block with multiple successors doesn't have a recipe as terminator");
611 return false;
612 }
613
614 const VPRecipeBase *R = &VPBB->back();
615 [[maybe_unused]] bool IsSwitch =
616 isa<VPInstruction>(Val: R) &&
617 cast<VPInstruction>(Val: R)->getOpcode() == Instruction::Switch;
618 [[maybe_unused]] bool IsBranchOnTwoConds = match(V: R, P: m_BranchOnTwoConds());
619 [[maybe_unused]] bool IsCondBranch =
620 isa<VPBranchOnMaskRecipe>(Val: R) ||
621 match(V: R, P: m_CombineOr(L: m_BranchOnCond(), R: m_BranchOnCount()));
622 if (VPBB->getNumSuccessors() == 2 ||
623 (VPBB->isExiting() && !VPBB->getParent()->isReplicator())) {
624 assert((IsCondBranch || IsSwitch || IsBranchOnTwoConds) &&
625 "block with multiple successors not terminated by "
626 "conditional branch nor switch recipe");
627
628 return true;
629 }
630
631 if (VPBB->getNumSuccessors() > 2) {
632 assert((IsSwitch || IsBranchOnTwoConds) &&
633 "block with more than 2 successors not terminated by a switch or "
634 "branch-on-two-conds recipe");
635 return true;
636 }
637
638 assert(
639 !IsCondBranch && !IsBranchOnTwoConds &&
640 "block with 0 or 1 successors terminated by conditional branch recipe");
641 return false;
642}
643
644VPRecipeBase *VPBasicBlock::getTerminator() {
645 if (hasConditionalTerminator(VPBB: this))
646 return &back();
647 return nullptr;
648}
649
650const VPRecipeBase *VPBasicBlock::getTerminator() const {
651 if (hasConditionalTerminator(VPBB: this))
652 return &back();
653 return nullptr;
654}
655
656bool VPBasicBlock::isExiting() const {
657 return getParent() && getParent()->getExitingBasicBlock() == this;
658}
659
660#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
661void VPBlockBase::print(raw_ostream &O) const {
662 VPSlotTracker SlotTracker(getPlan());
663 print(O, "", SlotTracker);
664}
665
666void VPBlockBase::printSuccessors(raw_ostream &O, const Twine &Indent) const {
667 if (!hasSuccessors()) {
668 O << Indent << "No successors\n";
669 } else {
670 O << Indent << "Successor(s): ";
671 ListSeparator LS;
672 for (auto *Succ : getSuccessors())
673 O << LS << Succ->getName();
674 O << '\n';
675 }
676}
677
678void VPBasicBlock::print(raw_ostream &O, const Twine &Indent,
679 VPSlotTracker &SlotTracker) const {
680 O << Indent << getName() << ":\n";
681
682 auto RecipeIndent = Indent + " ";
683 for (const VPRecipeBase &Recipe : *this) {
684 Recipe.print(O, RecipeIndent, SlotTracker);
685 O << '\n';
686 }
687
688 printSuccessors(O, Indent);
689}
690#endif
691
692static std::pair<VPBlockBase *, VPBlockBase *> cloneFrom(VPBlockBase *Entry);
693
694// Clone the CFG for all nodes reachable from \p Entry, this includes cloning
695// the blocks and their recipes. Operands of cloned recipes will NOT be updated.
696// Remapping of operands must be done separately. Returns a pair with the new
697// entry and exiting blocks of the cloned region. If \p Entry isn't part of a
698// region, return nullptr for the exiting block.
699static std::pair<VPBlockBase *, VPBlockBase *> cloneFrom(VPBlockBase *Entry) {
700 DenseMap<VPBlockBase *, VPBlockBase *> Old2NewVPBlocks;
701 VPBlockBase *Exiting = nullptr;
702 bool InRegion = Entry->getParent();
703 // First, clone blocks reachable from Entry.
704 for (VPBlockBase *BB : vp_depth_first_shallow(G: Entry)) {
705 VPBlockBase *NewBB = BB->clone();
706 Old2NewVPBlocks[BB] = NewBB;
707 if (InRegion && BB->getNumSuccessors() == 0) {
708 assert(!Exiting && "Multiple exiting blocks?");
709 Exiting = BB;
710 }
711 }
712 assert((!InRegion || Exiting) && "regions must have a single exiting block");
713
714 // Second, update the predecessors & successors of the cloned blocks.
715 for (VPBlockBase *BB : vp_depth_first_shallow(G: Entry)) {
716 VPBlockBase *NewBB = Old2NewVPBlocks[BB];
717 SmallVector<VPBlockBase *> NewPreds;
718 for (VPBlockBase *Pred : BB->getPredecessors()) {
719 NewPreds.push_back(Elt: Old2NewVPBlocks[Pred]);
720 }
721 NewBB->setPredecessors(NewPreds);
722 SmallVector<VPBlockBase *> NewSuccs;
723 for (VPBlockBase *Succ : BB->successors()) {
724 NewSuccs.push_back(Elt: Old2NewVPBlocks[Succ]);
725 }
726 NewBB->setSuccessors(NewSuccs);
727 }
728
729#if !defined(NDEBUG)
730 // Verify that the order of predecessors and successors matches in the cloned
731 // version.
732 for (const auto &[OldBB, NewBB] :
733 zip(vp_depth_first_shallow(Entry),
734 vp_depth_first_shallow(Old2NewVPBlocks[Entry]))) {
735 for (const auto &[OldPred, NewPred] :
736 zip(OldBB->getPredecessors(), NewBB->getPredecessors()))
737 assert(NewPred == Old2NewVPBlocks[OldPred] && "Different predecessors");
738
739 for (const auto &[OldSucc, NewSucc] :
740 zip(OldBB->successors(), NewBB->successors()))
741 assert(NewSucc == Old2NewVPBlocks[OldSucc] && "Different successors");
742 }
743#endif
744
745 return std::make_pair(x&: Old2NewVPBlocks[Entry],
746 y: Exiting ? Old2NewVPBlocks[Exiting] : nullptr);
747}
748
749VPRegionBlock *VPRegionBlock::clone() {
750 const auto &[NewEntry, NewExiting] = cloneFrom(Entry: getEntry());
751 VPlan &Plan = *getPlan();
752 VPRegionBlock *NewRegion =
753 isReplicator()
754 ? Plan.createReplicateRegion(Entry: NewEntry, Exiting: NewExiting, Name: getName())
755 : Plan.createLoopRegion(Name: getName(), Entry: NewEntry, Exiting: NewExiting);
756
757 for (VPBlockBase *Block : vp_depth_first_shallow(G: NewEntry))
758 Block->setParent(NewRegion);
759 return NewRegion;
760}
761
762void VPRegionBlock::execute(VPTransformState *State) {
763 assert(isReplicator() &&
764 "Loop regions should have been lowered to plain CFG");
765 assert(!State->Lane && "Replicating a Region with non-null instance.");
766 assert(!State->VF.isScalable() && "VF is assumed to be non scalable.");
767
768 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
769 Entry);
770 State->Lane = VPLane(0);
771 for (unsigned Lane = 0, VF = State->VF.getFixedValue(); Lane < VF; ++Lane) {
772 State->Lane = VPLane(Lane, VPLane::Kind::First);
773 // Visit the VPBlocks connected to \p this, starting from it.
774 for (VPBlockBase *Block : RPOT) {
775 LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
776 Block->execute(State);
777 }
778 }
779
780 // Exit replicating mode.
781 State->Lane.reset();
782}
783
784InstructionCost VPBasicBlock::cost(ElementCount VF, VPCostContext &Ctx) {
785 InstructionCost Cost = 0;
786 for (VPRecipeBase &R : Recipes)
787 Cost += R.cost(VF, Ctx);
788 return Cost;
789}
790
791const VPBasicBlock *VPBasicBlock::getCFGPredecessor(unsigned Idx) const {
792 const VPBlockBase *Pred = nullptr;
793 if (hasPredecessors()) {
794 Pred = getPredecessors()[Idx];
795 } else {
796 auto *Region = getParent();
797 assert(Region && !Region->isReplicator() && Region->getEntry() == this &&
798 "must be in the entry block of a non-replicate region");
799 assert(Idx < 2 && Region->getNumPredecessors() == 1 &&
800 "loop region has a single predecessor (preheader), its entry block "
801 "has 2 incoming blocks");
802
803 // Idx == 0 selects the predecessor of the region, Idx == 1 selects the
804 // region itself whose exiting block feeds the phi across the backedge.
805 Pred = Idx == 0 ? Region->getSinglePredecessor() : Region;
806 }
807 return Pred->getExitingBasicBlock();
808}
809
810InstructionCost VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx) {
811 if (!isReplicator()) {
812 InstructionCost Cost = 0;
813 for (VPBlockBase *Block : vp_depth_first_shallow(G: getEntry()))
814 Cost += Block->cost(VF, Ctx);
815 InstructionCost BackedgeCost =
816 ForceTargetInstructionCost.getNumOccurrences()
817 ? InstructionCost(ForceTargetInstructionCost)
818 : Ctx.TTI.getCFInstrCost(Opcode: Instruction::UncondBr, CostKind: Ctx.CostKind);
819 LLVM_DEBUG(dbgs() << "Cost of " << BackedgeCost << " for VF " << VF
820 << ": vector loop backedge\n");
821 Cost += BackedgeCost;
822 return Cost;
823 }
824
825 // Compute the cost of a replicate region. Replicating isn't supported for
826 // scalable vectors, return an invalid cost for them.
827 // TODO: Discard scalable VPlans with replicate recipes earlier after
828 // construction.
829 if (VF.isScalable())
830 return InstructionCost::getInvalid();
831
832 // Compute and return the cost of the conditionally executed recipes.
833 assert(VF.isVector() && "Can only compute vector cost at the moment.");
834 VPBasicBlock *Then = cast<VPBasicBlock>(Val: getEntry()->getSuccessors()[0]);
835 return Then->cost(VF, Ctx);
836}
837
838#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
839void VPRegionBlock::print(raw_ostream &O, const Twine &Indent,
840 VPSlotTracker &SlotTracker) const {
841 O << Indent << (isReplicator() ? "<xVFxUF> " : "<x1> ") << getName() << ": {";
842 auto NewIndent = Indent + " ";
843 for (auto *BlockBase : vp_depth_first_shallow(Entry)) {
844 O << '\n';
845 BlockBase->print(O, NewIndent, SlotTracker);
846 }
847 O << Indent << "}\n";
848
849 printSuccessors(O, Indent);
850}
851#endif
852
853void VPRegionBlock::dissolveToCFGLoop() {
854 auto *Header = cast<VPBasicBlock>(Val: getEntry());
855 if (auto *CanIV = dyn_cast<VPCanonicalIVPHIRecipe>(Val: &Header->front())) {
856 assert(this == getPlan()->getVectorLoopRegion() &&
857 "Canonical IV must be in the entry of the top-level loop region");
858 auto *ScalarR = VPBuilder(CanIV).createScalarPhi(
859 IncomingValues: {CanIV->getStartValue(), CanIV->getBackedgeValue()},
860 DL: CanIV->getDebugLoc(), Name: "index");
861 CanIV->replaceAllUsesWith(New: ScalarR);
862 CanIV->eraseFromParent();
863 }
864
865 VPBlockBase *Preheader = getSinglePredecessor();
866 auto *ExitingLatch = cast<VPBasicBlock>(Val: getExiting());
867
868 VPBlockUtils::disconnectBlocks(From: Preheader, To: this);
869
870 for (VPBlockBase *VPB : vp_depth_first_shallow(G: Entry))
871 VPB->setParent(getParent());
872
873 VPBlockUtils::connectBlocks(From: Preheader, To: Header);
874 VPBlockUtils::transferSuccessors(Old: this, New: ExitingLatch);
875 VPBlockUtils::connectBlocks(From: ExitingLatch, To: Header);
876}
877
878VPlan::VPlan(Loop *L) {
879 setEntry(createVPIRBasicBlock(IRBB: L->getLoopPreheader()));
880 ScalarHeader = createVPIRBasicBlock(IRBB: L->getHeader());
881
882 SmallVector<BasicBlock *> IRExitBlocks;
883 L->getUniqueExitBlocks(ExitBlocks&: IRExitBlocks);
884 for (BasicBlock *EB : IRExitBlocks)
885 ExitBlocks.push_back(Elt: createVPIRBasicBlock(IRBB: EB));
886}
887
888VPlan::~VPlan() {
889 VPSymbolicValue DummyValue;
890
891 for (auto *VPB : CreatedBlocks) {
892 if (auto *VPBB = dyn_cast<VPBasicBlock>(Val: VPB)) {
893 // Replace all operands of recipes and all VPValues defined in VPBB with
894 // DummyValue so the block can be deleted.
895 for (VPRecipeBase &R : *VPBB) {
896 for (auto *Def : R.definedValues())
897 Def->replaceAllUsesWith(New: &DummyValue);
898
899 for (unsigned I = 0, E = R.getNumOperands(); I != E; I++)
900 R.setOperand(I, New: &DummyValue);
901 }
902 }
903 delete VPB;
904 }
905 for (VPValue *VPV : getLiveIns())
906 delete VPV;
907 delete BackedgeTakenCount;
908}
909
910VPIRBasicBlock *VPlan::getExitBlock(BasicBlock *IRBB) const {
911 auto Iter = find_if(Range: getExitBlocks(), P: [IRBB](const VPIRBasicBlock *VPIRBB) {
912 return VPIRBB->getIRBasicBlock() == IRBB;
913 });
914 assert(Iter != getExitBlocks().end() && "no exit block found");
915 return *Iter;
916}
917
918bool VPlan::isExitBlock(VPBlockBase *VPBB) {
919 return is_contained(Range&: ExitBlocks, Element: VPBB);
920}
921
922/// To make RUN_VPLAN_PASS print final VPlan.
923static void printFinalVPlan(VPlan &) {}
924
925/// Generate the code inside the preheader and body of the vectorized loop.
926/// Assumes a single pre-header basic-block was created for this. Introduce
927/// additional basic-blocks as needed, and fill them all.
928void VPlan::execute(VPTransformState *State) {
929 // Initialize CFG state.
930 State->CFG.PrevVPBB = nullptr;
931 State->CFG.ExitBB = State->CFG.PrevBB->getSingleSuccessor();
932
933 // Update VPDominatorTree since VPBasicBlock may be removed after State was
934 // constructed.
935 State->VPDT.recalculate(Func&: *this);
936
937 // Disconnect VectorPreHeader from ExitBB in both the CFG and DT.
938 BasicBlock *VectorPreHeader = State->CFG.PrevBB;
939 cast<UncondBrInst>(Val: VectorPreHeader->getTerminator())->setSuccessor(nullptr);
940 State->CFG.DTU.applyUpdates(
941 Updates: {{DominatorTree::Delete, VectorPreHeader, State->CFG.ExitBB}});
942
943 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << State->VF
944 << ", UF=" << getConcreteUF() << '\n');
945 setName("Final VPlan");
946 // TODO: RUN_VPLAN_PASS/VPlanTransforms::runPass should automatically dump
947 // VPlans after some specific stages when "-debug" is specified, but that
948 // hasn't been implemented yet. For now, just do both:
949 LLVM_DEBUG(dump());
950 RUN_VPLAN_PASS(printFinalVPlan, *this);
951
952 BasicBlock *ScalarPh = State->CFG.ExitBB;
953 VPBasicBlock *ScalarPhVPBB = getScalarPreheader();
954 if (ScalarPhVPBB->hasPredecessors()) {
955 // Disconnect scalar preheader and scalar header, as the dominator tree edge
956 // will be updated as part of VPlan execution. This allows keeping the DTU
957 // logic generic during VPlan execution.
958 State->CFG.DTU.applyUpdates(
959 Updates: {{DominatorTree::Delete, ScalarPh, ScalarPh->getSingleSuccessor()}});
960 }
961 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
962 Entry);
963 // Generate code for the VPlan, in parts of the vector skeleton, loop body and
964 // successor blocks including the middle, exit and scalar preheader blocks.
965 for (VPBlockBase *Block : RPOT)
966 Block->execute(State);
967
968 // If the original loop is unreachable, delete it and all its blocks.
969 if (!ScalarPhVPBB->hasPredecessors()) {
970 // DeleteDeadBlocks will remove single-entry phis. Remove them from the exit
971 // VPIRBBs in VPlan as well, otherwise we would retain references to deleted
972 // IR instructions.
973 for (VPIRBasicBlock *EB : getExitBlocks()) {
974 for (VPRecipeBase &R : make_early_inc_range(Range: EB->phis())) {
975 if (R.getNumOperands() == 1)
976 R.eraseFromParent();
977 }
978 }
979
980 Loop *OrigLoop =
981 State->LI->getLoopFor(BB: getScalarHeader()->getIRBasicBlock());
982 auto Blocks = OrigLoop->getBlocksVector();
983 Blocks.push_back(x: cast<VPIRBasicBlock>(Val: ScalarPhVPBB)->getIRBasicBlock());
984 for (auto *BB : Blocks)
985 State->LI->removeBlock(BB);
986 DeleteDeadBlocks(BBs: Blocks, DTU: &State->CFG.DTU);
987 State->LI->erase(L: OrigLoop);
988 }
989
990 State->CFG.DTU.flush();
991
992 VPBasicBlock *Header = vputils::getFirstLoopHeader(Plan&: *this, VPDT&: State->VPDT);
993 if (!Header)
994 return;
995
996 auto *LatchVPBB = cast<VPBasicBlock>(Val: Header->getPredecessors()[1]);
997 BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB];
998
999 // Fix the latch value of canonical, reduction and first-order recurrences
1000 // phis in the vector loop.
1001 for (VPRecipeBase &R : Header->phis()) {
1002 // Skip phi-like recipes that generate their backedege values themselves.
1003 if (isa<VPWidenPHIRecipe>(Val: &R))
1004 continue;
1005
1006 auto *PhiR = cast<VPSingleDefRecipe>(Val: &R);
1007 // VPInstructions currently model scalar Phis only.
1008 bool NeedsScalar = isa<VPInstruction>(Val: PhiR) ||
1009 (isa<VPReductionPHIRecipe>(Val: PhiR) &&
1010 cast<VPReductionPHIRecipe>(Val: PhiR)->isInLoop());
1011
1012 Value *Phi = State->get(Def: PhiR, NeedsScalar);
1013 // VPHeaderPHIRecipe supports getBackedgeValue() but VPInstruction does
1014 // not.
1015 Value *Val = State->get(Def: PhiR->getOperand(N: 1), NeedsScalar);
1016 cast<PHINode>(Val: Phi)->addIncoming(V: Val, BB: VectorLatchBB);
1017 }
1018}
1019
1020InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx) {
1021 // For now only return the cost of the vector loop region, ignoring any other
1022 // blocks, like the preheader or middle blocks, expect for checking them for
1023 // recipes with invalid costs.
1024 InstructionCost Cost = getVectorLoopRegion()->cost(VF, Ctx);
1025
1026 // If the cost of the loop region is invalid or any recipe in the skeleton
1027 // outside loop regions are invalid return an invalid cost.
1028 if (!Cost.isValid() || any_of(Range: VPBlockUtils::blocksOnly<VPBasicBlock>(
1029 Range: vp_depth_first_shallow(G: getEntry())),
1030 P: [&VF, &Ctx](VPBasicBlock *VPBB) {
1031 return !VPBB->cost(VF, Ctx).isValid();
1032 }))
1033 return InstructionCost::getInvalid();
1034
1035 return Cost;
1036}
1037
1038VPRegionBlock *VPlan::getVectorLoopRegion() {
1039 // TODO: Cache if possible.
1040 for (VPBlockBase *B : vp_depth_first_shallow(G: getEntry()))
1041 if (auto *R = dyn_cast<VPRegionBlock>(Val: B))
1042 return R->isReplicator() ? nullptr : R;
1043 return nullptr;
1044}
1045
1046const VPRegionBlock *VPlan::getVectorLoopRegion() const {
1047 for (const VPBlockBase *B : vp_depth_first_shallow(G: getEntry()))
1048 if (auto *R = dyn_cast<VPRegionBlock>(Val: B))
1049 return R->isReplicator() ? nullptr : R;
1050 return nullptr;
1051}
1052
1053#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1054void VPlan::printLiveIns(raw_ostream &O) const {
1055 VPSlotTracker SlotTracker(this);
1056
1057 if (VF.getNumUsers() > 0) {
1058 O << "\nLive-in ";
1059 VF.printAsOperand(O, SlotTracker);
1060 O << " = VF";
1061 }
1062
1063 if (UF.getNumUsers() > 0) {
1064 O << "\nLive-in ";
1065 UF.printAsOperand(O, SlotTracker);
1066 O << " = UF";
1067 }
1068
1069 if (VFxUF.getNumUsers() > 0) {
1070 O << "\nLive-in ";
1071 VFxUF.printAsOperand(O, SlotTracker);
1072 O << " = VF * UF";
1073 }
1074
1075 if (VectorTripCount.getNumUsers() > 0) {
1076 O << "\nLive-in ";
1077 VectorTripCount.printAsOperand(O, SlotTracker);
1078 O << " = vector-trip-count";
1079 }
1080
1081 if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) {
1082 O << "\nLive-in ";
1083 BackedgeTakenCount->printAsOperand(O, SlotTracker);
1084 O << " = backedge-taken count";
1085 }
1086
1087 O << "\n";
1088 if (TripCount) {
1089 if (isa<VPIRValue>(TripCount))
1090 O << "Live-in ";
1091 TripCount->printAsOperand(O, SlotTracker);
1092 O << " = original trip-count";
1093 O << "\n";
1094 }
1095}
1096
1097LLVM_DUMP_METHOD
1098void VPlan::print(raw_ostream &O) const {
1099 VPSlotTracker SlotTracker(this);
1100
1101 O << "VPlan '" << getName() << "' {";
1102
1103 printLiveIns(O);
1104
1105 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<const VPBlockBase *>>
1106 RPOT(getEntry());
1107 for (const VPBlockBase *Block : RPOT) {
1108 O << '\n';
1109 Block->print(O, "", SlotTracker);
1110 }
1111
1112 O << "}\n";
1113}
1114
1115std::string VPlan::getName() const {
1116 std::string Out;
1117 raw_string_ostream RSO(Out);
1118 RSO << Name << " for ";
1119 if (!VFs.empty()) {
1120 RSO << "VF={" << VFs[0];
1121 for (ElementCount VF : drop_begin(VFs))
1122 RSO << "," << VF;
1123 RSO << "},";
1124 }
1125
1126 if (UFs.empty()) {
1127 RSO << "UF>=1";
1128 } else {
1129 RSO << "UF={" << UFs[0];
1130 for (unsigned UF : drop_begin(UFs))
1131 RSO << "," << UF;
1132 RSO << "}";
1133 }
1134
1135 return Out;
1136}
1137
1138LLVM_DUMP_METHOD
1139void VPlan::printDOT(raw_ostream &O) const {
1140 VPlanPrinter Printer(O, *this);
1141 Printer.dump();
1142}
1143
1144LLVM_DUMP_METHOD
1145void VPlan::dump() const { print(dbgs()); }
1146#endif
1147
1148static void remapOperands(VPBlockBase *Entry, VPBlockBase *NewEntry,
1149 DenseMap<VPValue *, VPValue *> &Old2NewVPValues) {
1150 // Update the operands of all cloned recipes starting at NewEntry. This
1151 // traverses all reachable blocks. This is done in two steps, to handle cycles
1152 // in PHI recipes.
1153 ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>>
1154 OldDeepRPOT(Entry);
1155 ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>>
1156 NewDeepRPOT(NewEntry);
1157 // First, collect all mappings from old to new VPValues defined by cloned
1158 // recipes.
1159 for (const auto &[OldBB, NewBB] :
1160 zip(t: VPBlockUtils::blocksOnly<VPBasicBlock>(Range: OldDeepRPOT),
1161 u: VPBlockUtils::blocksOnly<VPBasicBlock>(Range: NewDeepRPOT))) {
1162 assert(OldBB->getRecipeList().size() == NewBB->getRecipeList().size() &&
1163 "blocks must have the same number of recipes");
1164 for (const auto &[OldR, NewR] : zip(t&: *OldBB, u&: *NewBB)) {
1165 assert(OldR.getNumOperands() == NewR.getNumOperands() &&
1166 "recipes must have the same number of operands");
1167 assert(OldR.getNumDefinedValues() == NewR.getNumDefinedValues() &&
1168 "recipes must define the same number of operands");
1169 for (const auto &[OldV, NewV] :
1170 zip(t: OldR.definedValues(), u: NewR.definedValues()))
1171 Old2NewVPValues[OldV] = NewV;
1172 }
1173 }
1174
1175 // Update all operands to use cloned VPValues.
1176 for (VPBasicBlock *NewBB :
1177 VPBlockUtils::blocksOnly<VPBasicBlock>(Range: NewDeepRPOT)) {
1178 for (VPRecipeBase &NewR : *NewBB)
1179 for (unsigned I = 0, E = NewR.getNumOperands(); I != E; ++I) {
1180 VPValue *NewOp = Old2NewVPValues.lookup(Val: NewR.getOperand(N: I));
1181 NewR.setOperand(I, New: NewOp);
1182 }
1183 }
1184}
1185
1186VPlan *VPlan::duplicate() {
1187 unsigned NumBlocksBeforeCloning = CreatedBlocks.size();
1188 // Clone blocks.
1189 const auto &[NewEntry, __] = cloneFrom(Entry);
1190
1191 BasicBlock *ScalarHeaderIRBB = getScalarHeader()->getIRBasicBlock();
1192 VPIRBasicBlock *NewScalarHeader = nullptr;
1193 if (getScalarHeader()->hasPredecessors()) {
1194 NewScalarHeader = cast<VPIRBasicBlock>(Val: *find_if(
1195 Range: vp_depth_first_shallow(G: NewEntry), P: [ScalarHeaderIRBB](VPBlockBase *VPB) {
1196 auto *VPIRBB = dyn_cast<VPIRBasicBlock>(Val: VPB);
1197 return VPIRBB && VPIRBB->getIRBasicBlock() == ScalarHeaderIRBB;
1198 }));
1199 } else {
1200 NewScalarHeader = createVPIRBasicBlock(IRBB: ScalarHeaderIRBB);
1201 }
1202 // Create VPlan, clone live-ins and remap operands in the cloned blocks.
1203 auto *NewPlan = new VPlan(cast<VPBasicBlock>(Val: NewEntry), NewScalarHeader);
1204 DenseMap<VPValue *, VPValue *> Old2NewVPValues;
1205 for (VPIRValue *OldLiveIn : getLiveIns())
1206 Old2NewVPValues[OldLiveIn] = NewPlan->getOrAddLiveIn(V: OldLiveIn);
1207 Old2NewVPValues[&VectorTripCount] = &NewPlan->VectorTripCount;
1208 Old2NewVPValues[&VF] = &NewPlan->VF;
1209 Old2NewVPValues[&UF] = &NewPlan->UF;
1210 Old2NewVPValues[&VFxUF] = &NewPlan->VFxUF;
1211 if (BackedgeTakenCount) {
1212 NewPlan->BackedgeTakenCount = new VPSymbolicValue();
1213 Old2NewVPValues[BackedgeTakenCount] = NewPlan->BackedgeTakenCount;
1214 }
1215 if (auto *TripCountIRV = dyn_cast_or_null<VPIRValue>(Val: TripCount))
1216 Old2NewVPValues[TripCountIRV] = NewPlan->getOrAddLiveIn(V: TripCountIRV);
1217 // else NewTripCount will be created and inserted into Old2NewVPValues when
1218 // TripCount is cloned. In any case NewPlan->TripCount is updated below.
1219
1220 remapOperands(Entry, NewEntry, Old2NewVPValues);
1221
1222 // Initialize remaining fields of cloned VPlan.
1223 NewPlan->VFs = VFs;
1224 NewPlan->UFs = UFs;
1225 // TODO: Adjust names.
1226 NewPlan->Name = Name;
1227 if (TripCount) {
1228 assert(Old2NewVPValues.contains(TripCount) &&
1229 "TripCount must have been added to Old2NewVPValues");
1230 NewPlan->TripCount = Old2NewVPValues[TripCount];
1231 }
1232
1233 // Transfer all cloned blocks (the second half of all current blocks) from
1234 // current to new VPlan.
1235 unsigned NumBlocksAfterCloning = CreatedBlocks.size();
1236 for (unsigned I :
1237 seq<unsigned>(Begin: NumBlocksBeforeCloning, End: NumBlocksAfterCloning))
1238 NewPlan->CreatedBlocks.push_back(Elt: this->CreatedBlocks[I]);
1239 CreatedBlocks.truncate(N: NumBlocksBeforeCloning);
1240
1241 // Update ExitBlocks of the new plan.
1242 for (VPBlockBase *VPB : NewPlan->CreatedBlocks) {
1243 if (VPB->getNumSuccessors() == 0 && isa<VPIRBasicBlock>(Val: VPB) &&
1244 VPB != NewScalarHeader)
1245 NewPlan->ExitBlocks.push_back(Elt: cast<VPIRBasicBlock>(Val: VPB));
1246 }
1247
1248 return NewPlan;
1249}
1250
1251VPIRBasicBlock *VPlan::createEmptyVPIRBasicBlock(BasicBlock *IRBB) {
1252 auto *VPIRBB = new VPIRBasicBlock(IRBB);
1253 CreatedBlocks.push_back(Elt: VPIRBB);
1254 return VPIRBB;
1255}
1256
1257VPIRBasicBlock *VPlan::createVPIRBasicBlock(BasicBlock *IRBB) {
1258 auto *VPIRBB = createEmptyVPIRBasicBlock(IRBB);
1259 for (Instruction &I :
1260 make_range(x: IRBB->begin(), y: IRBB->getTerminator()->getIterator()))
1261 VPIRBB->appendRecipe(Recipe: VPIRInstruction::create(I));
1262 return VPIRBB;
1263}
1264
1265#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1266
1267Twine VPlanPrinter::getUID(const VPBlockBase *Block) {
1268 return (isa<VPRegionBlock>(Block) ? "cluster_N" : "N") +
1269 Twine(getOrCreateBID(Block));
1270}
1271
1272Twine VPlanPrinter::getOrCreateName(const VPBlockBase *Block) {
1273 const std::string &Name = Block->getName();
1274 if (!Name.empty())
1275 return Name;
1276 return "VPB" + Twine(getOrCreateBID(Block));
1277}
1278
1279void VPlanPrinter::dump() {
1280 Depth = 1;
1281 bumpIndent(0);
1282 OS << "digraph VPlan {\n";
1283 OS << "graph [labelloc=t, fontsize=30; label=\"Vectorization Plan";
1284 if (!Plan.getName().empty())
1285 OS << "\\n" << DOT::EscapeString(Plan.getName());
1286
1287 {
1288 // Print live-ins.
1289 std::string Str;
1290 raw_string_ostream SS(Str);
1291 Plan.printLiveIns(SS);
1292 SmallVector<StringRef, 0> Lines;
1293 StringRef(Str).rtrim('\n').split(Lines, "\n");
1294 for (auto Line : Lines)
1295 OS << DOT::EscapeString(Line.str()) << "\\n";
1296 }
1297
1298 OS << "\"]\n";
1299 OS << "node [shape=rect, fontname=Courier, fontsize=30]\n";
1300 OS << "edge [fontname=Courier, fontsize=30]\n";
1301 OS << "compound=true\n";
1302
1303 for (const VPBlockBase *Block : vp_depth_first_shallow(Plan.getEntry()))
1304 dumpBlock(Block);
1305
1306 OS << "}\n";
1307}
1308
1309void VPlanPrinter::dumpBlock(const VPBlockBase *Block) {
1310 if (const VPBasicBlock *BasicBlock = dyn_cast<VPBasicBlock>(Block))
1311 dumpBasicBlock(BasicBlock);
1312 else if (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
1313 dumpRegion(Region);
1314 else
1315 llvm_unreachable("Unsupported kind of VPBlock.");
1316}
1317
1318void VPlanPrinter::drawEdge(const VPBlockBase *From, const VPBlockBase *To,
1319 bool Hidden, const Twine &Label) {
1320 // Due to "dot" we print an edge between two regions as an edge between the
1321 // exiting basic block and the entry basic of the respective regions.
1322 const VPBlockBase *Tail = From->getExitingBasicBlock();
1323 const VPBlockBase *Head = To->getEntryBasicBlock();
1324 OS << Indent << getUID(Tail) << " -> " << getUID(Head);
1325 OS << " [ label=\"" << Label << '\"';
1326 if (Tail != From)
1327 OS << " ltail=" << getUID(From);
1328 if (Head != To)
1329 OS << " lhead=" << getUID(To);
1330 if (Hidden)
1331 OS << "; splines=none";
1332 OS << "]\n";
1333}
1334
1335void VPlanPrinter::dumpEdges(const VPBlockBase *Block) {
1336 auto &Successors = Block->getSuccessors();
1337 if (Successors.size() == 1)
1338 drawEdge(Block, Successors.front(), false, "");
1339 else if (Successors.size() == 2) {
1340 drawEdge(Block, Successors.front(), false, "T");
1341 drawEdge(Block, Successors.back(), false, "F");
1342 } else {
1343 unsigned SuccessorNumber = 0;
1344 for (auto *Successor : Successors)
1345 drawEdge(Block, Successor, false, Twine(SuccessorNumber++));
1346 }
1347}
1348
1349void VPlanPrinter::dumpBasicBlock(const VPBasicBlock *BasicBlock) {
1350 // Implement dot-formatted dump by performing plain-text dump into the
1351 // temporary storage followed by some post-processing.
1352 OS << Indent << getUID(BasicBlock) << " [label =\n";
1353 bumpIndent(1);
1354 std::string Str;
1355 raw_string_ostream SS(Str);
1356 // Use no indentation as we need to wrap the lines into quotes ourselves.
1357 BasicBlock->print(SS, "", SlotTracker);
1358
1359 // We need to process each line of the output separately, so split
1360 // single-string plain-text dump.
1361 SmallVector<StringRef, 0> Lines;
1362 StringRef(Str).rtrim('\n').split(Lines, "\n");
1363
1364 auto EmitLine = [&](StringRef Line, StringRef Suffix) {
1365 OS << Indent << '"' << DOT::EscapeString(Line.str()) << "\\l\"" << Suffix;
1366 };
1367
1368 // Don't need the "+" after the last line.
1369 for (auto Line : make_range(Lines.begin(), Lines.end() - 1))
1370 EmitLine(Line, " +\n");
1371 EmitLine(Lines.back(), "\n");
1372
1373 bumpIndent(-1);
1374 OS << Indent << "]\n";
1375
1376 dumpEdges(BasicBlock);
1377}
1378
1379void VPlanPrinter::dumpRegion(const VPRegionBlock *Region) {
1380 OS << Indent << "subgraph " << getUID(Region) << " {\n";
1381 bumpIndent(1);
1382 OS << Indent << "fontname=Courier\n"
1383 << Indent << "label=\""
1384 << DOT::EscapeString(Region->isReplicator() ? "<xVFxUF> " : "<x1> ")
1385 << DOT::EscapeString(Region->getName()) << "\"\n";
1386 // Dump the blocks of the region.
1387 assert(Region->getEntry() && "Region contains no inner blocks.");
1388 for (const VPBlockBase *Block : vp_depth_first_shallow(Region->getEntry()))
1389 dumpBlock(Block);
1390 bumpIndent(-1);
1391 OS << Indent << "}\n";
1392 dumpEdges(Region);
1393}
1394
1395#endif
1396
1397/// Returns true if there is a vector loop region and \p VPV is defined in a
1398/// loop region.
1399static bool isDefinedInsideLoopRegions(const VPValue *VPV) {
1400 const VPRecipeBase *DefR = VPV->getDefiningRecipe();
1401 return DefR && (!DefR->getParent()->getPlan()->getVectorLoopRegion() ||
1402 DefR->getParent()->getEnclosingLoopRegion());
1403}
1404
1405bool VPValue::isDefinedOutsideLoopRegions() const {
1406 return !isDefinedInsideLoopRegions(VPV: this);
1407}
1408void VPValue::replaceAllUsesWith(VPValue *New) {
1409 replaceUsesWithIf(New, ShouldReplace: [](VPUser &, unsigned) { return true; });
1410 if (auto *SV = dyn_cast<VPSymbolicValue>(Val: this))
1411 SV->markMaterialized();
1412}
1413
1414void VPValue::replaceUsesWithIf(
1415 VPValue *New,
1416 llvm::function_ref<bool(VPUser &U, unsigned Idx)> ShouldReplace) {
1417 assertNotMaterialized();
1418 // Note that this early exit is required for correctness; the implementation
1419 // below relies on the number of users for this VPValue to decrease, which
1420 // isn't the case if this == New.
1421 if (this == New)
1422 return;
1423
1424 for (unsigned J = 0; J < getNumUsers();) {
1425 VPUser *User = Users[J];
1426 bool RemovedUser = false;
1427 for (unsigned I = 0, E = User->getNumOperands(); I < E; ++I) {
1428 if (User->getOperand(N: I) != this || !ShouldReplace(*User, I))
1429 continue;
1430
1431 RemovedUser = true;
1432 User->setOperand(I, New);
1433 }
1434 // If a user got removed after updating the current user, the next user to
1435 // update will be moved to the current position, so we only need to
1436 // increment the index if the number of users did not change.
1437 if (!RemovedUser)
1438 J++;
1439 }
1440}
1441
1442void VPUser::replaceUsesOfWith(VPValue *From, VPValue *To) {
1443 for (unsigned Idx = 0; Idx != getNumOperands(); ++Idx) {
1444 if (getOperand(N: Idx) == From)
1445 setOperand(I: Idx, New: To);
1446 }
1447}
1448
1449#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1450void VPValue::printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const {
1451 OS << Tracker.getOrCreateName(this);
1452}
1453
1454void VPUser::printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const {
1455 interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) {
1456 Op->printAsOperand(O, SlotTracker);
1457 });
1458}
1459#endif
1460
1461void VPSlotTracker::assignName(const VPValue *V) {
1462 assert(!VPValue2Name.contains(V) && "VPValue already has a name!");
1463 auto *UV = V->getUnderlyingValue();
1464 auto *VPI = dyn_cast_or_null<VPInstruction>(Val: V);
1465 if (!UV && !(VPI && !VPI->getName().empty())) {
1466 VPValue2Name[V] = (Twine("vp<%") + Twine(NextSlot) + ">").str();
1467 NextSlot++;
1468 return;
1469 }
1470
1471 // Use the name of the underlying Value, wrapped in "ir<>", and versioned by
1472 // appending ".Number" to the name if there are multiple uses.
1473 std::string Name;
1474 if (UV)
1475 Name = getName(V: UV);
1476 else
1477 Name = VPI->getName();
1478
1479 assert(!Name.empty() && "Name cannot be empty.");
1480 StringRef Prefix = UV ? "ir<" : "vp<%";
1481 std::string BaseName = (Twine(Prefix) + Name + Twine(">")).str();
1482
1483 // First assign the base name for V.
1484 const auto &[A, _] = VPValue2Name.try_emplace(Key: V, Args&: BaseName);
1485 // Integer or FP constants with different types will result in the same string
1486 // due to stripping types.
1487 if (isa<VPIRValue>(Val: V) && isa<ConstantInt, ConstantFP>(Val: UV))
1488 return;
1489
1490 // If it is already used by C > 0 other VPValues, increase the version counter
1491 // C and use it for V.
1492 const auto &[C, UseInserted] = BaseName2Version.try_emplace(Key: BaseName, Args: 0);
1493 if (!UseInserted) {
1494 C->second++;
1495 A->second = (BaseName + Twine(".") + Twine(C->second)).str();
1496 }
1497}
1498
1499void VPSlotTracker::assignNames(const VPlan &Plan) {
1500 if (Plan.VF.getNumUsers() > 0)
1501 assignName(V: &Plan.VF);
1502 if (Plan.UF.getNumUsers() > 0)
1503 assignName(V: &Plan.UF);
1504 if (Plan.VFxUF.getNumUsers() > 0)
1505 assignName(V: &Plan.VFxUF);
1506 assignName(V: &Plan.VectorTripCount);
1507 if (Plan.BackedgeTakenCount)
1508 assignName(V: Plan.BackedgeTakenCount);
1509 for (VPValue *LI : Plan.getLiveIns())
1510 assignName(V: LI);
1511
1512 ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<const VPBlockBase *>>
1513 RPOT(VPBlockDeepTraversalWrapper<const VPBlockBase *>(Plan.getEntry()));
1514 for (const VPBasicBlock *VPBB :
1515 VPBlockUtils::blocksOnly<const VPBasicBlock>(Range: RPOT))
1516 assignNames(VPBB);
1517}
1518
1519void VPSlotTracker::assignNames(const VPBasicBlock *VPBB) {
1520 for (const VPRecipeBase &Recipe : *VPBB)
1521 for (VPValue *Def : Recipe.definedValues())
1522 assignName(V: Def);
1523}
1524
1525std::string VPSlotTracker::getName(const Value *V) {
1526 std::string Name;
1527 raw_string_ostream S(Name);
1528 if (V->hasName() || !isa<Instruction>(Val: V)) {
1529 V->printAsOperand(O&: S, PrintType: false);
1530 return Name;
1531 }
1532
1533 if (!MST) {
1534 // Lazily create the ModuleSlotTracker when we first hit an unnamed
1535 // instruction.
1536 auto *I = cast<Instruction>(Val: V);
1537 // This check is required to support unit tests with incomplete IR.
1538 if (I->getParent()) {
1539 MST = std::make_unique<ModuleSlotTracker>(args: I->getModule());
1540 MST->incorporateFunction(F: *I->getFunction());
1541 } else {
1542 MST = std::make_unique<ModuleSlotTracker>(args: nullptr);
1543 }
1544 }
1545 V->printAsOperand(O&: S, PrintType: false, MST&: *MST);
1546 return Name;
1547}
1548
1549std::string VPSlotTracker::getOrCreateName(const VPValue *V) const {
1550 std::string Name = VPValue2Name.lookup(Val: V);
1551 if (!Name.empty())
1552 return Name;
1553
1554 // If no name was assigned, no VPlan was provided when creating the slot
1555 // tracker or it is not reachable from the provided VPlan. This can happen,
1556 // e.g. when trying to print a recipe that has not been inserted into a VPlan
1557 // in a debugger.
1558 // TODO: Update VPSlotTracker constructor to assign names to recipes &
1559 // VPValues not associated with a VPlan, instead of constructing names ad-hoc
1560 // here.
1561 const VPRecipeBase *DefR = V->getDefiningRecipe();
1562 (void)DefR;
1563 assert((!DefR || !DefR->getParent() || !DefR->getParent()->getPlan()) &&
1564 "VPValue defined by a recipe in a VPlan?");
1565
1566 // Use the underlying value's name, if there is one.
1567 if (auto *UV = V->getUnderlyingValue()) {
1568 std::string Name;
1569 raw_string_ostream S(Name);
1570 UV->printAsOperand(O&: S, PrintType: false);
1571 return (Twine("ir<") + Name + ">").str();
1572 }
1573
1574 return "<badref>";
1575}
1576
1577bool LoopVectorizationPlanner::getDecisionAndClampRange(
1578 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
1579 assert(!Range.isEmpty() && "Trying to test an empty VF range.");
1580 bool PredicateAtRangeStart = Predicate(Range.Start);
1581
1582 for (ElementCount TmpVF : VFRange(Range.Start * 2, Range.End))
1583 if (Predicate(TmpVF) != PredicateAtRangeStart) {
1584 Range.End = TmpVF;
1585 break;
1586 }
1587
1588 return PredicateAtRangeStart;
1589}
1590
1591/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
1592/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
1593/// of VF's starting at a given VF and extending it as much as possible. Each
1594/// vectorization decision can potentially shorten this sub-range during
1595/// buildVPlan().
1596void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
1597 ElementCount MaxVF) {
1598 auto MaxVFTimes2 = MaxVF * 2;
1599 for (ElementCount VF = MinVF; ElementCount::isKnownLT(LHS: VF, RHS: MaxVFTimes2);) {
1600 VFRange SubRange = {VF, MaxVFTimes2};
1601 if (auto Plan = tryToBuildVPlan(Range&: SubRange)) {
1602 VPlanTransforms::optimize(Plan&: *Plan);
1603 // Update the name of the latch of the top-level vector loop region region
1604 // after optimizations which includes block folding.
1605 Plan->getVectorLoopRegion()->getExiting()->setName("vector.latch");
1606 VPlans.push_back(Elt: std::move(Plan));
1607 }
1608 VF = SubRange.End;
1609 }
1610}
1611
1612VPlan &LoopVectorizationPlanner::getPlanFor(ElementCount VF) const {
1613 assert(count_if(VPlans,
1614 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
1615 1 &&
1616 "Multiple VPlans for VF.");
1617
1618 for (const VPlanPtr &Plan : VPlans) {
1619 if (Plan->hasVF(VF))
1620 return *Plan.get();
1621 }
1622 llvm_unreachable("No plan found!");
1623}
1624
1625static void addRuntimeUnrollDisableMetaData(Loop *L) {
1626 SmallVector<Metadata *, 4> MDs;
1627 // Reserve first location for self reference to the LoopID metadata node.
1628 MDs.push_back(Elt: nullptr);
1629 bool IsUnrollMetadata = false;
1630 MDNode *LoopID = L->getLoopID();
1631 if (LoopID) {
1632 // First find existing loop unrolling disable metadata.
1633 for (unsigned I = 1, IE = LoopID->getNumOperands(); I < IE; ++I) {
1634 auto *MD = dyn_cast<MDNode>(Val: LoopID->getOperand(I));
1635 if (MD) {
1636 const auto *S = dyn_cast<MDString>(Val: MD->getOperand(I: 0));
1637 if (!S)
1638 continue;
1639 if (S->getString().starts_with(Prefix: "llvm.loop.unroll.runtime.disable"))
1640 continue;
1641 IsUnrollMetadata =
1642 S->getString().starts_with(Prefix: "llvm.loop.unroll.disable");
1643 }
1644 MDs.push_back(Elt: LoopID->getOperand(I));
1645 }
1646 }
1647
1648 if (!IsUnrollMetadata) {
1649 // Add runtime unroll disable metadata.
1650 LLVMContext &Context = L->getHeader()->getContext();
1651 SmallVector<Metadata *, 1> DisableOperands;
1652 DisableOperands.push_back(
1653 Elt: MDString::get(Context, Str: "llvm.loop.unroll.runtime.disable"));
1654 MDNode *DisableNode = MDNode::get(Context, MDs: DisableOperands);
1655 MDs.push_back(Elt: DisableNode);
1656 MDNode *NewLoopID = MDNode::get(Context, MDs);
1657 // Set operand 0 to refer to the loop id itself.
1658 NewLoopID->replaceOperandWith(I: 0, New: NewLoopID);
1659 L->setLoopID(NewLoopID);
1660 }
1661}
1662
1663void LoopVectorizationPlanner::updateLoopMetadataAndProfileInfo(
1664 Loop *VectorLoop, VPBasicBlock *HeaderVPBB, const VPlan &Plan,
1665 bool VectorizingEpilogue, MDNode *OrigLoopID,
1666 std::optional<unsigned> OrigAverageTripCount,
1667 unsigned OrigLoopInvocationWeight, unsigned EstimatedVFxUF,
1668 bool DisableRuntimeUnroll) {
1669 // Update the metadata of the scalar loop. Skip the update when vectorizing
1670 // the epilogue loop to ensure it is updated only once. Also skip the update
1671 // when the scalar loop became unreachable.
1672 if (Plan.getScalarPreheader()->hasPredecessors() && !VectorizingEpilogue) {
1673 std::optional<MDNode *> RemainderLoopID =
1674 makeFollowupLoopID(OrigLoopID, FollowupAttrs: {LLVMLoopVectorizeFollowupAll,
1675 LLVMLoopVectorizeFollowupEpilogue});
1676 if (RemainderLoopID) {
1677 OrigLoop->setLoopID(*RemainderLoopID);
1678 } else {
1679 if (DisableRuntimeUnroll)
1680 addRuntimeUnrollDisableMetaData(L: OrigLoop);
1681
1682 LoopVectorizeHints Hints(OrigLoop, true, *ORE);
1683 Hints.setAlreadyVectorized();
1684 }
1685 }
1686
1687 if (!VectorLoop)
1688 return;
1689
1690 if (std::optional<MDNode *> VectorizedLoopID = makeFollowupLoopID(
1691 OrigLoopID, FollowupAttrs: {LLVMLoopVectorizeFollowupAll,
1692 LLVMLoopVectorizeFollowupVectorized})) {
1693 VectorLoop->setLoopID(*VectorizedLoopID);
1694 } else {
1695 // Keep all loop hints from the original loop on the vector loop (we'll
1696 // replace the vectorizer-specific hints below).
1697 if (OrigLoopID)
1698 VectorLoop->setLoopID(OrigLoopID);
1699
1700 if (!VectorizingEpilogue) {
1701 LoopVectorizeHints Hints(VectorLoop, true, *ORE);
1702 Hints.setAlreadyVectorized();
1703 }
1704 }
1705 TargetTransformInfo::UnrollingPreferences UP;
1706 TTI.getUnrollingPreferences(L: VectorLoop, *PSE.getSE(), UP, ORE);
1707 if (!UP.UnrollVectorizedLoop || VectorizingEpilogue)
1708 addRuntimeUnrollDisableMetaData(L: VectorLoop);
1709
1710 // Set/update profile weights for the vector and remainder loops as original
1711 // loop iterations are now distributed among them. Note that original loop
1712 // becomes the scalar remainder loop after vectorization.
1713 //
1714 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
1715 // end up getting slightly roughened result but that should be OK since
1716 // profile is not inherently precise anyway. Note also possible bypass of
1717 // vector code caused by legality checks is ignored, assigning all the weight
1718 // to the vector loop, optimistically.
1719 //
1720 // For scalable vectorization we can't know at compile time how many
1721 // iterations of the loop are handled in one vector iteration, so instead
1722 // use the value of vscale used for tuning.
1723 unsigned AverageVectorTripCount = 0;
1724 unsigned RemainderAverageTripCount = 0;
1725 auto EC = VectorLoop->getLoopPreheader()->getParent()->getEntryCount();
1726 auto IsProfiled = EC && EC->getCount();
1727 if (!OrigAverageTripCount) {
1728 if (!IsProfiled)
1729 return;
1730 auto &SE = *PSE.getSE();
1731 AverageVectorTripCount = SE.getSmallConstantTripCount(L: VectorLoop);
1732 if (ProfcheckDisableMetadataFixes || !AverageVectorTripCount)
1733 return;
1734 if (Plan.getScalarPreheader()->hasPredecessors())
1735 RemainderAverageTripCount =
1736 SE.getSmallConstantTripCount(L: OrigLoop) % EstimatedVFxUF;
1737 // Setting to 1 should be sufficient to generate the correct branch weights.
1738 OrigLoopInvocationWeight = 1;
1739 } else {
1740 // Calculate number of iterations in unrolled loop.
1741 AverageVectorTripCount = *OrigAverageTripCount / EstimatedVFxUF;
1742 // Calculate number of iterations for remainder loop.
1743 RemainderAverageTripCount = *OrigAverageTripCount % EstimatedVFxUF;
1744 }
1745 if (HeaderVPBB) {
1746 setLoopEstimatedTripCount(L: VectorLoop, EstimatedTripCount: AverageVectorTripCount,
1747 EstimatedLoopInvocationWeight: OrigLoopInvocationWeight);
1748 }
1749
1750 if (Plan.getScalarPreheader()->hasPredecessors()) {
1751 setLoopEstimatedTripCount(L: OrigLoop, EstimatedTripCount: RemainderAverageTripCount,
1752 EstimatedLoopInvocationWeight: OrigLoopInvocationWeight);
1753 }
1754}
1755
1756#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1757void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
1758 if (VPlans.empty()) {
1759 O << "LV: No VPlans built.\n";
1760 return;
1761 }
1762 for (const auto &Plan : VPlans)
1763 if (PrintVPlansInDotFormat)
1764 Plan->printDOT(O);
1765 else
1766 Plan->print(O);
1767}
1768#endif
1769
1770bool llvm::canConstantBeExtended(const APInt *C, Type *NarrowType,
1771 TTI::PartialReductionExtendKind ExtKind) {
1772 APInt TruncatedVal = C->trunc(width: NarrowType->getScalarSizeInBits());
1773 unsigned WideSize = C->getBitWidth();
1774 APInt ExtendedVal = ExtKind == TTI::PR_SignExtend
1775 ? TruncatedVal.sext(width: WideSize)
1776 : TruncatedVal.zext(width: WideSize);
1777 return ExtendedVal == *C;
1778}
1779
1780TargetTransformInfo::OperandValueInfo
1781VPCostContext::getOperandInfo(VPValue *V) const {
1782 if (auto *IRV = dyn_cast<VPIRValue>(Val: V))
1783 return TTI::getOperandInfo(V: IRV->getValue());
1784
1785 return {};
1786}
1787
1788InstructionCost VPCostContext::getScalarizationOverhead(
1789 Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF,
1790 TTI::VectorInstrContext VIC, bool AlwaysIncludeReplicatingR) {
1791 if (VF.isScalar())
1792 return 0;
1793
1794 assert(!VF.isScalable() &&
1795 "Scalarization overhead not supported for scalable vectors");
1796
1797 InstructionCost ScalarizationCost = 0;
1798 // Compute the cost of scalarizing the result if needed.
1799 if (!ResultTy->isVoidTy()) {
1800 for (Type *VectorTy :
1801 to_vector(Range: getContainedTypes(Ty: toVectorizedTy(Ty: ResultTy, EC: VF)))) {
1802 ScalarizationCost += TTI.getScalarizationOverhead(
1803 Ty: cast<VectorType>(Val: VectorTy), DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
1804 /*Insert=*/true, /*Extract=*/false, CostKind,
1805 /*ForPoisonSrc=*/true, VL: {}, VIC);
1806 }
1807 }
1808 // Compute the cost of scalarizing the operands, skipping ones that do not
1809 // require extraction/scalarization and do not incur any overhead.
1810 SmallPtrSet<const VPValue *, 4> UniqueOperands;
1811 SmallVector<Type *> Tys;
1812 for (auto *Op : Operands) {
1813 if (isa<VPIRValue>(Val: Op) ||
1814 (!AlwaysIncludeReplicatingR &&
1815 isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Val: Op)) ||
1816 (isa<VPReplicateRecipe>(Val: Op) &&
1817 cast<VPReplicateRecipe>(Val: Op)->getOpcode() == Instruction::Load) ||
1818 !UniqueOperands.insert(Ptr: Op).second)
1819 continue;
1820 Tys.push_back(Elt: toVectorizedTy(Ty: Types.inferScalarType(V: Op), EC: VF));
1821 }
1822 return ScalarizationCost +
1823 TTI.getOperandsScalarizationOverhead(Tys, CostKind, VIC);
1824}
1825
1826bool VPCostContext::useEmulatedMaskMemRefHack(const VPReplicateRecipe *R,
1827 ElementCount VF) {
1828 const Instruction *UI = R->getUnderlyingInstr();
1829 if (isa<LoadInst>(Val: UI))
1830 return true;
1831 assert(isa<StoreInst>(UI) && "R must either be a load or store");
1832
1833 if (!NumPredStores) {
1834 // Count the number of predicated stores in the VPlan, caching the result.
1835 // Only stores where scatter is not legal are counted, matching the legacy
1836 // cost model behavior.
1837 const VPlan &Plan = *R->getParent()->getPlan();
1838 NumPredStores = 0;
1839 for (const VPRegionBlock *VPRB :
1840 VPBlockUtils::blocksOnly<const VPRegionBlock>(
1841 Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry()))) {
1842 assert(VPRB->isReplicator() && "must only contain replicate regions");
1843 for (const VPBasicBlock *VPBB :
1844 VPBlockUtils::blocksOnly<const VPBasicBlock>(
1845 Range: vp_depth_first_shallow(G: VPRB->getEntry()))) {
1846 for (const VPRecipeBase &Recipe : *VPBB) {
1847 auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &Recipe);
1848 if (!RepR)
1849 continue;
1850 if (!isa<StoreInst>(Val: RepR->getUnderlyingInstr()))
1851 continue;
1852 // Check if scatter is legal for this store. If so, don't count it.
1853 Type *Ty = Types.inferScalarType(V: RepR->getOperand(N: 0));
1854 auto *VTy = VectorType::get(ElementType: Ty, EC: VF);
1855 const Align Alignment =
1856 getLoadStoreAlignment(I: RepR->getUnderlyingInstr());
1857 if (!TTI.isLegalMaskedScatter(DataType: VTy, Alignment))
1858 ++(*NumPredStores);
1859 }
1860 }
1861 }
1862 }
1863 return *NumPredStores > NumberOfStoresToPredicate;
1864}
1865