1//===- VPlan.cpp - Vectorizer Plan ----------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This is the LLVM vectorization plan. It represents a candidate for
11/// vectorization, allowing to plan and optimize how to vectorize a given loop
12/// before generating LLVM-IR.
13/// The vectorizer uses vectorization plans to estimate the costs of potential
14/// candidates and if profitable to execute the desired plan, generating vector
15/// LLVM-IR code.
16///
17//===----------------------------------------------------------------------===//
18
19#include "VPlan.h"
20#include "LoopVectorizationPlanner.h"
21#include "VPlanCFG.h"
22#include "VPlanDominatorTree.h"
23#include "VPlanHelpers.h"
24#include "VPlanPatternMatch.h"
25#include "VPlanTransforms.h"
26#include "VPlanUtils.h"
27#include "llvm/ADT/PostOrderIterator.h"
28#include "llvm/ADT/STLExtras.h"
29#include "llvm/ADT/SmallVector.h"
30#include "llvm/ADT/StringExtras.h"
31#include "llvm/ADT/Twine.h"
32#include "llvm/Analysis/DomTreeUpdater.h"
33#include "llvm/Analysis/LoopInfo.h"
34#include "llvm/IR/BasicBlock.h"
35#include "llvm/IR/CFG.h"
36#include "llvm/IR/IRBuilder.h"
37#include "llvm/IR/Instruction.h"
38#include "llvm/IR/Instructions.h"
39#include "llvm/IR/Type.h"
40#include "llvm/IR/Value.h"
41#include "llvm/Support/Casting.h"
42#include "llvm/Support/CommandLine.h"
43#include "llvm/Support/Debug.h"
44#include "llvm/Support/GraphWriter.h"
45#include "llvm/Support/raw_ostream.h"
46#include "llvm/Transforms/Utils/BasicBlockUtils.h"
47#include "llvm/Transforms/Utils/LoopVersioning.h"
48#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
49#include <cassert>
50#include <string>
51
52using namespace llvm;
53using namespace llvm::VPlanPatternMatch;
54
55namespace llvm {
56extern cl::opt<bool> ProfcheckDisableMetadataFixes;
57} // namespace llvm
58
59/// @{
60/// Metadata attribute names
61const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
62const char LLVMLoopVectorizeFollowupVectorized[] =
63 "llvm.loop.vectorize.followup_vectorized";
64const char LLVMLoopVectorizeFollowupEpilogue[] =
65 "llvm.loop.vectorize.followup_epilogue";
66/// @}
67
68extern cl::opt<unsigned> ForceTargetInstructionCost;
69
70static cl::opt<bool> PrintVPlansInDotFormat(
71 "vplan-print-in-dot-format", cl::Hidden,
72 cl::desc("Use dot format instead of plain text when dumping VPlans"));
73
74#define DEBUG_TYPE "loop-vectorize"
75
76#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
77raw_ostream &llvm::operator<<(raw_ostream &OS, const VPRecipeBase &R) {
78 const VPBasicBlock *Parent = R.getParent();
79 VPSlotTracker SlotTracker(Parent ? Parent->getPlan() : nullptr);
80 R.print(OS, "", SlotTracker);
81 return OS;
82}
83#endif
84
85Value *VPLane::getAsRuntimeExpr(IRBuilderBase &Builder,
86 const ElementCount &VF) const {
87 switch (LaneKind) {
88 case VPLane::Kind::ScalableLast:
89 // Lane = RuntimeVF - VF.getKnownMinValue() + Lane
90 return Builder.CreateSub(LHS: getRuntimeVF(B&: Builder, Ty: Builder.getInt32Ty(), VF),
91 RHS: Builder.getInt32(C: VF.getKnownMinValue() - Lane));
92 case VPLane::Kind::First:
93 return Builder.getInt32(C: Lane);
94 }
95 llvm_unreachable("Unknown lane kind");
96}
97
98#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
99void VPValue::print(raw_ostream &OS, VPSlotTracker &SlotTracker) const {
100 if (const VPRecipeBase *R = getDefiningRecipe())
101 R->print(OS, "", SlotTracker);
102 else
103 printAsOperand(OS, SlotTracker);
104}
105
106void VPValue::dump() const {
107 const VPRecipeBase *Instr = getDefiningRecipe();
108 VPSlotTracker SlotTracker(
109 (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr);
110 print(dbgs(), SlotTracker);
111 dbgs() << "\n";
112}
113
114void VPRecipeBase::dump() const {
115 VPSlotTracker SlotTracker(getParent() ? getParent()->getPlan() : nullptr);
116 print(dbgs(), "", SlotTracker);
117 dbgs() << "\n";
118}
119#endif
120
121#if !defined(NDEBUG)
122bool VPRecipeValue::isDefinedBy(const VPDef *D) const { return Def == D; }
123#endif
124
125VPRecipeBase *VPValue::getDefiningRecipe() {
126 auto *DefValue = dyn_cast<VPRecipeValue>(Val: this);
127 return DefValue ? DefValue->Def : nullptr;
128}
129
130const VPRecipeBase *VPValue::getDefiningRecipe() const {
131 auto *DefValue = dyn_cast<VPRecipeValue>(Val: this);
132 return DefValue ? DefValue->Def : nullptr;
133}
134
135Value *VPValue::getLiveInIRValue() const {
136 return cast<VPIRValue>(Val: this)->getValue();
137}
138
139Type *VPIRValue::getType() const { return getUnderlyingValue()->getType(); }
140
141VPRecipeValue::VPRecipeValue(VPRecipeBase *Def, Value *UV)
142 : VPValue(VPVRecipeValueSC, UV), Def(Def) {
143 assert(Def && "VPRecipeValue requires a defining recipe");
144 Def->addDefinedValue(V: this);
145}
146
147VPRecipeValue::~VPRecipeValue() {
148 assert(Users.empty() &&
149 "trying to delete a VPRecipeValue with remaining users");
150 Def->removeDefinedValue(V: this);
151}
152
153// Get the top-most entry block of \p Start. This is the entry block of the
154// containing VPlan. This function is templated to support both const and non-const blocks
155template <typename T> static T *getPlanEntry(T *Start) {
156 T *Next = Start;
157 T *Current = Start;
158 while ((Next = Next->getParent()))
159 Current = Next;
160
161 SmallSetVector<T *, 8> WorkList;
162 WorkList.insert(Current);
163
164 for (unsigned i = 0; i < WorkList.size(); i++) {
165 T *Current = WorkList[i];
166 if (!Current->hasPredecessors())
167 return Current;
168 auto &Predecessors = Current->getPredecessors();
169 WorkList.insert_range(Predecessors);
170 }
171
172 llvm_unreachable("VPlan without any entry node without predecessors");
173}
174
175VPlan *VPBlockBase::getPlan() { return getPlanEntry(Start: this)->Plan; }
176
177const VPlan *VPBlockBase::getPlan() const { return getPlanEntry(Start: this)->Plan; }
178
179/// \return the VPBasicBlock that is the entry of Block, possibly indirectly.
180const VPBasicBlock *VPBlockBase::getEntryBasicBlock() const {
181 const VPBlockBase *Block = this;
182 while (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Val: Block))
183 Block = Region->getEntry();
184 return cast<VPBasicBlock>(Val: Block);
185}
186
187VPBasicBlock *VPBlockBase::getEntryBasicBlock() {
188 VPBlockBase *Block = this;
189 while (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Val: Block))
190 Block = Region->getEntry();
191 return cast<VPBasicBlock>(Val: Block);
192}
193
194void VPBlockBase::setPlan(VPlan *ParentPlan) {
195 assert(ParentPlan->getEntry() == this && "Can only set plan on its entry.");
196 Plan = ParentPlan;
197}
198
199/// \return the VPBasicBlock that is the exit of Block, possibly indirectly.
200const VPBasicBlock *VPBlockBase::getExitingBasicBlock() const {
201 const VPBlockBase *Block = this;
202 while (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Val: Block))
203 Block = Region->getExiting();
204 return cast<VPBasicBlock>(Val: Block);
205}
206
207VPBasicBlock *VPBlockBase::getExitingBasicBlock() {
208 VPBlockBase *Block = this;
209 while (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Val: Block))
210 Block = Region->getExiting();
211 return cast<VPBasicBlock>(Val: Block);
212}
213
214VPBlockBase *VPBlockBase::getEnclosingBlockWithSuccessors() {
215 if (!Successors.empty() || !Parent)
216 return this;
217 assert(Parent->getExiting() == this &&
218 "Block w/o successors not the exiting block of its parent.");
219 return Parent->getEnclosingBlockWithSuccessors();
220}
221
222VPBlockBase *VPBlockBase::getEnclosingBlockWithPredecessors() {
223 if (!Predecessors.empty() || !Parent)
224 return this;
225 assert(Parent->getEntry() == this &&
226 "Block w/o predecessors not the entry of its parent.");
227 return Parent->getEnclosingBlockWithPredecessors();
228}
229
230VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() {
231 iterator It = begin();
232 while (It != end() && It->isPhi())
233 It++;
234 return It;
235}
236
237VPTransformState::VPTransformState(const TargetTransformInfo *TTI,
238 ElementCount VF, LoopInfo *LI,
239 DominatorTree *DT, AssumptionCache *AC,
240 IRBuilderBase &Builder, VPlan *Plan,
241 Loop *CurrentParentLoop, Type *CanonicalIVTy)
242 : TTI(TTI), VF(VF), CFG(DT), LI(LI), AC(AC), Builder(Builder), Plan(Plan),
243 CurrentParentLoop(CurrentParentLoop), TypeAnalysis(*Plan), VPDT(*Plan) {}
244
245Value *VPTransformState::get(const VPValue *Def, const VPLane &Lane) {
246 if (isa<VPIRValue, VPSymbolicValue>(Val: Def))
247 return Def->getUnderlyingValue();
248
249 if (hasScalarValue(Def, Lane))
250 return Data.VPV2Scalars[Def][Lane.mapToCacheIndex(VF)];
251
252 if (!Lane.isFirstLane() && vputils::isSingleScalar(VPV: Def) &&
253 hasScalarValue(Def, Lane: VPLane::getFirstLane())) {
254 return Data.VPV2Scalars[Def][0];
255 }
256
257 // Look through BuildVector to avoid redundant extracts.
258 // TODO: Remove once replicate regions are unrolled explicitly.
259 if (Lane.getKind() == VPLane::Kind::First && match(V: Def, P: m_BuildVector())) {
260 auto *BuildVector = cast<VPInstruction>(Val: Def);
261 return get(Def: BuildVector->getOperand(N: Lane.getKnownLane()), IsScalar: true);
262 }
263
264 assert(hasVectorValue(Def));
265 auto *VecPart = Data.VPV2Vector[Def];
266 if (!VecPart->getType()->isVectorTy()) {
267 assert(Lane.isFirstLane() && "cannot get lane > 0 for scalar");
268 return VecPart;
269 }
270 // TODO: Cache created scalar values.
271 Value *LaneV = Lane.getAsRuntimeExpr(Builder, VF);
272 auto *Extract = Builder.CreateExtractElement(Vec: VecPart, Idx: LaneV);
273 // set(Def, Extract, Instance);
274 return Extract;
275}
276
277Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) {
278 if (NeedsScalar) {
279 assert((VF.isScalar() || isa<VPIRValue, VPSymbolicValue>(Def) ||
280 hasVectorValue(Def) || !vputils::onlyFirstLaneUsed(Def) ||
281 (hasScalarValue(Def, VPLane(0)) &&
282 Data.VPV2Scalars[Def].size() == 1)) &&
283 "Trying to access a single scalar per part but has multiple scalars "
284 "per part.");
285 return get(Def, Lane: VPLane(0));
286 }
287
288 // If Values have been set for this Def return the one relevant for \p Part.
289 if (hasVectorValue(Def))
290 return Data.VPV2Vector[Def];
291
292 auto GetBroadcastInstrs = [this](Value *V) {
293 if (VF.isScalar())
294 return V;
295 // Broadcast the scalar into all locations in the vector.
296 Value *Shuf = Builder.CreateVectorSplat(EC: VF, V, Name: "broadcast");
297 return Shuf;
298 };
299
300 if (!hasScalarValue(Def, Lane: {0})) {
301 Value *IRV = Def->getLiveInIRValue();
302 Value *B = GetBroadcastInstrs(IRV);
303 set(Def, V: B);
304 return B;
305 }
306
307 Value *ScalarValue = get(Def, Lane: VPLane(0));
308 // If we aren't vectorizing, we can just copy the scalar map values over
309 // to the vector map.
310 if (VF.isScalar()) {
311 set(Def, V: ScalarValue);
312 return ScalarValue;
313 }
314
315 bool IsSingleScalar = vputils::isSingleScalar(VPV: Def);
316 VPLane LastLane(IsSingleScalar ? 0 : VF.getFixedValue() - 1);
317
318 // We need to construct the vector value for a single-scalar value by
319 // broadcasting the scalar to all lanes.
320 // TODO: Replace by introducing Broadcast VPInstructions.
321 assert(IsSingleScalar && "must be a single-scalar at this point");
322 // Set the insert point after the last scalarized instruction or after the
323 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence
324 // will directly follow the scalar definitions.
325 auto OldIP = Builder.saveIP();
326 auto *LastInst = cast<Instruction>(Val: get(Def, Lane: LastLane));
327 auto NewIP = isa<PHINode>(Val: LastInst)
328 ? LastInst->getParent()->getFirstNonPHIIt()
329 : std::next(x: BasicBlock::iterator(LastInst));
330 Builder.SetInsertPoint(&*NewIP);
331 Value *VectorValue = GetBroadcastInstrs(ScalarValue);
332 set(Def, V: VectorValue);
333 Builder.restoreIP(IP: OldIP);
334 return VectorValue;
335}
336
337void VPTransformState::setDebugLocFrom(DebugLoc DL) {
338 const DILocation *DIL = DL;
339 // When a FSDiscriminator is enabled, we don't need to add the multiply
340 // factors to the discriminators.
341 if (DIL &&
342 Builder.GetInsertBlock()
343 ->getParent()
344 ->shouldEmitDebugInfoForProfiling() &&
345 !EnableFSDiscriminator) {
346 // FIXME: For scalable vectors, assume vscale=1.
347 unsigned UF = Plan->getUF();
348 auto NewDIL =
349 DIL->cloneByMultiplyingDuplicationFactor(DF: UF * VF.getKnownMinValue());
350 if (NewDIL)
351 Builder.SetCurrentDebugLocation(*NewDIL);
352 else
353 LLVM_DEBUG(dbgs() << "Failed to create new discriminator: "
354 << DIL->getFilename() << " Line: " << DIL->getLine());
355 } else
356 Builder.SetCurrentDebugLocation(DL);
357}
358
359Value *VPTransformState::packScalarIntoVectorizedValue(const VPValue *Def,
360 Value *WideValue,
361 const VPLane &Lane) {
362 Value *ScalarInst = get(Def, Lane);
363 Value *LaneExpr = Lane.getAsRuntimeExpr(Builder, VF);
364 if (auto *StructTy = dyn_cast<StructType>(Val: WideValue->getType())) {
365 // We must handle each element of a vectorized struct type.
366 for (unsigned I = 0, E = StructTy->getNumElements(); I != E; I++) {
367 Value *ScalarValue = Builder.CreateExtractValue(Agg: ScalarInst, Idxs: I);
368 Value *VectorValue = Builder.CreateExtractValue(Agg: WideValue, Idxs: I);
369 VectorValue =
370 Builder.CreateInsertElement(Vec: VectorValue, NewElt: ScalarValue, Idx: LaneExpr);
371 WideValue = Builder.CreateInsertValue(Agg: WideValue, Val: VectorValue, Idxs: I);
372 }
373 } else {
374 WideValue = Builder.CreateInsertElement(Vec: WideValue, NewElt: ScalarInst, Idx: LaneExpr);
375 }
376 return WideValue;
377}
378
379BasicBlock *VPBasicBlock::createEmptyBasicBlock(VPTransformState &State) {
380 auto &CFG = State.CFG;
381 // BB stands for IR BasicBlocks. VPBB stands for VPlan VPBasicBlocks.
382 // Pred stands for Predessor. Prev stands for Previous - last visited/created.
383 BasicBlock *PrevBB = CFG.PrevBB;
384 BasicBlock *NewBB = BasicBlock::Create(Context&: PrevBB->getContext(), Name: getName(),
385 Parent: PrevBB->getParent(), InsertBefore: CFG.ExitBB);
386 LLVM_DEBUG(dbgs() << "LV: created " << NewBB->getName() << '\n');
387
388 return NewBB;
389}
390
391void VPBasicBlock::connectToPredecessors(VPTransformState &State) {
392 auto &CFG = State.CFG;
393 BasicBlock *NewBB = CFG.VPBB2IRBB[this];
394
395 // Register NewBB in its loop. In innermost loops its the same for all
396 // BB's.
397 Loop *ParentLoop = State.CurrentParentLoop;
398 // If this block has a sole successor that is an exit block or is an exit
399 // block itself then it needs adding to the same parent loop as the exit
400 // block.
401 VPBlockBase *SuccOrExitVPB = getSingleSuccessor();
402 SuccOrExitVPB = SuccOrExitVPB ? SuccOrExitVPB : this;
403 if (State.Plan->isExitBlock(VPBB: SuccOrExitVPB)) {
404 ParentLoop = State.LI->getLoopFor(
405 BB: cast<VPIRBasicBlock>(Val: SuccOrExitVPB)->getIRBasicBlock());
406 }
407
408 if (ParentLoop && !State.LI->getLoopFor(BB: NewBB))
409 ParentLoop->addBasicBlockToLoop(NewBB, LI&: *State.LI);
410
411 SmallVector<VPBlockBase *> Preds;
412 if (VPBlockUtils::isHeader(VPB: this, VPDT: State.VPDT)) {
413 // There's no block for the latch yet, connect to the preheader only.
414 Preds = {getPredecessors()[0]};
415 } else {
416 Preds = to_vector(Range&: getPredecessors());
417 }
418
419 // Hook up the new basic block to its predecessors.
420 for (VPBlockBase *PredVPBlock : Preds) {
421 VPBasicBlock *PredVPBB = PredVPBlock->getExitingBasicBlock();
422 auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors();
423 assert(CFG.VPBB2IRBB.contains(PredVPBB) &&
424 "Predecessor basic-block not found building successor.");
425 BasicBlock *PredBB = CFG.VPBB2IRBB[PredVPBB];
426 auto *PredBBTerminator = PredBB->getTerminator();
427 LLVM_DEBUG(dbgs() << "LV: draw edge from " << PredBB->getName() << '\n');
428
429 auto *TermBr = dyn_cast<BranchInst>(Val: PredBBTerminator);
430 if (isa<UnreachableInst>(Val: PredBBTerminator)) {
431 assert(PredVPSuccessors.size() == 1 &&
432 "Predecessor ending w/o branch must have single successor.");
433 DebugLoc DL = PredBBTerminator->getDebugLoc();
434 PredBBTerminator->eraseFromParent();
435 auto *Br = BranchInst::Create(IfTrue: NewBB, InsertBefore: PredBB);
436 Br->setDebugLoc(DL);
437 } else if (TermBr && !TermBr->isConditional()) {
438 TermBr->setSuccessor(idx: 0, NewSucc: NewBB);
439 } else {
440 // Set each forward successor here when it is created, excluding
441 // backedges. A backward successor is set when the branch is created.
442 // Branches to VPIRBasicBlocks must have the same successors in VPlan as
443 // in the original IR, except when the predecessor is the entry block.
444 // This enables including SCEV and memory runtime check blocks in VPlan.
445 // TODO: Remove exception by modeling the terminator of entry block using
446 // BranchOnCond.
447 unsigned idx = PredVPSuccessors.front() == this ? 0 : 1;
448 assert((TermBr && (!TermBr->getSuccessor(idx) ||
449 (isa<VPIRBasicBlock>(this) &&
450 (TermBr->getSuccessor(idx) == NewBB ||
451 PredVPBlock == getPlan()->getEntry())))) &&
452 "Trying to reset an existing successor block.");
453 TermBr->setSuccessor(idx, NewSucc: NewBB);
454 }
455 CFG.DTU.applyUpdates(Updates: {{DominatorTree::Insert, PredBB, NewBB}});
456 }
457}
458
459void VPIRBasicBlock::execute(VPTransformState *State) {
460 assert(getHierarchicalSuccessors().size() <= 2 &&
461 "VPIRBasicBlock can have at most two successors at the moment!");
462 // Move completely disconnected blocks to their final position.
463 if (IRBB->hasNPredecessors(N: 0) && succ_begin(BB: IRBB) == succ_end(BB: IRBB))
464 IRBB->moveAfter(MovePos: State->CFG.PrevBB);
465 State->Builder.SetInsertPoint(IRBB->getTerminator());
466 State->CFG.PrevBB = IRBB;
467 State->CFG.VPBB2IRBB[this] = IRBB;
468 executeRecipes(State, BB: IRBB);
469 // Create a branch instruction to terminate IRBB if one was not created yet
470 // and is needed.
471 if (getSingleSuccessor() && isa<UnreachableInst>(Val: IRBB->getTerminator())) {
472 auto *Br = State->Builder.CreateBr(Dest: IRBB);
473 Br->setOperand(i_nocapture: 0, Val_nocapture: nullptr);
474 IRBB->getTerminator()->eraseFromParent();
475 } else {
476 assert(
477 (getNumSuccessors() == 0 || isa<BranchInst>(IRBB->getTerminator())) &&
478 "other blocks must be terminated by a branch");
479 }
480
481 connectToPredecessors(State&: *State);
482}
483
484VPIRBasicBlock *VPIRBasicBlock::clone() {
485 auto *NewBlock = getPlan()->createEmptyVPIRBasicBlock(IRBB);
486 for (VPRecipeBase &R : Recipes)
487 NewBlock->appendRecipe(Recipe: R.clone());
488 return NewBlock;
489}
490
491void VPBasicBlock::execute(VPTransformState *State) {
492 bool Replica = bool(State->Lane);
493 BasicBlock *NewBB = State->CFG.PrevBB; // Reuse it if possible.
494
495 if (VPBlockUtils::isHeader(VPB: this, VPDT: State->VPDT)) {
496 // Create and register the new vector loop.
497 Loop *PrevParentLoop = State->CurrentParentLoop;
498 State->CurrentParentLoop = State->LI->AllocateLoop();
499
500 // Insert the new loop into the loop nest and register the new basic blocks
501 // before calling any utilities such as SCEV that require valid LoopInfo.
502 if (PrevParentLoop)
503 PrevParentLoop->addChildLoop(NewChild: State->CurrentParentLoop);
504 else
505 State->LI->addTopLevelLoop(New: State->CurrentParentLoop);
506 }
507
508 auto IsReplicateRegion = [](VPBlockBase *BB) {
509 auto *R = dyn_cast_or_null<VPRegionBlock>(Val: BB);
510 assert((!R || R->isReplicator()) &&
511 "only replicate region blocks should remain");
512 return R;
513 };
514 // 1. Create an IR basic block.
515 if ((Replica && this == getParent()->getEntry()) ||
516 IsReplicateRegion(getSingleHierarchicalPredecessor())) {
517 // Reuse the previous basic block if the current VPBB is either
518 // * the entry to a replicate region, or
519 // * the exit of a replicate region.
520 State->CFG.VPBB2IRBB[this] = NewBB;
521 } else {
522 NewBB = createEmptyBasicBlock(State&: *State);
523
524 State->Builder.SetInsertPoint(NewBB);
525 // Temporarily terminate with unreachable until CFG is rewired.
526 UnreachableInst *Terminator = State->Builder.CreateUnreachable();
527 State->Builder.SetInsertPoint(Terminator);
528
529 State->CFG.PrevBB = NewBB;
530 State->CFG.VPBB2IRBB[this] = NewBB;
531 connectToPredecessors(State&: *State);
532 }
533
534 // 2. Fill the IR basic block with IR instructions.
535 executeRecipes(State, BB: NewBB);
536
537 // If this block is a latch, update CurrentParentLoop.
538 if (VPBlockUtils::isLatch(VPB: this, VPDT: State->VPDT))
539 State->CurrentParentLoop = State->CurrentParentLoop->getParentLoop();
540}
541
542VPBasicBlock *VPBasicBlock::clone() {
543 auto *NewBlock = getPlan()->createVPBasicBlock(Name: getName());
544 for (VPRecipeBase &R : *this)
545 NewBlock->appendRecipe(Recipe: R.clone());
546 return NewBlock;
547}
548
549void VPBasicBlock::executeRecipes(VPTransformState *State, BasicBlock *BB) {
550 LLVM_DEBUG(dbgs() << "LV: vectorizing VPBB: " << getName()
551 << " in BB: " << BB->getName() << '\n');
552
553 State->CFG.PrevVPBB = this;
554
555 for (VPRecipeBase &Recipe : Recipes) {
556 State->setDebugLocFrom(Recipe.getDebugLoc());
557 Recipe.execute(State&: *State);
558 }
559
560 LLVM_DEBUG(dbgs() << "LV: filled BB: " << *BB);
561}
562
563VPBasicBlock *VPBasicBlock::splitAt(iterator SplitAt) {
564 assert((SplitAt == end() || SplitAt->getParent() == this) &&
565 "can only split at a position in the same block");
566
567 // Create new empty block after the block to split.
568 auto *SplitBlock = getPlan()->createVPBasicBlock(Name: getName() + ".split");
569 VPBlockUtils::insertBlockAfter(NewBlock: SplitBlock, BlockPtr: this);
570
571 // Finally, move the recipes starting at SplitAt to new block.
572 for (VPRecipeBase &ToMove :
573 make_early_inc_range(Range: make_range(x: SplitAt, y: this->end())))
574 ToMove.moveBefore(BB&: *SplitBlock, I: SplitBlock->end());
575
576 return SplitBlock;
577}
578
579/// Return the enclosing loop region for region \p P. The templated version is
580/// used to support both const and non-const block arguments.
581template <typename T> static T *getEnclosingLoopRegionForRegion(T *P) {
582 if (P && P->isReplicator()) {
583 P = P->getParent();
584 // Multiple loop regions can be nested, but replicate regions can only be
585 // nested inside a loop region or must be outside any other region.
586 assert((!P || !P->isReplicator()) && "unexpected nested replicate regions");
587 }
588 return P;
589}
590
591VPRegionBlock *VPBasicBlock::getEnclosingLoopRegion() {
592 return getEnclosingLoopRegionForRegion(P: getParent());
593}
594
595const VPRegionBlock *VPBasicBlock::getEnclosingLoopRegion() const {
596 return getEnclosingLoopRegionForRegion(P: getParent());
597}
598
599static bool hasConditionalTerminator(const VPBasicBlock *VPBB) {
600 if (VPBB->empty()) {
601 assert(
602 VPBB->getNumSuccessors() < 2 &&
603 "block with multiple successors doesn't have a recipe as terminator");
604 return false;
605 }
606
607 const VPRecipeBase *R = &VPBB->back();
608 [[maybe_unused]] bool IsSwitch =
609 isa<VPInstruction>(Val: R) &&
610 cast<VPInstruction>(Val: R)->getOpcode() == Instruction::Switch;
611 [[maybe_unused]] bool IsBranchOnTwoConds = match(V: R, P: m_BranchOnTwoConds());
612 [[maybe_unused]] bool IsCondBranch =
613 isa<VPBranchOnMaskRecipe>(Val: R) ||
614 match(V: R, P: m_CombineOr(L: m_BranchOnCond(), R: m_BranchOnCount()));
615 if (VPBB->getNumSuccessors() == 2 ||
616 (VPBB->isExiting() && !VPBB->getParent()->isReplicator())) {
617 assert((IsCondBranch || IsSwitch || IsBranchOnTwoConds) &&
618 "block with multiple successors not terminated by "
619 "conditional branch nor switch recipe");
620
621 return true;
622 }
623
624 if (VPBB->getNumSuccessors() > 2) {
625 assert((IsSwitch || IsBranchOnTwoConds) &&
626 "block with more than 2 successors not terminated by a switch or "
627 "branch-on-two-conds recipe");
628 return true;
629 }
630
631 assert(
632 !IsCondBranch && !IsBranchOnTwoConds &&
633 "block with 0 or 1 successors terminated by conditional branch recipe");
634 return false;
635}
636
637VPRecipeBase *VPBasicBlock::getTerminator() {
638 if (hasConditionalTerminator(VPBB: this))
639 return &back();
640 return nullptr;
641}
642
643const VPRecipeBase *VPBasicBlock::getTerminator() const {
644 if (hasConditionalTerminator(VPBB: this))
645 return &back();
646 return nullptr;
647}
648
649bool VPBasicBlock::isExiting() const {
650 return getParent() && getParent()->getExitingBasicBlock() == this;
651}
652
653#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
654void VPBlockBase::print(raw_ostream &O) const {
655 VPSlotTracker SlotTracker(getPlan());
656 print(O, "", SlotTracker);
657}
658
659void VPBlockBase::printSuccessors(raw_ostream &O, const Twine &Indent) const {
660 if (getSuccessors().empty()) {
661 O << Indent << "No successors\n";
662 } else {
663 O << Indent << "Successor(s): ";
664 ListSeparator LS;
665 for (auto *Succ : getSuccessors())
666 O << LS << Succ->getName();
667 O << '\n';
668 }
669}
670
671void VPBasicBlock::print(raw_ostream &O, const Twine &Indent,
672 VPSlotTracker &SlotTracker) const {
673 O << Indent << getName() << ":\n";
674
675 auto RecipeIndent = Indent + " ";
676 for (const VPRecipeBase &Recipe : *this) {
677 Recipe.print(O, RecipeIndent, SlotTracker);
678 O << '\n';
679 }
680
681 printSuccessors(O, Indent);
682}
683#endif
684
685static std::pair<VPBlockBase *, VPBlockBase *> cloneFrom(VPBlockBase *Entry);
686
687// Clone the CFG for all nodes reachable from \p Entry, this includes cloning
688// the blocks and their recipes. Operands of cloned recipes will NOT be updated.
689// Remapping of operands must be done separately. Returns a pair with the new
690// entry and exiting blocks of the cloned region. If \p Entry isn't part of a
691// region, return nullptr for the exiting block.
692static std::pair<VPBlockBase *, VPBlockBase *> cloneFrom(VPBlockBase *Entry) {
693 DenseMap<VPBlockBase *, VPBlockBase *> Old2NewVPBlocks;
694 VPBlockBase *Exiting = nullptr;
695 bool InRegion = Entry->getParent();
696 // First, clone blocks reachable from Entry.
697 for (VPBlockBase *BB : vp_depth_first_shallow(G: Entry)) {
698 VPBlockBase *NewBB = BB->clone();
699 Old2NewVPBlocks[BB] = NewBB;
700 if (InRegion && BB->getNumSuccessors() == 0) {
701 assert(!Exiting && "Multiple exiting blocks?");
702 Exiting = BB;
703 }
704 }
705 assert((!InRegion || Exiting) && "regions must have a single exiting block");
706
707 // Second, update the predecessors & successors of the cloned blocks.
708 for (VPBlockBase *BB : vp_depth_first_shallow(G: Entry)) {
709 VPBlockBase *NewBB = Old2NewVPBlocks[BB];
710 SmallVector<VPBlockBase *> NewPreds;
711 for (VPBlockBase *Pred : BB->getPredecessors()) {
712 NewPreds.push_back(Elt: Old2NewVPBlocks[Pred]);
713 }
714 NewBB->setPredecessors(NewPreds);
715 SmallVector<VPBlockBase *> NewSuccs;
716 for (VPBlockBase *Succ : BB->successors()) {
717 NewSuccs.push_back(Elt: Old2NewVPBlocks[Succ]);
718 }
719 NewBB->setSuccessors(NewSuccs);
720 }
721
722#if !defined(NDEBUG)
723 // Verify that the order of predecessors and successors matches in the cloned
724 // version.
725 for (const auto &[OldBB, NewBB] :
726 zip(vp_depth_first_shallow(Entry),
727 vp_depth_first_shallow(Old2NewVPBlocks[Entry]))) {
728 for (const auto &[OldPred, NewPred] :
729 zip(OldBB->getPredecessors(), NewBB->getPredecessors()))
730 assert(NewPred == Old2NewVPBlocks[OldPred] && "Different predecessors");
731
732 for (const auto &[OldSucc, NewSucc] :
733 zip(OldBB->successors(), NewBB->successors()))
734 assert(NewSucc == Old2NewVPBlocks[OldSucc] && "Different successors");
735 }
736#endif
737
738 return std::make_pair(x&: Old2NewVPBlocks[Entry],
739 y: Exiting ? Old2NewVPBlocks[Exiting] : nullptr);
740}
741
742VPRegionBlock *VPRegionBlock::clone() {
743 const auto &[NewEntry, NewExiting] = cloneFrom(Entry: getEntry());
744 VPlan &Plan = *getPlan();
745 VPRegionBlock *NewRegion =
746 isReplicator()
747 ? Plan.createReplicateRegion(Entry: NewEntry, Exiting: NewExiting, Name: getName())
748 : Plan.createLoopRegion(Name: getName(), Entry: NewEntry, Exiting: NewExiting);
749
750 for (VPBlockBase *Block : vp_depth_first_shallow(G: NewEntry))
751 Block->setParent(NewRegion);
752 return NewRegion;
753}
754
755void VPRegionBlock::execute(VPTransformState *State) {
756 assert(isReplicator() &&
757 "Loop regions should have been lowered to plain CFG");
758 assert(!State->Lane && "Replicating a Region with non-null instance.");
759 assert(!State->VF.isScalable() && "VF is assumed to be non scalable.");
760
761 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
762 Entry);
763 State->Lane = VPLane(0);
764 for (unsigned Lane = 0, VF = State->VF.getFixedValue(); Lane < VF; ++Lane) {
765 State->Lane = VPLane(Lane, VPLane::Kind::First);
766 // Visit the VPBlocks connected to \p this, starting from it.
767 for (VPBlockBase *Block : RPOT) {
768 LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
769 Block->execute(State);
770 }
771 }
772
773 // Exit replicating mode.
774 State->Lane.reset();
775}
776
777InstructionCost VPBasicBlock::cost(ElementCount VF, VPCostContext &Ctx) {
778 InstructionCost Cost = 0;
779 for (VPRecipeBase &R : Recipes)
780 Cost += R.cost(VF, Ctx);
781 return Cost;
782}
783
784const VPBasicBlock *VPBasicBlock::getCFGPredecessor(unsigned Idx) const {
785 const VPBlockBase *Pred = nullptr;
786 if (hasPredecessors()) {
787 Pred = getPredecessors()[Idx];
788 } else {
789 auto *Region = getParent();
790 assert(Region && !Region->isReplicator() && Region->getEntry() == this &&
791 "must be in the entry block of a non-replicate region");
792 assert(Idx < 2 && Region->getNumPredecessors() == 1 &&
793 "loop region has a single predecessor (preheader), its entry block "
794 "has 2 incoming blocks");
795
796 // Idx == 0 selects the predecessor of the region, Idx == 1 selects the
797 // region itself whose exiting block feeds the phi across the backedge.
798 Pred = Idx == 0 ? Region->getSinglePredecessor() : Region;
799 }
800 return Pred->getExitingBasicBlock();
801}
802
803InstructionCost VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx) {
804 if (!isReplicator()) {
805 InstructionCost Cost = 0;
806 for (VPBlockBase *Block : vp_depth_first_shallow(G: getEntry()))
807 Cost += Block->cost(VF, Ctx);
808 InstructionCost BackedgeCost =
809 ForceTargetInstructionCost.getNumOccurrences()
810 ? InstructionCost(ForceTargetInstructionCost.getNumOccurrences())
811 : Ctx.TTI.getCFInstrCost(Opcode: Instruction::Br, CostKind: Ctx.CostKind);
812 LLVM_DEBUG(dbgs() << "Cost of " << BackedgeCost << " for VF " << VF
813 << ": vector loop backedge\n");
814 Cost += BackedgeCost;
815 return Cost;
816 }
817
818 // Compute the cost of a replicate region. Replicating isn't supported for
819 // scalable vectors, return an invalid cost for them.
820 // TODO: Discard scalable VPlans with replicate recipes earlier after
821 // construction.
822 if (VF.isScalable())
823 return InstructionCost::getInvalid();
824
825 // Compute and return the cost of the conditionally executed recipes.
826 assert(VF.isVector() && "Can only compute vector cost at the moment.");
827 VPBasicBlock *Then = cast<VPBasicBlock>(Val: getEntry()->getSuccessors()[0]);
828 return Then->cost(VF, Ctx);
829}
830
831#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
832void VPRegionBlock::print(raw_ostream &O, const Twine &Indent,
833 VPSlotTracker &SlotTracker) const {
834 O << Indent << (isReplicator() ? "<xVFxUF> " : "<x1> ") << getName() << ": {";
835 auto NewIndent = Indent + " ";
836 for (auto *BlockBase : vp_depth_first_shallow(Entry)) {
837 O << '\n';
838 BlockBase->print(O, NewIndent, SlotTracker);
839 }
840 O << Indent << "}\n";
841
842 printSuccessors(O, Indent);
843}
844#endif
845
846void VPRegionBlock::dissolveToCFGLoop() {
847 auto *Header = cast<VPBasicBlock>(Val: getEntry());
848 if (auto *CanIV = dyn_cast<VPCanonicalIVPHIRecipe>(Val: &Header->front())) {
849 assert(this == getPlan()->getVectorLoopRegion() &&
850 "Canonical IV must be in the entry of the top-level loop region");
851 auto *ScalarR = VPBuilder(CanIV).createScalarPhi(
852 IncomingValues: {CanIV->getStartValue(), CanIV->getBackedgeValue()},
853 DL: CanIV->getDebugLoc(), Name: "index");
854 CanIV->replaceAllUsesWith(New: ScalarR);
855 CanIV->eraseFromParent();
856 }
857
858 VPBlockBase *Preheader = getSinglePredecessor();
859 auto *ExitingLatch = cast<VPBasicBlock>(Val: getExiting());
860
861 VPBlockUtils::disconnectBlocks(From: Preheader, To: this);
862
863 for (VPBlockBase *VPB : vp_depth_first_shallow(G: Entry))
864 VPB->setParent(getParent());
865
866 VPBlockUtils::connectBlocks(From: Preheader, To: Header);
867 VPBlockUtils::transferSuccessors(Old: this, New: ExitingLatch);
868 VPBlockUtils::connectBlocks(From: ExitingLatch, To: Header);
869}
870
871VPlan::VPlan(Loop *L) {
872 setEntry(createVPIRBasicBlock(IRBB: L->getLoopPreheader()));
873 ScalarHeader = createVPIRBasicBlock(IRBB: L->getHeader());
874
875 SmallVector<BasicBlock *> IRExitBlocks;
876 L->getUniqueExitBlocks(ExitBlocks&: IRExitBlocks);
877 for (BasicBlock *EB : IRExitBlocks)
878 ExitBlocks.push_back(Elt: createVPIRBasicBlock(IRBB: EB));
879}
880
881VPlan::~VPlan() {
882 VPSymbolicValue DummyValue;
883
884 for (auto *VPB : CreatedBlocks) {
885 if (auto *VPBB = dyn_cast<VPBasicBlock>(Val: VPB)) {
886 // Replace all operands of recipes and all VPValues defined in VPBB with
887 // DummyValue so the block can be deleted.
888 for (VPRecipeBase &R : *VPBB) {
889 for (auto *Def : R.definedValues())
890 Def->replaceAllUsesWith(New: &DummyValue);
891
892 for (unsigned I = 0, E = R.getNumOperands(); I != E; I++)
893 R.setOperand(I, New: &DummyValue);
894 }
895 }
896 delete VPB;
897 }
898 for (VPValue *VPV : getLiveIns())
899 delete VPV;
900 delete BackedgeTakenCount;
901}
902
903VPIRBasicBlock *VPlan::getExitBlock(BasicBlock *IRBB) const {
904 auto Iter = find_if(Range: getExitBlocks(), P: [IRBB](const VPIRBasicBlock *VPIRBB) {
905 return VPIRBB->getIRBasicBlock() == IRBB;
906 });
907 assert(Iter != getExitBlocks().end() && "no exit block found");
908 return *Iter;
909}
910
911bool VPlan::isExitBlock(VPBlockBase *VPBB) {
912 return is_contained(Range&: ExitBlocks, Element: VPBB);
913}
914
915/// To make RUN_VPLAN_PASS print final VPlan.
916static void printFinalVPlan(VPlan &) {}
917
918/// Generate the code inside the preheader and body of the vectorized loop.
919/// Assumes a single pre-header basic-block was created for this. Introduce
920/// additional basic-blocks as needed, and fill them all.
921void VPlan::execute(VPTransformState *State) {
922 // Initialize CFG state.
923 State->CFG.PrevVPBB = nullptr;
924 State->CFG.ExitBB = State->CFG.PrevBB->getSingleSuccessor();
925
926 // Update VPDominatorTree since VPBasicBlock may be removed after State was
927 // constructed.
928 State->VPDT.recalculate(Func&: *this);
929
930 // Disconnect VectorPreHeader from ExitBB in both the CFG and DT.
931 BasicBlock *VectorPreHeader = State->CFG.PrevBB;
932 cast<BranchInst>(Val: VectorPreHeader->getTerminator())->setSuccessor(idx: 0, NewSucc: nullptr);
933 State->CFG.DTU.applyUpdates(
934 Updates: {{DominatorTree::Delete, VectorPreHeader, State->CFG.ExitBB}});
935
936 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << State->VF
937 << ", UF=" << getUF() << '\n');
938 setName("Final VPlan");
939 // TODO: RUN_VPLAN_PASS/VPlanTransforms::runPass should automatically dump
940 // VPlans after some specific stages when "-debug" is specified, but that
941 // hasn't been implemented yet. For now, just do both:
942 LLVM_DEBUG(dump());
943 RUN_VPLAN_PASS(printFinalVPlan, *this);
944
945 BasicBlock *ScalarPh = State->CFG.ExitBB;
946 VPBasicBlock *ScalarPhVPBB = getScalarPreheader();
947 if (ScalarPhVPBB->hasPredecessors()) {
948 // Disconnect scalar preheader and scalar header, as the dominator tree edge
949 // will be updated as part of VPlan execution. This allows keeping the DTU
950 // logic generic during VPlan execution.
951 State->CFG.DTU.applyUpdates(
952 Updates: {{DominatorTree::Delete, ScalarPh, ScalarPh->getSingleSuccessor()}});
953 }
954 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
955 Entry);
956 // Generate code for the VPlan, in parts of the vector skeleton, loop body and
957 // successor blocks including the middle, exit and scalar preheader blocks.
958 for (VPBlockBase *Block : RPOT)
959 Block->execute(State);
960
961 // If the original loop is unreachable, delete it and all its blocks.
962 if (!ScalarPhVPBB->hasPredecessors()) {
963 // DeleteDeadBlocks will remove single-entry phis. Remove them from the exit
964 // VPIRBBs in VPlan as well, otherwise we would retain references to deleted
965 // IR instructions.
966 for (VPIRBasicBlock *EB : getExitBlocks()) {
967 for (VPRecipeBase &R : make_early_inc_range(Range: EB->phis())) {
968 if (R.getNumOperands() == 1)
969 R.eraseFromParent();
970 }
971 }
972
973 Loop *OrigLoop =
974 State->LI->getLoopFor(BB: getScalarHeader()->getIRBasicBlock());
975 auto Blocks = OrigLoop->getBlocksVector();
976 Blocks.push_back(x: cast<VPIRBasicBlock>(Val: ScalarPhVPBB)->getIRBasicBlock());
977 for (auto *BB : Blocks)
978 State->LI->removeBlock(BB);
979 DeleteDeadBlocks(BBs: Blocks, DTU: &State->CFG.DTU);
980 State->LI->erase(L: OrigLoop);
981 }
982
983 State->CFG.DTU.flush();
984
985 VPBasicBlock *Header = vputils::getFirstLoopHeader(Plan&: *this, VPDT&: State->VPDT);
986 if (!Header)
987 return;
988
989 auto *LatchVPBB = cast<VPBasicBlock>(Val: Header->getPredecessors()[1]);
990 BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB];
991
992 // Fix the latch value of canonical, reduction and first-order recurrences
993 // phis in the vector loop.
994 for (VPRecipeBase &R : Header->phis()) {
995 // Skip phi-like recipes that generate their backedege values themselves.
996 if (isa<VPWidenPHIRecipe>(Val: &R))
997 continue;
998
999 auto *PhiR = cast<VPSingleDefRecipe>(Val: &R);
1000 // VPInstructions currently model scalar Phis only.
1001 bool NeedsScalar = isa<VPInstruction>(Val: PhiR) ||
1002 (isa<VPReductionPHIRecipe>(Val: PhiR) &&
1003 cast<VPReductionPHIRecipe>(Val: PhiR)->isInLoop());
1004
1005 Value *Phi = State->get(Def: PhiR, NeedsScalar);
1006 // VPHeaderPHIRecipe supports getBackedgeValue() but VPInstruction does
1007 // not.
1008 Value *Val = State->get(Def: PhiR->getOperand(N: 1), NeedsScalar);
1009 cast<PHINode>(Val: Phi)->addIncoming(V: Val, BB: VectorLatchBB);
1010 }
1011}
1012
1013InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx) {
1014 // For now only return the cost of the vector loop region, ignoring any other
1015 // blocks, like the preheader or middle blocks, expect for checking them for
1016 // recipes with invalid costs.
1017 InstructionCost Cost = getVectorLoopRegion()->cost(VF, Ctx);
1018
1019 // If the cost of the loop region is invalid or any recipe in the skeleton
1020 // outside loop regions are invalid return an invalid cost.
1021 if (!Cost.isValid() || any_of(Range: VPBlockUtils::blocksOnly<VPBasicBlock>(
1022 Range: vp_depth_first_shallow(G: getEntry())),
1023 P: [&VF, &Ctx](VPBasicBlock *VPBB) {
1024 return !VPBB->cost(VF, Ctx).isValid();
1025 }))
1026 return InstructionCost::getInvalid();
1027
1028 return Cost;
1029}
1030
1031VPRegionBlock *VPlan::getVectorLoopRegion() {
1032 // TODO: Cache if possible.
1033 for (VPBlockBase *B : vp_depth_first_shallow(G: getEntry()))
1034 if (auto *R = dyn_cast<VPRegionBlock>(Val: B))
1035 return R->isReplicator() ? nullptr : R;
1036 return nullptr;
1037}
1038
1039const VPRegionBlock *VPlan::getVectorLoopRegion() const {
1040 for (const VPBlockBase *B : vp_depth_first_shallow(G: getEntry()))
1041 if (auto *R = dyn_cast<VPRegionBlock>(Val: B))
1042 return R->isReplicator() ? nullptr : R;
1043 return nullptr;
1044}
1045
1046#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1047void VPlan::printLiveIns(raw_ostream &O) const {
1048 VPSlotTracker SlotTracker(this);
1049
1050 if (VF.getNumUsers() > 0) {
1051 O << "\nLive-in ";
1052 VF.printAsOperand(O, SlotTracker);
1053 O << " = VF";
1054 }
1055
1056 if (VFxUF.getNumUsers() > 0) {
1057 O << "\nLive-in ";
1058 VFxUF.printAsOperand(O, SlotTracker);
1059 O << " = VF * UF";
1060 }
1061
1062 if (VectorTripCount.getNumUsers() > 0) {
1063 O << "\nLive-in ";
1064 VectorTripCount.printAsOperand(O, SlotTracker);
1065 O << " = vector-trip-count";
1066 }
1067
1068 if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) {
1069 O << "\nLive-in ";
1070 BackedgeTakenCount->printAsOperand(O, SlotTracker);
1071 O << " = backedge-taken count";
1072 }
1073
1074 O << "\n";
1075 if (TripCount) {
1076 if (isa<VPIRValue>(TripCount))
1077 O << "Live-in ";
1078 TripCount->printAsOperand(O, SlotTracker);
1079 O << " = original trip-count";
1080 O << "\n";
1081 }
1082}
1083
1084LLVM_DUMP_METHOD
1085void VPlan::print(raw_ostream &O) const {
1086 VPSlotTracker SlotTracker(this);
1087
1088 O << "VPlan '" << getName() << "' {";
1089
1090 printLiveIns(O);
1091
1092 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<const VPBlockBase *>>
1093 RPOT(getEntry());
1094 for (const VPBlockBase *Block : RPOT) {
1095 O << '\n';
1096 Block->print(O, "", SlotTracker);
1097 }
1098
1099 O << "}\n";
1100}
1101
1102std::string VPlan::getName() const {
1103 std::string Out;
1104 raw_string_ostream RSO(Out);
1105 RSO << Name << " for ";
1106 if (!VFs.empty()) {
1107 RSO << "VF={" << VFs[0];
1108 for (ElementCount VF : drop_begin(VFs))
1109 RSO << "," << VF;
1110 RSO << "},";
1111 }
1112
1113 if (UFs.empty()) {
1114 RSO << "UF>=1";
1115 } else {
1116 RSO << "UF={" << UFs[0];
1117 for (unsigned UF : drop_begin(UFs))
1118 RSO << "," << UF;
1119 RSO << "}";
1120 }
1121
1122 return Out;
1123}
1124
1125LLVM_DUMP_METHOD
1126void VPlan::printDOT(raw_ostream &O) const {
1127 VPlanPrinter Printer(O, *this);
1128 Printer.dump();
1129}
1130
1131LLVM_DUMP_METHOD
1132void VPlan::dump() const { print(dbgs()); }
1133#endif
1134
1135static void remapOperands(VPBlockBase *Entry, VPBlockBase *NewEntry,
1136 DenseMap<VPValue *, VPValue *> &Old2NewVPValues) {
1137 // Update the operands of all cloned recipes starting at NewEntry. This
1138 // traverses all reachable blocks. This is done in two steps, to handle cycles
1139 // in PHI recipes.
1140 ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>>
1141 OldDeepRPOT(Entry);
1142 ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>>
1143 NewDeepRPOT(NewEntry);
1144 // First, collect all mappings from old to new VPValues defined by cloned
1145 // recipes.
1146 for (const auto &[OldBB, NewBB] :
1147 zip(t: VPBlockUtils::blocksOnly<VPBasicBlock>(Range: OldDeepRPOT),
1148 u: VPBlockUtils::blocksOnly<VPBasicBlock>(Range: NewDeepRPOT))) {
1149 assert(OldBB->getRecipeList().size() == NewBB->getRecipeList().size() &&
1150 "blocks must have the same number of recipes");
1151 for (const auto &[OldR, NewR] : zip(t&: *OldBB, u&: *NewBB)) {
1152 assert(OldR.getNumOperands() == NewR.getNumOperands() &&
1153 "recipes must have the same number of operands");
1154 assert(OldR.getNumDefinedValues() == NewR.getNumDefinedValues() &&
1155 "recipes must define the same number of operands");
1156 for (const auto &[OldV, NewV] :
1157 zip(t: OldR.definedValues(), u: NewR.definedValues()))
1158 Old2NewVPValues[OldV] = NewV;
1159 }
1160 }
1161
1162 // Update all operands to use cloned VPValues.
1163 for (VPBasicBlock *NewBB :
1164 VPBlockUtils::blocksOnly<VPBasicBlock>(Range: NewDeepRPOT)) {
1165 for (VPRecipeBase &NewR : *NewBB)
1166 for (unsigned I = 0, E = NewR.getNumOperands(); I != E; ++I) {
1167 VPValue *NewOp = Old2NewVPValues.lookup(Val: NewR.getOperand(N: I));
1168 NewR.setOperand(I, New: NewOp);
1169 }
1170 }
1171}
1172
1173VPlan *VPlan::duplicate() {
1174 unsigned NumBlocksBeforeCloning = CreatedBlocks.size();
1175 // Clone blocks.
1176 const auto &[NewEntry, __] = cloneFrom(Entry);
1177
1178 BasicBlock *ScalarHeaderIRBB = getScalarHeader()->getIRBasicBlock();
1179 VPIRBasicBlock *NewScalarHeader = nullptr;
1180 if (getScalarHeader()->hasPredecessors()) {
1181 NewScalarHeader = cast<VPIRBasicBlock>(Val: *find_if(
1182 Range: vp_depth_first_shallow(G: NewEntry), P: [ScalarHeaderIRBB](VPBlockBase *VPB) {
1183 auto *VPIRBB = dyn_cast<VPIRBasicBlock>(Val: VPB);
1184 return VPIRBB && VPIRBB->getIRBasicBlock() == ScalarHeaderIRBB;
1185 }));
1186 } else {
1187 NewScalarHeader = createVPIRBasicBlock(IRBB: ScalarHeaderIRBB);
1188 }
1189 // Create VPlan, clone live-ins and remap operands in the cloned blocks.
1190 auto *NewPlan = new VPlan(cast<VPBasicBlock>(Val: NewEntry), NewScalarHeader);
1191 DenseMap<VPValue *, VPValue *> Old2NewVPValues;
1192 for (VPIRValue *OldLiveIn : getLiveIns())
1193 Old2NewVPValues[OldLiveIn] = NewPlan->getOrAddLiveIn(V: OldLiveIn);
1194 Old2NewVPValues[&VectorTripCount] = &NewPlan->VectorTripCount;
1195 Old2NewVPValues[&VF] = &NewPlan->VF;
1196 Old2NewVPValues[&VFxUF] = &NewPlan->VFxUF;
1197 if (BackedgeTakenCount) {
1198 NewPlan->BackedgeTakenCount = new VPSymbolicValue();
1199 Old2NewVPValues[BackedgeTakenCount] = NewPlan->BackedgeTakenCount;
1200 }
1201 if (auto *TripCountIRV = dyn_cast_or_null<VPIRValue>(Val: TripCount))
1202 Old2NewVPValues[TripCountIRV] = NewPlan->getOrAddLiveIn(V: TripCountIRV);
1203 // else NewTripCount will be created and inserted into Old2NewVPValues when
1204 // TripCount is cloned. In any case NewPlan->TripCount is updated below.
1205
1206 remapOperands(Entry, NewEntry, Old2NewVPValues);
1207
1208 // Initialize remaining fields of cloned VPlan.
1209 NewPlan->VFs = VFs;
1210 NewPlan->UFs = UFs;
1211 // TODO: Adjust names.
1212 NewPlan->Name = Name;
1213 if (TripCount) {
1214 assert(Old2NewVPValues.contains(TripCount) &&
1215 "TripCount must have been added to Old2NewVPValues");
1216 NewPlan->TripCount = Old2NewVPValues[TripCount];
1217 }
1218
1219 // Transfer all cloned blocks (the second half of all current blocks) from
1220 // current to new VPlan.
1221 unsigned NumBlocksAfterCloning = CreatedBlocks.size();
1222 for (unsigned I :
1223 seq<unsigned>(Begin: NumBlocksBeforeCloning, End: NumBlocksAfterCloning))
1224 NewPlan->CreatedBlocks.push_back(Elt: this->CreatedBlocks[I]);
1225 CreatedBlocks.truncate(N: NumBlocksBeforeCloning);
1226
1227 // Update ExitBlocks of the new plan.
1228 for (VPBlockBase *VPB : NewPlan->CreatedBlocks) {
1229 if (VPB->getNumSuccessors() == 0 && isa<VPIRBasicBlock>(Val: VPB) &&
1230 VPB != NewScalarHeader)
1231 NewPlan->ExitBlocks.push_back(Elt: cast<VPIRBasicBlock>(Val: VPB));
1232 }
1233
1234 return NewPlan;
1235}
1236
1237VPIRBasicBlock *VPlan::createEmptyVPIRBasicBlock(BasicBlock *IRBB) {
1238 auto *VPIRBB = new VPIRBasicBlock(IRBB);
1239 CreatedBlocks.push_back(Elt: VPIRBB);
1240 return VPIRBB;
1241}
1242
1243VPIRBasicBlock *VPlan::createVPIRBasicBlock(BasicBlock *IRBB) {
1244 auto *VPIRBB = createEmptyVPIRBasicBlock(IRBB);
1245 for (Instruction &I :
1246 make_range(x: IRBB->begin(), y: IRBB->getTerminator()->getIterator()))
1247 VPIRBB->appendRecipe(Recipe: VPIRInstruction::create(I));
1248 return VPIRBB;
1249}
1250
1251#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1252
1253Twine VPlanPrinter::getUID(const VPBlockBase *Block) {
1254 return (isa<VPRegionBlock>(Block) ? "cluster_N" : "N") +
1255 Twine(getOrCreateBID(Block));
1256}
1257
1258Twine VPlanPrinter::getOrCreateName(const VPBlockBase *Block) {
1259 const std::string &Name = Block->getName();
1260 if (!Name.empty())
1261 return Name;
1262 return "VPB" + Twine(getOrCreateBID(Block));
1263}
1264
1265void VPlanPrinter::dump() {
1266 Depth = 1;
1267 bumpIndent(0);
1268 OS << "digraph VPlan {\n";
1269 OS << "graph [labelloc=t, fontsize=30; label=\"Vectorization Plan";
1270 if (!Plan.getName().empty())
1271 OS << "\\n" << DOT::EscapeString(Plan.getName());
1272
1273 {
1274 // Print live-ins.
1275 std::string Str;
1276 raw_string_ostream SS(Str);
1277 Plan.printLiveIns(SS);
1278 SmallVector<StringRef, 0> Lines;
1279 StringRef(Str).rtrim('\n').split(Lines, "\n");
1280 for (auto Line : Lines)
1281 OS << DOT::EscapeString(Line.str()) << "\\n";
1282 }
1283
1284 OS << "\"]\n";
1285 OS << "node [shape=rect, fontname=Courier, fontsize=30]\n";
1286 OS << "edge [fontname=Courier, fontsize=30]\n";
1287 OS << "compound=true\n";
1288
1289 for (const VPBlockBase *Block : vp_depth_first_shallow(Plan.getEntry()))
1290 dumpBlock(Block);
1291
1292 OS << "}\n";
1293}
1294
1295void VPlanPrinter::dumpBlock(const VPBlockBase *Block) {
1296 if (const VPBasicBlock *BasicBlock = dyn_cast<VPBasicBlock>(Block))
1297 dumpBasicBlock(BasicBlock);
1298 else if (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
1299 dumpRegion(Region);
1300 else
1301 llvm_unreachable("Unsupported kind of VPBlock.");
1302}
1303
1304void VPlanPrinter::drawEdge(const VPBlockBase *From, const VPBlockBase *To,
1305 bool Hidden, const Twine &Label) {
1306 // Due to "dot" we print an edge between two regions as an edge between the
1307 // exiting basic block and the entry basic of the respective regions.
1308 const VPBlockBase *Tail = From->getExitingBasicBlock();
1309 const VPBlockBase *Head = To->getEntryBasicBlock();
1310 OS << Indent << getUID(Tail) << " -> " << getUID(Head);
1311 OS << " [ label=\"" << Label << '\"';
1312 if (Tail != From)
1313 OS << " ltail=" << getUID(From);
1314 if (Head != To)
1315 OS << " lhead=" << getUID(To);
1316 if (Hidden)
1317 OS << "; splines=none";
1318 OS << "]\n";
1319}
1320
1321void VPlanPrinter::dumpEdges(const VPBlockBase *Block) {
1322 auto &Successors = Block->getSuccessors();
1323 if (Successors.size() == 1)
1324 drawEdge(Block, Successors.front(), false, "");
1325 else if (Successors.size() == 2) {
1326 drawEdge(Block, Successors.front(), false, "T");
1327 drawEdge(Block, Successors.back(), false, "F");
1328 } else {
1329 unsigned SuccessorNumber = 0;
1330 for (auto *Successor : Successors)
1331 drawEdge(Block, Successor, false, Twine(SuccessorNumber++));
1332 }
1333}
1334
1335void VPlanPrinter::dumpBasicBlock(const VPBasicBlock *BasicBlock) {
1336 // Implement dot-formatted dump by performing plain-text dump into the
1337 // temporary storage followed by some post-processing.
1338 OS << Indent << getUID(BasicBlock) << " [label =\n";
1339 bumpIndent(1);
1340 std::string Str;
1341 raw_string_ostream SS(Str);
1342 // Use no indentation as we need to wrap the lines into quotes ourselves.
1343 BasicBlock->print(SS, "", SlotTracker);
1344
1345 // We need to process each line of the output separately, so split
1346 // single-string plain-text dump.
1347 SmallVector<StringRef, 0> Lines;
1348 StringRef(Str).rtrim('\n').split(Lines, "\n");
1349
1350 auto EmitLine = [&](StringRef Line, StringRef Suffix) {
1351 OS << Indent << '"' << DOT::EscapeString(Line.str()) << "\\l\"" << Suffix;
1352 };
1353
1354 // Don't need the "+" after the last line.
1355 for (auto Line : make_range(Lines.begin(), Lines.end() - 1))
1356 EmitLine(Line, " +\n");
1357 EmitLine(Lines.back(), "\n");
1358
1359 bumpIndent(-1);
1360 OS << Indent << "]\n";
1361
1362 dumpEdges(BasicBlock);
1363}
1364
1365void VPlanPrinter::dumpRegion(const VPRegionBlock *Region) {
1366 OS << Indent << "subgraph " << getUID(Region) << " {\n";
1367 bumpIndent(1);
1368 OS << Indent << "fontname=Courier\n"
1369 << Indent << "label=\""
1370 << DOT::EscapeString(Region->isReplicator() ? "<xVFxUF> " : "<x1> ")
1371 << DOT::EscapeString(Region->getName()) << "\"\n";
1372 // Dump the blocks of the region.
1373 assert(Region->getEntry() && "Region contains no inner blocks.");
1374 for (const VPBlockBase *Block : vp_depth_first_shallow(Region->getEntry()))
1375 dumpBlock(Block);
1376 bumpIndent(-1);
1377 OS << Indent << "}\n";
1378 dumpEdges(Region);
1379}
1380
1381#endif
1382
1383/// Returns true if there is a vector loop region and \p VPV is defined in a
1384/// loop region.
1385static bool isDefinedInsideLoopRegions(const VPValue *VPV) {
1386 const VPRecipeBase *DefR = VPV->getDefiningRecipe();
1387 return DefR && (!DefR->getParent()->getPlan()->getVectorLoopRegion() ||
1388 DefR->getParent()->getEnclosingLoopRegion());
1389}
1390
1391bool VPValue::isDefinedOutsideLoopRegions() const {
1392 return !isDefinedInsideLoopRegions(VPV: this);
1393}
1394void VPValue::replaceAllUsesWith(VPValue *New) {
1395 replaceUsesWithIf(New, ShouldReplace: [](VPUser &, unsigned) { return true; });
1396}
1397
1398void VPValue::replaceUsesWithIf(
1399 VPValue *New,
1400 llvm::function_ref<bool(VPUser &U, unsigned Idx)> ShouldReplace) {
1401 // Note that this early exit is required for correctness; the implementation
1402 // below relies on the number of users for this VPValue to decrease, which
1403 // isn't the case if this == New.
1404 if (this == New)
1405 return;
1406
1407 for (unsigned J = 0; J < getNumUsers();) {
1408 VPUser *User = Users[J];
1409 bool RemovedUser = false;
1410 for (unsigned I = 0, E = User->getNumOperands(); I < E; ++I) {
1411 if (User->getOperand(N: I) != this || !ShouldReplace(*User, I))
1412 continue;
1413
1414 RemovedUser = true;
1415 User->setOperand(I, New);
1416 }
1417 // If a user got removed after updating the current user, the next user to
1418 // update will be moved to the current position, so we only need to
1419 // increment the index if the number of users did not change.
1420 if (!RemovedUser)
1421 J++;
1422 }
1423}
1424
1425void VPUser::replaceUsesOfWith(VPValue *From, VPValue *To) {
1426 for (unsigned Idx = 0; Idx != getNumOperands(); ++Idx) {
1427 if (getOperand(N: Idx) == From)
1428 setOperand(I: Idx, New: To);
1429 }
1430}
1431
1432#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1433void VPValue::printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const {
1434 OS << Tracker.getOrCreateName(this);
1435}
1436
1437void VPUser::printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const {
1438 interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) {
1439 Op->printAsOperand(O, SlotTracker);
1440 });
1441}
1442#endif
1443
1444void VPSlotTracker::assignName(const VPValue *V) {
1445 assert(!VPValue2Name.contains(V) && "VPValue already has a name!");
1446 auto *UV = V->getUnderlyingValue();
1447 auto *VPI = dyn_cast_or_null<VPInstruction>(Val: V);
1448 if (!UV && !(VPI && !VPI->getName().empty())) {
1449 VPValue2Name[V] = (Twine("vp<%") + Twine(NextSlot) + ">").str();
1450 NextSlot++;
1451 return;
1452 }
1453
1454 // Use the name of the underlying Value, wrapped in "ir<>", and versioned by
1455 // appending ".Number" to the name if there are multiple uses.
1456 std::string Name;
1457 if (UV)
1458 Name = getName(V: UV);
1459 else
1460 Name = VPI->getName();
1461
1462 assert(!Name.empty() && "Name cannot be empty.");
1463 StringRef Prefix = UV ? "ir<" : "vp<%";
1464 std::string BaseName = (Twine(Prefix) + Name + Twine(">")).str();
1465
1466 // First assign the base name for V.
1467 const auto &[A, _] = VPValue2Name.try_emplace(Key: V, Args&: BaseName);
1468 // Integer or FP constants with different types will result in the same string
1469 // due to stripping types.
1470 if (isa<VPIRValue>(Val: V) && isa<ConstantInt, ConstantFP>(Val: UV))
1471 return;
1472
1473 // If it is already used by C > 0 other VPValues, increase the version counter
1474 // C and use it for V.
1475 const auto &[C, UseInserted] = BaseName2Version.try_emplace(Key: BaseName, Args: 0);
1476 if (!UseInserted) {
1477 C->second++;
1478 A->second = (BaseName + Twine(".") + Twine(C->second)).str();
1479 }
1480}
1481
1482void VPSlotTracker::assignNames(const VPlan &Plan) {
1483 if (Plan.VF.getNumUsers() > 0)
1484 assignName(V: &Plan.VF);
1485 if (Plan.VFxUF.getNumUsers() > 0)
1486 assignName(V: &Plan.VFxUF);
1487 assignName(V: &Plan.VectorTripCount);
1488 if (Plan.BackedgeTakenCount)
1489 assignName(V: Plan.BackedgeTakenCount);
1490 for (VPValue *LI : Plan.getLiveIns())
1491 assignName(V: LI);
1492
1493 ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<const VPBlockBase *>>
1494 RPOT(VPBlockDeepTraversalWrapper<const VPBlockBase *>(Plan.getEntry()));
1495 for (const VPBasicBlock *VPBB :
1496 VPBlockUtils::blocksOnly<const VPBasicBlock>(Range: RPOT))
1497 assignNames(VPBB);
1498}
1499
1500void VPSlotTracker::assignNames(const VPBasicBlock *VPBB) {
1501 for (const VPRecipeBase &Recipe : *VPBB)
1502 for (VPValue *Def : Recipe.definedValues())
1503 assignName(V: Def);
1504}
1505
1506std::string VPSlotTracker::getName(const Value *V) {
1507 std::string Name;
1508 raw_string_ostream S(Name);
1509 if (V->hasName() || !isa<Instruction>(Val: V)) {
1510 V->printAsOperand(O&: S, PrintType: false);
1511 return Name;
1512 }
1513
1514 if (!MST) {
1515 // Lazily create the ModuleSlotTracker when we first hit an unnamed
1516 // instruction.
1517 auto *I = cast<Instruction>(Val: V);
1518 // This check is required to support unit tests with incomplete IR.
1519 if (I->getParent()) {
1520 MST = std::make_unique<ModuleSlotTracker>(args: I->getModule());
1521 MST->incorporateFunction(F: *I->getFunction());
1522 } else {
1523 MST = std::make_unique<ModuleSlotTracker>(args: nullptr);
1524 }
1525 }
1526 V->printAsOperand(O&: S, PrintType: false, MST&: *MST);
1527 return Name;
1528}
1529
1530std::string VPSlotTracker::getOrCreateName(const VPValue *V) const {
1531 std::string Name = VPValue2Name.lookup(Val: V);
1532 if (!Name.empty())
1533 return Name;
1534
1535 // If no name was assigned, no VPlan was provided when creating the slot
1536 // tracker or it is not reachable from the provided VPlan. This can happen,
1537 // e.g. when trying to print a recipe that has not been inserted into a VPlan
1538 // in a debugger.
1539 // TODO: Update VPSlotTracker constructor to assign names to recipes &
1540 // VPValues not associated with a VPlan, instead of constructing names ad-hoc
1541 // here.
1542 const VPRecipeBase *DefR = V->getDefiningRecipe();
1543 (void)DefR;
1544 assert((!DefR || !DefR->getParent() || !DefR->getParent()->getPlan()) &&
1545 "VPValue defined by a recipe in a VPlan?");
1546
1547 // Use the underlying value's name, if there is one.
1548 if (auto *UV = V->getUnderlyingValue()) {
1549 std::string Name;
1550 raw_string_ostream S(Name);
1551 UV->printAsOperand(O&: S, PrintType: false);
1552 return (Twine("ir<") + Name + ">").str();
1553 }
1554
1555 return "<badref>";
1556}
1557
1558bool LoopVectorizationPlanner::getDecisionAndClampRange(
1559 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
1560 assert(!Range.isEmpty() && "Trying to test an empty VF range.");
1561 bool PredicateAtRangeStart = Predicate(Range.Start);
1562
1563 for (ElementCount TmpVF : VFRange(Range.Start * 2, Range.End))
1564 if (Predicate(TmpVF) != PredicateAtRangeStart) {
1565 Range.End = TmpVF;
1566 break;
1567 }
1568
1569 return PredicateAtRangeStart;
1570}
1571
1572/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
1573/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
1574/// of VF's starting at a given VF and extending it as much as possible. Each
1575/// vectorization decision can potentially shorten this sub-range during
1576/// buildVPlan().
1577void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
1578 ElementCount MaxVF) {
1579 auto MaxVFTimes2 = MaxVF * 2;
1580 for (ElementCount VF = MinVF; ElementCount::isKnownLT(LHS: VF, RHS: MaxVFTimes2);) {
1581 VFRange SubRange = {VF, MaxVFTimes2};
1582 if (auto Plan = tryToBuildVPlan(Range&: SubRange)) {
1583 VPlanTransforms::optimize(Plan&: *Plan);
1584 // Update the name of the latch of the top-level vector loop region region
1585 // after optimizations which includes block folding.
1586 Plan->getVectorLoopRegion()->getExiting()->setName("vector.latch");
1587 VPlans.push_back(Elt: std::move(Plan));
1588 }
1589 VF = SubRange.End;
1590 }
1591}
1592
1593VPlan &LoopVectorizationPlanner::getPlanFor(ElementCount VF) const {
1594 assert(count_if(VPlans,
1595 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
1596 1 &&
1597 "Multiple VPlans for VF.");
1598
1599 for (const VPlanPtr &Plan : VPlans) {
1600 if (Plan->hasVF(VF))
1601 return *Plan.get();
1602 }
1603 llvm_unreachable("No plan found!");
1604}
1605
1606static void addRuntimeUnrollDisableMetaData(Loop *L) {
1607 SmallVector<Metadata *, 4> MDs;
1608 // Reserve first location for self reference to the LoopID metadata node.
1609 MDs.push_back(Elt: nullptr);
1610 bool IsUnrollMetadata = false;
1611 MDNode *LoopID = L->getLoopID();
1612 if (LoopID) {
1613 // First find existing loop unrolling disable metadata.
1614 for (unsigned I = 1, IE = LoopID->getNumOperands(); I < IE; ++I) {
1615 auto *MD = dyn_cast<MDNode>(Val: LoopID->getOperand(I));
1616 if (MD) {
1617 const auto *S = dyn_cast<MDString>(Val: MD->getOperand(I: 0));
1618 if (!S)
1619 continue;
1620 if (S->getString().starts_with(Prefix: "llvm.loop.unroll.runtime.disable"))
1621 continue;
1622 IsUnrollMetadata =
1623 S->getString().starts_with(Prefix: "llvm.loop.unroll.disable");
1624 }
1625 MDs.push_back(Elt: LoopID->getOperand(I));
1626 }
1627 }
1628
1629 if (!IsUnrollMetadata) {
1630 // Add runtime unroll disable metadata.
1631 LLVMContext &Context = L->getHeader()->getContext();
1632 SmallVector<Metadata *, 1> DisableOperands;
1633 DisableOperands.push_back(
1634 Elt: MDString::get(Context, Str: "llvm.loop.unroll.runtime.disable"));
1635 MDNode *DisableNode = MDNode::get(Context, MDs: DisableOperands);
1636 MDs.push_back(Elt: DisableNode);
1637 MDNode *NewLoopID = MDNode::get(Context, MDs);
1638 // Set operand 0 to refer to the loop id itself.
1639 NewLoopID->replaceOperandWith(I: 0, New: NewLoopID);
1640 L->setLoopID(NewLoopID);
1641 }
1642}
1643
1644void LoopVectorizationPlanner::updateLoopMetadataAndProfileInfo(
1645 Loop *VectorLoop, VPBasicBlock *HeaderVPBB, const VPlan &Plan,
1646 bool VectorizingEpilogue, MDNode *OrigLoopID,
1647 std::optional<unsigned> OrigAverageTripCount,
1648 unsigned OrigLoopInvocationWeight, unsigned EstimatedVFxUF,
1649 bool DisableRuntimeUnroll) {
1650 // Update the metadata of the scalar loop. Skip the update when vectorizing
1651 // the epilogue loop to ensure it is updated only once. Also skip the update
1652 // when the scalar loop became unreachable.
1653 if (Plan.getScalarPreheader()->hasPredecessors() && !VectorizingEpilogue) {
1654 std::optional<MDNode *> RemainderLoopID =
1655 makeFollowupLoopID(OrigLoopID, FollowupAttrs: {LLVMLoopVectorizeFollowupAll,
1656 LLVMLoopVectorizeFollowupEpilogue});
1657 if (RemainderLoopID) {
1658 OrigLoop->setLoopID(*RemainderLoopID);
1659 } else {
1660 if (DisableRuntimeUnroll)
1661 addRuntimeUnrollDisableMetaData(L: OrigLoop);
1662
1663 LoopVectorizeHints Hints(OrigLoop, true, *ORE);
1664 Hints.setAlreadyVectorized();
1665 }
1666 }
1667
1668 if (!VectorLoop)
1669 return;
1670
1671 if (std::optional<MDNode *> VectorizedLoopID = makeFollowupLoopID(
1672 OrigLoopID, FollowupAttrs: {LLVMLoopVectorizeFollowupAll,
1673 LLVMLoopVectorizeFollowupVectorized})) {
1674 VectorLoop->setLoopID(*VectorizedLoopID);
1675 } else {
1676 // Keep all loop hints from the original loop on the vector loop (we'll
1677 // replace the vectorizer-specific hints below).
1678 if (OrigLoopID)
1679 VectorLoop->setLoopID(OrigLoopID);
1680
1681 if (!VectorizingEpilogue) {
1682 LoopVectorizeHints Hints(VectorLoop, true, *ORE);
1683 Hints.setAlreadyVectorized();
1684 }
1685 }
1686 TargetTransformInfo::UnrollingPreferences UP;
1687 TTI.getUnrollingPreferences(L: VectorLoop, *PSE.getSE(), UP, ORE);
1688 if (!UP.UnrollVectorizedLoop || VectorizingEpilogue)
1689 addRuntimeUnrollDisableMetaData(L: VectorLoop);
1690
1691 // Set/update profile weights for the vector and remainder loops as original
1692 // loop iterations are now distributed among them. Note that original loop
1693 // becomes the scalar remainder loop after vectorization.
1694 //
1695 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
1696 // end up getting slightly roughened result but that should be OK since
1697 // profile is not inherently precise anyway. Note also possible bypass of
1698 // vector code caused by legality checks is ignored, assigning all the weight
1699 // to the vector loop, optimistically.
1700 //
1701 // For scalable vectorization we can't know at compile time how many
1702 // iterations of the loop are handled in one vector iteration, so instead
1703 // use the value of vscale used for tuning.
1704 unsigned AverageVectorTripCount = 0;
1705 unsigned RemainderAverageTripCount = 0;
1706 auto EC = VectorLoop->getLoopPreheader()->getParent()->getEntryCount();
1707 auto IsProfiled = EC && EC->getCount();
1708 if (!OrigAverageTripCount) {
1709 if (!IsProfiled)
1710 return;
1711 auto &SE = *PSE.getSE();
1712 AverageVectorTripCount = SE.getSmallConstantTripCount(L: VectorLoop);
1713 if (ProfcheckDisableMetadataFixes || !AverageVectorTripCount)
1714 return;
1715 if (Plan.getScalarPreheader()->hasPredecessors())
1716 RemainderAverageTripCount =
1717 SE.getSmallConstantTripCount(L: OrigLoop) % EstimatedVFxUF;
1718 // Setting to 1 should be sufficient to generate the correct branch weights.
1719 OrigLoopInvocationWeight = 1;
1720 } else {
1721 // Calculate number of iterations in unrolled loop.
1722 AverageVectorTripCount = *OrigAverageTripCount / EstimatedVFxUF;
1723 // Calculate number of iterations for remainder loop.
1724 RemainderAverageTripCount = *OrigAverageTripCount % EstimatedVFxUF;
1725 }
1726 if (HeaderVPBB) {
1727 setLoopEstimatedTripCount(L: VectorLoop, EstimatedTripCount: AverageVectorTripCount,
1728 EstimatedLoopInvocationWeight: OrigLoopInvocationWeight);
1729 }
1730
1731 if (Plan.getScalarPreheader()->hasPredecessors()) {
1732 setLoopEstimatedTripCount(L: OrigLoop, EstimatedTripCount: RemainderAverageTripCount,
1733 EstimatedLoopInvocationWeight: OrigLoopInvocationWeight);
1734 }
1735}
1736
1737#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1738void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
1739 if (VPlans.empty()) {
1740 O << "LV: No VPlans built.\n";
1741 return;
1742 }
1743 for (const auto &Plan : VPlans)
1744 if (PrintVPlansInDotFormat)
1745 Plan->printDOT(O);
1746 else
1747 Plan->print(O);
1748}
1749#endif
1750
1751bool llvm::canConstantBeExtended(const APInt *C, Type *NarrowType,
1752 TTI::PartialReductionExtendKind ExtKind) {
1753 APInt TruncatedVal = C->trunc(width: NarrowType->getScalarSizeInBits());
1754 unsigned WideSize = C->getBitWidth();
1755 APInt ExtendedVal = ExtKind == TTI::PR_SignExtend
1756 ? TruncatedVal.sext(width: WideSize)
1757 : TruncatedVal.zext(width: WideSize);
1758 return ExtendedVal == *C;
1759}
1760
1761TargetTransformInfo::OperandValueInfo
1762VPCostContext::getOperandInfo(VPValue *V) const {
1763 if (auto *IRV = dyn_cast<VPIRValue>(Val: V))
1764 return TTI::getOperandInfo(V: IRV->getValue());
1765
1766 return {};
1767}
1768
1769InstructionCost VPCostContext::getScalarizationOverhead(
1770 Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF,
1771 TTI::VectorInstrContext VIC, bool AlwaysIncludeReplicatingR) {
1772 if (VF.isScalar())
1773 return 0;
1774
1775 assert(!VF.isScalable() &&
1776 "Scalarization overhead not supported for scalable vectors");
1777
1778 InstructionCost ScalarizationCost = 0;
1779 // Compute the cost of scalarizing the result if needed.
1780 if (!ResultTy->isVoidTy()) {
1781 for (Type *VectorTy :
1782 to_vector(Range: getContainedTypes(Ty: toVectorizedTy(Ty: ResultTy, EC: VF)))) {
1783 ScalarizationCost += TTI.getScalarizationOverhead(
1784 Ty: cast<VectorType>(Val: VectorTy), DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
1785 /*Insert=*/true, /*Extract=*/false, CostKind,
1786 /*ForPoisonSrc=*/true, VL: {}, VIC);
1787 }
1788 }
1789 // Compute the cost of scalarizing the operands, skipping ones that do not
1790 // require extraction/scalarization and do not incur any overhead.
1791 SmallPtrSet<const VPValue *, 4> UniqueOperands;
1792 SmallVector<Type *> Tys;
1793 for (auto *Op : Operands) {
1794 if (isa<VPIRValue>(Val: Op) ||
1795 (!AlwaysIncludeReplicatingR &&
1796 isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Val: Op)) ||
1797 (isa<VPReplicateRecipe>(Val: Op) &&
1798 cast<VPReplicateRecipe>(Val: Op)->getOpcode() == Instruction::Load) ||
1799 !UniqueOperands.insert(Ptr: Op).second)
1800 continue;
1801 Tys.push_back(Elt: toVectorizedTy(Ty: Types.inferScalarType(V: Op), EC: VF));
1802 }
1803 return ScalarizationCost +
1804 TTI.getOperandsScalarizationOverhead(Tys, CostKind, VIC);
1805}
1806