1//===- VPlanHelpers.h - VPlan-related auxiliary helpers -------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file contains the declarations of different VPlan-related auxiliary
11/// helpers.
12//
13//===----------------------------------------------------------------------===//
14
15#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANHELPERS_H
16#define LLVM_TRANSFORMS_VECTORIZE_VPLANHELPERS_H
17
18#include "VPlanAnalysis.h"
19#include "VPlanDominatorTree.h"
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/SmallPtrSet.h"
22#include "llvm/ADT/SmallVector.h"
23#include "llvm/Analysis/DomTreeUpdater.h"
24#include "llvm/Analysis/TargetTransformInfo.h"
25#include "llvm/IR/DebugLoc.h"
26#include "llvm/IR/ModuleSlotTracker.h"
27#include "llvm/Support/InstructionCost.h"
28
29namespace llvm {
30
31class AssumptionCache;
32class BasicBlock;
33class CallInst;
34class DominatorTree;
35class InnerLoopVectorizer;
36class IRBuilderBase;
37class LoopInfo;
38class SCEV;
39class Type;
40class VPBasicBlock;
41class VPRegionBlock;
42class VPlan;
43class Value;
44
45namespace Intrinsic {
46typedef unsigned ID;
47}
48
49/// Returns a calculation for the total number of elements for a given \p VF.
50/// For fixed width vectors this value is a constant, whereas for scalable
51/// vectors it is an expression determined at runtime.
52Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF);
53
54/// A range of powers-of-2 vectorization factors with fixed start and
55/// adjustable end. The range includes start and excludes end, e.g.,:
56/// [1, 16) = {1, 2, 4, 8}
57struct VFRange {
58 // A power of 2.
59 const ElementCount Start;
60
61 // A power of 2. If End <= Start range is empty.
62 ElementCount End;
63
64 bool isEmpty() const {
65 return End.getKnownMinValue() <= Start.getKnownMinValue();
66 }
67
68 VFRange(const ElementCount &Start, const ElementCount &End)
69 : Start(Start), End(End) {
70 assert(Start.isScalable() == End.isScalable() &&
71 "Both Start and End should have the same scalable flag");
72 assert(isPowerOf2_32(Start.getKnownMinValue()) &&
73 "Expected Start to be a power of 2");
74 assert(isPowerOf2_32(End.getKnownMinValue()) &&
75 "Expected End to be a power of 2");
76 }
77
78 /// Iterator to iterate over vectorization factors in a VFRange.
79 class iterator
80 : public iterator_facade_base<iterator, std::forward_iterator_tag,
81 ElementCount> {
82 ElementCount VF;
83
84 public:
85 iterator(ElementCount VF) : VF(VF) {}
86
87 bool operator==(const iterator &Other) const { return VF == Other.VF; }
88
89 ElementCount operator*() const { return VF; }
90
91 iterator &operator++() {
92 VF *= 2;
93 return *this;
94 }
95 };
96
97 iterator begin() { return iterator(Start); }
98 iterator end() {
99 assert(isPowerOf2_32(End.getKnownMinValue()));
100 return iterator(End);
101 }
102};
103
104/// In what follows, the term "input IR" refers to code that is fed into the
105/// vectorizer whereas the term "output IR" refers to code that is generated by
106/// the vectorizer.
107
108/// VPLane provides a way to access lanes in both fixed width and scalable
109/// vectors, where for the latter the lane index sometimes needs calculating
110/// as a runtime expression.
111class VPLane {
112public:
113 /// Kind describes how to interpret Lane.
114 enum class Kind : uint8_t {
115 /// For First, Lane is the index into the first N elements of a
116 /// fixed-vector <N x <ElTy>> or a scalable vector <vscale x N x <ElTy>>.
117 First,
118 /// For ScalableLast, Lane is the offset from the start of the last
119 /// N-element subvector in a scalable vector <vscale x N x <ElTy>>. For
120 /// example, a Lane of 0 corresponds to lane `(vscale - 1) * N`, a Lane of
121 /// 1 corresponds to `((vscale - 1) * N) + 1`, etc.
122 ScalableLast
123 };
124
125private:
126 /// in [0..VF)
127 unsigned Lane;
128
129 /// Indicates how the Lane should be interpreted, as described above.
130 Kind LaneKind = Kind::First;
131
132public:
133 VPLane(unsigned Lane) : Lane(Lane) {}
134 VPLane(unsigned Lane, Kind LaneKind) : Lane(Lane), LaneKind(LaneKind) {}
135
136 static VPLane getFirstLane() { return VPLane(0, VPLane::Kind::First); }
137
138 static VPLane getLaneFromEnd(const ElementCount &VF, unsigned Offset) {
139 assert(Offset > 0 && Offset <= VF.getKnownMinValue() &&
140 "trying to extract with invalid offset");
141 unsigned LaneOffset = VF.getKnownMinValue() - Offset;
142 Kind LaneKind;
143 if (VF.isScalable())
144 // In this case 'LaneOffset' refers to the offset from the start of the
145 // last subvector with VF.getKnownMinValue() elements.
146 LaneKind = VPLane::Kind::ScalableLast;
147 else
148 LaneKind = VPLane::Kind::First;
149 return VPLane(LaneOffset, LaneKind);
150 }
151
152 static VPLane getLastLaneForVF(const ElementCount &VF) {
153 return getLaneFromEnd(VF, Offset: 1);
154 }
155
156 /// Returns a compile-time known value for the lane index and asserts if the
157 /// lane can only be calculated at runtime.
158 unsigned getKnownLane() const {
159 assert(LaneKind == Kind::First &&
160 "can only get known lane from the beginning");
161 return Lane;
162 }
163
164 /// Returns an expression describing the lane index that can be used at
165 /// runtime.
166 Value *getAsRuntimeExpr(IRBuilderBase &Builder, const ElementCount &VF) const;
167
168 /// Returns the Kind of lane offset.
169 Kind getKind() const { return LaneKind; }
170
171 /// Returns true if this is the first lane of the whole vector.
172 bool isFirstLane() const { return Lane == 0 && LaneKind == Kind::First; }
173
174 /// Maps the lane to a cache index based on \p VF.
175 unsigned mapToCacheIndex(const ElementCount &VF) const {
176 switch (LaneKind) {
177 case VPLane::Kind::ScalableLast:
178 assert(VF.isScalable() && Lane < VF.getKnownMinValue() &&
179 "ScalableLast can only be used with scalable VFs");
180 return VF.getKnownMinValue() + Lane;
181 default:
182 assert(Lane < VF.getKnownMinValue() &&
183 "Cannot extract lane larger than VF");
184 return Lane;
185 }
186 }
187};
188
189/// VPTransformState holds information passed down when "executing" a VPlan,
190/// needed for generating the output IR.
191struct VPTransformState {
192 VPTransformState(const TargetTransformInfo *TTI, ElementCount VF,
193 LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC,
194 IRBuilderBase &Builder, VPlan *Plan,
195 Loop *CurrentParentLoop);
196 /// Target Transform Info.
197 const TargetTransformInfo *TTI;
198
199 /// The chosen Vectorization Factor of the loop being vectorized.
200 ElementCount VF;
201
202 struct DataState {
203 // Each value from the original loop, when vectorized, is represented by a
204 // vector value in the map.
205 DenseMap<const VPValue *, Value *> VPV2Vector;
206
207 DenseMap<const VPValue *, SmallVector<Value *, 4>> VPV2Scalars;
208 } Data;
209
210 /// Get the generated vector Value for a given VPValue \p Def if \p IsScalar
211 /// is false, otherwise return the generated scalar. \See set.
212 Value *get(const VPValue *Def, bool IsScalar = false);
213
214 /// Get the generated Value for a given VPValue and given Part and Lane.
215 Value *get(const VPValue *Def, const VPLane &Lane);
216
217 bool hasVectorValue(const VPValue *Def) {
218 return Data.VPV2Vector.contains(Val: Def);
219 }
220
221 bool hasScalarValue(const VPValue *Def, VPLane Lane) {
222 auto I = Data.VPV2Scalars.find(Val: Def);
223 if (I == Data.VPV2Scalars.end())
224 return false;
225 unsigned CacheIdx = Lane.mapToCacheIndex(VF);
226 return CacheIdx < I->second.size() && I->second[CacheIdx];
227 }
228
229 /// Set the generated vector Value for a given VPValue, if \p
230 /// IsScalar is false. If \p IsScalar is true, set the scalar in lane 0.
231 void set(const VPValue *Def, Value *V, bool IsScalar = false) {
232 if (IsScalar) {
233 set(Def, V, Lane: VPLane(0));
234 return;
235 }
236 assert((VF.isScalar() || isVectorizedTy(V->getType())) &&
237 "scalar values must be stored as (0, 0)");
238 Data.VPV2Vector[Def] = V;
239 }
240
241 /// Reset an existing vector value for \p Def and a given \p Part.
242 void reset(const VPValue *Def, Value *V) {
243 assert(Data.VPV2Vector.contains(Def) && "need to overwrite existing value");
244 Data.VPV2Vector[Def] = V;
245 }
246
247 /// Set the generated scalar \p V for \p Def and the given \p Lane.
248 void set(const VPValue *Def, Value *V, const VPLane &Lane) {
249 auto &Scalars = Data.VPV2Scalars[Def];
250 unsigned CacheIdx = Lane.mapToCacheIndex(VF);
251 if (Scalars.size() <= CacheIdx)
252 Scalars.resize(N: CacheIdx + 1);
253 assert(!Scalars[CacheIdx] && "should overwrite existing value");
254 Scalars[CacheIdx] = V;
255 }
256
257 /// Reset an existing scalar value for \p Def and a given \p Lane.
258 void reset(const VPValue *Def, Value *V, const VPLane &Lane) {
259 auto Iter = Data.VPV2Scalars.find(Val: Def);
260 assert(Iter != Data.VPV2Scalars.end() &&
261 "need to overwrite existing value");
262 unsigned CacheIdx = Lane.mapToCacheIndex(VF);
263 assert(CacheIdx < Iter->second.size() &&
264 "need to overwrite existing value");
265 Iter->second[CacheIdx] = V;
266 }
267
268 /// Set the debug location in the builder using the debug location \p DL.
269 void setDebugLocFrom(DebugLoc DL);
270
271 /// Insert the scalar value of \p Def at \p Lane into \p Lane of \p WideValue
272 /// and return the resulting value.
273 Value *packScalarIntoVectorizedValue(const VPValue *Def, Value *WideValue,
274 const VPLane &Lane);
275
276 /// Hold state information used when constructing the CFG of the output IR,
277 /// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks.
278 struct CFGState {
279 /// The previous VPBasicBlock visited. Initially set to null.
280 VPBasicBlock *PrevVPBB = nullptr;
281
282 /// The previous IR BasicBlock created or used. Initially set to the new
283 /// header BasicBlock.
284 BasicBlock *PrevBB = nullptr;
285
286 /// The last IR BasicBlock in the output IR. Set to the exit block of the
287 /// vector loop.
288 BasicBlock *ExitBB = nullptr;
289
290 /// A mapping of each VPBasicBlock to the corresponding BasicBlock. In case
291 /// of replication, maps the BasicBlock of the last replica created.
292 SmallDenseMap<const VPBasicBlock *, BasicBlock *> VPBB2IRBB;
293
294 /// Updater for the DominatorTree.
295 DomTreeUpdater DTU;
296
297 CFGState(DominatorTree *DT)
298 : DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy) {}
299 } CFG;
300
301 /// Hold a pointer to LoopInfo to register new basic blocks in the loop.
302 LoopInfo *LI;
303
304 /// Hold a pointer to AssumptionCache to register new assumptions after
305 /// replicating assume calls.
306 AssumptionCache *AC;
307
308 /// Hold a reference to the IRBuilder used to generate output IR code.
309 IRBuilderBase &Builder;
310
311 /// Pointer to the VPlan code is generated for.
312 VPlan *Plan;
313
314 /// The parent loop object for the current scope, or nullptr.
315 Loop *CurrentParentLoop = nullptr;
316
317 /// VPlan-based dominator tree.
318 VPDominatorTree VPDT;
319};
320
321/// Struct to hold various analysis needed for cost computations.
322struct VPCostContext {
323 const TargetTransformInfo &TTI;
324 const TargetLibraryInfo &TLI;
325 LLVMContext &LLVMCtx;
326 LoopVectorizationCostModel &CM;
327 SmallPtrSet<Instruction *, 8> SkipCostComputation;
328 TargetTransformInfo::TargetCostKind CostKind;
329 PredicatedScalarEvolution &PSE;
330 const Loop *L;
331
332 /// Number of predicated stores in the VPlan, computed on demand.
333 std::optional<unsigned> NumPredStores;
334
335 VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI,
336 const VPlan &Plan, LoopVectorizationCostModel &CM,
337 TargetTransformInfo::TargetCostKind CostKind,
338 PredicatedScalarEvolution &PSE, const Loop *L)
339 : TTI(TTI), TLI(TLI), LLVMCtx(Plan.getContext()), CM(CM),
340 CostKind(CostKind), PSE(PSE), L(L) {}
341
342 /// Return the cost for \p UI with \p VF using the legacy cost model as
343 /// fallback until computing the cost of all recipes migrates to VPlan.
344 InstructionCost getLegacyCost(Instruction *UI, ElementCount VF) const;
345
346 /// Return true if the cost for \p UI shouldn't be computed, e.g. because it
347 /// has already been pre-computed.
348 bool skipCostComputation(Instruction *UI, bool IsVector) const;
349
350 /// Mark the widening decision for \p I at \p VF as invalidated since a VPlan
351 /// transform replaced the original recipe.
352 void invalidateWideningDecision(Instruction *I, ElementCount VF);
353
354 /// \returns how much the cost of a predicated block should be divided by.
355 /// Forwards to LoopVectorizationCostModel::getPredBlockCostDivisor.
356 uint64_t getPredBlockCostDivisor(BasicBlock *BB) const;
357
358 /// Returns true if \p I is known to be scalarized at \p VF.
359 bool willBeScalarized(Instruction *I, ElementCount VF) const;
360
361 /// Forwards to LoopVectorizationCostModel::isMaskRequired.
362 bool isMaskRequired(Instruction *I) const;
363
364 /// Returns the OperandInfo for \p V, if it is a live-in.
365 TargetTransformInfo::OperandValueInfo getOperandInfo(VPValue *V) const;
366
367 /// Estimate the overhead of scalarizing a recipe with result type \p ResultTy
368 /// and \p Operands with \p VF. This is a convenience wrapper for the
369 /// type-based getScalarizationOverhead API. \p VIC provides context about
370 /// whether the scalarization is for a load/store operation. If \p
371 /// AlwaysIncludeReplicatingR is true, always compute the cost of scalarizing
372 /// replicating operands.
373 InstructionCost getScalarizationOverhead(
374 Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF,
375 TTI::VectorInstrContext VIC = TTI::VectorInstrContext::None,
376 bool AlwaysIncludeReplicatingR = false);
377
378 /// Returns true if an artificially high cost for emulated masked memrefs
379 /// should be used.
380 bool useEmulatedMaskMemRefHack(const VPReplicateRecipe *R, ElementCount VF);
381
382 /// Returns true if \p ID is a pseudo intrinsic that is dropped via
383 /// scalarization rather than widened.
384 static bool isFreeScalarIntrinsic(Intrinsic::ID ID);
385};
386
387/// This class can be used to assign names to VPValues. For VPValues without
388/// underlying value, assign consecutive numbers and use those as names (wrapped
389/// in vp<>). Otherwise, use the name from the underlying value (wrapped in
390/// ir<>), appending a .V version number if there are multiple uses of the same
391/// name. Allows querying names for VPValues for printing, similar to the
392/// ModuleSlotTracker for IR values.
393class VPSlotTracker {
394 /// Keep track of versioned names assigned to VPValues with underlying IR
395 /// values.
396 DenseMap<const VPValue *, std::string> VPValue2Name;
397 /// Keep track of the next number to use to version the base name.
398 StringMap<unsigned> BaseName2Version;
399
400 /// Number to assign to the next VPValue without underlying value.
401 unsigned NextSlot = 0;
402
403 /// Lazily created ModuleSlotTracker, used only when unnamed IR instructions
404 /// require slot tracking.
405 std::unique_ptr<ModuleSlotTracker> MST;
406
407 /// Cached metadata kind names from the Module's LLVMContext.
408 SmallVector<StringRef> MDNames;
409
410 /// Cached Module pointer for printing metadata.
411 const Module *M = nullptr;
412
413 void assignName(const VPValue *V);
414 LLVM_ABI_FOR_TEST void assignNames(const VPlan &Plan);
415 void assignNames(const VPBasicBlock *VPBB);
416 std::string getName(const Value *V);
417
418public:
419 VPSlotTracker(const VPlan *Plan = nullptr) {
420 if (Plan) {
421 assignNames(Plan: *Plan);
422 if (auto *ScalarHeader = Plan->getScalarHeader())
423 M = ScalarHeader->getIRBasicBlock()->getModule();
424 }
425 }
426
427 /// Returns the name assigned to \p V, if there is one, otherwise try to
428 /// construct one from the underlying value, if there's one; else return
429 /// <badref>.
430 std::string getOrCreateName(const VPValue *V) const;
431
432 /// Returns the cached metadata kind names.
433 ArrayRef<StringRef> getMDNames() {
434 if (MDNames.empty() && M)
435 M->getContext().getMDKindNames(Result&: MDNames);
436 return MDNames;
437 }
438
439 /// Returns the cached Module pointer.
440 const Module *getModule() const { return M; }
441};
442
443#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
444/// VPlanPrinter prints a given VPlan to a given output stream. The printing is
445/// indented and follows the dot format.
446class VPlanPrinter {
447 raw_ostream &OS;
448 const VPlan &Plan;
449 unsigned Depth = 0;
450 unsigned TabWidth = 2;
451 std::string Indent;
452 unsigned BID = 0;
453 SmallDenseMap<const VPBlockBase *, unsigned> BlockID;
454
455 VPSlotTracker SlotTracker;
456
457 /// Handle indentation.
458 void bumpIndent(int b) { Indent = std::string((Depth += b) * TabWidth, ' '); }
459
460 /// Print a given \p Block of the Plan.
461 void dumpBlock(const VPBlockBase *Block);
462
463 /// Print the information related to the CFG edges going out of a given
464 /// \p Block, followed by printing the successor blocks themselves.
465 void dumpEdges(const VPBlockBase *Block);
466
467 /// Print a given \p BasicBlock, including its VPRecipes, followed by printing
468 /// its successor blocks.
469 void dumpBasicBlock(const VPBasicBlock *BasicBlock);
470
471 /// Print a given \p Region of the Plan.
472 void dumpRegion(const VPRegionBlock *Region);
473
474 unsigned getOrCreateBID(const VPBlockBase *Block) {
475 return BlockID.count(Block) ? BlockID[Block] : BlockID[Block] = BID++;
476 }
477
478 Twine getUID(const VPBlockBase *Block);
479
480 /// Print the information related to a CFG edge between two VPBlockBases.
481 void drawEdge(const VPBlockBase *From, const VPBlockBase *To, bool Hidden,
482 const Twine &Label);
483
484public:
485 VPlanPrinter(raw_ostream &O, const VPlan &P)
486 : OS(O), Plan(P), SlotTracker(&P) {}
487
488 LLVM_DUMP_METHOD void dump();
489};
490#endif
491
492/// Check if a constant \p CI can be safely treated as having been extended
493/// from a narrower type with the given extension kind.
494bool canConstantBeExtended(const APInt *C, Type *NarrowType,
495 TTI::PartialReductionExtendKind ExtKind);
496} // end namespace llvm
497
498#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
499