1//===- VPlanHelpers.h - VPlan-related auxiliary helpers -------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file contains the declarations of different VPlan-related auxiliary
11/// helpers.
12//
13//===----------------------------------------------------------------------===//
14
15#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANHELPERS_H
16#define LLVM_TRANSFORMS_VECTORIZE_VPLANHELPERS_H
17
18#include "VPlanAnalysis.h"
19#include "VPlanDominatorTree.h"
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/SmallPtrSet.h"
22#include "llvm/ADT/SmallVector.h"
23#include "llvm/Analysis/DomTreeUpdater.h"
24#include "llvm/Analysis/TargetTransformInfo.h"
25#include "llvm/IR/DebugLoc.h"
26#include "llvm/IR/ModuleSlotTracker.h"
27#include "llvm/Support/InstructionCost.h"
28
29namespace llvm {
30
31class AssumptionCache;
32class BasicBlock;
33class DominatorTree;
34class InnerLoopVectorizer;
35class IRBuilderBase;
36class LoopInfo;
37class SCEV;
38class Type;
39class VPBasicBlock;
40class VPRegionBlock;
41class VPlan;
42class Value;
43
44/// Returns a calculation for the total number of elements for a given \p VF.
45/// For fixed width vectors this value is a constant, whereas for scalable
46/// vectors it is an expression determined at runtime.
47Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF);
48
49/// Compute the transformed value of Index at offset StartValue using step
50/// StepValue.
51/// For integer induction, returns StartValue + Index * StepValue.
52/// For pointer induction, returns StartValue[Index * StepValue].
53Value *emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue,
54 Value *Step,
55 InductionDescriptor::InductionKind InductionKind,
56 const BinaryOperator *InductionBinOp);
57
58/// A range of powers-of-2 vectorization factors with fixed start and
59/// adjustable end. The range includes start and excludes end, e.g.,:
60/// [1, 16) = {1, 2, 4, 8}
61struct VFRange {
62 // A power of 2.
63 const ElementCount Start;
64
65 // A power of 2. If End <= Start range is empty.
66 ElementCount End;
67
68 bool isEmpty() const {
69 return End.getKnownMinValue() <= Start.getKnownMinValue();
70 }
71
72 VFRange(const ElementCount &Start, const ElementCount &End)
73 : Start(Start), End(End) {
74 assert(Start.isScalable() == End.isScalable() &&
75 "Both Start and End should have the same scalable flag");
76 assert(isPowerOf2_32(Start.getKnownMinValue()) &&
77 "Expected Start to be a power of 2");
78 assert(isPowerOf2_32(End.getKnownMinValue()) &&
79 "Expected End to be a power of 2");
80 }
81
82 /// Iterator to iterate over vectorization factors in a VFRange.
83 class iterator
84 : public iterator_facade_base<iterator, std::forward_iterator_tag,
85 ElementCount> {
86 ElementCount VF;
87
88 public:
89 iterator(ElementCount VF) : VF(VF) {}
90
91 bool operator==(const iterator &Other) const { return VF == Other.VF; }
92
93 ElementCount operator*() const { return VF; }
94
95 iterator &operator++() {
96 VF *= 2;
97 return *this;
98 }
99 };
100
101 iterator begin() { return iterator(Start); }
102 iterator end() {
103 assert(isPowerOf2_32(End.getKnownMinValue()));
104 return iterator(End);
105 }
106};
107
108/// In what follows, the term "input IR" refers to code that is fed into the
109/// vectorizer whereas the term "output IR" refers to code that is generated by
110/// the vectorizer.
111
112/// VPLane provides a way to access lanes in both fixed width and scalable
113/// vectors, where for the latter the lane index sometimes needs calculating
114/// as a runtime expression.
115class VPLane {
116public:
117 /// Kind describes how to interpret Lane.
118 enum class Kind : uint8_t {
119 /// For First, Lane is the index into the first N elements of a
120 /// fixed-vector <N x <ElTy>> or a scalable vector <vscale x N x <ElTy>>.
121 First,
122 /// For ScalableLast, Lane is the offset from the start of the last
123 /// N-element subvector in a scalable vector <vscale x N x <ElTy>>. For
124 /// example, a Lane of 0 corresponds to lane `(vscale - 1) * N`, a Lane of
125 /// 1 corresponds to `((vscale - 1) * N) + 1`, etc.
126 ScalableLast
127 };
128
129private:
130 /// in [0..VF)
131 unsigned Lane;
132
133 /// Indicates how the Lane should be interpreted, as described above.
134 Kind LaneKind = Kind::First;
135
136public:
137 VPLane(unsigned Lane) : Lane(Lane) {}
138 VPLane(unsigned Lane, Kind LaneKind) : Lane(Lane), LaneKind(LaneKind) {}
139
140 static VPLane getFirstLane() { return VPLane(0, VPLane::Kind::First); }
141
142 static VPLane getLaneFromEnd(const ElementCount &VF, unsigned Offset) {
143 assert(Offset > 0 && Offset <= VF.getKnownMinValue() &&
144 "trying to extract with invalid offset");
145 unsigned LaneOffset = VF.getKnownMinValue() - Offset;
146 Kind LaneKind;
147 if (VF.isScalable())
148 // In this case 'LaneOffset' refers to the offset from the start of the
149 // last subvector with VF.getKnownMinValue() elements.
150 LaneKind = VPLane::Kind::ScalableLast;
151 else
152 LaneKind = VPLane::Kind::First;
153 return VPLane(LaneOffset, LaneKind);
154 }
155
156 static VPLane getLastLaneForVF(const ElementCount &VF) {
157 return getLaneFromEnd(VF, Offset: 1);
158 }
159
160 /// Returns a compile-time known value for the lane index and asserts if the
161 /// lane can only be calculated at runtime.
162 unsigned getKnownLane() const {
163 assert(LaneKind == Kind::First &&
164 "can only get known lane from the beginning");
165 return Lane;
166 }
167
168 /// Returns an expression describing the lane index that can be used at
169 /// runtime.
170 Value *getAsRuntimeExpr(IRBuilderBase &Builder, const ElementCount &VF) const;
171
172 /// Returns the Kind of lane offset.
173 Kind getKind() const { return LaneKind; }
174
175 /// Returns true if this is the first lane of the whole vector.
176 bool isFirstLane() const { return Lane == 0 && LaneKind == Kind::First; }
177
178 /// Maps the lane to a cache index based on \p VF.
179 unsigned mapToCacheIndex(const ElementCount &VF) const {
180 switch (LaneKind) {
181 case VPLane::Kind::ScalableLast:
182 assert(VF.isScalable() && Lane < VF.getKnownMinValue() &&
183 "ScalableLast can only be used with scalable VFs");
184 return VF.getKnownMinValue() + Lane;
185 default:
186 assert(Lane < VF.getKnownMinValue() &&
187 "Cannot extract lane larger than VF");
188 return Lane;
189 }
190 }
191};
192
193/// VPTransformState holds information passed down when "executing" a VPlan,
194/// needed for generating the output IR.
195struct VPTransformState {
196 VPTransformState(const TargetTransformInfo *TTI, ElementCount VF,
197 LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC,
198 IRBuilderBase &Builder, VPlan *Plan, Loop *CurrentParentLoop,
199 Type *CanonicalIVTy);
200 /// Target Transform Info.
201 const TargetTransformInfo *TTI;
202
203 /// The chosen Vectorization Factor of the loop being vectorized.
204 ElementCount VF;
205
206 /// Hold the index to generate specific scalar instructions. Null indicates
207 /// that all instances are to be generated, using either scalar or vector
208 /// instructions.
209 std::optional<VPLane> Lane;
210
211 struct DataState {
212 // Each value from the original loop, when vectorized, is represented by a
213 // vector value in the map.
214 DenseMap<const VPValue *, Value *> VPV2Vector;
215
216 DenseMap<const VPValue *, SmallVector<Value *, 4>> VPV2Scalars;
217 } Data;
218
219 /// Get the generated vector Value for a given VPValue \p Def if \p IsScalar
220 /// is false, otherwise return the generated scalar. \See set.
221 Value *get(const VPValue *Def, bool IsScalar = false);
222
223 /// Get the generated Value for a given VPValue and given Part and Lane.
224 Value *get(const VPValue *Def, const VPLane &Lane);
225
226 bool hasVectorValue(const VPValue *Def) {
227 return Data.VPV2Vector.contains(Val: Def);
228 }
229
230 bool hasScalarValue(const VPValue *Def, VPLane Lane) {
231 auto I = Data.VPV2Scalars.find(Val: Def);
232 if (I == Data.VPV2Scalars.end())
233 return false;
234 unsigned CacheIdx = Lane.mapToCacheIndex(VF);
235 return CacheIdx < I->second.size() && I->second[CacheIdx];
236 }
237
238 /// Set the generated vector Value for a given VPValue, if \p
239 /// IsScalar is false. If \p IsScalar is true, set the scalar in lane 0.
240 void set(const VPValue *Def, Value *V, bool IsScalar = false) {
241 if (IsScalar) {
242 set(Def, V, Lane: VPLane(0));
243 return;
244 }
245 assert((VF.isScalar() || isVectorizedTy(V->getType())) &&
246 "scalar values must be stored as (0, 0)");
247 Data.VPV2Vector[Def] = V;
248 }
249
250 /// Reset an existing vector value for \p Def and a given \p Part.
251 void reset(const VPValue *Def, Value *V) {
252 assert(Data.VPV2Vector.contains(Def) && "need to overwrite existing value");
253 Data.VPV2Vector[Def] = V;
254 }
255
256 /// Set the generated scalar \p V for \p Def and the given \p Lane.
257 void set(const VPValue *Def, Value *V, const VPLane &Lane) {
258 auto &Scalars = Data.VPV2Scalars[Def];
259 unsigned CacheIdx = Lane.mapToCacheIndex(VF);
260 if (Scalars.size() <= CacheIdx)
261 Scalars.resize(N: CacheIdx + 1);
262 assert(!Scalars[CacheIdx] && "should overwrite existing value");
263 Scalars[CacheIdx] = V;
264 }
265
266 /// Reset an existing scalar value for \p Def and a given \p Lane.
267 void reset(const VPValue *Def, Value *V, const VPLane &Lane) {
268 auto Iter = Data.VPV2Scalars.find(Val: Def);
269 assert(Iter != Data.VPV2Scalars.end() &&
270 "need to overwrite existing value");
271 unsigned CacheIdx = Lane.mapToCacheIndex(VF);
272 assert(CacheIdx < Iter->second.size() &&
273 "need to overwrite existing value");
274 Iter->second[CacheIdx] = V;
275 }
276
277 /// Set the debug location in the builder using the debug location \p DL.
278 void setDebugLocFrom(DebugLoc DL);
279
280 /// Insert the scalar value of \p Def at \p Lane into \p Lane of \p WideValue
281 /// and return the resulting value.
282 Value *packScalarIntoVectorizedValue(const VPValue *Def, Value *WideValue,
283 const VPLane &Lane);
284
285 /// Hold state information used when constructing the CFG of the output IR,
286 /// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks.
287 struct CFGState {
288 /// The previous VPBasicBlock visited. Initially set to null.
289 VPBasicBlock *PrevVPBB = nullptr;
290
291 /// The previous IR BasicBlock created or used. Initially set to the new
292 /// header BasicBlock.
293 BasicBlock *PrevBB = nullptr;
294
295 /// The last IR BasicBlock in the output IR. Set to the exit block of the
296 /// vector loop.
297 BasicBlock *ExitBB = nullptr;
298
299 /// A mapping of each VPBasicBlock to the corresponding BasicBlock. In case
300 /// of replication, maps the BasicBlock of the last replica created.
301 SmallDenseMap<const VPBasicBlock *, BasicBlock *> VPBB2IRBB;
302
303 /// Updater for the DominatorTree.
304 DomTreeUpdater DTU;
305
306 CFGState(DominatorTree *DT)
307 : DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy) {}
308 } CFG;
309
310 /// Hold a pointer to LoopInfo to register new basic blocks in the loop.
311 LoopInfo *LI;
312
313 /// Hold a pointer to AssumptionCache to register new assumptions after
314 /// replicating assume calls.
315 AssumptionCache *AC;
316
317 /// Hold a reference to the IRBuilder used to generate output IR code.
318 IRBuilderBase &Builder;
319
320 /// Pointer to the VPlan code is generated for.
321 VPlan *Plan;
322
323 /// The parent loop object for the current scope, or nullptr.
324 Loop *CurrentParentLoop = nullptr;
325
326 /// VPlan-based type analysis.
327 VPTypeAnalysis TypeAnalysis;
328
329 /// VPlan-based dominator tree.
330 VPDominatorTree VPDT;
331};
332
333/// Struct to hold various analysis needed for cost computations.
334struct VPCostContext {
335 const TargetTransformInfo &TTI;
336 const TargetLibraryInfo &TLI;
337 VPTypeAnalysis Types;
338 LLVMContext &LLVMCtx;
339 LoopVectorizationCostModel &CM;
340 SmallPtrSet<Instruction *, 8> SkipCostComputation;
341 TargetTransformInfo::TargetCostKind CostKind;
342 PredicatedScalarEvolution &PSE;
343 const Loop *L;
344
345 /// Number of predicated stores in the VPlan, computed on demand.
346 std::optional<unsigned> NumPredStores;
347
348 VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI,
349 const VPlan &Plan, LoopVectorizationCostModel &CM,
350 TargetTransformInfo::TargetCostKind CostKind,
351 PredicatedScalarEvolution &PSE, const Loop *L)
352 : TTI(TTI), TLI(TLI), Types(Plan), LLVMCtx(Plan.getContext()), CM(CM),
353 CostKind(CostKind), PSE(PSE), L(L) {}
354
355 /// Return the cost for \p UI with \p VF using the legacy cost model as
356 /// fallback until computing the cost of all recipes migrates to VPlan.
357 InstructionCost getLegacyCost(Instruction *UI, ElementCount VF) const;
358
359 /// Return true if the cost for \p UI shouldn't be computed, e.g. because it
360 /// has already been pre-computed.
361 bool skipCostComputation(Instruction *UI, bool IsVector) const;
362
363 /// \returns how much the cost of a predicated block should be divided by.
364 /// Forwards to LoopVectorizationCostModel::getPredBlockCostDivisor.
365 uint64_t getPredBlockCostDivisor(BasicBlock *BB) const;
366
367 /// Returns the OperandInfo for \p V, if it is a live-in.
368 TargetTransformInfo::OperandValueInfo getOperandInfo(VPValue *V) const;
369
370 /// Return true if \p I is considered uniform-after-vectorization in the
371 /// legacy cost model for \p VF. Only used to check for additional VPlan
372 /// simplifications.
373 bool isLegacyUniformAfterVectorization(Instruction *I, ElementCount VF) const;
374
375 /// Estimate the overhead of scalarizing a recipe with result type \p ResultTy
376 /// and \p Operands with \p VF. This is a convenience wrapper for the
377 /// type-based getScalarizationOverhead API. \p VIC provides context about
378 /// whether the scalarization is for a load/store operation. If \p
379 /// AlwaysIncludeReplicatingR is true, always compute the cost of scalarizing
380 /// replicating operands.
381 InstructionCost getScalarizationOverhead(
382 Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF,
383 TTI::VectorInstrContext VIC = TTI::VectorInstrContext::None,
384 bool AlwaysIncludeReplicatingR = false);
385
386 /// Returns true if an artificially high cost for emulated masked memrefs
387 /// should be used.
388 bool useEmulatedMaskMemRefHack(const VPReplicateRecipe *R, ElementCount VF);
389};
390
391/// This class can be used to assign names to VPValues. For VPValues without
392/// underlying value, assign consecutive numbers and use those as names (wrapped
393/// in vp<>). Otherwise, use the name from the underlying value (wrapped in
394/// ir<>), appending a .V version number if there are multiple uses of the same
395/// name. Allows querying names for VPValues for printing, similar to the
396/// ModuleSlotTracker for IR values.
397class VPSlotTracker {
398 /// Keep track of versioned names assigned to VPValues with underlying IR
399 /// values.
400 DenseMap<const VPValue *, std::string> VPValue2Name;
401 /// Keep track of the next number to use to version the base name.
402 StringMap<unsigned> BaseName2Version;
403
404 /// Number to assign to the next VPValue without underlying value.
405 unsigned NextSlot = 0;
406
407 /// Lazily created ModuleSlotTracker, used only when unnamed IR instructions
408 /// require slot tracking.
409 std::unique_ptr<ModuleSlotTracker> MST;
410
411 /// Cached metadata kind names from the Module's LLVMContext.
412 SmallVector<StringRef> MDNames;
413
414 /// Cached Module pointer for printing metadata.
415 const Module *M = nullptr;
416
417 void assignName(const VPValue *V);
418 LLVM_ABI_FOR_TEST void assignNames(const VPlan &Plan);
419 void assignNames(const VPBasicBlock *VPBB);
420 std::string getName(const Value *V);
421
422public:
423 VPSlotTracker(const VPlan *Plan = nullptr) {
424 if (Plan) {
425 assignNames(Plan: *Plan);
426 if (auto *ScalarHeader = Plan->getScalarHeader())
427 M = ScalarHeader->getIRBasicBlock()->getModule();
428 }
429 }
430
431 /// Returns the name assigned to \p V, if there is one, otherwise try to
432 /// construct one from the underlying value, if there's one; else return
433 /// <badref>.
434 std::string getOrCreateName(const VPValue *V) const;
435
436 /// Returns the cached metadata kind names.
437 ArrayRef<StringRef> getMDNames() {
438 if (MDNames.empty() && M)
439 M->getContext().getMDKindNames(Result&: MDNames);
440 return MDNames;
441 }
442
443 /// Returns the cached Module pointer.
444 const Module *getModule() const { return M; }
445};
446
447#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
448/// VPlanPrinter prints a given VPlan to a given output stream. The printing is
449/// indented and follows the dot format.
450class VPlanPrinter {
451 raw_ostream &OS;
452 const VPlan &Plan;
453 unsigned Depth = 0;
454 unsigned TabWidth = 2;
455 std::string Indent;
456 unsigned BID = 0;
457 SmallDenseMap<const VPBlockBase *, unsigned> BlockID;
458
459 VPSlotTracker SlotTracker;
460
461 /// Handle indentation.
462 void bumpIndent(int b) { Indent = std::string((Depth += b) * TabWidth, ' '); }
463
464 /// Print a given \p Block of the Plan.
465 void dumpBlock(const VPBlockBase *Block);
466
467 /// Print the information related to the CFG edges going out of a given
468 /// \p Block, followed by printing the successor blocks themselves.
469 void dumpEdges(const VPBlockBase *Block);
470
471 /// Print a given \p BasicBlock, including its VPRecipes, followed by printing
472 /// its successor blocks.
473 void dumpBasicBlock(const VPBasicBlock *BasicBlock);
474
475 /// Print a given \p Region of the Plan.
476 void dumpRegion(const VPRegionBlock *Region);
477
478 unsigned getOrCreateBID(const VPBlockBase *Block) {
479 return BlockID.count(Block) ? BlockID[Block] : BlockID[Block] = BID++;
480 }
481
482 Twine getOrCreateName(const VPBlockBase *Block);
483
484 Twine getUID(const VPBlockBase *Block);
485
486 /// Print the information related to a CFG edge between two VPBlockBases.
487 void drawEdge(const VPBlockBase *From, const VPBlockBase *To, bool Hidden,
488 const Twine &Label);
489
490public:
491 VPlanPrinter(raw_ostream &O, const VPlan &P)
492 : OS(O), Plan(P), SlotTracker(&P) {}
493
494 LLVM_DUMP_METHOD void dump();
495};
496#endif
497
498/// Check if a constant \p CI can be safely treated as having been extended
499/// from a narrower type with the given extension kind.
500bool canConstantBeExtended(const APInt *C, Type *NarrowType,
501 TTI::PartialReductionExtendKind ExtKind);
502} // end namespace llvm
503
504#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
505