1 | //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops |
10 | // and generates target-independent LLVM-IR. |
11 | // The vectorizer uses the TargetTransformInfo analysis to estimate the costs |
12 | // of instructions in order to estimate the profitability of vectorization. |
13 | // |
14 | // The loop vectorizer combines consecutive loop iterations into a single |
15 | // 'wide' iteration. After this transformation the index is incremented |
16 | // by the SIMD vector width, and not by one. |
17 | // |
18 | // This pass has three parts: |
19 | // 1. The main loop pass that drives the different parts. |
20 | // 2. LoopVectorizationLegality - A unit that checks for the legality |
21 | // of the vectorization. |
22 | // 3. InnerLoopVectorizer - A unit that performs the actual |
23 | // widening of instructions. |
24 | // 4. LoopVectorizationCostModel - A unit that checks for the profitability |
25 | // of vectorization. It decides on the optimal vector width, which |
26 | // can be one, if vectorization is not profitable. |
27 | // |
28 | // There is a development effort going on to migrate loop vectorizer to the |
29 | // VPlan infrastructure and to introduce outer loop vectorization support (see |
30 | // docs/VectorizationPlan.rst and |
31 | // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this |
32 | // purpose, we temporarily introduced the VPlan-native vectorization path: an |
33 | // alternative vectorization path that is natively implemented on top of the |
34 | // VPlan infrastructure. See EnableVPlanNativePath for enabling. |
35 | // |
36 | //===----------------------------------------------------------------------===// |
37 | // |
38 | // The reduction-variable vectorization is based on the paper: |
39 | // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. |
40 | // |
41 | // Variable uniformity checks are inspired by: |
42 | // Karrenberg, R. and Hack, S. Whole Function Vectorization. |
43 | // |
44 | // The interleaved access vectorization is based on the paper: |
45 | // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved |
46 | // Data for SIMD |
47 | // |
48 | // Other ideas/concepts are from: |
49 | // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. |
50 | // |
51 | // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of |
52 | // Vectorizing Compilers. |
53 | // |
54 | //===----------------------------------------------------------------------===// |
55 | |
56 | #include "llvm/Transforms/Vectorize/LoopVectorize.h" |
57 | #include "LoopVectorizationPlanner.h" |
58 | #include "VPRecipeBuilder.h" |
59 | #include "VPlan.h" |
60 | #include "VPlanAnalysis.h" |
61 | #include "VPlanHCFGBuilder.h" |
62 | #include "VPlanPatternMatch.h" |
63 | #include "VPlanTransforms.h" |
64 | #include "VPlanVerifier.h" |
65 | #include "llvm/ADT/APInt.h" |
66 | #include "llvm/ADT/ArrayRef.h" |
67 | #include "llvm/ADT/DenseMap.h" |
68 | #include "llvm/ADT/DenseMapInfo.h" |
69 | #include "llvm/ADT/Hashing.h" |
70 | #include "llvm/ADT/MapVector.h" |
71 | #include "llvm/ADT/STLExtras.h" |
72 | #include "llvm/ADT/SmallPtrSet.h" |
73 | #include "llvm/ADT/SmallSet.h" |
74 | #include "llvm/ADT/SmallVector.h" |
75 | #include "llvm/ADT/Statistic.h" |
76 | #include "llvm/ADT/StringRef.h" |
77 | #include "llvm/ADT/Twine.h" |
78 | #include "llvm/ADT/iterator_range.h" |
79 | #include "llvm/Analysis/AssumptionCache.h" |
80 | #include "llvm/Analysis/BasicAliasAnalysis.h" |
81 | #include "llvm/Analysis/BlockFrequencyInfo.h" |
82 | #include "llvm/Analysis/CFG.h" |
83 | #include "llvm/Analysis/CodeMetrics.h" |
84 | #include "llvm/Analysis/DemandedBits.h" |
85 | #include "llvm/Analysis/GlobalsModRef.h" |
86 | #include "llvm/Analysis/LoopAccessAnalysis.h" |
87 | #include "llvm/Analysis/LoopAnalysisManager.h" |
88 | #include "llvm/Analysis/LoopInfo.h" |
89 | #include "llvm/Analysis/LoopIterator.h" |
90 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
91 | #include "llvm/Analysis/ProfileSummaryInfo.h" |
92 | #include "llvm/Analysis/ScalarEvolution.h" |
93 | #include "llvm/Analysis/ScalarEvolutionExpressions.h" |
94 | #include "llvm/Analysis/TargetLibraryInfo.h" |
95 | #include "llvm/Analysis/TargetTransformInfo.h" |
96 | #include "llvm/Analysis/ValueTracking.h" |
97 | #include "llvm/Analysis/VectorUtils.h" |
98 | #include "llvm/IR/Attributes.h" |
99 | #include "llvm/IR/BasicBlock.h" |
100 | #include "llvm/IR/CFG.h" |
101 | #include "llvm/IR/Constant.h" |
102 | #include "llvm/IR/Constants.h" |
103 | #include "llvm/IR/DataLayout.h" |
104 | #include "llvm/IR/DebugInfo.h" |
105 | #include "llvm/IR/DebugInfoMetadata.h" |
106 | #include "llvm/IR/DebugLoc.h" |
107 | #include "llvm/IR/DerivedTypes.h" |
108 | #include "llvm/IR/DiagnosticInfo.h" |
109 | #include "llvm/IR/Dominators.h" |
110 | #include "llvm/IR/Function.h" |
111 | #include "llvm/IR/IRBuilder.h" |
112 | #include "llvm/IR/InstrTypes.h" |
113 | #include "llvm/IR/Instruction.h" |
114 | #include "llvm/IR/Instructions.h" |
115 | #include "llvm/IR/IntrinsicInst.h" |
116 | #include "llvm/IR/Intrinsics.h" |
117 | #include "llvm/IR/MDBuilder.h" |
118 | #include "llvm/IR/Metadata.h" |
119 | #include "llvm/IR/Module.h" |
120 | #include "llvm/IR/Operator.h" |
121 | #include "llvm/IR/PatternMatch.h" |
122 | #include "llvm/IR/ProfDataUtils.h" |
123 | #include "llvm/IR/Type.h" |
124 | #include "llvm/IR/Use.h" |
125 | #include "llvm/IR/User.h" |
126 | #include "llvm/IR/Value.h" |
127 | #include "llvm/IR/ValueHandle.h" |
128 | #include "llvm/IR/VectorBuilder.h" |
129 | #include "llvm/IR/Verifier.h" |
130 | #include "llvm/Support/Casting.h" |
131 | #include "llvm/Support/CommandLine.h" |
132 | #include "llvm/Support/Compiler.h" |
133 | #include "llvm/Support/Debug.h" |
134 | #include "llvm/Support/ErrorHandling.h" |
135 | #include "llvm/Support/InstructionCost.h" |
136 | #include "llvm/Support/MathExtras.h" |
137 | #include "llvm/Support/raw_ostream.h" |
138 | #include "llvm/Transforms/Utils/BasicBlockUtils.h" |
139 | #include "llvm/Transforms/Utils/InjectTLIMappings.h" |
140 | #include "llvm/Transforms/Utils/LoopSimplify.h" |
141 | #include "llvm/Transforms/Utils/LoopUtils.h" |
142 | #include "llvm/Transforms/Utils/LoopVersioning.h" |
143 | #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" |
144 | #include "llvm/Transforms/Utils/SizeOpts.h" |
145 | #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" |
146 | #include <algorithm> |
147 | #include <cassert> |
148 | #include <cmath> |
149 | #include <cstdint> |
150 | #include <functional> |
151 | #include <iterator> |
152 | #include <limits> |
153 | #include <map> |
154 | #include <memory> |
155 | #include <string> |
156 | #include <tuple> |
157 | #include <utility> |
158 | |
159 | using namespace llvm; |
160 | |
161 | #define LV_NAME "loop-vectorize" |
162 | #define DEBUG_TYPE LV_NAME |
163 | |
164 | #ifndef NDEBUG |
165 | const char VerboseDebug[] = DEBUG_TYPE "-verbose" ; |
166 | #endif |
167 | |
168 | /// @{ |
169 | /// Metadata attribute names |
170 | const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all" ; |
171 | const char LLVMLoopVectorizeFollowupVectorized[] = |
172 | "llvm.loop.vectorize.followup_vectorized" ; |
173 | const char LLVMLoopVectorizeFollowupEpilogue[] = |
174 | "llvm.loop.vectorize.followup_epilogue" ; |
175 | /// @} |
176 | |
177 | STATISTIC(LoopsVectorized, "Number of loops vectorized" ); |
178 | STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization" ); |
179 | STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized" ); |
180 | |
181 | static cl::opt<bool> EnableEpilogueVectorization( |
182 | "enable-epilogue-vectorization" , cl::init(Val: true), cl::Hidden, |
183 | cl::desc("Enable vectorization of epilogue loops." )); |
184 | |
185 | static cl::opt<unsigned> EpilogueVectorizationForceVF( |
186 | "epilogue-vectorization-force-VF" , cl::init(Val: 1), cl::Hidden, |
187 | cl::desc("When epilogue vectorization is enabled, and a value greater than " |
188 | "1 is specified, forces the given VF for all applicable epilogue " |
189 | "loops." )); |
190 | |
191 | static cl::opt<unsigned> EpilogueVectorizationMinVF( |
192 | "epilogue-vectorization-minimum-VF" , cl::init(Val: 16), cl::Hidden, |
193 | cl::desc("Only loops with vectorization factor equal to or larger than " |
194 | "the specified value are considered for epilogue vectorization." )); |
195 | |
196 | /// Loops with a known constant trip count below this number are vectorized only |
197 | /// if no scalar iteration overheads are incurred. |
198 | static cl::opt<unsigned> TinyTripCountVectorThreshold( |
199 | "vectorizer-min-trip-count" , cl::init(Val: 16), cl::Hidden, |
200 | cl::desc("Loops with a constant trip count that is smaller than this " |
201 | "value are vectorized only if no scalar iteration overheads " |
202 | "are incurred." )); |
203 | |
204 | static cl::opt<unsigned> VectorizeMemoryCheckThreshold( |
205 | "vectorize-memory-check-threshold" , cl::init(Val: 128), cl::Hidden, |
206 | cl::desc("The maximum allowed number of runtime memory checks" )); |
207 | |
208 | static cl::opt<bool> UseLegacyCostModel( |
209 | "vectorize-use-legacy-cost-model" , cl::init(Val: true), cl::Hidden, |
210 | cl::desc("Use the legacy cost model instead of the VPlan-based cost model. " |
211 | "This option will be removed in the future." )); |
212 | |
213 | // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, |
214 | // that predication is preferred, and this lists all options. I.e., the |
215 | // vectorizer will try to fold the tail-loop (epilogue) into the vector body |
216 | // and predicate the instructions accordingly. If tail-folding fails, there are |
217 | // different fallback strategies depending on these values: |
218 | namespace PreferPredicateTy { |
219 | enum Option { |
220 | ScalarEpilogue = 0, |
221 | PredicateElseScalarEpilogue, |
222 | PredicateOrDontVectorize |
223 | }; |
224 | } // namespace PreferPredicateTy |
225 | |
226 | static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( |
227 | "prefer-predicate-over-epilogue" , |
228 | cl::init(Val: PreferPredicateTy::ScalarEpilogue), |
229 | cl::Hidden, |
230 | cl::desc("Tail-folding and predication preferences over creating a scalar " |
231 | "epilogue loop." ), |
232 | cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, |
233 | "scalar-epilogue" , |
234 | "Don't tail-predicate loops, create scalar epilogue" ), |
235 | clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, |
236 | "predicate-else-scalar-epilogue" , |
237 | "prefer tail-folding, create scalar epilogue if tail " |
238 | "folding fails." ), |
239 | clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, |
240 | "predicate-dont-vectorize" , |
241 | "prefers tail-folding, don't attempt vectorization if " |
242 | "tail-folding fails." ))); |
243 | |
244 | static cl::opt<TailFoldingStyle> ForceTailFoldingStyle( |
245 | "force-tail-folding-style" , cl::desc("Force the tail folding style" ), |
246 | cl::init(Val: TailFoldingStyle::None), |
247 | cl::values( |
248 | clEnumValN(TailFoldingStyle::None, "none" , "Disable tail folding" ), |
249 | clEnumValN( |
250 | TailFoldingStyle::Data, "data" , |
251 | "Create lane mask for data only, using active.lane.mask intrinsic" ), |
252 | clEnumValN(TailFoldingStyle::DataWithoutLaneMask, |
253 | "data-without-lane-mask" , |
254 | "Create lane mask with compare/stepvector" ), |
255 | clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control" , |
256 | "Create lane mask using active.lane.mask intrinsic, and use " |
257 | "it for both data and control flow" ), |
258 | clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, |
259 | "data-and-control-without-rt-check" , |
260 | "Similar to data-and-control, but remove the runtime check" ), |
261 | clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl" , |
262 | "Use predicated EVL instructions for tail folding. If EVL " |
263 | "is unsupported, fallback to data-without-lane-mask." ))); |
264 | |
265 | static cl::opt<bool> MaximizeBandwidth( |
266 | "vectorizer-maximize-bandwidth" , cl::init(Val: false), cl::Hidden, |
267 | cl::desc("Maximize bandwidth when selecting vectorization factor which " |
268 | "will be determined by the smallest type in loop." )); |
269 | |
270 | static cl::opt<bool> EnableInterleavedMemAccesses( |
271 | "enable-interleaved-mem-accesses" , cl::init(Val: false), cl::Hidden, |
272 | cl::desc("Enable vectorization on interleaved memory accesses in a loop" )); |
273 | |
274 | /// An interleave-group may need masking if it resides in a block that needs |
275 | /// predication, or in order to mask away gaps. |
276 | static cl::opt<bool> EnableMaskedInterleavedMemAccesses( |
277 | "enable-masked-interleaved-mem-accesses" , cl::init(Val: false), cl::Hidden, |
278 | cl::desc("Enable vectorization on masked interleaved memory accesses in a loop" )); |
279 | |
280 | static cl::opt<unsigned> ForceTargetNumScalarRegs( |
281 | "force-target-num-scalar-regs" , cl::init(Val: 0), cl::Hidden, |
282 | cl::desc("A flag that overrides the target's number of scalar registers." )); |
283 | |
284 | static cl::opt<unsigned> ForceTargetNumVectorRegs( |
285 | "force-target-num-vector-regs" , cl::init(Val: 0), cl::Hidden, |
286 | cl::desc("A flag that overrides the target's number of vector registers." )); |
287 | |
288 | static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( |
289 | "force-target-max-scalar-interleave" , cl::init(Val: 0), cl::Hidden, |
290 | cl::desc("A flag that overrides the target's max interleave factor for " |
291 | "scalar loops." )); |
292 | |
293 | static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( |
294 | "force-target-max-vector-interleave" , cl::init(Val: 0), cl::Hidden, |
295 | cl::desc("A flag that overrides the target's max interleave factor for " |
296 | "vectorized loops." )); |
297 | |
298 | cl::opt<unsigned> ForceTargetInstructionCost( |
299 | "force-target-instruction-cost" , cl::init(Val: 0), cl::Hidden, |
300 | cl::desc("A flag that overrides the target's expected cost for " |
301 | "an instruction to a single constant value. Mostly " |
302 | "useful for getting consistent testing." )); |
303 | |
304 | static cl::opt<bool> ForceTargetSupportsScalableVectors( |
305 | "force-target-supports-scalable-vectors" , cl::init(Val: false), cl::Hidden, |
306 | cl::desc( |
307 | "Pretend that scalable vectors are supported, even if the target does " |
308 | "not support them. This flag should only be used for testing." )); |
309 | |
310 | static cl::opt<unsigned> SmallLoopCost( |
311 | "small-loop-cost" , cl::init(Val: 20), cl::Hidden, |
312 | cl::desc( |
313 | "The cost of a loop that is considered 'small' by the interleaver." )); |
314 | |
315 | static cl::opt<bool> LoopVectorizeWithBlockFrequency( |
316 | "loop-vectorize-with-block-frequency" , cl::init(Val: true), cl::Hidden, |
317 | cl::desc("Enable the use of the block frequency analysis to access PGO " |
318 | "heuristics minimizing code growth in cold regions and being more " |
319 | "aggressive in hot regions." )); |
320 | |
321 | // Runtime interleave loops for load/store throughput. |
322 | static cl::opt<bool> EnableLoadStoreRuntimeInterleave( |
323 | "enable-loadstore-runtime-interleave" , cl::init(Val: true), cl::Hidden, |
324 | cl::desc( |
325 | "Enable runtime interleaving until load/store ports are saturated" )); |
326 | |
327 | /// The number of stores in a loop that are allowed to need predication. |
328 | static cl::opt<unsigned> NumberOfStoresToPredicate( |
329 | "vectorize-num-stores-pred" , cl::init(Val: 1), cl::Hidden, |
330 | cl::desc("Max number of stores to be predicated behind an if." )); |
331 | |
332 | static cl::opt<bool> EnableIndVarRegisterHeur( |
333 | "enable-ind-var-reg-heur" , cl::init(Val: true), cl::Hidden, |
334 | cl::desc("Count the induction variable only once when interleaving" )); |
335 | |
336 | static cl::opt<bool> EnableCondStoresVectorization( |
337 | "enable-cond-stores-vec" , cl::init(Val: true), cl::Hidden, |
338 | cl::desc("Enable if predication of stores during vectorization." )); |
339 | |
340 | static cl::opt<unsigned> MaxNestedScalarReductionIC( |
341 | "max-nested-scalar-reduction-interleave" , cl::init(Val: 2), cl::Hidden, |
342 | cl::desc("The maximum interleave count to use when interleaving a scalar " |
343 | "reduction in a nested loop." )); |
344 | |
345 | static cl::opt<bool> |
346 | PreferInLoopReductions("prefer-inloop-reductions" , cl::init(Val: false), |
347 | cl::Hidden, |
348 | cl::desc("Prefer in-loop vector reductions, " |
349 | "overriding the targets preference." )); |
350 | |
351 | static cl::opt<bool> ForceOrderedReductions( |
352 | "force-ordered-reductions" , cl::init(Val: false), cl::Hidden, |
353 | cl::desc("Enable the vectorisation of loops with in-order (strict) " |
354 | "FP reductions" )); |
355 | |
356 | static cl::opt<bool> PreferPredicatedReductionSelect( |
357 | "prefer-predicated-reduction-select" , cl::init(Val: false), cl::Hidden, |
358 | cl::desc( |
359 | "Prefer predicating a reduction operation over an after loop select." )); |
360 | |
361 | namespace llvm { |
362 | cl::opt<bool> EnableVPlanNativePath( |
363 | "enable-vplan-native-path" , cl::Hidden, |
364 | cl::desc("Enable VPlan-native vectorization path with " |
365 | "support for outer loop vectorization." )); |
366 | } |
367 | |
368 | // This flag enables the stress testing of the VPlan H-CFG construction in the |
369 | // VPlan-native vectorization path. It must be used in conjuction with |
370 | // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the |
371 | // verification of the H-CFGs built. |
372 | static cl::opt<bool> VPlanBuildStressTest( |
373 | "vplan-build-stress-test" , cl::init(Val: false), cl::Hidden, |
374 | cl::desc( |
375 | "Build VPlan for every supported loop nest in the function and bail " |
376 | "out right after the build (stress test the VPlan H-CFG construction " |
377 | "in the VPlan-native vectorization path)." )); |
378 | |
379 | cl::opt<bool> llvm::EnableLoopInterleaving( |
380 | "interleave-loops" , cl::init(Val: true), cl::Hidden, |
381 | cl::desc("Enable loop interleaving in Loop vectorization passes" )); |
382 | cl::opt<bool> llvm::EnableLoopVectorization( |
383 | "vectorize-loops" , cl::init(Val: true), cl::Hidden, |
384 | cl::desc("Run the Loop vectorization passes" )); |
385 | |
386 | static cl::opt<bool> PrintVPlansInDotFormat( |
387 | "vplan-print-in-dot-format" , cl::Hidden, |
388 | cl::desc("Use dot format instead of plain text when dumping VPlans" )); |
389 | |
390 | static cl::opt<cl::boolOrDefault> ForceSafeDivisor( |
391 | "force-widen-divrem-via-safe-divisor" , cl::Hidden, |
392 | cl::desc( |
393 | "Override cost based safe divisor widening for div/rem instructions" )); |
394 | |
395 | static cl::opt<bool> UseWiderVFIfCallVariantsPresent( |
396 | "vectorizer-maximize-bandwidth-for-vector-calls" , cl::init(Val: true), |
397 | cl::Hidden, |
398 | cl::desc("Try wider VFs if they enable the use of vector variants" )); |
399 | |
400 | // Likelyhood of bypassing the vectorized loop because assumptions about SCEV |
401 | // variables not overflowing do not hold. See `emitSCEVChecks`. |
402 | static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127}; |
403 | // Likelyhood of bypassing the vectorized loop because pointers overlap. See |
404 | // `emitMemRuntimeChecks`. |
405 | static constexpr uint32_t MemCheckBypassWeights[] = {1, 127}; |
406 | // Likelyhood of bypassing the vectorized loop because there are zero trips left |
407 | // after prolog. See `emitIterationCountCheck`. |
408 | static constexpr uint32_t MinItersBypassWeights[] = {1, 127}; |
409 | |
410 | /// A helper function that returns true if the given type is irregular. The |
411 | /// type is irregular if its allocated size doesn't equal the store size of an |
412 | /// element of the corresponding vector type. |
413 | static bool hasIrregularType(Type *Ty, const DataLayout &DL) { |
414 | // Determine if an array of N elements of type Ty is "bitcast compatible" |
415 | // with a <N x Ty> vector. |
416 | // This is only true if there is no padding between the array elements. |
417 | return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); |
418 | } |
419 | |
420 | /// Returns "best known" trip count for the specified loop \p L as defined by |
421 | /// the following procedure: |
422 | /// 1) Returns exact trip count if it is known. |
423 | /// 2) Returns expected trip count according to profile data if any. |
424 | /// 3) Returns upper bound estimate if it is known. |
425 | /// 4) Returns std::nullopt if all of the above failed. |
426 | static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, |
427 | Loop *L) { |
428 | // Check if exact trip count is known. |
429 | if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) |
430 | return ExpectedTC; |
431 | |
432 | // Check if there is an expected trip count available from profile data. |
433 | if (LoopVectorizeWithBlockFrequency) |
434 | if (auto EstimatedTC = getLoopEstimatedTripCount(L)) |
435 | return *EstimatedTC; |
436 | |
437 | // Check if upper bound estimate is known. |
438 | if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) |
439 | return ExpectedTC; |
440 | |
441 | return std::nullopt; |
442 | } |
443 | |
444 | namespace { |
445 | // Forward declare GeneratedRTChecks. |
446 | class GeneratedRTChecks; |
447 | |
448 | using SCEV2ValueTy = DenseMap<const SCEV *, Value *>; |
449 | } // namespace |
450 | |
451 | namespace llvm { |
452 | |
453 | AnalysisKey ShouldRunExtraVectorPasses::; |
454 | |
455 | /// InnerLoopVectorizer vectorizes loops which contain only one basic |
456 | /// block to a specified vectorization factor (VF). |
457 | /// This class performs the widening of scalars into vectors, or multiple |
458 | /// scalars. This class also implements the following features: |
459 | /// * It inserts an epilogue loop for handling loops that don't have iteration |
460 | /// counts that are known to be a multiple of the vectorization factor. |
461 | /// * It handles the code generation for reduction variables. |
462 | /// * Scalarization (implementation using scalars) of un-vectorizable |
463 | /// instructions. |
464 | /// InnerLoopVectorizer does not perform any vectorization-legality |
465 | /// checks, and relies on the caller to check for the different legality |
466 | /// aspects. The InnerLoopVectorizer relies on the |
467 | /// LoopVectorizationLegality class to provide information about the induction |
468 | /// and reduction variables that were found to a given vectorization factor. |
469 | class InnerLoopVectorizer { |
470 | public: |
471 | InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, |
472 | LoopInfo *LI, DominatorTree *DT, |
473 | const TargetLibraryInfo *TLI, |
474 | const TargetTransformInfo *TTI, AssumptionCache *AC, |
475 | OptimizationRemarkEmitter *ORE, ElementCount VecWidth, |
476 | ElementCount MinProfitableTripCount, |
477 | unsigned UnrollFactor, LoopVectorizationLegality *LVL, |
478 | LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, |
479 | ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) |
480 | : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), |
481 | AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), |
482 | Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), |
483 | PSI(PSI), RTChecks(RTChecks) { |
484 | // Query this against the original loop and save it here because the profile |
485 | // of the original loop header may change as the transformation happens. |
486 | OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( |
487 | BB: OrigLoop->getHeader(), PSI, BFI, QueryType: PGSOQueryType::IRPass); |
488 | |
489 | if (MinProfitableTripCount.isZero()) |
490 | this->MinProfitableTripCount = VecWidth; |
491 | else |
492 | this->MinProfitableTripCount = MinProfitableTripCount; |
493 | } |
494 | |
495 | virtual ~InnerLoopVectorizer() = default; |
496 | |
497 | /// Create a new empty loop that will contain vectorized instructions later |
498 | /// on, while the old loop will be used as the scalar remainder. Control flow |
499 | /// is generated around the vectorized (and scalar epilogue) loops consisting |
500 | /// of various checks and bypasses. Return the pre-header block of the new |
501 | /// loop and the start value for the canonical induction, if it is != 0. The |
502 | /// latter is the case when vectorizing the epilogue loop. In the case of |
503 | /// epilogue vectorization, this function is overriden to handle the more |
504 | /// complex control flow around the loops. \p ExpandedSCEVs is used to |
505 | /// look up SCEV expansions for expressions needed during skeleton creation. |
506 | virtual std::pair<BasicBlock *, Value *> |
507 | createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs); |
508 | |
509 | /// Fix the vectorized code, taking care of header phi's, live-outs, and more. |
510 | void fixVectorizedLoop(VPTransformState &State, VPlan &Plan); |
511 | |
512 | // Return true if any runtime check is added. |
513 | bool areSafetyChecksAdded() { return AddedSafetyChecks; } |
514 | |
515 | /// A helper function to scalarize a single Instruction in the innermost loop. |
516 | /// Generates a sequence of scalar instances for each lane between \p MinLane |
517 | /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, |
518 | /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p |
519 | /// Instr's operands. |
520 | void scalarizeInstruction(const Instruction *Instr, |
521 | VPReplicateRecipe *RepRecipe, |
522 | const VPIteration &Instance, |
523 | VPTransformState &State); |
524 | |
525 | /// Fix the non-induction PHIs in \p Plan. |
526 | void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State); |
527 | |
528 | /// Create a new phi node for the induction variable \p OrigPhi to resume |
529 | /// iteration count in the scalar epilogue, from where the vectorized loop |
530 | /// left off. \p Step is the SCEV-expanded induction step to use. In cases |
531 | /// where the loop skeleton is more complicated (i.e., epilogue vectorization) |
532 | /// and the resume values can come from an additional bypass block, the \p |
533 | /// AdditionalBypass pair provides information about the bypass block and the |
534 | /// end value on the edge from bypass to this loop. |
535 | PHINode *createInductionResumeValue( |
536 | PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step, |
537 | ArrayRef<BasicBlock *> BypassBlocks, |
538 | std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); |
539 | |
540 | /// Returns the original loop trip count. |
541 | Value *getTripCount() const { return TripCount; } |
542 | |
543 | /// Used to set the trip count after ILV's construction and after the |
544 | /// preheader block has been executed. Note that this always holds the trip |
545 | /// count of the original loop for both main loop and epilogue vectorization. |
546 | void setTripCount(Value *TC) { TripCount = TC; } |
547 | |
548 | protected: |
549 | friend class LoopVectorizationPlanner; |
550 | |
551 | /// A small list of PHINodes. |
552 | using PhiVector = SmallVector<PHINode *, 4>; |
553 | |
554 | /// A type for scalarized values in the new loop. Each value from the |
555 | /// original loop, when scalarized, is represented by UF x VF scalar values |
556 | /// in the new unrolled loop, where UF is the unroll factor and VF is the |
557 | /// vectorization factor. |
558 | using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; |
559 | |
560 | /// Set up the values of the IVs correctly when exiting the vector loop. |
561 | void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, |
562 | Value *VectorTripCount, Value *EndValue, |
563 | BasicBlock *MiddleBlock, BasicBlock *, |
564 | VPlan &Plan, VPTransformState &State); |
565 | |
566 | /// Iteratively sink the scalarized operands of a predicated instruction into |
567 | /// the block that was created for it. |
568 | void sinkScalarOperands(Instruction *PredInst); |
569 | |
570 | /// Returns (and creates if needed) the trip count of the widened loop. |
571 | Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock); |
572 | |
573 | /// Emit a bypass check to see if the vector trip count is zero, including if |
574 | /// it overflows. |
575 | void emitIterationCountCheck(BasicBlock *Bypass); |
576 | |
577 | /// Emit a bypass check to see if all of the SCEV assumptions we've |
578 | /// had to make are correct. Returns the block containing the checks or |
579 | /// nullptr if no checks have been added. |
580 | BasicBlock *emitSCEVChecks(BasicBlock *Bypass); |
581 | |
582 | /// Emit bypass checks to check any memory assumptions we may have made. |
583 | /// Returns the block containing the checks or nullptr if no checks have been |
584 | /// added. |
585 | BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass); |
586 | |
587 | /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, |
588 | /// vector loop preheader, middle block and scalar preheader. |
589 | void createVectorLoopSkeleton(StringRef Prefix); |
590 | |
591 | /// Create new phi nodes for the induction variables to resume iteration count |
592 | /// in the scalar epilogue, from where the vectorized loop left off. |
593 | /// In cases where the loop skeleton is more complicated (eg. epilogue |
594 | /// vectorization) and the resume values can come from an additional bypass |
595 | /// block, the \p AdditionalBypass pair provides information about the bypass |
596 | /// block and the end value on the edge from bypass to this loop. |
597 | void createInductionResumeValues( |
598 | const SCEV2ValueTy &ExpandedSCEVs, |
599 | std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); |
600 | |
601 | /// Complete the loop skeleton by adding debug MDs, creating appropriate |
602 | /// conditional branches in the middle block, preparing the builder and |
603 | /// running the verifier. Return the preheader of the completed vector loop. |
604 | BasicBlock *completeLoopSkeleton(); |
605 | |
606 | /// Allow subclasses to override and print debug traces before/after vplan |
607 | /// execution, when trace information is requested. |
608 | virtual void printDebugTracesAtStart(){}; |
609 | virtual void printDebugTracesAtEnd(){}; |
610 | |
611 | /// The original loop. |
612 | Loop *OrigLoop; |
613 | |
614 | /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies |
615 | /// dynamic knowledge to simplify SCEV expressions and converts them to a |
616 | /// more usable form. |
617 | PredicatedScalarEvolution &PSE; |
618 | |
619 | /// Loop Info. |
620 | LoopInfo *LI; |
621 | |
622 | /// Dominator Tree. |
623 | DominatorTree *DT; |
624 | |
625 | /// Target Library Info. |
626 | const TargetLibraryInfo *TLI; |
627 | |
628 | /// Target Transform Info. |
629 | const TargetTransformInfo *TTI; |
630 | |
631 | /// Assumption Cache. |
632 | AssumptionCache *AC; |
633 | |
634 | /// Interface to emit optimization remarks. |
635 | OptimizationRemarkEmitter *ORE; |
636 | |
637 | /// The vectorization SIMD factor to use. Each vector will have this many |
638 | /// vector elements. |
639 | ElementCount VF; |
640 | |
641 | ElementCount MinProfitableTripCount; |
642 | |
643 | /// The vectorization unroll factor to use. Each scalar is vectorized to this |
644 | /// many different vector instructions. |
645 | unsigned UF; |
646 | |
647 | /// The builder that we use |
648 | IRBuilder<> Builder; |
649 | |
650 | // --- Vectorization state --- |
651 | |
652 | /// The vector-loop preheader. |
653 | BasicBlock *; |
654 | |
655 | /// The scalar-loop preheader. |
656 | BasicBlock *; |
657 | |
658 | /// Middle Block between the vector and the scalar. |
659 | BasicBlock *LoopMiddleBlock; |
660 | |
661 | /// The unique ExitBlock of the scalar loop if one exists. Note that |
662 | /// there can be multiple exiting edges reaching this block. |
663 | BasicBlock *LoopExitBlock; |
664 | |
665 | /// The scalar loop body. |
666 | BasicBlock *LoopScalarBody; |
667 | |
668 | /// A list of all bypass blocks. The first block is the entry of the loop. |
669 | SmallVector<BasicBlock *, 4> LoopBypassBlocks; |
670 | |
671 | /// Store instructions that were predicated. |
672 | SmallVector<Instruction *, 4> PredicatedInstructions; |
673 | |
674 | /// Trip count of the original loop. |
675 | Value *TripCount = nullptr; |
676 | |
677 | /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) |
678 | Value *VectorTripCount = nullptr; |
679 | |
680 | /// The legality analysis. |
681 | LoopVectorizationLegality *Legal; |
682 | |
683 | /// The profitablity analysis. |
684 | LoopVectorizationCostModel *Cost; |
685 | |
686 | // Record whether runtime checks are added. |
687 | bool AddedSafetyChecks = false; |
688 | |
689 | // Holds the end values for each induction variable. We save the end values |
690 | // so we can later fix-up the external users of the induction variables. |
691 | DenseMap<PHINode *, Value *> IVEndValues; |
692 | |
693 | /// BFI and PSI are used to check for profile guided size optimizations. |
694 | BlockFrequencyInfo *BFI; |
695 | ProfileSummaryInfo *PSI; |
696 | |
697 | // Whether this loop should be optimized for size based on profile guided size |
698 | // optimizatios. |
699 | bool OptForSizeBasedOnProfile; |
700 | |
701 | /// Structure to hold information about generated runtime checks, responsible |
702 | /// for cleaning the checks, if vectorization turns out unprofitable. |
703 | GeneratedRTChecks &RTChecks; |
704 | |
705 | // Holds the resume values for reductions in the loops, used to set the |
706 | // correct start value of reduction PHIs when vectorizing the epilogue. |
707 | SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4> |
708 | ReductionResumeValues; |
709 | }; |
710 | |
711 | class InnerLoopUnroller : public InnerLoopVectorizer { |
712 | public: |
713 | InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, |
714 | LoopInfo *LI, DominatorTree *DT, |
715 | const TargetLibraryInfo *TLI, |
716 | const TargetTransformInfo *TTI, AssumptionCache *AC, |
717 | OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, |
718 | LoopVectorizationLegality *LVL, |
719 | LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, |
720 | ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) |
721 | : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, |
722 | ElementCount::getFixed(MinVal: 1), |
723 | ElementCount::getFixed(MinVal: 1), UnrollFactor, LVL, CM, |
724 | BFI, PSI, Check) {} |
725 | }; |
726 | |
727 | /// Encapsulate information regarding vectorization of a loop and its epilogue. |
728 | /// This information is meant to be updated and used across two stages of |
729 | /// epilogue vectorization. |
730 | struct EpilogueLoopVectorizationInfo { |
731 | ElementCount MainLoopVF = ElementCount::getFixed(MinVal: 0); |
732 | unsigned MainLoopUF = 0; |
733 | ElementCount EpilogueVF = ElementCount::getFixed(MinVal: 0); |
734 | unsigned EpilogueUF = 0; |
735 | BasicBlock *MainLoopIterationCountCheck = nullptr; |
736 | BasicBlock *EpilogueIterationCountCheck = nullptr; |
737 | BasicBlock *SCEVSafetyCheck = nullptr; |
738 | BasicBlock *MemSafetyCheck = nullptr; |
739 | Value *TripCount = nullptr; |
740 | Value *VectorTripCount = nullptr; |
741 | |
742 | EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, |
743 | ElementCount EVF, unsigned EUF) |
744 | : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { |
745 | assert(EUF == 1 && |
746 | "A high UF for the epilogue loop is likely not beneficial." ); |
747 | } |
748 | }; |
749 | |
750 | /// An extension of the inner loop vectorizer that creates a skeleton for a |
751 | /// vectorized loop that has its epilogue (residual) also vectorized. |
752 | /// The idea is to run the vplan on a given loop twice, firstly to setup the |
753 | /// skeleton and vectorize the main loop, and secondly to complete the skeleton |
754 | /// from the first step and vectorize the epilogue. This is achieved by |
755 | /// deriving two concrete strategy classes from this base class and invoking |
756 | /// them in succession from the loop vectorizer planner. |
757 | class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { |
758 | public: |
759 | InnerLoopAndEpilogueVectorizer( |
760 | Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, |
761 | DominatorTree *DT, const TargetLibraryInfo *TLI, |
762 | const TargetTransformInfo *TTI, AssumptionCache *AC, |
763 | OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, |
764 | LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, |
765 | BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, |
766 | GeneratedRTChecks &Checks) |
767 | : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, |
768 | EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL, |
769 | CM, BFI, PSI, Checks), |
770 | EPI(EPI) {} |
771 | |
772 | // Override this function to handle the more complex control flow around the |
773 | // three loops. |
774 | std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton( |
775 | const SCEV2ValueTy &ExpandedSCEVs) final { |
776 | return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs); |
777 | } |
778 | |
779 | /// The interface for creating a vectorized skeleton using one of two |
780 | /// different strategies, each corresponding to one execution of the vplan |
781 | /// as described above. |
782 | virtual std::pair<BasicBlock *, Value *> |
783 | createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0; |
784 | |
785 | /// Holds and updates state information required to vectorize the main loop |
786 | /// and its epilogue in two separate passes. This setup helps us avoid |
787 | /// regenerating and recomputing runtime safety checks. It also helps us to |
788 | /// shorten the iteration-count-check path length for the cases where the |
789 | /// iteration count of the loop is so small that the main vector loop is |
790 | /// completely skipped. |
791 | EpilogueLoopVectorizationInfo &EPI; |
792 | }; |
793 | |
794 | /// A specialized derived class of inner loop vectorizer that performs |
795 | /// vectorization of *main* loops in the process of vectorizing loops and their |
796 | /// epilogues. |
797 | class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { |
798 | public: |
799 | EpilogueVectorizerMainLoop( |
800 | Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, |
801 | DominatorTree *DT, const TargetLibraryInfo *TLI, |
802 | const TargetTransformInfo *TTI, AssumptionCache *AC, |
803 | OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, |
804 | LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, |
805 | BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, |
806 | GeneratedRTChecks &Check) |
807 | : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, |
808 | EPI, LVL, CM, BFI, PSI, Check) {} |
809 | /// Implements the interface for creating a vectorized skeleton using the |
810 | /// *main loop* strategy (ie the first pass of vplan execution). |
811 | std::pair<BasicBlock *, Value *> |
812 | createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final; |
813 | |
814 | protected: |
815 | /// Emits an iteration count bypass check once for the main loop (when \p |
816 | /// ForEpilogue is false) and once for the epilogue loop (when \p |
817 | /// ForEpilogue is true). |
818 | BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue); |
819 | void printDebugTracesAtStart() override; |
820 | void printDebugTracesAtEnd() override; |
821 | }; |
822 | |
823 | // A specialized derived class of inner loop vectorizer that performs |
824 | // vectorization of *epilogue* loops in the process of vectorizing loops and |
825 | // their epilogues. |
826 | class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { |
827 | public: |
828 | EpilogueVectorizerEpilogueLoop( |
829 | Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, |
830 | DominatorTree *DT, const TargetLibraryInfo *TLI, |
831 | const TargetTransformInfo *TTI, AssumptionCache *AC, |
832 | OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, |
833 | LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, |
834 | BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, |
835 | GeneratedRTChecks &Checks) |
836 | : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, |
837 | EPI, LVL, CM, BFI, PSI, Checks) { |
838 | TripCount = EPI.TripCount; |
839 | } |
840 | /// Implements the interface for creating a vectorized skeleton using the |
841 | /// *epilogue loop* strategy (ie the second pass of vplan execution). |
842 | std::pair<BasicBlock *, Value *> |
843 | createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final; |
844 | |
845 | protected: |
846 | /// Emits an iteration count bypass check after the main vector loop has |
847 | /// finished to see if there are any iterations left to execute by either |
848 | /// the vector epilogue or the scalar epilogue. |
849 | BasicBlock *emitMinimumVectorEpilogueIterCountCheck( |
850 | BasicBlock *Bypass, |
851 | BasicBlock *Insert); |
852 | void printDebugTracesAtStart() override; |
853 | void printDebugTracesAtEnd() override; |
854 | }; |
855 | } // end namespace llvm |
856 | |
857 | /// Look for a meaningful debug location on the instruction or it's |
858 | /// operands. |
859 | static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) { |
860 | if (!I) |
861 | return DebugLoc(); |
862 | |
863 | DebugLoc Empty; |
864 | if (I->getDebugLoc() != Empty) |
865 | return I->getDebugLoc(); |
866 | |
867 | for (Use &Op : I->operands()) { |
868 | if (Instruction *OpInst = dyn_cast<Instruction>(Val&: Op)) |
869 | if (OpInst->getDebugLoc() != Empty) |
870 | return OpInst->getDebugLoc(); |
871 | } |
872 | |
873 | return I->getDebugLoc(); |
874 | } |
875 | |
876 | /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I |
877 | /// is passed, the message relates to that particular instruction. |
878 | #ifndef NDEBUG |
879 | static void debugVectorizationMessage(const StringRef Prefix, |
880 | const StringRef DebugMsg, |
881 | Instruction *I) { |
882 | dbgs() << "LV: " << Prefix << DebugMsg; |
883 | if (I != nullptr) |
884 | dbgs() << " " << *I; |
885 | else |
886 | dbgs() << '.'; |
887 | dbgs() << '\n'; |
888 | } |
889 | #endif |
890 | |
891 | /// Create an analysis remark that explains why vectorization failed |
892 | /// |
893 | /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p |
894 | /// RemarkName is the identifier for the remark. If \p I is passed it is an |
895 | /// instruction that prevents vectorization. Otherwise \p TheLoop is used for |
896 | /// the location of the remark. \return the remark object that can be |
897 | /// streamed to. |
898 | static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, |
899 | StringRef , Loop *TheLoop, Instruction *I) { |
900 | Value *CodeRegion = TheLoop->getHeader(); |
901 | DebugLoc DL = TheLoop->getStartLoc(); |
902 | |
903 | if (I) { |
904 | CodeRegion = I->getParent(); |
905 | // If there is no debug location attached to the instruction, revert back to |
906 | // using the loop's. |
907 | if (I->getDebugLoc()) |
908 | DL = I->getDebugLoc(); |
909 | } |
910 | |
911 | return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); |
912 | } |
913 | |
914 | namespace llvm { |
915 | |
916 | /// Return a value for Step multiplied by VF. |
917 | Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, |
918 | int64_t Step) { |
919 | assert(Ty->isIntegerTy() && "Expected an integer step" ); |
920 | return B.CreateElementCount(DstType: Ty, EC: VF.multiplyCoefficientBy(RHS: Step)); |
921 | } |
922 | |
923 | /// Return the runtime value for VF. |
924 | Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) { |
925 | return B.CreateElementCount(DstType: Ty, EC: VF); |
926 | } |
927 | |
928 | const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE, |
929 | Loop *OrigLoop) { |
930 | const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); |
931 | assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count" ); |
932 | |
933 | ScalarEvolution &SE = *PSE.getSE(); |
934 | return SE.getTripCountFromExitCount(ExitCount: BackedgeTakenCount, EvalTy: IdxTy, L: OrigLoop); |
935 | } |
936 | |
937 | void (const StringRef DebugMsg, |
938 | const StringRef OREMsg, const StringRef ORETag, |
939 | OptimizationRemarkEmitter *ORE, Loop *TheLoop, |
940 | Instruction *I) { |
941 | LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: " , DebugMsg, I)); |
942 | LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); |
943 | ORE->emit( |
944 | OptDiag&: createLVAnalysis(PassName: Hints.vectorizeAnalysisPassName(), RemarkName: ORETag, TheLoop, I) |
945 | << "loop not vectorized: " << OREMsg); |
946 | } |
947 | |
948 | /// Reports an informative message: print \p Msg for debugging purposes as well |
949 | /// as an optimization remark. Uses either \p I as location of the remark, or |
950 | /// otherwise \p TheLoop. |
951 | static void (const StringRef Msg, const StringRef ORETag, |
952 | OptimizationRemarkEmitter *ORE, Loop *TheLoop, |
953 | Instruction *I = nullptr) { |
954 | LLVM_DEBUG(debugVectorizationMessage("" , Msg, I)); |
955 | LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); |
956 | ORE->emit( |
957 | OptDiag&: createLVAnalysis(PassName: Hints.vectorizeAnalysisPassName(), RemarkName: ORETag, TheLoop, I) |
958 | << Msg); |
959 | } |
960 | |
961 | /// Report successful vectorization of the loop. In case an outer loop is |
962 | /// vectorized, prepend "outer" to the vectorization remark. |
963 | static void (OptimizationRemarkEmitter *ORE, Loop *TheLoop, |
964 | VectorizationFactor VF, unsigned IC) { |
965 | LLVM_DEBUG(debugVectorizationMessage( |
966 | "Vectorizing: " , TheLoop->isInnermost() ? "innermost loop" : "outer loop" , |
967 | nullptr)); |
968 | StringRef LoopType = TheLoop->isInnermost() ? "" : "outer " ; |
969 | ORE->emit(RemarkBuilder: [&]() { |
970 | return OptimizationRemark(LV_NAME, "Vectorized" , TheLoop->getStartLoc(), |
971 | TheLoop->getHeader()) |
972 | << "vectorized " << LoopType << "loop (vectorization width: " |
973 | << ore::NV("VectorizationFactor" , VF.Width) |
974 | << ", interleaved count: " << ore::NV("InterleaveCount" , IC) << ")" ; |
975 | }); |
976 | } |
977 | |
978 | } // end namespace llvm |
979 | |
980 | namespace llvm { |
981 | |
982 | // Loop vectorization cost-model hints how the scalar epilogue loop should be |
983 | // lowered. |
984 | enum ScalarEpilogueLowering { |
985 | |
986 | // The default: allowing scalar epilogues. |
987 | CM_ScalarEpilogueAllowed, |
988 | |
989 | // Vectorization with OptForSize: don't allow epilogues. |
990 | CM_ScalarEpilogueNotAllowedOptSize, |
991 | |
992 | // A special case of vectorisation with OptForSize: loops with a very small |
993 | // trip count are considered for vectorization under OptForSize, thereby |
994 | // making sure the cost of their loop body is dominant, free of runtime |
995 | // guards and scalar iteration overheads. |
996 | CM_ScalarEpilogueNotAllowedLowTripLoop, |
997 | |
998 | // Loop hint predicate indicating an epilogue is undesired. |
999 | CM_ScalarEpilogueNotNeededUsePredicate, |
1000 | |
1001 | // Directive indicating we must either tail fold or not vectorize |
1002 | CM_ScalarEpilogueNotAllowedUsePredicate |
1003 | }; |
1004 | |
1005 | using InstructionVFPair = std::pair<Instruction *, ElementCount>; |
1006 | |
1007 | /// LoopVectorizationCostModel - estimates the expected speedups due to |
1008 | /// vectorization. |
1009 | /// In many cases vectorization is not profitable. This can happen because of |
1010 | /// a number of reasons. In this class we mainly attempt to predict the |
1011 | /// expected speedup/slowdowns due to the supported instruction set. We use the |
1012 | /// TargetTransformInfo to query the different backends for the cost of |
1013 | /// different operations. |
1014 | class LoopVectorizationCostModel { |
1015 | public: |
1016 | LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, |
1017 | PredicatedScalarEvolution &PSE, LoopInfo *LI, |
1018 | LoopVectorizationLegality *Legal, |
1019 | const TargetTransformInfo &TTI, |
1020 | const TargetLibraryInfo *TLI, DemandedBits *DB, |
1021 | AssumptionCache *AC, |
1022 | OptimizationRemarkEmitter *ORE, const Function *F, |
1023 | const LoopVectorizeHints *Hints, |
1024 | InterleavedAccessInfo &IAI) |
1025 | : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), |
1026 | TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), |
1027 | Hints(Hints), InterleaveInfo(IAI) {} |
1028 | |
1029 | /// \return An upper bound for the vectorization factors (both fixed and |
1030 | /// scalable). If the factors are 0, vectorization and interleaving should be |
1031 | /// avoided up front. |
1032 | FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); |
1033 | |
1034 | /// \return True if runtime checks are required for vectorization, and false |
1035 | /// otherwise. |
1036 | bool runtimeChecksRequired(); |
1037 | |
1038 | /// Setup cost-based decisions for user vectorization factor. |
1039 | /// \return true if the UserVF is a feasible VF to be chosen. |
1040 | bool selectUserVectorizationFactor(ElementCount UserVF) { |
1041 | collectUniformsAndScalars(VF: UserVF); |
1042 | collectInstsToScalarize(VF: UserVF); |
1043 | return expectedCost(VF: UserVF).isValid(); |
1044 | } |
1045 | |
1046 | /// \return The size (in bits) of the smallest and widest types in the code |
1047 | /// that needs to be vectorized. We ignore values that remain scalar such as |
1048 | /// 64 bit loop indices. |
1049 | std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); |
1050 | |
1051 | /// \return The desired interleave count. |
1052 | /// If interleave count has been specified by metadata it will be returned. |
1053 | /// Otherwise, the interleave count is computed and returned. VF and LoopCost |
1054 | /// are the selected vectorization factor and the cost of the selected VF. |
1055 | unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost); |
1056 | |
1057 | /// Memory access instruction may be vectorized in more than one way. |
1058 | /// Form of instruction after vectorization depends on cost. |
1059 | /// This function takes cost-based decisions for Load/Store instructions |
1060 | /// and collects them in a map. This decisions map is used for building |
1061 | /// the lists of loop-uniform and loop-scalar instructions. |
1062 | /// The calculated cost is saved with widening decision in order to |
1063 | /// avoid redundant calculations. |
1064 | void setCostBasedWideningDecision(ElementCount VF); |
1065 | |
1066 | /// A call may be vectorized in different ways depending on whether we have |
1067 | /// vectorized variants available and whether the target supports masking. |
1068 | /// This function analyzes all calls in the function at the supplied VF, |
1069 | /// makes a decision based on the costs of available options, and stores that |
1070 | /// decision in a map for use in planning and plan execution. |
1071 | void setVectorizedCallDecision(ElementCount VF); |
1072 | |
1073 | /// A struct that represents some properties of the register usage |
1074 | /// of a loop. |
1075 | struct RegisterUsage { |
1076 | /// Holds the number of loop invariant values that are used in the loop. |
1077 | /// The key is ClassID of target-provided register class. |
1078 | SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; |
1079 | /// Holds the maximum number of concurrent live intervals in the loop. |
1080 | /// The key is ClassID of target-provided register class. |
1081 | SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; |
1082 | }; |
1083 | |
1084 | /// \return Returns information about the register usages of the loop for the |
1085 | /// given vectorization factors. |
1086 | SmallVector<RegisterUsage, 8> |
1087 | calculateRegisterUsage(ArrayRef<ElementCount> VFs); |
1088 | |
1089 | /// Collect values we want to ignore in the cost model. |
1090 | void collectValuesToIgnore(); |
1091 | |
1092 | /// Collect all element types in the loop for which widening is needed. |
1093 | void collectElementTypesForWidening(); |
1094 | |
1095 | /// Split reductions into those that happen in the loop, and those that happen |
1096 | /// outside. In loop reductions are collected into InLoopReductions. |
1097 | void collectInLoopReductions(); |
1098 | |
1099 | /// Returns true if we should use strict in-order reductions for the given |
1100 | /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, |
1101 | /// the IsOrdered flag of RdxDesc is set and we do not allow reordering |
1102 | /// of FP operations. |
1103 | bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const { |
1104 | return !Hints->allowReordering() && RdxDesc.isOrdered(); |
1105 | } |
1106 | |
1107 | /// \returns The smallest bitwidth each instruction can be represented with. |
1108 | /// The vector equivalents of these instructions should be truncated to this |
1109 | /// type. |
1110 | const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { |
1111 | return MinBWs; |
1112 | } |
1113 | |
1114 | /// \returns True if it is more profitable to scalarize instruction \p I for |
1115 | /// vectorization factor \p VF. |
1116 | bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { |
1117 | assert(VF.isVector() && |
1118 | "Profitable to scalarize relevant only for VF > 1." ); |
1119 | assert( |
1120 | TheLoop->isInnermost() && |
1121 | "cost-model should not be used for outer loops (in VPlan-native path)" ); |
1122 | |
1123 | auto Scalars = InstsToScalarize.find(Val: VF); |
1124 | assert(Scalars != InstsToScalarize.end() && |
1125 | "VF not yet analyzed for scalarization profitability" ); |
1126 | return Scalars->second.contains(Val: I); |
1127 | } |
1128 | |
1129 | /// Returns true if \p I is known to be uniform after vectorization. |
1130 | bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { |
1131 | assert( |
1132 | TheLoop->isInnermost() && |
1133 | "cost-model should not be used for outer loops (in VPlan-native path)" ); |
1134 | // Pseudo probe needs to be duplicated for each unrolled iteration and |
1135 | // vector lane so that profiled loop trip count can be accurately |
1136 | // accumulated instead of being under counted. |
1137 | if (isa<PseudoProbeInst>(Val: I)) |
1138 | return false; |
1139 | |
1140 | if (VF.isScalar()) |
1141 | return true; |
1142 | |
1143 | auto UniformsPerVF = Uniforms.find(Val: VF); |
1144 | assert(UniformsPerVF != Uniforms.end() && |
1145 | "VF not yet analyzed for uniformity" ); |
1146 | return UniformsPerVF->second.count(Ptr: I); |
1147 | } |
1148 | |
1149 | /// Returns true if \p I is known to be scalar after vectorization. |
1150 | bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { |
1151 | assert( |
1152 | TheLoop->isInnermost() && |
1153 | "cost-model should not be used for outer loops (in VPlan-native path)" ); |
1154 | if (VF.isScalar()) |
1155 | return true; |
1156 | |
1157 | auto ScalarsPerVF = Scalars.find(Val: VF); |
1158 | assert(ScalarsPerVF != Scalars.end() && |
1159 | "Scalar values are not calculated for VF" ); |
1160 | return ScalarsPerVF->second.count(Ptr: I); |
1161 | } |
1162 | |
1163 | /// \returns True if instruction \p I can be truncated to a smaller bitwidth |
1164 | /// for vectorization factor \p VF. |
1165 | bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { |
1166 | return VF.isVector() && MinBWs.contains(Key: I) && |
1167 | !isProfitableToScalarize(I, VF) && |
1168 | !isScalarAfterVectorization(I, VF); |
1169 | } |
1170 | |
1171 | /// Decision that was taken during cost calculation for memory instruction. |
1172 | enum InstWidening { |
1173 | CM_Unknown, |
1174 | CM_Widen, // For consecutive accesses with stride +1. |
1175 | CM_Widen_Reverse, // For consecutive accesses with stride -1. |
1176 | CM_Interleave, |
1177 | CM_GatherScatter, |
1178 | CM_Scalarize, |
1179 | CM_VectorCall, |
1180 | CM_IntrinsicCall |
1181 | }; |
1182 | |
1183 | /// Save vectorization decision \p W and \p Cost taken by the cost model for |
1184 | /// instruction \p I and vector width \p VF. |
1185 | void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, |
1186 | InstructionCost Cost) { |
1187 | assert(VF.isVector() && "Expected VF >=2" ); |
1188 | WideningDecisions[std::make_pair(x&: I, y&: VF)] = std::make_pair(x&: W, y&: Cost); |
1189 | } |
1190 | |
1191 | /// Save vectorization decision \p W and \p Cost taken by the cost model for |
1192 | /// interleaving group \p Grp and vector width \p VF. |
1193 | void setWideningDecision(const InterleaveGroup<Instruction> *Grp, |
1194 | ElementCount VF, InstWidening W, |
1195 | InstructionCost Cost) { |
1196 | assert(VF.isVector() && "Expected VF >=2" ); |
1197 | /// Broadcast this decicion to all instructions inside the group. |
1198 | /// But the cost will be assigned to one instruction only. |
1199 | for (unsigned i = 0; i < Grp->getFactor(); ++i) { |
1200 | if (auto *I = Grp->getMember(Index: i)) { |
1201 | if (Grp->getInsertPos() == I) |
1202 | WideningDecisions[std::make_pair(x&: I, y&: VF)] = std::make_pair(x&: W, y&: Cost); |
1203 | else |
1204 | WideningDecisions[std::make_pair(x&: I, y&: VF)] = std::make_pair(x&: W, y: 0); |
1205 | } |
1206 | } |
1207 | } |
1208 | |
1209 | /// Return the cost model decision for the given instruction \p I and vector |
1210 | /// width \p VF. Return CM_Unknown if this instruction did not pass |
1211 | /// through the cost modeling. |
1212 | InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { |
1213 | assert(VF.isVector() && "Expected VF to be a vector VF" ); |
1214 | assert( |
1215 | TheLoop->isInnermost() && |
1216 | "cost-model should not be used for outer loops (in VPlan-native path)" ); |
1217 | |
1218 | std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(x&: I, y&: VF); |
1219 | auto Itr = WideningDecisions.find(Val: InstOnVF); |
1220 | if (Itr == WideningDecisions.end()) |
1221 | return CM_Unknown; |
1222 | return Itr->second.first; |
1223 | } |
1224 | |
1225 | /// Return the vectorization cost for the given instruction \p I and vector |
1226 | /// width \p VF. |
1227 | InstructionCost getWideningCost(Instruction *I, ElementCount VF) { |
1228 | assert(VF.isVector() && "Expected VF >=2" ); |
1229 | std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(x&: I, y&: VF); |
1230 | assert(WideningDecisions.contains(InstOnVF) && |
1231 | "The cost is not calculated" ); |
1232 | return WideningDecisions[InstOnVF].second; |
1233 | } |
1234 | |
1235 | struct CallWideningDecision { |
1236 | InstWidening Kind; |
1237 | Function *Variant; |
1238 | Intrinsic::ID IID; |
1239 | std::optional<unsigned> MaskPos; |
1240 | InstructionCost Cost; |
1241 | }; |
1242 | |
1243 | void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind, |
1244 | Function *Variant, Intrinsic::ID IID, |
1245 | std::optional<unsigned> MaskPos, |
1246 | InstructionCost Cost) { |
1247 | assert(!VF.isScalar() && "Expected vector VF" ); |
1248 | CallWideningDecisions[std::make_pair(x&: CI, y&: VF)] = {.Kind: Kind, .Variant: Variant, .IID: IID, |
1249 | .MaskPos: MaskPos, .Cost: Cost}; |
1250 | } |
1251 | |
1252 | CallWideningDecision getCallWideningDecision(CallInst *CI, |
1253 | ElementCount VF) const { |
1254 | assert(!VF.isScalar() && "Expected vector VF" ); |
1255 | return CallWideningDecisions.at(Val: std::make_pair(x&: CI, y&: VF)); |
1256 | } |
1257 | |
1258 | /// Return True if instruction \p I is an optimizable truncate whose operand |
1259 | /// is an induction variable. Such a truncate will be removed by adding a new |
1260 | /// induction variable with the destination type. |
1261 | bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { |
1262 | // If the instruction is not a truncate, return false. |
1263 | auto *Trunc = dyn_cast<TruncInst>(Val: I); |
1264 | if (!Trunc) |
1265 | return false; |
1266 | |
1267 | // Get the source and destination types of the truncate. |
1268 | Type *SrcTy = ToVectorTy(Scalar: cast<CastInst>(Val: I)->getSrcTy(), EC: VF); |
1269 | Type *DestTy = ToVectorTy(Scalar: cast<CastInst>(Val: I)->getDestTy(), EC: VF); |
1270 | |
1271 | // If the truncate is free for the given types, return false. Replacing a |
1272 | // free truncate with an induction variable would add an induction variable |
1273 | // update instruction to each iteration of the loop. We exclude from this |
1274 | // check the primary induction variable since it will need an update |
1275 | // instruction regardless. |
1276 | Value *Op = Trunc->getOperand(i_nocapture: 0); |
1277 | if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(Ty1: SrcTy, Ty2: DestTy)) |
1278 | return false; |
1279 | |
1280 | // If the truncated value is not an induction variable, return false. |
1281 | return Legal->isInductionPhi(V: Op); |
1282 | } |
1283 | |
1284 | /// Collects the instructions to scalarize for each predicated instruction in |
1285 | /// the loop. |
1286 | void collectInstsToScalarize(ElementCount VF); |
1287 | |
1288 | /// Collect Uniform and Scalar values for the given \p VF. |
1289 | /// The sets depend on CM decision for Load/Store instructions |
1290 | /// that may be vectorized as interleave, gather-scatter or scalarized. |
1291 | /// Also make a decision on what to do about call instructions in the loop |
1292 | /// at that VF -- scalarize, call a known vector routine, or call a |
1293 | /// vector intrinsic. |
1294 | void collectUniformsAndScalars(ElementCount VF) { |
1295 | // Do the analysis once. |
1296 | if (VF.isScalar() || Uniforms.contains(Val: VF)) |
1297 | return; |
1298 | setCostBasedWideningDecision(VF); |
1299 | setVectorizedCallDecision(VF); |
1300 | collectLoopUniforms(VF); |
1301 | collectLoopScalars(VF); |
1302 | } |
1303 | |
1304 | /// Returns true if the target machine supports masked store operation |
1305 | /// for the given \p DataType and kind of access to \p Ptr. |
1306 | bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { |
1307 | return Legal->isConsecutivePtr(AccessTy: DataType, Ptr) && |
1308 | TTI.isLegalMaskedStore(DataType, Alignment); |
1309 | } |
1310 | |
1311 | /// Returns true if the target machine supports masked load operation |
1312 | /// for the given \p DataType and kind of access to \p Ptr. |
1313 | bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { |
1314 | return Legal->isConsecutivePtr(AccessTy: DataType, Ptr) && |
1315 | TTI.isLegalMaskedLoad(DataType, Alignment); |
1316 | } |
1317 | |
1318 | /// Returns true if the target machine can represent \p V as a masked gather |
1319 | /// or scatter operation. |
1320 | bool isLegalGatherOrScatter(Value *V, ElementCount VF) { |
1321 | bool LI = isa<LoadInst>(Val: V); |
1322 | bool SI = isa<StoreInst>(Val: V); |
1323 | if (!LI && !SI) |
1324 | return false; |
1325 | auto *Ty = getLoadStoreType(I: V); |
1326 | Align Align = getLoadStoreAlignment(I: V); |
1327 | if (VF.isVector()) |
1328 | Ty = VectorType::get(ElementType: Ty, EC: VF); |
1329 | return (LI && TTI.isLegalMaskedGather(DataType: Ty, Alignment: Align)) || |
1330 | (SI && TTI.isLegalMaskedScatter(DataType: Ty, Alignment: Align)); |
1331 | } |
1332 | |
1333 | /// Returns true if the target machine supports all of the reduction |
1334 | /// variables found for the given VF. |
1335 | bool canVectorizeReductions(ElementCount VF) const { |
1336 | return (all_of(Range: Legal->getReductionVars(), P: [&](auto &Reduction) -> bool { |
1337 | const RecurrenceDescriptor &RdxDesc = Reduction.second; |
1338 | return TTI.isLegalToVectorizeReduction(RdxDesc, VF); |
1339 | })); |
1340 | } |
1341 | |
1342 | /// Given costs for both strategies, return true if the scalar predication |
1343 | /// lowering should be used for div/rem. This incorporates an override |
1344 | /// option so it is not simply a cost comparison. |
1345 | bool isDivRemScalarWithPredication(InstructionCost ScalarCost, |
1346 | InstructionCost SafeDivisorCost) const { |
1347 | switch (ForceSafeDivisor) { |
1348 | case cl::BOU_UNSET: |
1349 | return ScalarCost < SafeDivisorCost; |
1350 | case cl::BOU_TRUE: |
1351 | return false; |
1352 | case cl::BOU_FALSE: |
1353 | return true; |
1354 | }; |
1355 | llvm_unreachable("impossible case value" ); |
1356 | } |
1357 | |
1358 | /// Returns true if \p I is an instruction which requires predication and |
1359 | /// for which our chosen predication strategy is scalarization (i.e. we |
1360 | /// don't have an alternate strategy such as masking available). |
1361 | /// \p VF is the vectorization factor that will be used to vectorize \p I. |
1362 | bool isScalarWithPredication(Instruction *I, ElementCount VF) const; |
1363 | |
1364 | /// Returns true if \p I is an instruction that needs to be predicated |
1365 | /// at runtime. The result is independent of the predication mechanism. |
1366 | /// Superset of instructions that return true for isScalarWithPredication. |
1367 | bool isPredicatedInst(Instruction *I) const; |
1368 | |
1369 | /// Return the costs for our two available strategies for lowering a |
1370 | /// div/rem operation which requires speculating at least one lane. |
1371 | /// First result is for scalarization (will be invalid for scalable |
1372 | /// vectors); second is for the safe-divisor strategy. |
1373 | std::pair<InstructionCost, InstructionCost> |
1374 | getDivRemSpeculationCost(Instruction *I, |
1375 | ElementCount VF) const; |
1376 | |
1377 | /// Returns true if \p I is a memory instruction with consecutive memory |
1378 | /// access that can be widened. |
1379 | bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF); |
1380 | |
1381 | /// Returns true if \p I is a memory instruction in an interleaved-group |
1382 | /// of memory accesses that can be vectorized with wide vector loads/stores |
1383 | /// and shuffles. |
1384 | bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const; |
1385 | |
1386 | /// Check if \p Instr belongs to any interleaved access group. |
1387 | bool isAccessInterleaved(Instruction *Instr) const { |
1388 | return InterleaveInfo.isInterleaved(Instr); |
1389 | } |
1390 | |
1391 | /// Get the interleaved access group that \p Instr belongs to. |
1392 | const InterleaveGroup<Instruction> * |
1393 | getInterleavedAccessGroup(Instruction *Instr) const { |
1394 | return InterleaveInfo.getInterleaveGroup(Instr); |
1395 | } |
1396 | |
1397 | /// Returns true if we're required to use a scalar epilogue for at least |
1398 | /// the final iteration of the original loop. |
1399 | bool requiresScalarEpilogue(bool IsVectorizing) const { |
1400 | if (!isScalarEpilogueAllowed()) { |
1401 | LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n" ); |
1402 | return false; |
1403 | } |
1404 | // If we might exit from anywhere but the latch, must run the exiting |
1405 | // iteration in scalar form. |
1406 | if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { |
1407 | LLVM_DEBUG( |
1408 | dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n" ); |
1409 | return true; |
1410 | } |
1411 | if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) { |
1412 | LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: " |
1413 | "interleaved group requires scalar epilogue\n" ); |
1414 | return true; |
1415 | } |
1416 | LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n" ); |
1417 | return false; |
1418 | } |
1419 | |
1420 | /// Returns true if we're required to use a scalar epilogue for at least |
1421 | /// the final iteration of the original loop for all VFs in \p Range. |
1422 | /// A scalar epilogue must either be required for all VFs in \p Range or for |
1423 | /// none. |
1424 | bool requiresScalarEpilogue(VFRange Range) const { |
1425 | auto RequiresScalarEpilogue = [this](ElementCount VF) { |
1426 | return requiresScalarEpilogue(IsVectorizing: VF.isVector()); |
1427 | }; |
1428 | bool IsRequired = all_of(Range, P: RequiresScalarEpilogue); |
1429 | assert( |
1430 | (IsRequired || none_of(Range, RequiresScalarEpilogue)) && |
1431 | "all VFs in range must agree on whether a scalar epilogue is required" ); |
1432 | return IsRequired; |
1433 | } |
1434 | |
1435 | /// Returns true if a scalar epilogue is not allowed due to optsize or a |
1436 | /// loop hint annotation. |
1437 | bool isScalarEpilogueAllowed() const { |
1438 | return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; |
1439 | } |
1440 | |
1441 | /// Returns the TailFoldingStyle that is best for the current loop. |
1442 | TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const { |
1443 | if (!ChosenTailFoldingStyle) |
1444 | return TailFoldingStyle::None; |
1445 | return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first |
1446 | : ChosenTailFoldingStyle->second; |
1447 | } |
1448 | |
1449 | /// Selects and saves TailFoldingStyle for 2 options - if IV update may |
1450 | /// overflow or not. |
1451 | /// \param IsScalableVF true if scalable vector factors enabled. |
1452 | /// \param UserIC User specific interleave count. |
1453 | void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) { |
1454 | assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet." ); |
1455 | if (!Legal->canFoldTailByMasking()) { |
1456 | ChosenTailFoldingStyle = |
1457 | std::make_pair(x: TailFoldingStyle::None, y: TailFoldingStyle::None); |
1458 | return; |
1459 | } |
1460 | |
1461 | if (!ForceTailFoldingStyle.getNumOccurrences()) { |
1462 | ChosenTailFoldingStyle = std::make_pair( |
1463 | x: TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true), |
1464 | y: TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false)); |
1465 | return; |
1466 | } |
1467 | |
1468 | // Set styles when forced. |
1469 | ChosenTailFoldingStyle = std::make_pair(x&: ForceTailFoldingStyle.getValue(), |
1470 | y&: ForceTailFoldingStyle.getValue()); |
1471 | if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL) |
1472 | return; |
1473 | // Override forced styles if needed. |
1474 | // FIXME: use actual opcode/data type for analysis here. |
1475 | // FIXME: Investigate opportunity for fixed vector factor. |
1476 | bool EVLIsLegal = |
1477 | IsScalableVF && UserIC <= 1 && |
1478 | TTI.hasActiveVectorLength(Opcode: 0, DataType: nullptr, Alignment: Align()) && |
1479 | !EnableVPlanNativePath && |
1480 | // FIXME: implement support for max safe dependency distance. |
1481 | Legal->isSafeForAnyVectorWidth(); |
1482 | if (!EVLIsLegal) { |
1483 | // If for some reason EVL mode is unsupported, fallback to |
1484 | // DataWithoutLaneMask to try to vectorize the loop with folded tail |
1485 | // in a generic way. |
1486 | ChosenTailFoldingStyle = |
1487 | std::make_pair(x: TailFoldingStyle::DataWithoutLaneMask, |
1488 | y: TailFoldingStyle::DataWithoutLaneMask); |
1489 | LLVM_DEBUG( |
1490 | dbgs() |
1491 | << "LV: Preference for VP intrinsics indicated. Will " |
1492 | "not try to generate VP Intrinsics " |
1493 | << (UserIC > 1 |
1494 | ? "since interleave count specified is greater than 1.\n" |
1495 | : "due to non-interleaving reasons.\n" )); |
1496 | } |
1497 | } |
1498 | |
1499 | /// Returns true if all loop blocks should be masked to fold tail loop. |
1500 | bool foldTailByMasking() const { |
1501 | // TODO: check if it is possible to check for None style independent of |
1502 | // IVUpdateMayOverflow flag in getTailFoldingStyle. |
1503 | return getTailFoldingStyle() != TailFoldingStyle::None; |
1504 | } |
1505 | |
1506 | /// Returns true if the instructions in this block requires predication |
1507 | /// for any reason, e.g. because tail folding now requires a predicate |
1508 | /// or because the block in the original loop was predicated. |
1509 | bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { |
1510 | return foldTailByMasking() || Legal->blockNeedsPredication(BB); |
1511 | } |
1512 | |
1513 | /// Returns true if VP intrinsics with explicit vector length support should |
1514 | /// be generated in the tail folded loop. |
1515 | bool foldTailWithEVL() const { |
1516 | return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL; |
1517 | } |
1518 | |
1519 | /// Returns true if the Phi is part of an inloop reduction. |
1520 | bool isInLoopReduction(PHINode *Phi) const { |
1521 | return InLoopReductions.contains(Ptr: Phi); |
1522 | } |
1523 | |
1524 | /// Estimate cost of an intrinsic call instruction CI if it were vectorized |
1525 | /// with factor VF. Return the cost of the instruction, including |
1526 | /// scalarization overhead if it's needed. |
1527 | InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; |
1528 | |
1529 | /// Estimate cost of a call instruction CI if it were vectorized with factor |
1530 | /// VF. Return the cost of the instruction, including scalarization overhead |
1531 | /// if it's needed. |
1532 | InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const; |
1533 | |
1534 | /// Invalidates decisions already taken by the cost model. |
1535 | void invalidateCostModelingDecisions() { |
1536 | WideningDecisions.clear(); |
1537 | CallWideningDecisions.clear(); |
1538 | Uniforms.clear(); |
1539 | Scalars.clear(); |
1540 | } |
1541 | |
1542 | /// Returns the expected execution cost. The unit of the cost does |
1543 | /// not matter because we use the 'cost' units to compare different |
1544 | /// vector widths. The cost that is returned is *not* normalized by |
1545 | /// the factor width. If \p Invalid is not nullptr, this function |
1546 | /// will add a pair(Instruction*, ElementCount) to \p Invalid for |
1547 | /// each instruction that has an Invalid cost for the given VF. |
1548 | InstructionCost |
1549 | expectedCost(ElementCount VF, |
1550 | SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); |
1551 | |
1552 | bool hasPredStores() const { return NumPredStores > 0; } |
1553 | |
1554 | /// Returns true if epilogue vectorization is considered profitable, and |
1555 | /// false otherwise. |
1556 | /// \p VF is the vectorization factor chosen for the original loop. |
1557 | bool isEpilogueVectorizationProfitable(const ElementCount VF) const; |
1558 | |
1559 | /// Returns the execution time cost of an instruction for a given vector |
1560 | /// width. Vector width of one means scalar. |
1561 | InstructionCost getInstructionCost(Instruction *I, ElementCount VF); |
1562 | |
1563 | /// Return the cost of instructions in an inloop reduction pattern, if I is |
1564 | /// part of that pattern. |
1565 | std::optional<InstructionCost> |
1566 | getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, |
1567 | TTI::TargetCostKind CostKind) const; |
1568 | |
1569 | private: |
1570 | unsigned NumPredStores = 0; |
1571 | |
1572 | /// \return An upper bound for the vectorization factors for both |
1573 | /// fixed and scalable vectorization, where the minimum-known number of |
1574 | /// elements is a power-of-2 larger than zero. If scalable vectorization is |
1575 | /// disabled or unsupported, then the scalable part will be equal to |
1576 | /// ElementCount::getScalable(0). |
1577 | FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount, |
1578 | ElementCount UserVF, |
1579 | bool FoldTailByMasking); |
1580 | |
1581 | /// \return the maximized element count based on the targets vector |
1582 | /// registers and the loop trip-count, but limited to a maximum safe VF. |
1583 | /// This is a helper function of computeFeasibleMaxVF. |
1584 | ElementCount getMaximizedVFForTarget(unsigned MaxTripCount, |
1585 | unsigned SmallestType, |
1586 | unsigned WidestType, |
1587 | ElementCount MaxSafeVF, |
1588 | bool FoldTailByMasking); |
1589 | |
1590 | /// Checks if scalable vectorization is supported and enabled. Caches the |
1591 | /// result to avoid repeated debug dumps for repeated queries. |
1592 | bool isScalableVectorizationAllowed(); |
1593 | |
1594 | /// \return the maximum legal scalable VF, based on the safe max number |
1595 | /// of elements. |
1596 | ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); |
1597 | |
1598 | /// Calculate vectorization cost of memory instruction \p I. |
1599 | InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); |
1600 | |
1601 | /// The cost computation for scalarized memory instruction. |
1602 | InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); |
1603 | |
1604 | /// The cost computation for interleaving group of memory instructions. |
1605 | InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); |
1606 | |
1607 | /// The cost computation for Gather/Scatter instruction. |
1608 | InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); |
1609 | |
1610 | /// The cost computation for widening instruction \p I with consecutive |
1611 | /// memory access. |
1612 | InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); |
1613 | |
1614 | /// The cost calculation for Load/Store instruction \p I with uniform pointer - |
1615 | /// Load: scalar load + broadcast. |
1616 | /// Store: scalar store + (loop invariant value stored? 0 : extract of last |
1617 | /// element) |
1618 | InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); |
1619 | |
1620 | /// Estimate the overhead of scalarizing an instruction. This is a |
1621 | /// convenience wrapper for the type-based getScalarizationOverhead API. |
1622 | InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF, |
1623 | TTI::TargetCostKind CostKind) const; |
1624 | |
1625 | /// Returns true if an artificially high cost for emulated masked memrefs |
1626 | /// should be used. |
1627 | bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF); |
1628 | |
1629 | /// Map of scalar integer values to the smallest bitwidth they can be legally |
1630 | /// represented as. The vector equivalents of these values should be truncated |
1631 | /// to this type. |
1632 | MapVector<Instruction *, uint64_t> MinBWs; |
1633 | |
1634 | /// A type representing the costs for instructions if they were to be |
1635 | /// scalarized rather than vectorized. The entries are Instruction-Cost |
1636 | /// pairs. |
1637 | using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; |
1638 | |
1639 | /// A set containing all BasicBlocks that are known to present after |
1640 | /// vectorization as a predicated block. |
1641 | DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>> |
1642 | PredicatedBBsAfterVectorization; |
1643 | |
1644 | /// Records whether it is allowed to have the original scalar loop execute at |
1645 | /// least once. This may be needed as a fallback loop in case runtime |
1646 | /// aliasing/dependence checks fail, or to handle the tail/remainder |
1647 | /// iterations when the trip count is unknown or doesn't divide by the VF, |
1648 | /// or as a peel-loop to handle gaps in interleave-groups. |
1649 | /// Under optsize and when the trip count is very small we don't allow any |
1650 | /// iterations to execute in the scalar loop. |
1651 | ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; |
1652 | |
1653 | /// Control finally chosen tail folding style. The first element is used if |
1654 | /// the IV update may overflow, the second element - if it does not. |
1655 | std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>> |
1656 | ChosenTailFoldingStyle; |
1657 | |
1658 | /// true if scalable vectorization is supported and enabled. |
1659 | std::optional<bool> IsScalableVectorizationAllowed; |
1660 | |
1661 | /// A map holding scalar costs for different vectorization factors. The |
1662 | /// presence of a cost for an instruction in the mapping indicates that the |
1663 | /// instruction will be scalarized when vectorizing with the associated |
1664 | /// vectorization factor. The entries are VF-ScalarCostTy pairs. |
1665 | DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; |
1666 | |
1667 | /// Holds the instructions known to be uniform after vectorization. |
1668 | /// The data is collected per VF. |
1669 | DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; |
1670 | |
1671 | /// Holds the instructions known to be scalar after vectorization. |
1672 | /// The data is collected per VF. |
1673 | DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; |
1674 | |
1675 | /// Holds the instructions (address computations) that are forced to be |
1676 | /// scalarized. |
1677 | DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; |
1678 | |
1679 | /// PHINodes of the reductions that should be expanded in-loop. |
1680 | SmallPtrSet<PHINode *, 4> InLoopReductions; |
1681 | |
1682 | /// A Map of inloop reduction operations and their immediate chain operand. |
1683 | /// FIXME: This can be removed once reductions can be costed correctly in |
1684 | /// VPlan. This was added to allow quick lookup of the inloop operations. |
1685 | DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; |
1686 | |
1687 | /// Returns the expected difference in cost from scalarizing the expression |
1688 | /// feeding a predicated instruction \p PredInst. The instructions to |
1689 | /// scalarize and their scalar costs are collected in \p ScalarCosts. A |
1690 | /// non-negative return value implies the expression will be scalarized. |
1691 | /// Currently, only single-use chains are considered for scalarization. |
1692 | InstructionCost computePredInstDiscount(Instruction *PredInst, |
1693 | ScalarCostsTy &ScalarCosts, |
1694 | ElementCount VF); |
1695 | |
1696 | /// Collect the instructions that are uniform after vectorization. An |
1697 | /// instruction is uniform if we represent it with a single scalar value in |
1698 | /// the vectorized loop corresponding to each vector iteration. Examples of |
1699 | /// uniform instructions include pointer operands of consecutive or |
1700 | /// interleaved memory accesses. Note that although uniformity implies an |
1701 | /// instruction will be scalar, the reverse is not true. In general, a |
1702 | /// scalarized instruction will be represented by VF scalar values in the |
1703 | /// vectorized loop, each corresponding to an iteration of the original |
1704 | /// scalar loop. |
1705 | void collectLoopUniforms(ElementCount VF); |
1706 | |
1707 | /// Collect the instructions that are scalar after vectorization. An |
1708 | /// instruction is scalar if it is known to be uniform or will be scalarized |
1709 | /// during vectorization. collectLoopScalars should only add non-uniform nodes |
1710 | /// to the list if they are used by a load/store instruction that is marked as |
1711 | /// CM_Scalarize. Non-uniform scalarized instructions will be represented by |
1712 | /// VF values in the vectorized loop, each corresponding to an iteration of |
1713 | /// the original scalar loop. |
1714 | void collectLoopScalars(ElementCount VF); |
1715 | |
1716 | /// Keeps cost model vectorization decision and cost for instructions. |
1717 | /// Right now it is used for memory instructions only. |
1718 | using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, |
1719 | std::pair<InstWidening, InstructionCost>>; |
1720 | |
1721 | DecisionList WideningDecisions; |
1722 | |
1723 | using CallDecisionList = |
1724 | DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>; |
1725 | |
1726 | CallDecisionList CallWideningDecisions; |
1727 | |
1728 | /// Returns true if \p V is expected to be vectorized and it needs to be |
1729 | /// extracted. |
1730 | bool (Value *V, ElementCount VF) const { |
1731 | Instruction *I = dyn_cast<Instruction>(Val: V); |
1732 | if (VF.isScalar() || !I || !TheLoop->contains(Inst: I) || |
1733 | TheLoop->isLoopInvariant(V: I)) |
1734 | return false; |
1735 | |
1736 | // Assume we can vectorize V (and hence we need extraction) if the |
1737 | // scalars are not computed yet. This can happen, because it is called |
1738 | // via getScalarizationOverhead from setCostBasedWideningDecision, before |
1739 | // the scalars are collected. That should be a safe assumption in most |
1740 | // cases, because we check if the operands have vectorizable types |
1741 | // beforehand in LoopVectorizationLegality. |
1742 | return !Scalars.contains(Val: VF) || !isScalarAfterVectorization(I, VF); |
1743 | }; |
1744 | |
1745 | /// Returns a range containing only operands needing to be extracted. |
1746 | SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, |
1747 | ElementCount VF) const { |
1748 | return SmallVector<Value *, 4>(make_filter_range( |
1749 | Range&: Ops, Pred: [this, VF](Value *V) { return this->needsExtract(V, VF); })); |
1750 | } |
1751 | |
1752 | public: |
1753 | /// The loop that we evaluate. |
1754 | Loop *TheLoop; |
1755 | |
1756 | /// Predicated scalar evolution analysis. |
1757 | PredicatedScalarEvolution &PSE; |
1758 | |
1759 | /// Loop Info analysis. |
1760 | LoopInfo *LI; |
1761 | |
1762 | /// Vectorization legality. |
1763 | LoopVectorizationLegality *Legal; |
1764 | |
1765 | /// Vector target information. |
1766 | const TargetTransformInfo &TTI; |
1767 | |
1768 | /// Target Library Info. |
1769 | const TargetLibraryInfo *TLI; |
1770 | |
1771 | /// Demanded bits analysis. |
1772 | DemandedBits *DB; |
1773 | |
1774 | /// Assumption cache. |
1775 | AssumptionCache *AC; |
1776 | |
1777 | /// Interface to emit optimization remarks. |
1778 | OptimizationRemarkEmitter *ORE; |
1779 | |
1780 | const Function *TheFunction; |
1781 | |
1782 | /// Loop Vectorize Hint. |
1783 | const LoopVectorizeHints *Hints; |
1784 | |
1785 | /// The interleave access information contains groups of interleaved accesses |
1786 | /// with the same stride and close to each other. |
1787 | InterleavedAccessInfo &InterleaveInfo; |
1788 | |
1789 | /// Values to ignore in the cost model. |
1790 | SmallPtrSet<const Value *, 16> ValuesToIgnore; |
1791 | |
1792 | /// Values to ignore in the cost model when VF > 1. |
1793 | SmallPtrSet<const Value *, 16> VecValuesToIgnore; |
1794 | |
1795 | /// All element types found in the loop. |
1796 | SmallPtrSet<Type *, 16> ElementTypesInLoop; |
1797 | }; |
1798 | } // end namespace llvm |
1799 | |
1800 | namespace { |
1801 | /// Helper struct to manage generating runtime checks for vectorization. |
1802 | /// |
1803 | /// The runtime checks are created up-front in temporary blocks to allow better |
1804 | /// estimating the cost and un-linked from the existing IR. After deciding to |
1805 | /// vectorize, the checks are moved back. If deciding not to vectorize, the |
1806 | /// temporary blocks are completely removed. |
1807 | class GeneratedRTChecks { |
1808 | /// Basic block which contains the generated SCEV checks, if any. |
1809 | BasicBlock *SCEVCheckBlock = nullptr; |
1810 | |
1811 | /// The value representing the result of the generated SCEV checks. If it is |
1812 | /// nullptr, either no SCEV checks have been generated or they have been used. |
1813 | Value *SCEVCheckCond = nullptr; |
1814 | |
1815 | /// Basic block which contains the generated memory runtime checks, if any. |
1816 | BasicBlock *MemCheckBlock = nullptr; |
1817 | |
1818 | /// The value representing the result of the generated memory runtime checks. |
1819 | /// If it is nullptr, either no memory runtime checks have been generated or |
1820 | /// they have been used. |
1821 | Value *MemRuntimeCheckCond = nullptr; |
1822 | |
1823 | DominatorTree *DT; |
1824 | LoopInfo *LI; |
1825 | TargetTransformInfo *TTI; |
1826 | |
1827 | SCEVExpander SCEVExp; |
1828 | SCEVExpander MemCheckExp; |
1829 | |
1830 | bool CostTooHigh = false; |
1831 | const bool AddBranchWeights; |
1832 | |
1833 | Loop *OuterLoop = nullptr; |
1834 | |
1835 | public: |
1836 | GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, |
1837 | TargetTransformInfo *TTI, const DataLayout &DL, |
1838 | bool AddBranchWeights) |
1839 | : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check" ), |
1840 | MemCheckExp(SE, DL, "scev.check" ), AddBranchWeights(AddBranchWeights) {} |
1841 | |
1842 | /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can |
1843 | /// accurately estimate the cost of the runtime checks. The blocks are |
1844 | /// un-linked from the IR and is added back during vector code generation. If |
1845 | /// there is no vector code generation, the check blocks are removed |
1846 | /// completely. |
1847 | void Create(Loop *L, const LoopAccessInfo &LAI, |
1848 | const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) { |
1849 | |
1850 | // Hard cutoff to limit compile-time increase in case a very large number of |
1851 | // runtime checks needs to be generated. |
1852 | // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to |
1853 | // profile info. |
1854 | CostTooHigh = |
1855 | LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold; |
1856 | if (CostTooHigh) |
1857 | return; |
1858 | |
1859 | BasicBlock * = L->getHeader(); |
1860 | BasicBlock * = L->getLoopPreheader(); |
1861 | |
1862 | // Use SplitBlock to create blocks for SCEV & memory runtime checks to |
1863 | // ensure the blocks are properly added to LoopInfo & DominatorTree. Those |
1864 | // may be used by SCEVExpander. The blocks will be un-linked from their |
1865 | // predecessors and removed from LI & DT at the end of the function. |
1866 | if (!UnionPred.isAlwaysTrue()) { |
1867 | SCEVCheckBlock = SplitBlock(Old: Preheader, SplitPt: Preheader->getTerminator(), DT, LI, |
1868 | MSSAU: nullptr, BBName: "vector.scevcheck" ); |
1869 | |
1870 | SCEVCheckCond = SCEVExp.expandCodeForPredicate( |
1871 | Pred: &UnionPred, Loc: SCEVCheckBlock->getTerminator()); |
1872 | } |
1873 | |
1874 | const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); |
1875 | if (RtPtrChecking.Need) { |
1876 | auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; |
1877 | MemCheckBlock = SplitBlock(Old: Pred, SplitPt: Pred->getTerminator(), DT, LI, MSSAU: nullptr, |
1878 | BBName: "vector.memcheck" ); |
1879 | |
1880 | auto DiffChecks = RtPtrChecking.getDiffChecks(); |
1881 | if (DiffChecks) { |
1882 | Value *RuntimeVF = nullptr; |
1883 | MemRuntimeCheckCond = addDiffRuntimeChecks( |
1884 | Loc: MemCheckBlock->getTerminator(), Checks: *DiffChecks, Expander&: MemCheckExp, |
1885 | GetVF: [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) { |
1886 | if (!RuntimeVF) |
1887 | RuntimeVF = getRuntimeVF(B, Ty: B.getIntNTy(N: Bits), VF); |
1888 | return RuntimeVF; |
1889 | }, |
1890 | IC); |
1891 | } else { |
1892 | MemRuntimeCheckCond = addRuntimeChecks( |
1893 | Loc: MemCheckBlock->getTerminator(), TheLoop: L, PointerChecks: RtPtrChecking.getChecks(), |
1894 | Expander&: MemCheckExp, HoistRuntimeChecks: VectorizerParams::HoistRuntimeChecks); |
1895 | } |
1896 | assert(MemRuntimeCheckCond && |
1897 | "no RT checks generated although RtPtrChecking " |
1898 | "claimed checks are required" ); |
1899 | } |
1900 | |
1901 | if (!MemCheckBlock && !SCEVCheckBlock) |
1902 | return; |
1903 | |
1904 | // Unhook the temporary block with the checks, update various places |
1905 | // accordingly. |
1906 | if (SCEVCheckBlock) |
1907 | SCEVCheckBlock->replaceAllUsesWith(V: Preheader); |
1908 | if (MemCheckBlock) |
1909 | MemCheckBlock->replaceAllUsesWith(V: Preheader); |
1910 | |
1911 | if (SCEVCheckBlock) { |
1912 | SCEVCheckBlock->getTerminator()->moveBefore(MovePos: Preheader->getTerminator()); |
1913 | new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); |
1914 | Preheader->getTerminator()->eraseFromParent(); |
1915 | } |
1916 | if (MemCheckBlock) { |
1917 | MemCheckBlock->getTerminator()->moveBefore(MovePos: Preheader->getTerminator()); |
1918 | new UnreachableInst(Preheader->getContext(), MemCheckBlock); |
1919 | Preheader->getTerminator()->eraseFromParent(); |
1920 | } |
1921 | |
1922 | DT->changeImmediateDominator(BB: LoopHeader, NewBB: Preheader); |
1923 | if (MemCheckBlock) { |
1924 | DT->eraseNode(BB: MemCheckBlock); |
1925 | LI->removeBlock(BB: MemCheckBlock); |
1926 | } |
1927 | if (SCEVCheckBlock) { |
1928 | DT->eraseNode(BB: SCEVCheckBlock); |
1929 | LI->removeBlock(BB: SCEVCheckBlock); |
1930 | } |
1931 | |
1932 | // Outer loop is used as part of the later cost calculations. |
1933 | OuterLoop = L->getParentLoop(); |
1934 | } |
1935 | |
1936 | InstructionCost getCost() { |
1937 | if (SCEVCheckBlock || MemCheckBlock) |
1938 | LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n" ); |
1939 | |
1940 | if (CostTooHigh) { |
1941 | InstructionCost Cost; |
1942 | Cost.setInvalid(); |
1943 | LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n" ); |
1944 | return Cost; |
1945 | } |
1946 | |
1947 | InstructionCost RTCheckCost = 0; |
1948 | if (SCEVCheckBlock) |
1949 | for (Instruction &I : *SCEVCheckBlock) { |
1950 | if (SCEVCheckBlock->getTerminator() == &I) |
1951 | continue; |
1952 | InstructionCost C = |
1953 | TTI->getInstructionCost(U: &I, CostKind: TTI::TCK_RecipThroughput); |
1954 | LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n" ); |
1955 | RTCheckCost += C; |
1956 | } |
1957 | if (MemCheckBlock) { |
1958 | InstructionCost MemCheckCost = 0; |
1959 | for (Instruction &I : *MemCheckBlock) { |
1960 | if (MemCheckBlock->getTerminator() == &I) |
1961 | continue; |
1962 | InstructionCost C = |
1963 | TTI->getInstructionCost(U: &I, CostKind: TTI::TCK_RecipThroughput); |
1964 | LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n" ); |
1965 | MemCheckCost += C; |
1966 | } |
1967 | |
1968 | // If the runtime memory checks are being created inside an outer loop |
1969 | // we should find out if these checks are outer loop invariant. If so, |
1970 | // the checks will likely be hoisted out and so the effective cost will |
1971 | // reduce according to the outer loop trip count. |
1972 | if (OuterLoop) { |
1973 | ScalarEvolution *SE = MemCheckExp.getSE(); |
1974 | // TODO: If profitable, we could refine this further by analysing every |
1975 | // individual memory check, since there could be a mixture of loop |
1976 | // variant and invariant checks that mean the final condition is |
1977 | // variant. |
1978 | const SCEV *Cond = SE->getSCEV(V: MemRuntimeCheckCond); |
1979 | if (SE->isLoopInvariant(S: Cond, L: OuterLoop)) { |
1980 | // It seems reasonable to assume that we can reduce the effective |
1981 | // cost of the checks even when we know nothing about the trip |
1982 | // count. Assume that the outer loop executes at least twice. |
1983 | unsigned BestTripCount = 2; |
1984 | |
1985 | // If exact trip count is known use that. |
1986 | if (unsigned SmallTC = SE->getSmallConstantTripCount(L: OuterLoop)) |
1987 | BestTripCount = SmallTC; |
1988 | else if (LoopVectorizeWithBlockFrequency) { |
1989 | // Else use profile data if available. |
1990 | if (auto EstimatedTC = getLoopEstimatedTripCount(L: OuterLoop)) |
1991 | BestTripCount = *EstimatedTC; |
1992 | } |
1993 | |
1994 | BestTripCount = std::max(a: BestTripCount, b: 1U); |
1995 | InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount; |
1996 | |
1997 | // Let's ensure the cost is always at least 1. |
1998 | NewMemCheckCost = std::max(a: *NewMemCheckCost.getValue(), |
1999 | b: (InstructionCost::CostType)1); |
2000 | |
2001 | if (BestTripCount > 1) |
2002 | LLVM_DEBUG(dbgs() |
2003 | << "We expect runtime memory checks to be hoisted " |
2004 | << "out of the outer loop. Cost reduced from " |
2005 | << MemCheckCost << " to " << NewMemCheckCost << '\n'); |
2006 | |
2007 | MemCheckCost = NewMemCheckCost; |
2008 | } |
2009 | } |
2010 | |
2011 | RTCheckCost += MemCheckCost; |
2012 | } |
2013 | |
2014 | if (SCEVCheckBlock || MemCheckBlock) |
2015 | LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost |
2016 | << "\n" ); |
2017 | |
2018 | return RTCheckCost; |
2019 | } |
2020 | |
2021 | /// Remove the created SCEV & memory runtime check blocks & instructions, if |
2022 | /// unused. |
2023 | ~GeneratedRTChecks() { |
2024 | SCEVExpanderCleaner SCEVCleaner(SCEVExp); |
2025 | SCEVExpanderCleaner MemCheckCleaner(MemCheckExp); |
2026 | if (!SCEVCheckCond) |
2027 | SCEVCleaner.markResultUsed(); |
2028 | |
2029 | if (!MemRuntimeCheckCond) |
2030 | MemCheckCleaner.markResultUsed(); |
2031 | |
2032 | if (MemRuntimeCheckCond) { |
2033 | auto &SE = *MemCheckExp.getSE(); |
2034 | // Memory runtime check generation creates compares that use expanded |
2035 | // values. Remove them before running the SCEVExpanderCleaners. |
2036 | for (auto &I : make_early_inc_range(Range: reverse(C&: *MemCheckBlock))) { |
2037 | if (MemCheckExp.isInsertedInstruction(I: &I)) |
2038 | continue; |
2039 | SE.forgetValue(V: &I); |
2040 | I.eraseFromParent(); |
2041 | } |
2042 | } |
2043 | MemCheckCleaner.cleanup(); |
2044 | SCEVCleaner.cleanup(); |
2045 | |
2046 | if (SCEVCheckCond) |
2047 | SCEVCheckBlock->eraseFromParent(); |
2048 | if (MemRuntimeCheckCond) |
2049 | MemCheckBlock->eraseFromParent(); |
2050 | } |
2051 | |
2052 | /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and |
2053 | /// adjusts the branches to branch to the vector preheader or \p Bypass, |
2054 | /// depending on the generated condition. |
2055 | BasicBlock *emitSCEVChecks(BasicBlock *Bypass, |
2056 | BasicBlock *, |
2057 | BasicBlock *LoopExitBlock) { |
2058 | if (!SCEVCheckCond) |
2059 | return nullptr; |
2060 | |
2061 | Value *Cond = SCEVCheckCond; |
2062 | // Mark the check as used, to prevent it from being removed during cleanup. |
2063 | SCEVCheckCond = nullptr; |
2064 | if (auto *C = dyn_cast<ConstantInt>(Val: Cond)) |
2065 | if (C->isZero()) |
2066 | return nullptr; |
2067 | |
2068 | auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); |
2069 | |
2070 | BranchInst::Create(IfTrue: LoopVectorPreHeader, InsertBefore: SCEVCheckBlock); |
2071 | // Create new preheader for vector loop. |
2072 | if (OuterLoop) |
2073 | OuterLoop->addBasicBlockToLoop(NewBB: SCEVCheckBlock, LI&: *LI); |
2074 | |
2075 | SCEVCheckBlock->getTerminator()->eraseFromParent(); |
2076 | SCEVCheckBlock->moveBefore(MovePos: LoopVectorPreHeader); |
2077 | Pred->getTerminator()->replaceSuccessorWith(OldBB: LoopVectorPreHeader, |
2078 | NewBB: SCEVCheckBlock); |
2079 | |
2080 | DT->addNewBlock(BB: SCEVCheckBlock, DomBB: Pred); |
2081 | DT->changeImmediateDominator(BB: LoopVectorPreHeader, NewBB: SCEVCheckBlock); |
2082 | |
2083 | BranchInst &BI = *BranchInst::Create(IfTrue: Bypass, IfFalse: LoopVectorPreHeader, Cond); |
2084 | if (AddBranchWeights) |
2085 | setBranchWeights(I&: BI, Weights: SCEVCheckBypassWeights, /*IsExpected=*/false); |
2086 | ReplaceInstWithInst(From: SCEVCheckBlock->getTerminator(), To: &BI); |
2087 | return SCEVCheckBlock; |
2088 | } |
2089 | |
2090 | /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts |
2091 | /// the branches to branch to the vector preheader or \p Bypass, depending on |
2092 | /// the generated condition. |
2093 | BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass, |
2094 | BasicBlock *) { |
2095 | // Check if we generated code that checks in runtime if arrays overlap. |
2096 | if (!MemRuntimeCheckCond) |
2097 | return nullptr; |
2098 | |
2099 | auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); |
2100 | Pred->getTerminator()->replaceSuccessorWith(OldBB: LoopVectorPreHeader, |
2101 | NewBB: MemCheckBlock); |
2102 | |
2103 | DT->addNewBlock(BB: MemCheckBlock, DomBB: Pred); |
2104 | DT->changeImmediateDominator(BB: LoopVectorPreHeader, NewBB: MemCheckBlock); |
2105 | MemCheckBlock->moveBefore(MovePos: LoopVectorPreHeader); |
2106 | |
2107 | if (OuterLoop) |
2108 | OuterLoop->addBasicBlockToLoop(NewBB: MemCheckBlock, LI&: *LI); |
2109 | |
2110 | BranchInst &BI = |
2111 | *BranchInst::Create(IfTrue: Bypass, IfFalse: LoopVectorPreHeader, Cond: MemRuntimeCheckCond); |
2112 | if (AddBranchWeights) { |
2113 | setBranchWeights(I&: BI, Weights: MemCheckBypassWeights, /*IsExpected=*/false); |
2114 | } |
2115 | ReplaceInstWithInst(From: MemCheckBlock->getTerminator(), To: &BI); |
2116 | MemCheckBlock->getTerminator()->setDebugLoc( |
2117 | Pred->getTerminator()->getDebugLoc()); |
2118 | |
2119 | // Mark the check as used, to prevent it from being removed during cleanup. |
2120 | MemRuntimeCheckCond = nullptr; |
2121 | return MemCheckBlock; |
2122 | } |
2123 | }; |
2124 | } // namespace |
2125 | |
2126 | static bool useActiveLaneMask(TailFoldingStyle Style) { |
2127 | return Style == TailFoldingStyle::Data || |
2128 | Style == TailFoldingStyle::DataAndControlFlow || |
2129 | Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; |
2130 | } |
2131 | |
2132 | static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) { |
2133 | return Style == TailFoldingStyle::DataAndControlFlow || |
2134 | Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; |
2135 | } |
2136 | |
2137 | // Return true if \p OuterLp is an outer loop annotated with hints for explicit |
2138 | // vectorization. The loop needs to be annotated with #pragma omp simd |
2139 | // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the |
2140 | // vector length information is not provided, vectorization is not considered |
2141 | // explicit. Interleave hints are not allowed either. These limitations will be |
2142 | // relaxed in the future. |
2143 | // Please, note that we are currently forced to abuse the pragma 'clang |
2144 | // vectorize' semantics. This pragma provides *auto-vectorization hints* |
2145 | // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' |
2146 | // provides *explicit vectorization hints* (LV can bypass legal checks and |
2147 | // assume that vectorization is legal). However, both hints are implemented |
2148 | // using the same metadata (llvm.loop.vectorize, processed by |
2149 | // LoopVectorizeHints). This will be fixed in the future when the native IR |
2150 | // representation for pragma 'omp simd' is introduced. |
2151 | static bool (Loop *OuterLp, |
2152 | OptimizationRemarkEmitter *ORE) { |
2153 | assert(!OuterLp->isInnermost() && "This is not an outer loop" ); |
2154 | LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); |
2155 | |
2156 | // Only outer loops with an explicit vectorization hint are supported. |
2157 | // Unannotated outer loops are ignored. |
2158 | if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) |
2159 | return false; |
2160 | |
2161 | Function *Fn = OuterLp->getHeader()->getParent(); |
2162 | if (!Hints.allowVectorization(F: Fn, L: OuterLp, |
2163 | VectorizeOnlyWhenForced: true /*VectorizeOnlyWhenForced*/)) { |
2164 | LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n" ); |
2165 | return false; |
2166 | } |
2167 | |
2168 | if (Hints.getInterleave() > 1) { |
2169 | // TODO: Interleave support is future work. |
2170 | LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " |
2171 | "outer loops.\n" ); |
2172 | Hints.emitRemarkWithHints(); |
2173 | return false; |
2174 | } |
2175 | |
2176 | return true; |
2177 | } |
2178 | |
2179 | static void (Loop &L, LoopInfo *LI, |
2180 | OptimizationRemarkEmitter *ORE, |
2181 | SmallVectorImpl<Loop *> &V) { |
2182 | // Collect inner loops and outer loops without irreducible control flow. For |
2183 | // now, only collect outer loops that have explicit vectorization hints. If we |
2184 | // are stress testing the VPlan H-CFG construction, we collect the outermost |
2185 | // loop of every loop nest. |
2186 | if (L.isInnermost() || VPlanBuildStressTest || |
2187 | (EnableVPlanNativePath && isExplicitVecOuterLoop(OuterLp: &L, ORE))) { |
2188 | LoopBlocksRPO RPOT(&L); |
2189 | RPOT.perform(LI); |
2190 | if (!containsIrreducibleCFG<const BasicBlock *>(RPOTraversal&: RPOT, LI: *LI)) { |
2191 | V.push_back(Elt: &L); |
2192 | // TODO: Collect inner loops inside marked outer loops in case |
2193 | // vectorization fails for the outer loop. Do not invoke |
2194 | // 'containsIrreducibleCFG' again for inner loops when the outer loop is |
2195 | // already known to be reducible. We can use an inherited attribute for |
2196 | // that. |
2197 | return; |
2198 | } |
2199 | } |
2200 | for (Loop *InnerL : L) |
2201 | collectSupportedLoops(L&: *InnerL, LI, ORE, V); |
2202 | } |
2203 | |
2204 | //===----------------------------------------------------------------------===// |
2205 | // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and |
2206 | // LoopVectorizationCostModel and LoopVectorizationPlanner. |
2207 | //===----------------------------------------------------------------------===// |
2208 | |
2209 | /// Compute the transformed value of Index at offset StartValue using step |
2210 | /// StepValue. |
2211 | /// For integer induction, returns StartValue + Index * StepValue. |
2212 | /// For pointer induction, returns StartValue[Index * StepValue]. |
2213 | /// FIXME: The newly created binary instructions should contain nsw/nuw |
2214 | /// flags, which can be found from the original scalar operations. |
2215 | static Value * |
2216 | emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue, |
2217 | Value *Step, |
2218 | InductionDescriptor::InductionKind InductionKind, |
2219 | const BinaryOperator *InductionBinOp) { |
2220 | Type *StepTy = Step->getType(); |
2221 | Value *CastedIndex = StepTy->isIntegerTy() |
2222 | ? B.CreateSExtOrTrunc(V: Index, DestTy: StepTy) |
2223 | : B.CreateCast(Op: Instruction::SIToFP, V: Index, DestTy: StepTy); |
2224 | if (CastedIndex != Index) { |
2225 | CastedIndex->setName(CastedIndex->getName() + ".cast" ); |
2226 | Index = CastedIndex; |
2227 | } |
2228 | |
2229 | // Note: the IR at this point is broken. We cannot use SE to create any new |
2230 | // SCEV and then expand it, hoping that SCEV's simplification will give us |
2231 | // a more optimal code. Unfortunately, attempt of doing so on invalid IR may |
2232 | // lead to various SCEV crashes. So all we can do is to use builder and rely |
2233 | // on InstCombine for future simplifications. Here we handle some trivial |
2234 | // cases only. |
2235 | auto CreateAdd = [&B](Value *X, Value *Y) { |
2236 | assert(X->getType() == Y->getType() && "Types don't match!" ); |
2237 | if (auto *CX = dyn_cast<ConstantInt>(Val: X)) |
2238 | if (CX->isZero()) |
2239 | return Y; |
2240 | if (auto *CY = dyn_cast<ConstantInt>(Val: Y)) |
2241 | if (CY->isZero()) |
2242 | return X; |
2243 | return B.CreateAdd(LHS: X, RHS: Y); |
2244 | }; |
2245 | |
2246 | // We allow X to be a vector type, in which case Y will potentially be |
2247 | // splatted into a vector with the same element count. |
2248 | auto CreateMul = [&B](Value *X, Value *Y) { |
2249 | assert(X->getType()->getScalarType() == Y->getType() && |
2250 | "Types don't match!" ); |
2251 | if (auto *CX = dyn_cast<ConstantInt>(Val: X)) |
2252 | if (CX->isOne()) |
2253 | return Y; |
2254 | if (auto *CY = dyn_cast<ConstantInt>(Val: Y)) |
2255 | if (CY->isOne()) |
2256 | return X; |
2257 | VectorType *XVTy = dyn_cast<VectorType>(Val: X->getType()); |
2258 | if (XVTy && !isa<VectorType>(Val: Y->getType())) |
2259 | Y = B.CreateVectorSplat(EC: XVTy->getElementCount(), V: Y); |
2260 | return B.CreateMul(LHS: X, RHS: Y); |
2261 | }; |
2262 | |
2263 | switch (InductionKind) { |
2264 | case InductionDescriptor::IK_IntInduction: { |
2265 | assert(!isa<VectorType>(Index->getType()) && |
2266 | "Vector indices not supported for integer inductions yet" ); |
2267 | assert(Index->getType() == StartValue->getType() && |
2268 | "Index type does not match StartValue type" ); |
2269 | if (isa<ConstantInt>(Val: Step) && cast<ConstantInt>(Val: Step)->isMinusOne()) |
2270 | return B.CreateSub(LHS: StartValue, RHS: Index); |
2271 | auto *Offset = CreateMul(Index, Step); |
2272 | return CreateAdd(StartValue, Offset); |
2273 | } |
2274 | case InductionDescriptor::IK_PtrInduction: |
2275 | return B.CreatePtrAdd(Ptr: StartValue, Offset: CreateMul(Index, Step)); |
2276 | case InductionDescriptor::IK_FpInduction: { |
2277 | assert(!isa<VectorType>(Index->getType()) && |
2278 | "Vector indices not supported for FP inductions yet" ); |
2279 | assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value" ); |
2280 | assert(InductionBinOp && |
2281 | (InductionBinOp->getOpcode() == Instruction::FAdd || |
2282 | InductionBinOp->getOpcode() == Instruction::FSub) && |
2283 | "Original bin op should be defined for FP induction" ); |
2284 | |
2285 | Value *MulExp = B.CreateFMul(L: Step, R: Index); |
2286 | return B.CreateBinOp(Opc: InductionBinOp->getOpcode(), LHS: StartValue, RHS: MulExp, |
2287 | Name: "induction" ); |
2288 | } |
2289 | case InductionDescriptor::IK_NoInduction: |
2290 | return nullptr; |
2291 | } |
2292 | llvm_unreachable("invalid enum" ); |
2293 | } |
2294 | |
2295 | std::optional<unsigned> getMaxVScale(const Function &F, |
2296 | const TargetTransformInfo &TTI) { |
2297 | if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale()) |
2298 | return MaxVScale; |
2299 | |
2300 | if (F.hasFnAttribute(Kind: Attribute::VScaleRange)) |
2301 | return F.getFnAttribute(Kind: Attribute::VScaleRange).getVScaleRangeMax(); |
2302 | |
2303 | return std::nullopt; |
2304 | } |
2305 | |
2306 | /// For the given VF and UF and maximum trip count computed for the loop, return |
2307 | /// whether the induction variable might overflow in the vectorized loop. If not, |
2308 | /// then we know a runtime overflow check always evaluates to false and can be |
2309 | /// removed. |
2310 | static bool isIndvarOverflowCheckKnownFalse( |
2311 | const LoopVectorizationCostModel *Cost, |
2312 | ElementCount VF, std::optional<unsigned> UF = std::nullopt) { |
2313 | // Always be conservative if we don't know the exact unroll factor. |
2314 | unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF); |
2315 | |
2316 | Type *IdxTy = Cost->Legal->getWidestInductionType(); |
2317 | APInt MaxUIntTripCount = cast<IntegerType>(Val: IdxTy)->getMask(); |
2318 | |
2319 | // We know the runtime overflow check is known false iff the (max) trip-count |
2320 | // is known and (max) trip-count + (VF * UF) does not overflow in the type of |
2321 | // the vector loop induction variable. |
2322 | if (unsigned TC = |
2323 | Cost->PSE.getSE()->getSmallConstantMaxTripCount(L: Cost->TheLoop)) { |
2324 | uint64_t MaxVF = VF.getKnownMinValue(); |
2325 | if (VF.isScalable()) { |
2326 | std::optional<unsigned> MaxVScale = |
2327 | getMaxVScale(F: *Cost->TheFunction, TTI: Cost->TTI); |
2328 | if (!MaxVScale) |
2329 | return false; |
2330 | MaxVF *= *MaxVScale; |
2331 | } |
2332 | |
2333 | return (MaxUIntTripCount - TC).ugt(RHS: MaxVF * MaxUF); |
2334 | } |
2335 | |
2336 | return false; |
2337 | } |
2338 | |
2339 | // Return whether we allow using masked interleave-groups (for dealing with |
2340 | // strided loads/stores that reside in predicated blocks, or for dealing |
2341 | // with gaps). |
2342 | static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { |
2343 | // If an override option has been passed in for interleaved accesses, use it. |
2344 | if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) |
2345 | return EnableMaskedInterleavedMemAccesses; |
2346 | |
2347 | return TTI.enableMaskedInterleavedAccessVectorization(); |
2348 | } |
2349 | |
2350 | void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr, |
2351 | VPReplicateRecipe *RepRecipe, |
2352 | const VPIteration &Instance, |
2353 | VPTransformState &State) { |
2354 | assert(!Instr->getType()->isAggregateType() && "Can't handle vectors" ); |
2355 | |
2356 | // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for |
2357 | // the first lane and part. |
2358 | if (isa<NoAliasScopeDeclInst>(Val: Instr)) |
2359 | if (!Instance.isFirstIteration()) |
2360 | return; |
2361 | |
2362 | // Does this instruction return a value ? |
2363 | bool IsVoidRetTy = Instr->getType()->isVoidTy(); |
2364 | |
2365 | Instruction *Cloned = Instr->clone(); |
2366 | if (!IsVoidRetTy) { |
2367 | Cloned->setName(Instr->getName() + ".cloned" ); |
2368 | #if !defined(NDEBUG) |
2369 | // Verify that VPlan type inference results agree with the type of the |
2370 | // generated values. |
2371 | assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() && |
2372 | "inferred type and type from generated instructions do not match" ); |
2373 | #endif |
2374 | } |
2375 | |
2376 | RepRecipe->setFlags(Cloned); |
2377 | |
2378 | if (auto DL = Instr->getDebugLoc()) |
2379 | State.setDebugLocFrom(DL); |
2380 | |
2381 | // Replace the operands of the cloned instructions with their scalar |
2382 | // equivalents in the new loop. |
2383 | for (const auto &I : enumerate(First: RepRecipe->operands())) { |
2384 | auto InputInstance = Instance; |
2385 | VPValue *Operand = I.value(); |
2386 | if (vputils::isUniformAfterVectorization(VPV: Operand)) |
2387 | InputInstance.Lane = VPLane::getFirstLane(); |
2388 | Cloned->setOperand(i: I.index(), Val: State.get(Def: Operand, Instance: InputInstance)); |
2389 | } |
2390 | State.addNewMetadata(To: Cloned, Orig: Instr); |
2391 | |
2392 | // Place the cloned scalar in the new loop. |
2393 | State.Builder.Insert(I: Cloned); |
2394 | |
2395 | State.set(Def: RepRecipe, V: Cloned, Instance); |
2396 | |
2397 | // If we just cloned a new assumption, add it the assumption cache. |
2398 | if (auto *II = dyn_cast<AssumeInst>(Val: Cloned)) |
2399 | AC->registerAssumption(CI: II); |
2400 | |
2401 | // End if-block. |
2402 | bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator(); |
2403 | if (IfPredicateInstr) |
2404 | PredicatedInstructions.push_back(Elt: Cloned); |
2405 | } |
2406 | |
2407 | Value * |
2408 | InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { |
2409 | if (VectorTripCount) |
2410 | return VectorTripCount; |
2411 | |
2412 | Value *TC = getTripCount(); |
2413 | IRBuilder<> Builder(InsertBlock->getTerminator()); |
2414 | |
2415 | Type *Ty = TC->getType(); |
2416 | // This is where we can make the step a runtime constant. |
2417 | Value *Step = createStepForVF(B&: Builder, Ty, VF, Step: UF); |
2418 | |
2419 | // If the tail is to be folded by masking, round the number of iterations N |
2420 | // up to a multiple of Step instead of rounding down. This is done by first |
2421 | // adding Step-1 and then rounding down. Note that it's ok if this addition |
2422 | // overflows: the vector induction variable will eventually wrap to zero given |
2423 | // that it starts at zero and its Step is a power of two; the loop will then |
2424 | // exit, with the last early-exit vector comparison also producing all-true. |
2425 | // For scalable vectors the VF is not guaranteed to be a power of 2, but this |
2426 | // is accounted for in emitIterationCountCheck that adds an overflow check. |
2427 | if (Cost->foldTailByMasking()) { |
2428 | assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && |
2429 | "VF*UF must be a power of 2 when folding tail by masking" ); |
2430 | TC = Builder.CreateAdd(LHS: TC, RHS: Builder.CreateSub(LHS: Step, RHS: ConstantInt::get(Ty, V: 1)), |
2431 | Name: "n.rnd.up" ); |
2432 | } |
2433 | |
2434 | // Now we need to generate the expression for the part of the loop that the |
2435 | // vectorized body will execute. This is equal to N - (N % Step) if scalar |
2436 | // iterations are not required for correctness, or N - Step, otherwise. Step |
2437 | // is equal to the vectorization factor (number of SIMD elements) times the |
2438 | // unroll factor (number of SIMD instructions). |
2439 | Value *R = Builder.CreateURem(LHS: TC, RHS: Step, Name: "n.mod.vf" ); |
2440 | |
2441 | // There are cases where we *must* run at least one iteration in the remainder |
2442 | // loop. See the cost model for when this can happen. If the step evenly |
2443 | // divides the trip count, we set the remainder to be equal to the step. If |
2444 | // the step does not evenly divide the trip count, no adjustment is necessary |
2445 | // since there will already be scalar iterations. Note that the minimum |
2446 | // iterations check ensures that N >= Step. |
2447 | if (Cost->requiresScalarEpilogue(IsVectorizing: VF.isVector())) { |
2448 | auto *IsZero = Builder.CreateICmpEQ(LHS: R, RHS: ConstantInt::get(Ty: R->getType(), V: 0)); |
2449 | R = Builder.CreateSelect(C: IsZero, True: Step, False: R); |
2450 | } |
2451 | |
2452 | VectorTripCount = Builder.CreateSub(LHS: TC, RHS: R, Name: "n.vec" ); |
2453 | |
2454 | return VectorTripCount; |
2455 | } |
2456 | |
2457 | void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { |
2458 | Value *Count = getTripCount(); |
2459 | // Reuse existing vector loop preheader for TC checks. |
2460 | // Note that new preheader block is generated for vector loop. |
2461 | BasicBlock *const TCCheckBlock = LoopVectorPreHeader; |
2462 | IRBuilder<> Builder(TCCheckBlock->getTerminator()); |
2463 | |
2464 | // Generate code to check if the loop's trip count is less than VF * UF, or |
2465 | // equal to it in case a scalar epilogue is required; this implies that the |
2466 | // vector trip count is zero. This check also covers the case where adding one |
2467 | // to the backedge-taken count overflowed leading to an incorrect trip count |
2468 | // of zero. In this case we will also jump to the scalar loop. |
2469 | auto P = Cost->requiresScalarEpilogue(IsVectorizing: VF.isVector()) ? ICmpInst::ICMP_ULE |
2470 | : ICmpInst::ICMP_ULT; |
2471 | |
2472 | // If tail is to be folded, vector loop takes care of all iterations. |
2473 | Type *CountTy = Count->getType(); |
2474 | Value *CheckMinIters = Builder.getFalse(); |
2475 | auto CreateStep = [&]() -> Value * { |
2476 | // Create step with max(MinProTripCount, UF * VF). |
2477 | if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue()) |
2478 | return createStepForVF(B&: Builder, Ty: CountTy, VF, Step: UF); |
2479 | |
2480 | Value *MinProfTC = |
2481 | createStepForVF(B&: Builder, Ty: CountTy, VF: MinProfitableTripCount, Step: 1); |
2482 | if (!VF.isScalable()) |
2483 | return MinProfTC; |
2484 | return Builder.CreateBinaryIntrinsic( |
2485 | ID: Intrinsic::umax, LHS: MinProfTC, RHS: createStepForVF(B&: Builder, Ty: CountTy, VF, Step: UF)); |
2486 | }; |
2487 | |
2488 | TailFoldingStyle Style = Cost->getTailFoldingStyle(); |
2489 | if (Style == TailFoldingStyle::None) |
2490 | CheckMinIters = |
2491 | Builder.CreateICmp(P, LHS: Count, RHS: CreateStep(), Name: "min.iters.check" ); |
2492 | else if (VF.isScalable() && |
2493 | !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) && |
2494 | Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) { |
2495 | // vscale is not necessarily a power-of-2, which means we cannot guarantee |
2496 | // an overflow to zero when updating induction variables and so an |
2497 | // additional overflow check is required before entering the vector loop. |
2498 | |
2499 | // Get the maximum unsigned value for the type. |
2500 | Value *MaxUIntTripCount = |
2501 | ConstantInt::get(Ty: CountTy, V: cast<IntegerType>(Val: CountTy)->getMask()); |
2502 | Value *LHS = Builder.CreateSub(LHS: MaxUIntTripCount, RHS: Count); |
2503 | |
2504 | // Don't execute the vector loop if (UMax - n) < (VF * UF). |
2505 | CheckMinIters = Builder.CreateICmp(P: ICmpInst::ICMP_ULT, LHS, RHS: CreateStep()); |
2506 | } |
2507 | |
2508 | // Create new preheader for vector loop. |
2509 | LoopVectorPreHeader = |
2510 | SplitBlock(Old: TCCheckBlock, SplitPt: TCCheckBlock->getTerminator(), DT, LI, MSSAU: nullptr, |
2511 | BBName: "vector.ph" ); |
2512 | |
2513 | assert(DT->properlyDominates(DT->getNode(TCCheckBlock), |
2514 | DT->getNode(Bypass)->getIDom()) && |
2515 | "TC check is expected to dominate Bypass" ); |
2516 | |
2517 | // Update dominator for Bypass & LoopExit (if needed). |
2518 | DT->changeImmediateDominator(BB: Bypass, NewBB: TCCheckBlock); |
2519 | BranchInst &BI = |
2520 | *BranchInst::Create(IfTrue: Bypass, IfFalse: LoopVectorPreHeader, Cond: CheckMinIters); |
2521 | if (hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator())) |
2522 | setBranchWeights(I&: BI, Weights: MinItersBypassWeights, /*IsExpected=*/false); |
2523 | ReplaceInstWithInst(From: TCCheckBlock->getTerminator(), To: &BI); |
2524 | LoopBypassBlocks.push_back(Elt: TCCheckBlock); |
2525 | } |
2526 | |
2527 | BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { |
2528 | BasicBlock *const SCEVCheckBlock = |
2529 | RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock); |
2530 | if (!SCEVCheckBlock) |
2531 | return nullptr; |
2532 | |
2533 | assert(!(SCEVCheckBlock->getParent()->hasOptSize() || |
2534 | (OptForSizeBasedOnProfile && |
2535 | Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && |
2536 | "Cannot SCEV check stride or overflow when optimizing for size" ); |
2537 | |
2538 | |
2539 | // Update dominator only if this is first RT check. |
2540 | if (LoopBypassBlocks.empty()) { |
2541 | DT->changeImmediateDominator(BB: Bypass, NewBB: SCEVCheckBlock); |
2542 | if (!Cost->requiresScalarEpilogue(IsVectorizing: VF.isVector())) |
2543 | // If there is an epilogue which must run, there's no edge from the |
2544 | // middle block to exit blocks and thus no need to update the immediate |
2545 | // dominator of the exit blocks. |
2546 | DT->changeImmediateDominator(BB: LoopExitBlock, NewBB: SCEVCheckBlock); |
2547 | } |
2548 | |
2549 | LoopBypassBlocks.push_back(Elt: SCEVCheckBlock); |
2550 | AddedSafetyChecks = true; |
2551 | return SCEVCheckBlock; |
2552 | } |
2553 | |
2554 | BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) { |
2555 | // VPlan-native path does not do any analysis for runtime checks currently. |
2556 | if (EnableVPlanNativePath) |
2557 | return nullptr; |
2558 | |
2559 | BasicBlock *const MemCheckBlock = |
2560 | RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader); |
2561 | |
2562 | // Check if we generated code that checks in runtime if arrays overlap. We put |
2563 | // the checks into a separate block to make the more common case of few |
2564 | // elements faster. |
2565 | if (!MemCheckBlock) |
2566 | return nullptr; |
2567 | |
2568 | if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { |
2569 | assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && |
2570 | "Cannot emit memory checks when optimizing for size, unless forced " |
2571 | "to vectorize." ); |
2572 | ORE->emit(RemarkBuilder: [&]() { |
2573 | return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize" , |
2574 | OrigLoop->getStartLoc(), |
2575 | OrigLoop->getHeader()) |
2576 | << "Code-size may be reduced by not forcing " |
2577 | "vectorization, or by source-code modifications " |
2578 | "eliminating the need for runtime checks " |
2579 | "(e.g., adding 'restrict')." ; |
2580 | }); |
2581 | } |
2582 | |
2583 | LoopBypassBlocks.push_back(Elt: MemCheckBlock); |
2584 | |
2585 | AddedSafetyChecks = true; |
2586 | |
2587 | return MemCheckBlock; |
2588 | } |
2589 | |
2590 | void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { |
2591 | LoopScalarBody = OrigLoop->getHeader(); |
2592 | LoopVectorPreHeader = OrigLoop->getLoopPreheader(); |
2593 | assert(LoopVectorPreHeader && "Invalid loop structure" ); |
2594 | LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr |
2595 | assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) && |
2596 | "multiple exit loop without required epilogue?" ); |
2597 | |
2598 | LoopMiddleBlock = |
2599 | SplitBlock(Old: LoopVectorPreHeader, SplitPt: LoopVectorPreHeader->getTerminator(), DT, |
2600 | LI, MSSAU: nullptr, BBName: Twine(Prefix) + "middle.block" ); |
2601 | LoopScalarPreHeader = |
2602 | SplitBlock(Old: LoopMiddleBlock, SplitPt: LoopMiddleBlock->getTerminator(), DT, LI, |
2603 | MSSAU: nullptr, BBName: Twine(Prefix) + "scalar.ph" ); |
2604 | } |
2605 | |
2606 | PHINode *InnerLoopVectorizer::createInductionResumeValue( |
2607 | PHINode *OrigPhi, const InductionDescriptor &II, Value *Step, |
2608 | ArrayRef<BasicBlock *> BypassBlocks, |
2609 | std::pair<BasicBlock *, Value *> AdditionalBypass) { |
2610 | Value *VectorTripCount = getOrCreateVectorTripCount(InsertBlock: LoopVectorPreHeader); |
2611 | assert(VectorTripCount && "Expected valid arguments" ); |
2612 | |
2613 | Instruction *OldInduction = Legal->getPrimaryInduction(); |
2614 | Value *&EndValue = IVEndValues[OrigPhi]; |
2615 | Value *EndValueFromAdditionalBypass = AdditionalBypass.second; |
2616 | if (OrigPhi == OldInduction) { |
2617 | // We know what the end value is. |
2618 | EndValue = VectorTripCount; |
2619 | } else { |
2620 | IRBuilder<> B(LoopVectorPreHeader->getTerminator()); |
2621 | |
2622 | // Fast-math-flags propagate from the original induction instruction. |
2623 | if (II.getInductionBinOp() && isa<FPMathOperator>(Val: II.getInductionBinOp())) |
2624 | B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); |
2625 | |
2626 | EndValue = emitTransformedIndex(B, Index: VectorTripCount, StartValue: II.getStartValue(), |
2627 | Step, InductionKind: II.getKind(), InductionBinOp: II.getInductionBinOp()); |
2628 | EndValue->setName("ind.end" ); |
2629 | |
2630 | // Compute the end value for the additional bypass (if applicable). |
2631 | if (AdditionalBypass.first) { |
2632 | B.SetInsertPoint(TheBB: AdditionalBypass.first, |
2633 | IP: AdditionalBypass.first->getFirstInsertionPt()); |
2634 | EndValueFromAdditionalBypass = |
2635 | emitTransformedIndex(B, Index: AdditionalBypass.second, StartValue: II.getStartValue(), |
2636 | Step, InductionKind: II.getKind(), InductionBinOp: II.getInductionBinOp()); |
2637 | EndValueFromAdditionalBypass->setName("ind.end" ); |
2638 | } |
2639 | } |
2640 | |
2641 | // Create phi nodes to merge from the backedge-taken check block. |
2642 | PHINode *BCResumeVal = PHINode::Create(Ty: OrigPhi->getType(), NumReservedValues: 3, NameStr: "bc.resume.val" , |
2643 | InsertBefore: LoopScalarPreHeader->getFirstNonPHI()); |
2644 | // Copy original phi DL over to the new one. |
2645 | BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); |
2646 | |
2647 | // The new PHI merges the original incoming value, in case of a bypass, |
2648 | // or the value at the end of the vectorized loop. |
2649 | BCResumeVal->addIncoming(V: EndValue, BB: LoopMiddleBlock); |
2650 | |
2651 | // Fix the scalar body counter (PHI node). |
2652 | // The old induction's phi node in the scalar body needs the truncated |
2653 | // value. |
2654 | for (BasicBlock *BB : BypassBlocks) |
2655 | BCResumeVal->addIncoming(V: II.getStartValue(), BB); |
2656 | |
2657 | if (AdditionalBypass.first) |
2658 | BCResumeVal->setIncomingValueForBlock(BB: AdditionalBypass.first, |
2659 | V: EndValueFromAdditionalBypass); |
2660 | return BCResumeVal; |
2661 | } |
2662 | |
2663 | /// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV |
2664 | /// expansion results. |
2665 | static Value *getExpandedStep(const InductionDescriptor &ID, |
2666 | const SCEV2ValueTy &ExpandedSCEVs) { |
2667 | const SCEV *Step = ID.getStep(); |
2668 | if (auto *C = dyn_cast<SCEVConstant>(Val: Step)) |
2669 | return C->getValue(); |
2670 | if (auto *U = dyn_cast<SCEVUnknown>(Val: Step)) |
2671 | return U->getValue(); |
2672 | auto I = ExpandedSCEVs.find(Val: Step); |
2673 | assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point" ); |
2674 | return I->second; |
2675 | } |
2676 | |
2677 | void InnerLoopVectorizer::createInductionResumeValues( |
2678 | const SCEV2ValueTy &ExpandedSCEVs, |
2679 | std::pair<BasicBlock *, Value *> AdditionalBypass) { |
2680 | assert(((AdditionalBypass.first && AdditionalBypass.second) || |
2681 | (!AdditionalBypass.first && !AdditionalBypass.second)) && |
2682 | "Inconsistent information about additional bypass." ); |
2683 | // We are going to resume the execution of the scalar loop. |
2684 | // Go over all of the induction variables that we found and fix the |
2685 | // PHIs that are left in the scalar version of the loop. |
2686 | // The starting values of PHI nodes depend on the counter of the last |
2687 | // iteration in the vectorized loop. |
2688 | // If we come from a bypass edge then we need to start from the original |
2689 | // start value. |
2690 | for (const auto &InductionEntry : Legal->getInductionVars()) { |
2691 | PHINode *OrigPhi = InductionEntry.first; |
2692 | const InductionDescriptor &II = InductionEntry.second; |
2693 | PHINode *BCResumeVal = createInductionResumeValue( |
2694 | OrigPhi, II, Step: getExpandedStep(ID: II, ExpandedSCEVs), BypassBlocks: LoopBypassBlocks, |
2695 | AdditionalBypass); |
2696 | OrigPhi->setIncomingValueForBlock(BB: LoopScalarPreHeader, V: BCResumeVal); |
2697 | } |
2698 | } |
2699 | |
2700 | std::pair<BasicBlock *, Value *> |
2701 | InnerLoopVectorizer::createVectorizedLoopSkeleton( |
2702 | const SCEV2ValueTy &ExpandedSCEVs) { |
2703 | /* |
2704 | In this function we generate a new loop. The new loop will contain |
2705 | the vectorized instructions while the old loop will continue to run the |
2706 | scalar remainder. |
2707 | |
2708 | [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's |
2709 | / | preheader are expanded here. Eventually all required SCEV |
2710 | / | expansion should happen here. |
2711 | / v |
2712 | | [ ] <-- vector loop bypass (may consist of multiple blocks). |
2713 | | / | |
2714 | | / v |
2715 | || [ ] <-- vector pre header. |
2716 | |/ | |
2717 | | v |
2718 | | [ ] \ |
2719 | | [ ]_| <-- vector loop (created during VPlan execution). |
2720 | | | |
2721 | | v |
2722 | \ -[ ] <--- middle-block (wrapped in VPIRBasicBlock with the branch to |
2723 | | | successors created during VPlan execution) |
2724 | \/ | |
2725 | /\ v |
2726 | | ->[ ] <--- new preheader (wrapped in VPIRBasicBlock). |
2727 | | | |
2728 | (opt) v <-- edge from middle to exit iff epilogue is not required. |
2729 | | [ ] \ |
2730 | | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). |
2731 | \ | |
2732 | \ v |
2733 | >[ ] <-- exit block(s). (wrapped in VPIRBasicBlock) |
2734 | ... |
2735 | */ |
2736 | |
2737 | // Create an empty vector loop, and prepare basic blocks for the runtime |
2738 | // checks. |
2739 | createVectorLoopSkeleton(Prefix: "" ); |
2740 | |
2741 | // Now, compare the new count to zero. If it is zero skip the vector loop and |
2742 | // jump to the scalar loop. This check also covers the case where the |
2743 | // backedge-taken count is uint##_max: adding one to it will overflow leading |
2744 | // to an incorrect trip count of zero. In this (rare) case we will also jump |
2745 | // to the scalar loop. |
2746 | emitIterationCountCheck(Bypass: LoopScalarPreHeader); |
2747 | |
2748 | // Generate the code to check any assumptions that we've made for SCEV |
2749 | // expressions. |
2750 | emitSCEVChecks(Bypass: LoopScalarPreHeader); |
2751 | |
2752 | // Generate the code that checks in runtime if arrays overlap. We put the |
2753 | // checks into a separate block to make the more common case of few elements |
2754 | // faster. |
2755 | emitMemRuntimeChecks(Bypass: LoopScalarPreHeader); |
2756 | |
2757 | // Emit phis for the new starting index of the scalar loop. |
2758 | createInductionResumeValues(ExpandedSCEVs); |
2759 | |
2760 | return {LoopVectorPreHeader, nullptr}; |
2761 | } |
2762 | |
2763 | // Fix up external users of the induction variable. At this point, we are |
2764 | // in LCSSA form, with all external PHIs that use the IV having one input value, |
2765 | // coming from the remainder loop. We need those PHIs to also have a correct |
2766 | // value for the IV when arriving directly from the middle block. |
2767 | void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, |
2768 | const InductionDescriptor &II, |
2769 | Value *VectorTripCount, Value *EndValue, |
2770 | BasicBlock *MiddleBlock, |
2771 | BasicBlock *, VPlan &Plan, |
2772 | VPTransformState &State) { |
2773 | // There are two kinds of external IV usages - those that use the value |
2774 | // computed in the last iteration (the PHI) and those that use the penultimate |
2775 | // value (the value that feeds into the phi from the loop latch). |
2776 | // We allow both, but they, obviously, have different values. |
2777 | |
2778 | assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block" ); |
2779 | |
2780 | DenseMap<Value *, Value *> MissingVals; |
2781 | |
2782 | // An external user of the last iteration's value should see the value that |
2783 | // the remainder loop uses to initialize its own IV. |
2784 | Value *PostInc = OrigPhi->getIncomingValueForBlock(BB: OrigLoop->getLoopLatch()); |
2785 | for (User *U : PostInc->users()) { |
2786 | Instruction *UI = cast<Instruction>(Val: U); |
2787 | if (!OrigLoop->contains(Inst: UI)) { |
2788 | assert(isa<PHINode>(UI) && "Expected LCSSA form" ); |
2789 | MissingVals[UI] = EndValue; |
2790 | } |
2791 | } |
2792 | |
2793 | // An external user of the penultimate value need to see EndValue - Step. |
2794 | // The simplest way to get this is to recompute it from the constituent SCEVs, |
2795 | // that is Start + (Step * (CRD - 1)). |
2796 | for (User *U : OrigPhi->users()) { |
2797 | auto *UI = cast<Instruction>(Val: U); |
2798 | if (!OrigLoop->contains(Inst: UI)) { |
2799 | assert(isa<PHINode>(UI) && "Expected LCSSA form" ); |
2800 | IRBuilder<> B(MiddleBlock->getTerminator()); |
2801 | |
2802 | // Fast-math-flags propagate from the original induction instruction. |
2803 | if (II.getInductionBinOp() && isa<FPMathOperator>(Val: II.getInductionBinOp())) |
2804 | B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); |
2805 | |
2806 | Value *CountMinusOne = B.CreateSub( |
2807 | LHS: VectorTripCount, RHS: ConstantInt::get(Ty: VectorTripCount->getType(), V: 1)); |
2808 | CountMinusOne->setName("cmo" ); |
2809 | |
2810 | VPValue *StepVPV = Plan.getSCEVExpansion(S: II.getStep()); |
2811 | assert(StepVPV && "step must have been expanded during VPlan execution" ); |
2812 | Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue() |
2813 | : State.get(Def: StepVPV, Instance: {0, 0}); |
2814 | Value *Escape = |
2815 | emitTransformedIndex(B, Index: CountMinusOne, StartValue: II.getStartValue(), Step, |
2816 | InductionKind: II.getKind(), InductionBinOp: II.getInductionBinOp()); |
2817 | Escape->setName("ind.escape" ); |
2818 | MissingVals[UI] = Escape; |
2819 | } |
2820 | } |
2821 | |
2822 | for (auto &I : MissingVals) { |
2823 | PHINode *PHI = cast<PHINode>(Val: I.first); |
2824 | // One corner case we have to handle is two IVs "chasing" each-other, |
2825 | // that is %IV2 = phi [...], [ %IV1, %latch ] |
2826 | // In this case, if IV1 has an external use, we need to avoid adding both |
2827 | // "last value of IV1" and "penultimate value of IV2". So, verify that we |
2828 | // don't already have an incoming value for the middle block. |
2829 | if (PHI->getBasicBlockIndex(BB: MiddleBlock) == -1) { |
2830 | PHI->addIncoming(V: I.second, BB: MiddleBlock); |
2831 | Plan.removeLiveOut(PN: PHI); |
2832 | } |
2833 | } |
2834 | } |
2835 | |
2836 | namespace { |
2837 | |
2838 | struct CSEDenseMapInfo { |
2839 | static bool canHandle(const Instruction *I) { |
2840 | return isa<InsertElementInst>(Val: I) || isa<ExtractElementInst>(Val: I) || |
2841 | isa<ShuffleVectorInst>(Val: I) || isa<GetElementPtrInst>(Val: I); |
2842 | } |
2843 | |
2844 | static inline Instruction *getEmptyKey() { |
2845 | return DenseMapInfo<Instruction *>::getEmptyKey(); |
2846 | } |
2847 | |
2848 | static inline Instruction *getTombstoneKey() { |
2849 | return DenseMapInfo<Instruction *>::getTombstoneKey(); |
2850 | } |
2851 | |
2852 | static unsigned getHashValue(const Instruction *I) { |
2853 | assert(canHandle(I) && "Unknown instruction!" ); |
2854 | return hash_combine(args: I->getOpcode(), args: hash_combine_range(first: I->value_op_begin(), |
2855 | last: I->value_op_end())); |
2856 | } |
2857 | |
2858 | static bool isEqual(const Instruction *LHS, const Instruction *RHS) { |
2859 | if (LHS == getEmptyKey() || RHS == getEmptyKey() || |
2860 | LHS == getTombstoneKey() || RHS == getTombstoneKey()) |
2861 | return LHS == RHS; |
2862 | return LHS->isIdenticalTo(I: RHS); |
2863 | } |
2864 | }; |
2865 | |
2866 | } // end anonymous namespace |
2867 | |
2868 | ///Perform cse of induction variable instructions. |
2869 | static void cse(BasicBlock *BB) { |
2870 | // Perform simple cse. |
2871 | SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; |
2872 | for (Instruction &In : llvm::make_early_inc_range(Range&: *BB)) { |
2873 | if (!CSEDenseMapInfo::canHandle(I: &In)) |
2874 | continue; |
2875 | |
2876 | // Check if we can replace this instruction with any of the |
2877 | // visited instructions. |
2878 | if (Instruction *V = CSEMap.lookup(Val: &In)) { |
2879 | In.replaceAllUsesWith(V); |
2880 | In.eraseFromParent(); |
2881 | continue; |
2882 | } |
2883 | |
2884 | CSEMap[&In] = &In; |
2885 | } |
2886 | } |
2887 | |
2888 | InstructionCost |
2889 | LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, |
2890 | ElementCount VF) const { |
2891 | // We only need to calculate a cost if the VF is scalar; for actual vectors |
2892 | // we should already have a pre-calculated cost at each VF. |
2893 | if (!VF.isScalar()) |
2894 | return CallWideningDecisions.at(Val: std::make_pair(x&: CI, y&: VF)).Cost; |
2895 | |
2896 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
2897 | Type *RetTy = CI->getType(); |
2898 | if (RecurrenceDescriptor::isFMulAddIntrinsic(I: CI)) |
2899 | if (auto RedCost = getReductionPatternCost(I: CI, VF, VectorTy: RetTy, CostKind)) |
2900 | return *RedCost; |
2901 | |
2902 | SmallVector<Type *, 4> Tys; |
2903 | for (auto &ArgOp : CI->args()) |
2904 | Tys.push_back(Elt: ArgOp->getType()); |
2905 | |
2906 | InstructionCost ScalarCallCost = |
2907 | TTI.getCallInstrCost(F: CI->getCalledFunction(), RetTy, Tys, CostKind); |
2908 | |
2909 | // If this is an intrinsic we may have a lower cost for it. |
2910 | if (getVectorIntrinsicIDForCall(CI, TLI)) { |
2911 | InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); |
2912 | return std::min(a: ScalarCallCost, b: IntrinsicCost); |
2913 | } |
2914 | return ScalarCallCost; |
2915 | } |
2916 | |
2917 | static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { |
2918 | if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) |
2919 | return Elt; |
2920 | return VectorType::get(ElementType: Elt, EC: VF); |
2921 | } |
2922 | |
2923 | InstructionCost |
2924 | LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, |
2925 | ElementCount VF) const { |
2926 | Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); |
2927 | assert(ID && "Expected intrinsic call!" ); |
2928 | Type *RetTy = MaybeVectorizeType(Elt: CI->getType(), VF); |
2929 | FastMathFlags FMF; |
2930 | if (auto *FPMO = dyn_cast<FPMathOperator>(Val: CI)) |
2931 | FMF = FPMO->getFastMathFlags(); |
2932 | |
2933 | SmallVector<const Value *> Arguments(CI->args()); |
2934 | FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); |
2935 | SmallVector<Type *> ParamTys; |
2936 | std::transform(first: FTy->param_begin(), last: FTy->param_end(), |
2937 | result: std::back_inserter(x&: ParamTys), |
2938 | unary_op: [&](Type *Ty) { return MaybeVectorizeType(Elt: Ty, VF); }); |
2939 | |
2940 | IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, |
2941 | dyn_cast<IntrinsicInst>(Val: CI)); |
2942 | return TTI.getIntrinsicInstrCost(ICA: CostAttrs, |
2943 | CostKind: TargetTransformInfo::TCK_RecipThroughput); |
2944 | } |
2945 | |
2946 | void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, |
2947 | VPlan &Plan) { |
2948 | // Fix widened non-induction PHIs by setting up the PHI operands. |
2949 | if (EnableVPlanNativePath) |
2950 | fixNonInductionPHIs(Plan, State); |
2951 | |
2952 | // Forget the original basic block. |
2953 | PSE.getSE()->forgetLoop(L: OrigLoop); |
2954 | PSE.getSE()->forgetBlockAndLoopDispositions(); |
2955 | |
2956 | // After vectorization, the exit blocks of the original loop will have |
2957 | // additional predecessors. Invalidate SCEVs for the exit phis in case SE |
2958 | // looked through single-entry phis. |
2959 | SmallVector<BasicBlock *> ExitBlocks; |
2960 | OrigLoop->getExitBlocks(ExitBlocks); |
2961 | for (BasicBlock *Exit : ExitBlocks) |
2962 | for (PHINode &PN : Exit->phis()) |
2963 | PSE.getSE()->forgetLcssaPhiWithNewPredecessor(L: OrigLoop, V: &PN); |
2964 | |
2965 | VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion(); |
2966 | VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock(); |
2967 | Loop *VectorLoop = LI->getLoopFor(BB: State.CFG.VPBB2IRBB[LatchVPBB]); |
2968 | if (Cost->requiresScalarEpilogue(IsVectorizing: VF.isVector())) { |
2969 | // No edge from the middle block to the unique exit block has been inserted |
2970 | // and there is nothing to fix from vector loop; phis should have incoming |
2971 | // from scalar loop only. |
2972 | } else { |
2973 | // TODO: Check VPLiveOuts to see if IV users need fixing instead of checking |
2974 | // the cost model. |
2975 | |
2976 | // If we inserted an edge from the middle block to the unique exit block, |
2977 | // update uses outside the loop (phis) to account for the newly inserted |
2978 | // edge. |
2979 | |
2980 | // Fix-up external users of the induction variables. |
2981 | for (const auto &Entry : Legal->getInductionVars()) |
2982 | fixupIVUsers(OrigPhi: Entry.first, II: Entry.second, |
2983 | VectorTripCount: getOrCreateVectorTripCount(InsertBlock: VectorLoop->getLoopPreheader()), |
2984 | EndValue: IVEndValues[Entry.first], MiddleBlock: LoopMiddleBlock, |
2985 | VectorHeader: VectorLoop->getHeader(), Plan, State); |
2986 | } |
2987 | |
2988 | // Fix live-out phis not already fixed earlier. |
2989 | for (const auto &KV : Plan.getLiveOuts()) |
2990 | KV.second->fixPhi(Plan, State); |
2991 | |
2992 | for (Instruction *PI : PredicatedInstructions) |
2993 | sinkScalarOperands(PredInst: &*PI); |
2994 | |
2995 | // Remove redundant induction instructions. |
2996 | cse(BB: VectorLoop->getHeader()); |
2997 | |
2998 | // Set/update profile weights for the vector and remainder loops as original |
2999 | // loop iterations are now distributed among them. Note that original loop |
3000 | // represented by LoopScalarBody becomes remainder loop after vectorization. |
3001 | // |
3002 | // For cases like foldTailByMasking() and requiresScalarEpiloque() we may |
3003 | // end up getting slightly roughened result but that should be OK since |
3004 | // profile is not inherently precise anyway. Note also possible bypass of |
3005 | // vector code caused by legality checks is ignored, assigning all the weight |
3006 | // to the vector loop, optimistically. |
3007 | // |
3008 | // For scalable vectorization we can't know at compile time how many iterations |
3009 | // of the loop are handled in one vector iteration, so instead assume a pessimistic |
3010 | // vscale of '1'. |
3011 | setProfileInfoAfterUnrolling(OrigLoop: LI->getLoopFor(BB: LoopScalarBody), UnrolledLoop: VectorLoop, |
3012 | RemainderLoop: LI->getLoopFor(BB: LoopScalarBody), |
3013 | UF: VF.getKnownMinValue() * UF); |
3014 | } |
3015 | |
3016 | void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { |
3017 | // The basic block and loop containing the predicated instruction. |
3018 | auto *PredBB = PredInst->getParent(); |
3019 | auto *VectorLoop = LI->getLoopFor(BB: PredBB); |
3020 | |
3021 | // Initialize a worklist with the operands of the predicated instruction. |
3022 | SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); |
3023 | |
3024 | // Holds instructions that we need to analyze again. An instruction may be |
3025 | // reanalyzed if we don't yet know if we can sink it or not. |
3026 | SmallVector<Instruction *, 8> InstsToReanalyze; |
3027 | |
3028 | // Returns true if a given use occurs in the predicated block. Phi nodes use |
3029 | // their operands in their corresponding predecessor blocks. |
3030 | auto isBlockOfUsePredicated = [&](Use &U) -> bool { |
3031 | auto *I = cast<Instruction>(Val: U.getUser()); |
3032 | BasicBlock *BB = I->getParent(); |
3033 | if (auto *Phi = dyn_cast<PHINode>(Val: I)) |
3034 | BB = Phi->getIncomingBlock( |
3035 | i: PHINode::getIncomingValueNumForOperand(i: U.getOperandNo())); |
3036 | return BB == PredBB; |
3037 | }; |
3038 | |
3039 | // Iteratively sink the scalarized operands of the predicated instruction |
3040 | // into the block we created for it. When an instruction is sunk, it's |
3041 | // operands are then added to the worklist. The algorithm ends after one pass |
3042 | // through the worklist doesn't sink a single instruction. |
3043 | bool Changed; |
3044 | do { |
3045 | // Add the instructions that need to be reanalyzed to the worklist, and |
3046 | // reset the changed indicator. |
3047 | Worklist.insert(Start: InstsToReanalyze.begin(), End: InstsToReanalyze.end()); |
3048 | InstsToReanalyze.clear(); |
3049 | Changed = false; |
3050 | |
3051 | while (!Worklist.empty()) { |
3052 | auto *I = dyn_cast<Instruction>(Val: Worklist.pop_back_val()); |
3053 | |
3054 | // We can't sink an instruction if it is a phi node, is not in the loop, |
3055 | // may have side effects or may read from memory. |
3056 | // TODO Could dor more granular checking to allow sinking a load past non-store instructions. |
3057 | if (!I || isa<PHINode>(Val: I) || !VectorLoop->contains(Inst: I) || |
3058 | I->mayHaveSideEffects() || I->mayReadFromMemory()) |
3059 | continue; |
3060 | |
3061 | // If the instruction is already in PredBB, check if we can sink its |
3062 | // operands. In that case, VPlan's sinkScalarOperands() succeeded in |
3063 | // sinking the scalar instruction I, hence it appears in PredBB; but it |
3064 | // may have failed to sink I's operands (recursively), which we try |
3065 | // (again) here. |
3066 | if (I->getParent() == PredBB) { |
3067 | Worklist.insert(Start: I->op_begin(), End: I->op_end()); |
3068 | continue; |
3069 | } |
3070 | |
3071 | // It's legal to sink the instruction if all its uses occur in the |
3072 | // predicated block. Otherwise, there's nothing to do yet, and we may |
3073 | // need to reanalyze the instruction. |
3074 | if (!llvm::all_of(Range: I->uses(), P: isBlockOfUsePredicated)) { |
3075 | InstsToReanalyze.push_back(Elt: I); |
3076 | continue; |
3077 | } |
3078 | |
3079 | // Move the instruction to the beginning of the predicated block, and add |
3080 | // it's operands to the worklist. |
3081 | I->moveBefore(MovePos: &*PredBB->getFirstInsertionPt()); |
3082 | Worklist.insert(Start: I->op_begin(), End: I->op_end()); |
3083 | |
3084 | // The sinking may have enabled other instructions to be sunk, so we will |
3085 | // need to iterate. |
3086 | Changed = true; |
3087 | } |
3088 | } while (Changed); |
3089 | } |
3090 | |
3091 | void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan, |
3092 | VPTransformState &State) { |
3093 | auto Iter = vp_depth_first_deep(G: Plan.getEntry()); |
3094 | for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Iter)) { |
3095 | for (VPRecipeBase &P : VPBB->phis()) { |
3096 | VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(Val: &P); |
3097 | if (!VPPhi) |
3098 | continue; |
3099 | PHINode *NewPhi = cast<PHINode>(Val: State.get(Def: VPPhi, Part: 0)); |
3100 | // Make sure the builder has a valid insert point. |
3101 | Builder.SetInsertPoint(NewPhi); |
3102 | for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { |
3103 | VPValue *Inc = VPPhi->getIncomingValue(I: i); |
3104 | VPBasicBlock *VPBB = VPPhi->getIncomingBlock(I: i); |
3105 | NewPhi->addIncoming(V: State.get(Def: Inc, Part: 0), BB: State.CFG.VPBB2IRBB[VPBB]); |
3106 | } |
3107 | } |
3108 | } |
3109 | } |
3110 | |
3111 | void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { |
3112 | // We should not collect Scalars more than once per VF. Right now, this |
3113 | // function is called from collectUniformsAndScalars(), which already does |
3114 | // this check. Collecting Scalars for VF=1 does not make any sense. |
3115 | assert(VF.isVector() && !Scalars.contains(VF) && |
3116 | "This function should not be visited twice for the same VF" ); |
3117 | |
3118 | // This avoids any chances of creating a REPLICATE recipe during planning |
3119 | // since that would result in generation of scalarized code during execution, |
3120 | // which is not supported for scalable vectors. |
3121 | if (VF.isScalable()) { |
3122 | Scalars[VF].insert(I: Uniforms[VF].begin(), E: Uniforms[VF].end()); |
3123 | return; |
3124 | } |
3125 | |
3126 | SmallSetVector<Instruction *, 8> Worklist; |
3127 | |
3128 | // These sets are used to seed the analysis with pointers used by memory |
3129 | // accesses that will remain scalar. |
3130 | SmallSetVector<Instruction *, 8> ScalarPtrs; |
3131 | SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; |
3132 | auto *Latch = TheLoop->getLoopLatch(); |
3133 | |
3134 | // A helper that returns true if the use of Ptr by MemAccess will be scalar. |
3135 | // The pointer operands of loads and stores will be scalar as long as the |
3136 | // memory access is not a gather or scatter operation. The value operand of a |
3137 | // store will remain scalar if the store is scalarized. |
3138 | auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { |
3139 | InstWidening WideningDecision = getWideningDecision(I: MemAccess, VF); |
3140 | assert(WideningDecision != CM_Unknown && |
3141 | "Widening decision should be ready at this moment" ); |
3142 | if (auto *Store = dyn_cast<StoreInst>(Val: MemAccess)) |
3143 | if (Ptr == Store->getValueOperand()) |
3144 | return WideningDecision == CM_Scalarize; |
3145 | assert(Ptr == getLoadStorePointerOperand(MemAccess) && |
3146 | "Ptr is neither a value or pointer operand" ); |
3147 | return WideningDecision != CM_GatherScatter; |
3148 | }; |
3149 | |
3150 | // A helper that returns true if the given value is a bitcast or |
3151 | // getelementptr instruction contained in the loop. |
3152 | auto isLoopVaryingBitCastOrGEP = [&](Value *V) { |
3153 | return ((isa<BitCastInst>(Val: V) && V->getType()->isPointerTy()) || |
3154 | isa<GetElementPtrInst>(Val: V)) && |
3155 | !TheLoop->isLoopInvariant(V); |
3156 | }; |
3157 | |
3158 | // A helper that evaluates a memory access's use of a pointer. If the use will |
3159 | // be a scalar use and the pointer is only used by memory accesses, we place |
3160 | // the pointer in ScalarPtrs. Otherwise, the pointer is placed in |
3161 | // PossibleNonScalarPtrs. |
3162 | auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { |
3163 | // We only care about bitcast and getelementptr instructions contained in |
3164 | // the loop. |
3165 | if (!isLoopVaryingBitCastOrGEP(Ptr)) |
3166 | return; |
3167 | |
3168 | // If the pointer has already been identified as scalar (e.g., if it was |
3169 | // also identified as uniform), there's nothing to do. |
3170 | auto *I = cast<Instruction>(Val: Ptr); |
3171 | if (Worklist.count(key: I)) |
3172 | return; |
3173 | |
3174 | // If the use of the pointer will be a scalar use, and all users of the |
3175 | // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, |
3176 | // place the pointer in PossibleNonScalarPtrs. |
3177 | if (isScalarUse(MemAccess, Ptr) && llvm::all_of(Range: I->users(), P: [&](User *U) { |
3178 | return isa<LoadInst>(Val: U) || isa<StoreInst>(Val: U); |
3179 | })) |
3180 | ScalarPtrs.insert(X: I); |
3181 | else |
3182 | PossibleNonScalarPtrs.insert(Ptr: I); |
3183 | }; |
3184 | |
3185 | // We seed the scalars analysis with three classes of instructions: (1) |
3186 | // instructions marked uniform-after-vectorization and (2) bitcast, |
3187 | // getelementptr and (pointer) phi instructions used by memory accesses |
3188 | // requiring a scalar use. |
3189 | // |
3190 | // (1) Add to the worklist all instructions that have been identified as |
3191 | // uniform-after-vectorization. |
3192 | Worklist.insert(Start: Uniforms[VF].begin(), End: Uniforms[VF].end()); |
3193 | |
3194 | // (2) Add to the worklist all bitcast and getelementptr instructions used by |
3195 | // memory accesses requiring a scalar use. The pointer operands of loads and |
3196 | // stores will be scalar as long as the memory accesses is not a gather or |
3197 | // scatter operation. The value operand of a store will remain scalar if the |
3198 | // store is scalarized. |
3199 | for (auto *BB : TheLoop->blocks()) |
3200 | for (auto &I : *BB) { |
3201 | if (auto *Load = dyn_cast<LoadInst>(Val: &I)) { |
3202 | evaluatePtrUse(Load, Load->getPointerOperand()); |
3203 | } else if (auto *Store = dyn_cast<StoreInst>(Val: &I)) { |
3204 | evaluatePtrUse(Store, Store->getPointerOperand()); |
3205 | evaluatePtrUse(Store, Store->getValueOperand()); |
3206 | } |
3207 | } |
3208 | for (auto *I : ScalarPtrs) |
3209 | if (!PossibleNonScalarPtrs.count(Ptr: I)) { |
3210 | LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n" ); |
3211 | Worklist.insert(X: I); |
3212 | } |
3213 | |
3214 | // Insert the forced scalars. |
3215 | // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector |
3216 | // induction variable when the PHI user is scalarized. |
3217 | auto ForcedScalar = ForcedScalars.find(Val: VF); |
3218 | if (ForcedScalar != ForcedScalars.end()) |
3219 | for (auto *I : ForcedScalar->second) { |
3220 | LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n" ); |
3221 | Worklist.insert(X: I); |
3222 | } |
3223 | |
3224 | // Expand the worklist by looking through any bitcasts and getelementptr |
3225 | // instructions we've already identified as scalar. This is similar to the |
3226 | // expansion step in collectLoopUniforms(); however, here we're only |
3227 | // expanding to include additional bitcasts and getelementptr instructions. |
3228 | unsigned Idx = 0; |
3229 | while (Idx != Worklist.size()) { |
3230 | Instruction *Dst = Worklist[Idx++]; |
3231 | if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(i: 0))) |
3232 | continue; |
3233 | auto *Src = cast<Instruction>(Val: Dst->getOperand(i: 0)); |
3234 | if (llvm::all_of(Range: Src->users(), P: [&](User *U) -> bool { |
3235 | auto *J = cast<Instruction>(Val: U); |
3236 | return !TheLoop->contains(Inst: J) || Worklist.count(key: J) || |
3237 | ((isa<LoadInst>(Val: J) || isa<StoreInst>(Val: J)) && |
3238 | isScalarUse(J, Src)); |
3239 | })) { |
3240 | Worklist.insert(X: Src); |
3241 | LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n" ); |
3242 | } |
3243 | } |
3244 | |
3245 | // An induction variable will remain scalar if all users of the induction |
3246 | // variable and induction variable update remain scalar. |
3247 | for (const auto &Induction : Legal->getInductionVars()) { |
3248 | auto *Ind = Induction.first; |
3249 | auto *IndUpdate = cast<Instruction>(Val: Ind->getIncomingValueForBlock(BB: Latch)); |
3250 | |
3251 | // If tail-folding is applied, the primary induction variable will be used |
3252 | // to feed a vector compare. |
3253 | if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) |
3254 | continue; |
3255 | |
3256 | // Returns true if \p Indvar is a pointer induction that is used directly by |
3257 | // load/store instruction \p I. |
3258 | auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, |
3259 | Instruction *I) { |
3260 | return Induction.second.getKind() == |
3261 | InductionDescriptor::IK_PtrInduction && |
3262 | (isa<LoadInst>(Val: I) || isa<StoreInst>(Val: I)) && |
3263 | Indvar == getLoadStorePointerOperand(V: I) && isScalarUse(I, Indvar); |
3264 | }; |
3265 | |
3266 | // Determine if all users of the induction variable are scalar after |
3267 | // vectorization. |
3268 | auto ScalarInd = llvm::all_of(Range: Ind->users(), P: [&](User *U) -> bool { |
3269 | auto *I = cast<Instruction>(Val: U); |
3270 | return I == IndUpdate || !TheLoop->contains(Inst: I) || Worklist.count(key: I) || |
3271 | IsDirectLoadStoreFromPtrIndvar(Ind, I); |
3272 | }); |
3273 | if (!ScalarInd) |
3274 | continue; |
3275 | |
3276 | // If the induction variable update is a fixed-order recurrence, neither the |
3277 | // induction variable or its update should be marked scalar after |
3278 | // vectorization. |
3279 | auto *IndUpdatePhi = dyn_cast<PHINode>(Val: IndUpdate); |
3280 | if (IndUpdatePhi && Legal->isFixedOrderRecurrence(Phi: IndUpdatePhi)) |
3281 | continue; |
3282 | |
3283 | // Determine if all users of the induction variable update instruction are |
3284 | // scalar after vectorization. |
3285 | auto ScalarIndUpdate = |
3286 | llvm::all_of(Range: IndUpdate->users(), P: [&](User *U) -> bool { |
3287 | auto *I = cast<Instruction>(Val: U); |
3288 | return I == Ind || !TheLoop->contains(Inst: I) || Worklist.count(key: I) || |
3289 | IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); |
3290 | }); |
3291 | if (!ScalarIndUpdate) |
3292 | continue; |
3293 | |
3294 | // The induction variable and its update instruction will remain scalar. |
3295 | Worklist.insert(X: Ind); |
3296 | Worklist.insert(X: IndUpdate); |
3297 | LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n" ); |
3298 | LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate |
3299 | << "\n" ); |
3300 | } |
3301 | |
3302 | Scalars[VF].insert(I: Worklist.begin(), E: Worklist.end()); |
3303 | } |
3304 | |
3305 | bool LoopVectorizationCostModel::isScalarWithPredication( |
3306 | Instruction *I, ElementCount VF) const { |
3307 | if (!isPredicatedInst(I)) |
3308 | return false; |
3309 | |
3310 | // Do we have a non-scalar lowering for this predicated |
3311 | // instruction? No - it is scalar with predication. |
3312 | switch(I->getOpcode()) { |
3313 | default: |
3314 | return true; |
3315 | case Instruction::Call: |
3316 | if (VF.isScalar()) |
3317 | return true; |
3318 | return CallWideningDecisions.at(Val: std::make_pair(x: cast<CallInst>(Val: I), y&: VF)) |
3319 | .Kind == CM_Scalarize; |
3320 | case Instruction::Load: |
3321 | case Instruction::Store: { |
3322 | auto *Ptr = getLoadStorePointerOperand(V: I); |
3323 | auto *Ty = getLoadStoreType(I); |
3324 | Type *VTy = Ty; |
3325 | if (VF.isVector()) |
3326 | VTy = VectorType::get(ElementType: Ty, EC: VF); |
3327 | const Align Alignment = getLoadStoreAlignment(I); |
3328 | return isa<LoadInst>(Val: I) ? !(isLegalMaskedLoad(DataType: Ty, Ptr, Alignment) || |
3329 | TTI.isLegalMaskedGather(DataType: VTy, Alignment)) |
3330 | : !(isLegalMaskedStore(DataType: Ty, Ptr, Alignment) || |
3331 | TTI.isLegalMaskedScatter(DataType: VTy, Alignment)); |
3332 | } |
3333 | case Instruction::UDiv: |
3334 | case Instruction::SDiv: |
3335 | case Instruction::SRem: |
3336 | case Instruction::URem: { |
3337 | // We have the option to use the safe-divisor idiom to avoid predication. |
3338 | // The cost based decision here will always select safe-divisor for |
3339 | // scalable vectors as scalarization isn't legal. |
3340 | const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF); |
3341 | return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost); |
3342 | } |
3343 | } |
3344 | } |
3345 | |
3346 | bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const { |
3347 | if (!blockNeedsPredicationForAnyReason(BB: I->getParent())) |
3348 | return false; |
3349 | |
3350 | // Can we prove this instruction is safe to unconditionally execute? |
3351 | // If not, we must use some form of predication. |
3352 | switch(I->getOpcode()) { |
3353 | default: |
3354 | return false; |
3355 | case Instruction::Load: |
3356 | case Instruction::Store: { |
3357 | if (!Legal->isMaskRequired(I)) |
3358 | return false; |
3359 | // When we know the load's address is loop invariant and the instruction |
3360 | // in the original scalar loop was unconditionally executed then we |
3361 | // don't need to mark it as a predicated instruction. Tail folding may |
3362 | // introduce additional predication, but we're guaranteed to always have |
3363 | // at least one active lane. We call Legal->blockNeedsPredication here |
3364 | // because it doesn't query tail-folding. For stores, we need to prove |
3365 | // both speculation safety (which follows from the same argument as loads), |
3366 | // but also must prove the value being stored is correct. The easiest |
3367 | // form of the later is to require that all values stored are the same. |
3368 | if (Legal->isInvariant(V: getLoadStorePointerOperand(V: I)) && |
3369 | (isa<LoadInst>(Val: I) || |
3370 | (isa<StoreInst>(Val: I) && |
3371 | TheLoop->isLoopInvariant(V: cast<StoreInst>(Val: I)->getValueOperand()))) && |
3372 | !Legal->blockNeedsPredication(BB: I->getParent())) |
3373 | return false; |
3374 | return true; |
3375 | } |
3376 | case Instruction::UDiv: |
3377 | case Instruction::SDiv: |
3378 | case Instruction::SRem: |
3379 | case Instruction::URem: |
3380 | // TODO: We can use the loop-preheader as context point here and get |
3381 | // context sensitive reasoning |
3382 | return !isSafeToSpeculativelyExecute(I); |
3383 | case Instruction::Call: |
3384 | return Legal->isMaskRequired(I); |
3385 | } |
3386 | } |
3387 | |
3388 | std::pair<InstructionCost, InstructionCost> |
3389 | LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I, |
3390 | ElementCount VF) const { |
3391 | assert(I->getOpcode() == Instruction::UDiv || |
3392 | I->getOpcode() == Instruction::SDiv || |
3393 | I->getOpcode() == Instruction::SRem || |
3394 | I->getOpcode() == Instruction::URem); |
3395 | assert(!isSafeToSpeculativelyExecute(I)); |
3396 | |
3397 | const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
3398 | |
3399 | // Scalarization isn't legal for scalable vector types |
3400 | InstructionCost ScalarizationCost = InstructionCost::getInvalid(); |
3401 | if (!VF.isScalable()) { |
3402 | // Get the scalarization cost and scale this amount by the probability of |
3403 | // executing the predicated block. If the instruction is not predicated, |
3404 | // we fall through to the next case. |
3405 | ScalarizationCost = 0; |
3406 | |
3407 | // These instructions have a non-void type, so account for the phi nodes |
3408 | // that we will create. This cost is likely to be zero. The phi node |
3409 | // cost, if any, should be scaled by the block probability because it |
3410 | // models a copy at the end of each predicated block. |
3411 | ScalarizationCost += VF.getKnownMinValue() * |
3412 | TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind); |
3413 | |
3414 | // The cost of the non-predicated instruction. |
3415 | ScalarizationCost += VF.getKnownMinValue() * |
3416 | TTI.getArithmeticInstrCost(Opcode: I->getOpcode(), Ty: I->getType(), CostKind); |
3417 | |
3418 | // The cost of insertelement and extractelement instructions needed for |
3419 | // scalarization. |
3420 | ScalarizationCost += getScalarizationOverhead(I, VF, CostKind); |
3421 | |
3422 | // Scale the cost by the probability of executing the predicated blocks. |
3423 | // This assumes the predicated block for each vector lane is equally |
3424 | // likely. |
3425 | ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb(); |
3426 | } |
3427 | InstructionCost SafeDivisorCost = 0; |
3428 | |
3429 | auto *VecTy = ToVectorTy(Scalar: I->getType(), EC: VF); |
3430 | |
3431 | // The cost of the select guard to ensure all lanes are well defined |
3432 | // after we speculate above any internal control flow. |
3433 | SafeDivisorCost += TTI.getCmpSelInstrCost( |
3434 | Opcode: Instruction::Select, ValTy: VecTy, |
3435 | CondTy: ToVectorTy(Scalar: Type::getInt1Ty(C&: I->getContext()), EC: VF), |
3436 | VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind); |
3437 | |
3438 | // Certain instructions can be cheaper to vectorize if they have a constant |
3439 | // second vector operand. One example of this are shifts on x86. |
3440 | Value *Op2 = I->getOperand(i: 1); |
3441 | auto Op2Info = TTI.getOperandInfo(V: Op2); |
3442 | if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && |
3443 | Legal->isInvariant(V: Op2)) |
3444 | Op2Info.Kind = TargetTransformInfo::OK_UniformValue; |
3445 | |
3446 | SmallVector<const Value *, 4> Operands(I->operand_values()); |
3447 | SafeDivisorCost += TTI.getArithmeticInstrCost( |
3448 | Opcode: I->getOpcode(), Ty: VecTy, CostKind, |
3449 | Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None}, |
3450 | Opd2Info: Op2Info, Args: Operands, CxtI: I); |
3451 | return {ScalarizationCost, SafeDivisorCost}; |
3452 | } |
3453 | |
3454 | bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( |
3455 | Instruction *I, ElementCount VF) const { |
3456 | assert(isAccessInterleaved(I) && "Expecting interleaved access." ); |
3457 | assert(getWideningDecision(I, VF) == CM_Unknown && |
3458 | "Decision should not be set yet." ); |
3459 | auto *Group = getInterleavedAccessGroup(Instr: I); |
3460 | assert(Group && "Must have a group." ); |
3461 | |
3462 | // If the instruction's allocated size doesn't equal it's type size, it |
3463 | // requires padding and will be scalarized. |
3464 | auto &DL = I->getDataLayout(); |
3465 | auto *ScalarTy = getLoadStoreType(I); |
3466 | if (hasIrregularType(Ty: ScalarTy, DL)) |
3467 | return false; |
3468 | |
3469 | // If the group involves a non-integral pointer, we may not be able to |
3470 | // losslessly cast all values to a common type. |
3471 | unsigned InterleaveFactor = Group->getFactor(); |
3472 | bool ScalarNI = DL.isNonIntegralPointerType(Ty: ScalarTy); |
3473 | for (unsigned i = 0; i < InterleaveFactor; i++) { |
3474 | Instruction *Member = Group->getMember(Index: i); |
3475 | if (!Member) |
3476 | continue; |
3477 | auto *MemberTy = getLoadStoreType(I: Member); |
3478 | bool MemberNI = DL.isNonIntegralPointerType(Ty: MemberTy); |
3479 | // Don't coerce non-integral pointers to integers or vice versa. |
3480 | if (MemberNI != ScalarNI) { |
3481 | // TODO: Consider adding special nullptr value case here |
3482 | return false; |
3483 | } else if (MemberNI && ScalarNI && |
3484 | ScalarTy->getPointerAddressSpace() != |
3485 | MemberTy->getPointerAddressSpace()) { |
3486 | return false; |
3487 | } |
3488 | } |
3489 | |
3490 | // Check if masking is required. |
3491 | // A Group may need masking for one of two reasons: it resides in a block that |
3492 | // needs predication, or it was decided to use masking to deal with gaps |
3493 | // (either a gap at the end of a load-access that may result in a speculative |
3494 | // load, or any gaps in a store-access). |
3495 | bool PredicatedAccessRequiresMasking = |
3496 | blockNeedsPredicationForAnyReason(BB: I->getParent()) && |
3497 | Legal->isMaskRequired(I); |
3498 | bool LoadAccessWithGapsRequiresEpilogMasking = |
3499 | isa<LoadInst>(Val: I) && Group->requiresScalarEpilogue() && |
3500 | !isScalarEpilogueAllowed(); |
3501 | bool StoreAccessWithGapsRequiresMasking = |
3502 | isa<StoreInst>(Val: I) && (Group->getNumMembers() < Group->getFactor()); |
3503 | if (!PredicatedAccessRequiresMasking && |
3504 | !LoadAccessWithGapsRequiresEpilogMasking && |
3505 | !StoreAccessWithGapsRequiresMasking) |
3506 | return true; |
3507 | |
3508 | // If masked interleaving is required, we expect that the user/target had |
3509 | // enabled it, because otherwise it either wouldn't have been created or |
3510 | // it should have been invalidated by the CostModel. |
3511 | assert(useMaskedInterleavedAccesses(TTI) && |
3512 | "Masked interleave-groups for predicated accesses are not enabled." ); |
3513 | |
3514 | if (Group->isReverse()) |
3515 | return false; |
3516 | |
3517 | auto *Ty = getLoadStoreType(I); |
3518 | const Align Alignment = getLoadStoreAlignment(I); |
3519 | return isa<LoadInst>(Val: I) ? TTI.isLegalMaskedLoad(DataType: Ty, Alignment) |
3520 | : TTI.isLegalMaskedStore(DataType: Ty, Alignment); |
3521 | } |
3522 | |
3523 | bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( |
3524 | Instruction *I, ElementCount VF) { |
3525 | // Get and ensure we have a valid memory instruction. |
3526 | assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction" ); |
3527 | |
3528 | auto *Ptr = getLoadStorePointerOperand(V: I); |
3529 | auto *ScalarTy = getLoadStoreType(I); |
3530 | |
3531 | // In order to be widened, the pointer should be consecutive, first of all. |
3532 | if (!Legal->isConsecutivePtr(AccessTy: ScalarTy, Ptr)) |
3533 | return false; |
3534 | |
3535 | // If the instruction is a store located in a predicated block, it will be |
3536 | // scalarized. |
3537 | if (isScalarWithPredication(I, VF)) |
3538 | return false; |
3539 | |
3540 | // If the instruction's allocated size doesn't equal it's type size, it |
3541 | // requires padding and will be scalarized. |
3542 | auto &DL = I->getDataLayout(); |
3543 | if (hasIrregularType(Ty: ScalarTy, DL)) |
3544 | return false; |
3545 | |
3546 | return true; |
3547 | } |
3548 | |
3549 | void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { |
3550 | // We should not collect Uniforms more than once per VF. Right now, |
3551 | // this function is called from collectUniformsAndScalars(), which |
3552 | // already does this check. Collecting Uniforms for VF=1 does not make any |
3553 | // sense. |
3554 | |
3555 | assert(VF.isVector() && !Uniforms.contains(VF) && |
3556 | "This function should not be visited twice for the same VF" ); |
3557 | |
3558 | // Visit the list of Uniforms. If we'll not find any uniform value, we'll |
3559 | // not analyze again. Uniforms.count(VF) will return 1. |
3560 | Uniforms[VF].clear(); |
3561 | |
3562 | // We now know that the loop is vectorizable! |
3563 | // Collect instructions inside the loop that will remain uniform after |
3564 | // vectorization. |
3565 | |
3566 | // Global values, params and instructions outside of current loop are out of |
3567 | // scope. |
3568 | auto isOutOfScope = [&](Value *V) -> bool { |
3569 | Instruction *I = dyn_cast<Instruction>(Val: V); |
3570 | return (!I || !TheLoop->contains(Inst: I)); |
3571 | }; |
3572 | |
3573 | // Worklist containing uniform instructions demanding lane 0. |
3574 | SetVector<Instruction *> Worklist; |
3575 | |
3576 | // Add uniform instructions demanding lane 0 to the worklist. Instructions |
3577 | // that require predication must not be considered uniform after |
3578 | // vectorization, because that would create an erroneous replicating region |
3579 | // where only a single instance out of VF should be formed. |
3580 | auto addToWorklistIfAllowed = [&](Instruction *I) -> void { |
3581 | if (isOutOfScope(I)) { |
3582 | LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " |
3583 | << *I << "\n" ); |
3584 | return; |
3585 | } |
3586 | if (isPredicatedInst(I)) { |
3587 | LLVM_DEBUG( |
3588 | dbgs() << "LV: Found not uniform due to requiring predication: " << *I |
3589 | << "\n" ); |
3590 | return; |
3591 | } |
3592 | LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n" ); |
3593 | Worklist.insert(X: I); |
3594 | }; |
3595 | |
3596 | // Start with the conditional branches exiting the loop. If the branch |
3597 | // condition is an instruction contained in the loop that is only used by the |
3598 | // branch, it is uniform. |
3599 | SmallVector<BasicBlock *> Exiting; |
3600 | TheLoop->getExitingBlocks(ExitingBlocks&: Exiting); |
3601 | for (BasicBlock *E : Exiting) { |
3602 | auto *Cmp = dyn_cast<Instruction>(Val: E->getTerminator()->getOperand(i: 0)); |
3603 | if (Cmp && TheLoop->contains(Inst: Cmp) && Cmp->hasOneUse()) |
3604 | addToWorklistIfAllowed(Cmp); |
3605 | } |
3606 | |
3607 | auto PrevVF = VF.divideCoefficientBy(RHS: 2); |
3608 | // Return true if all lanes perform the same memory operation, and we can |
3609 | // thus chose to execute only one. |
3610 | auto isUniformMemOpUse = [&](Instruction *I) { |
3611 | // If the value was already known to not be uniform for the previous |
3612 | // (smaller VF), it cannot be uniform for the larger VF. |
3613 | if (PrevVF.isVector()) { |
3614 | auto Iter = Uniforms.find(Val: PrevVF); |
3615 | if (Iter != Uniforms.end() && !Iter->second.contains(Ptr: I)) |
3616 | return false; |
3617 | } |
3618 | if (!Legal->isUniformMemOp(I&: *I, VF)) |
3619 | return false; |
3620 | if (isa<LoadInst>(Val: I)) |
3621 | // Loading the same address always produces the same result - at least |
3622 | // assuming aliasing and ordering which have already been checked. |
3623 | return true; |
3624 | // Storing the same value on every iteration. |
3625 | return TheLoop->isLoopInvariant(V: cast<StoreInst>(Val: I)->getValueOperand()); |
3626 | }; |
3627 | |
3628 | auto isUniformDecision = [&](Instruction *I, ElementCount VF) { |
3629 | InstWidening WideningDecision = getWideningDecision(I, VF); |
3630 | assert(WideningDecision != CM_Unknown && |
3631 | "Widening decision should be ready at this moment" ); |
3632 | |
3633 | if (isUniformMemOpUse(I)) |
3634 | return true; |
3635 | |
3636 | return (WideningDecision == CM_Widen || |
3637 | WideningDecision == CM_Widen_Reverse || |
3638 | WideningDecision == CM_Interleave); |
3639 | }; |
3640 | |
3641 | // Returns true if Ptr is the pointer operand of a memory access instruction |
3642 | // I, I is known to not require scalarization, and the pointer is not also |
3643 | // stored. |
3644 | auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { |
3645 | if (isa<StoreInst>(Val: I) && I->getOperand(i: 0) == Ptr) |
3646 | return false; |
3647 | return getLoadStorePointerOperand(V: I) == Ptr && |
3648 | (isUniformDecision(I, VF) || Legal->isInvariant(V: Ptr)); |
3649 | }; |
3650 | |
3651 | // Holds a list of values which are known to have at least one uniform use. |
3652 | // Note that there may be other uses which aren't uniform. A "uniform use" |
3653 | // here is something which only demands lane 0 of the unrolled iterations; |
3654 | // it does not imply that all lanes produce the same value (e.g. this is not |
3655 | // the usual meaning of uniform) |
3656 | SetVector<Value *> HasUniformUse; |
3657 | |
3658 | // Scan the loop for instructions which are either a) known to have only |
3659 | // lane 0 demanded or b) are uses which demand only lane 0 of their operand. |
3660 | for (auto *BB : TheLoop->blocks()) |
3661 | for (auto &I : *BB) { |
3662 | if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: &I)) { |
3663 | switch (II->getIntrinsicID()) { |
3664 | case Intrinsic::sideeffect: |
3665 | case Intrinsic::experimental_noalias_scope_decl: |
3666 | case Intrinsic::assume: |
3667 | case Intrinsic::lifetime_start: |
3668 | case Intrinsic::lifetime_end: |
3669 | if (TheLoop->hasLoopInvariantOperands(I: &I)) |
3670 | addToWorklistIfAllowed(&I); |
3671 | break; |
3672 | default: |
3673 | break; |
3674 | } |
3675 | } |
3676 | |
3677 | // ExtractValue instructions must be uniform, because the operands are |
3678 | // known to be loop-invariant. |
3679 | if (auto *EVI = dyn_cast<ExtractValueInst>(Val: &I)) { |
3680 | assert(isOutOfScope(EVI->getAggregateOperand()) && |
3681 | "Expected aggregate value to be loop invariant" ); |
3682 | addToWorklistIfAllowed(EVI); |
3683 | continue; |
3684 | } |
3685 | |
3686 | // If there's no pointer operand, there's nothing to do. |
3687 | auto *Ptr = getLoadStorePointerOperand(V: &I); |
3688 | if (!Ptr) |
3689 | continue; |
3690 | |
3691 | if (isUniformMemOpUse(&I)) |
3692 | addToWorklistIfAllowed(&I); |
3693 | |
3694 | if (isVectorizedMemAccessUse(&I, Ptr)) |
3695 | HasUniformUse.insert(X: Ptr); |
3696 | } |
3697 | |
3698 | // Add to the worklist any operands which have *only* uniform (e.g. lane 0 |
3699 | // demanding) users. Since loops are assumed to be in LCSSA form, this |
3700 | // disallows uses outside the loop as well. |
3701 | for (auto *V : HasUniformUse) { |
3702 | if (isOutOfScope(V)) |
3703 | continue; |
3704 | auto *I = cast<Instruction>(Val: V); |
3705 | auto UsersAreMemAccesses = |
3706 | llvm::all_of(Range: I->users(), P: [&](User *U) -> bool { |
3707 | return isVectorizedMemAccessUse(cast<Instruction>(Val: U), V); |
3708 | }); |
3709 | if (UsersAreMemAccesses) |
3710 | addToWorklistIfAllowed(I); |
3711 | } |
3712 | |
3713 | // Expand Worklist in topological order: whenever a new instruction |
3714 | // is added , its users should be already inside Worklist. It ensures |
3715 | // a uniform instruction will only be used by uniform instructions. |
3716 | unsigned idx = 0; |
3717 | while (idx != Worklist.size()) { |
3718 | Instruction *I = Worklist[idx++]; |
3719 | |
3720 | for (auto *OV : I->operand_values()) { |
3721 | // isOutOfScope operands cannot be uniform instructions. |
3722 | if (isOutOfScope(OV)) |
3723 | continue; |
3724 | // First order recurrence Phi's should typically be considered |
3725 | // non-uniform. |
3726 | auto *OP = dyn_cast<PHINode>(Val: OV); |
3727 | if (OP && Legal->isFixedOrderRecurrence(Phi: OP)) |
3728 | continue; |
3729 | // If all the users of the operand are uniform, then add the |
3730 | // operand into the uniform worklist. |
3731 | auto *OI = cast<Instruction>(Val: OV); |
3732 | if (llvm::all_of(Range: OI->users(), P: [&](User *U) -> bool { |
3733 | auto *J = cast<Instruction>(Val: U); |
3734 | return Worklist.count(key: J) || isVectorizedMemAccessUse(J, OI); |
3735 | })) |
3736 | addToWorklistIfAllowed(OI); |
3737 | } |
3738 | } |
3739 | |
3740 | // For an instruction to be added into Worklist above, all its users inside |
3741 | // the loop should also be in Worklist. However, this condition cannot be |
3742 | // true for phi nodes that form a cyclic dependence. We must process phi |
3743 | // nodes separately. An induction variable will remain uniform if all users |
3744 | // of the induction variable and induction variable update remain uniform. |
3745 | // The code below handles both pointer and non-pointer induction variables. |
3746 | BasicBlock *Latch = TheLoop->getLoopLatch(); |
3747 | for (const auto &Induction : Legal->getInductionVars()) { |
3748 | auto *Ind = Induction.first; |
3749 | auto *IndUpdate = cast<Instruction>(Val: Ind->getIncomingValueForBlock(BB: Latch)); |
3750 | |
3751 | // Determine if all users of the induction variable are uniform after |
3752 | // vectorization. |
3753 | auto UniformInd = llvm::all_of(Range: Ind->users(), P: [&](User *U) -> bool { |
3754 | auto *I = cast<Instruction>(Val: U); |
3755 | return I == IndUpdate || !TheLoop->contains(Inst: I) || Worklist.count(key: I) || |
3756 | isVectorizedMemAccessUse(I, Ind); |
3757 | }); |
3758 | if (!UniformInd) |
3759 | continue; |
3760 | |
3761 | // Determine if all users of the induction variable update instruction are |
3762 | // uniform after vectorization. |
3763 | auto UniformIndUpdate = |
3764 | llvm::all_of(Range: IndUpdate->users(), P: [&](User *U) -> bool { |
3765 | auto *I = cast<Instruction>(Val: U); |
3766 | return I == Ind || !TheLoop->contains(Inst: I) || Worklist.count(key: I) || |
3767 | isVectorizedMemAccessUse(I, IndUpdate); |
3768 | }); |
3769 | if (!UniformIndUpdate) |
3770 | continue; |
3771 | |
3772 | // The induction variable and its update instruction will remain uniform. |
3773 | addToWorklistIfAllowed(Ind); |
3774 | addToWorklistIfAllowed(IndUpdate); |
3775 | } |
3776 | |
3777 | Uniforms[VF].insert(I: Worklist.begin(), E: Worklist.end()); |
3778 | } |
3779 | |
3780 | bool LoopVectorizationCostModel::runtimeChecksRequired() { |
3781 | LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n" ); |
3782 | |
3783 | if (Legal->getRuntimePointerChecking()->Need) { |
3784 | reportVectorizationFailure(DebugMsg: "Runtime ptr check is required with -Os/-Oz" , |
3785 | OREMsg: "runtime pointer checks needed. Enable vectorization of this " |
3786 | "loop with '#pragma clang loop vectorize(enable)' when " |
3787 | "compiling with -Os/-Oz" , |
3788 | ORETag: "CantVersionLoopWithOptForSize" , ORE, TheLoop); |
3789 | return true; |
3790 | } |
3791 | |
3792 | if (!PSE.getPredicate().isAlwaysTrue()) { |
3793 | reportVectorizationFailure(DebugMsg: "Runtime SCEV check is required with -Os/-Oz" , |
3794 | OREMsg: "runtime SCEV checks needed. Enable vectorization of this " |
3795 | "loop with '#pragma clang loop vectorize(enable)' when " |
3796 | "compiling with -Os/-Oz" , |
3797 | ORETag: "CantVersionLoopWithOptForSize" , ORE, TheLoop); |
3798 | return true; |
3799 | } |
3800 | |
3801 | // FIXME: Avoid specializing for stride==1 instead of bailing out. |
3802 | if (!Legal->getLAI()->getSymbolicStrides().empty()) { |
3803 | reportVectorizationFailure(DebugMsg: "Runtime stride check for small trip count" , |
3804 | OREMsg: "runtime stride == 1 checks needed. Enable vectorization of " |
3805 | "this loop without such check by compiling with -Os/-Oz" , |
3806 | ORETag: "CantVersionLoopWithOptForSize" , ORE, TheLoop); |
3807 | return true; |
3808 | } |
3809 | |
3810 | return false; |
3811 | } |
3812 | |
3813 | bool LoopVectorizationCostModel::isScalableVectorizationAllowed() { |
3814 | if (IsScalableVectorizationAllowed) |
3815 | return *IsScalableVectorizationAllowed; |
3816 | |
3817 | IsScalableVectorizationAllowed = false; |
3818 | if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) |
3819 | return false; |
3820 | |
3821 | if (Hints->isScalableVectorizationDisabled()) { |
3822 | reportVectorizationInfo(Msg: "Scalable vectorization is explicitly disabled" , |
3823 | ORETag: "ScalableVectorizationDisabled" , ORE, TheLoop); |
3824 | return false; |
3825 | } |
3826 | |
3827 | LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n" ); |
3828 | |
3829 | auto MaxScalableVF = ElementCount::getScalable( |
3830 | MinVal: std::numeric_limits<ElementCount::ScalarTy>::max()); |
3831 | |
3832 | // Test that the loop-vectorizer can legalize all operations for this MaxVF. |
3833 | // FIXME: While for scalable vectors this is currently sufficient, this should |
3834 | // be replaced by a more detailed mechanism that filters out specific VFs, |
3835 | // instead of invalidating vectorization for a whole set of VFs based on the |
3836 | // MaxVF. |
3837 | |
3838 | // Disable scalable vectorization if the loop contains unsupported reductions. |
3839 | if (!canVectorizeReductions(VF: MaxScalableVF)) { |
3840 | reportVectorizationInfo( |
3841 | Msg: "Scalable vectorization not supported for the reduction " |
3842 | "operations found in this loop." , |
3843 | ORETag: "ScalableVFUnfeasible" , ORE, TheLoop); |
3844 | return false; |
3845 | } |
3846 | |
3847 | // Disable scalable vectorization if the loop contains any instructions |
3848 | // with element types not supported for scalable vectors. |
3849 | if (any_of(Range&: ElementTypesInLoop, P: [&](Type *Ty) { |
3850 | return !Ty->isVoidTy() && |
3851 | !this->TTI.isElementTypeLegalForScalableVector(Ty); |
3852 | })) { |
3853 | reportVectorizationInfo(Msg: "Scalable vectorization is not supported " |
3854 | "for all element types found in this loop." , |
3855 | ORETag: "ScalableVFUnfeasible" , ORE, TheLoop); |
3856 | return false; |
3857 | } |
3858 | |
3859 | if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(F: *TheFunction, TTI)) { |
3860 | reportVectorizationInfo(Msg: "The target does not provide maximum vscale value " |
3861 | "for safe distance analysis." , |
3862 | ORETag: "ScalableVFUnfeasible" , ORE, TheLoop); |
3863 | return false; |
3864 | } |
3865 | |
3866 | IsScalableVectorizationAllowed = true; |
3867 | return true; |
3868 | } |
3869 | |
3870 | ElementCount |
3871 | LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { |
3872 | if (!isScalableVectorizationAllowed()) |
3873 | return ElementCount::getScalable(MinVal: 0); |
3874 | |
3875 | auto MaxScalableVF = ElementCount::getScalable( |
3876 | MinVal: std::numeric_limits<ElementCount::ScalarTy>::max()); |
3877 | if (Legal->isSafeForAnyVectorWidth()) |
3878 | return MaxScalableVF; |
3879 | |
3880 | std::optional<unsigned> MaxVScale = getMaxVScale(F: *TheFunction, TTI); |
3881 | // Limit MaxScalableVF by the maximum safe dependence distance. |
3882 | MaxScalableVF = ElementCount::getScalable(MinVal: MaxSafeElements / *MaxVScale); |
3883 | |
3884 | if (!MaxScalableVF) |
3885 | reportVectorizationInfo( |
3886 | Msg: "Max legal vector width too small, scalable vectorization " |
3887 | "unfeasible." , |
3888 | ORETag: "ScalableVFUnfeasible" , ORE, TheLoop); |
3889 | |
3890 | return MaxScalableVF; |
3891 | } |
3892 | |
3893 | FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( |
3894 | unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) { |
3895 | MinBWs = computeMinimumValueSizes(Blocks: TheLoop->getBlocks(), DB&: *DB, TTI: &TTI); |
3896 | unsigned SmallestType, WidestType; |
3897 | std::tie(args&: SmallestType, args&: WidestType) = getSmallestAndWidestTypes(); |
3898 | |
3899 | // Get the maximum safe dependence distance in bits computed by LAA. |
3900 | // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from |
3901 | // the memory accesses that is most restrictive (involved in the smallest |
3902 | // dependence distance). |
3903 | unsigned MaxSafeElements = |
3904 | llvm::bit_floor(Value: Legal->getMaxSafeVectorWidthInBits() / WidestType); |
3905 | |
3906 | auto MaxSafeFixedVF = ElementCount::getFixed(MinVal: MaxSafeElements); |
3907 | auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); |
3908 | |
3909 | LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF |
3910 | << ".\n" ); |
3911 | LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF |
3912 | << ".\n" ); |
3913 | |
3914 | // First analyze the UserVF, fall back if the UserVF should be ignored. |
3915 | if (UserVF) { |
3916 | auto MaxSafeUserVF = |
3917 | UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; |
3918 | |
3919 | if (ElementCount::isKnownLE(LHS: UserVF, RHS: MaxSafeUserVF)) { |
3920 | // If `VF=vscale x N` is safe, then so is `VF=N` |
3921 | if (UserVF.isScalable()) |
3922 | return FixedScalableVFPair( |
3923 | ElementCount::getFixed(MinVal: UserVF.getKnownMinValue()), UserVF); |
3924 | else |
3925 | return UserVF; |
3926 | } |
3927 | |
3928 | assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); |
3929 | |
3930 | // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it |
3931 | // is better to ignore the hint and let the compiler choose a suitable VF. |
3932 | if (!UserVF.isScalable()) { |
3933 | LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF |
3934 | << " is unsafe, clamping to max safe VF=" |
3935 | << MaxSafeFixedVF << ".\n" ); |
3936 | ORE->emit(RemarkBuilder: [&]() { |
3937 | return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor" , |
3938 | TheLoop->getStartLoc(), |
3939 | TheLoop->getHeader()) |
3940 | << "User-specified vectorization factor " |
3941 | << ore::NV("UserVectorizationFactor" , UserVF) |
3942 | << " is unsafe, clamping to maximum safe vectorization factor " |
3943 | << ore::NV("VectorizationFactor" , MaxSafeFixedVF); |
3944 | }); |
3945 | return MaxSafeFixedVF; |
3946 | } |
3947 | |
3948 | if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { |
3949 | LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF |
3950 | << " is ignored because scalable vectors are not " |
3951 | "available.\n" ); |
3952 | ORE->emit(RemarkBuilder: [&]() { |
3953 | return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor" , |
3954 | TheLoop->getStartLoc(), |
3955 | TheLoop->getHeader()) |
3956 | << "User-specified vectorization factor " |
3957 | << ore::NV("UserVectorizationFactor" , UserVF) |
3958 | << " is ignored because the target does not support scalable " |
3959 | "vectors. The compiler will pick a more suitable value." ; |
3960 | }); |
3961 | } else { |
3962 | LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF |
3963 | << " is unsafe. Ignoring scalable UserVF.\n" ); |
3964 | ORE->emit(RemarkBuilder: [&]() { |
3965 | return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor" , |
3966 | TheLoop->getStartLoc(), |
3967 | TheLoop->getHeader()) |
3968 | << "User-specified vectorization factor " |
3969 | << ore::NV("UserVectorizationFactor" , UserVF) |
3970 | << " is unsafe. Ignoring the hint to let the compiler pick a " |
3971 | "more suitable value." ; |
3972 | }); |
3973 | } |
3974 | } |
3975 | |
3976 | LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType |
3977 | << " / " << WidestType << " bits.\n" ); |
3978 | |
3979 | FixedScalableVFPair Result(ElementCount::getFixed(MinVal: 1), |
3980 | ElementCount::getScalable(MinVal: 0)); |
3981 | if (auto MaxVF = |
3982 | getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType, |
3983 | MaxSafeVF: MaxSafeFixedVF, FoldTailByMasking)) |
3984 | Result.FixedVF = MaxVF; |
3985 | |
3986 | if (auto MaxVF = |
3987 | getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType, |
3988 | MaxSafeVF: MaxSafeScalableVF, FoldTailByMasking)) |
3989 | if (MaxVF.isScalable()) { |
3990 | Result.ScalableVF = MaxVF; |
3991 | LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF |
3992 | << "\n" ); |
3993 | } |
3994 | |
3995 | return Result; |
3996 | } |
3997 | |
3998 | FixedScalableVFPair |
3999 | LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { |
4000 | if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { |
4001 | // TODO: It may by useful to do since it's still likely to be dynamically |
4002 | // uniform if the target can skip. |
4003 | reportVectorizationFailure( |
4004 | DebugMsg: "Not inserting runtime ptr check for divergent target" , |
4005 | OREMsg: "runtime pointer checks needed. Not enabled for divergent target" , |
4006 | ORETag: "CantVersionLoopWithDivergentTarget" , ORE, TheLoop); |
4007 | return FixedScalableVFPair::getNone(); |
4008 | } |
4009 | |
4010 | unsigned TC = PSE.getSE()->getSmallConstantTripCount(L: TheLoop); |
4011 | unsigned MaxTC = PSE.getSE()->getSmallConstantMaxTripCount(L: TheLoop); |
4012 | LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); |
4013 | if (TC == 1) { |
4014 | reportVectorizationFailure(DebugMsg: "Single iteration (non) loop" , |
4015 | OREMsg: "loop trip count is one, irrelevant for vectorization" , |
4016 | ORETag: "SingleIterationLoop" , ORE, TheLoop); |
4017 | return FixedScalableVFPair::getNone(); |
4018 | } |
4019 | |
4020 | switch (ScalarEpilogueStatus) { |
4021 | case CM_ScalarEpilogueAllowed: |
4022 | return computeFeasibleMaxVF(MaxTripCount: MaxTC, UserVF, FoldTailByMasking: false); |
4023 | case CM_ScalarEpilogueNotAllowedUsePredicate: |
4024 | [[fallthrough]]; |
4025 | case CM_ScalarEpilogueNotNeededUsePredicate: |
4026 | LLVM_DEBUG( |
4027 | dbgs() << "LV: vector predicate hint/switch found.\n" |
4028 | << "LV: Not allowing scalar epilogue, creating predicated " |
4029 | << "vector loop.\n" ); |
4030 | break; |
4031 | case CM_ScalarEpilogueNotAllowedLowTripLoop: |
4032 | // fallthrough as a special case of OptForSize |
4033 | case CM_ScalarEpilogueNotAllowedOptSize: |
4034 | if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) |
4035 | LLVM_DEBUG( |
4036 | dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n" ); |
4037 | else |
4038 | LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " |
4039 | << "count.\n" ); |
4040 | |
4041 | // Bail if runtime checks are required, which are not good when optimising |
4042 | // for size. |
4043 | if (runtimeChecksRequired()) |
4044 | return FixedScalableVFPair::getNone(); |
4045 | |
4046 | break; |
4047 | } |
4048 | |
4049 | // The only loops we can vectorize without a scalar epilogue, are loops with |
4050 | // a bottom-test and a single exiting block. We'd have to handle the fact |
4051 | // that not every instruction executes on the last iteration. This will |
4052 | // require a lane mask which varies through the vector loop body. (TODO) |
4053 | if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { |
4054 | // If there was a tail-folding hint/switch, but we can't fold the tail by |
4055 | // masking, fallback to a vectorization with a scalar epilogue. |
4056 | if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { |
4057 | LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " |
4058 | "scalar epilogue instead.\n" ); |
4059 | ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; |
4060 | return computeFeasibleMaxVF(MaxTripCount: MaxTC, UserVF, FoldTailByMasking: false); |
4061 | } |
4062 | return FixedScalableVFPair::getNone(); |
4063 | } |
4064 | |
4065 | // Now try the tail folding |
4066 | |
4067 | // Invalidate interleave groups that require an epilogue if we can't mask |
4068 | // the interleave-group. |
4069 | if (!useMaskedInterleavedAccesses(TTI)) { |
4070 | assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && |
4071 | "No decisions should have been taken at this point" ); |
4072 | // Note: There is no need to invalidate any cost modeling decisions here, as |
4073 | // non where taken so far. |
4074 | InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); |
4075 | } |
4076 | |
4077 | FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTripCount: MaxTC, UserVF, FoldTailByMasking: true); |
4078 | |
4079 | // Avoid tail folding if the trip count is known to be a multiple of any VF |
4080 | // we choose. |
4081 | std::optional<unsigned> MaxPowerOf2RuntimeVF = |
4082 | MaxFactors.FixedVF.getFixedValue(); |
4083 | if (MaxFactors.ScalableVF) { |
4084 | std::optional<unsigned> MaxVScale = getMaxVScale(F: *TheFunction, TTI); |
4085 | if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) { |
4086 | MaxPowerOf2RuntimeVF = std::max<unsigned>( |
4087 | a: *MaxPowerOf2RuntimeVF, |
4088 | b: *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue()); |
4089 | } else |
4090 | MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now. |
4091 | } |
4092 | |
4093 | if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) { |
4094 | assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) && |
4095 | "MaxFixedVF must be a power of 2" ); |
4096 | unsigned MaxVFtimesIC = |
4097 | UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF; |
4098 | ScalarEvolution *SE = PSE.getSE(); |
4099 | const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); |
4100 | const SCEV *ExitCount = SE->getAddExpr( |
4101 | LHS: BackedgeTakenCount, RHS: SE->getOne(Ty: BackedgeTakenCount->getType())); |
4102 | const SCEV *Rem = SE->getURemExpr( |
4103 | LHS: SE->applyLoopGuards(Expr: ExitCount, L: TheLoop), |
4104 | RHS: SE->getConstant(Ty: BackedgeTakenCount->getType(), V: MaxVFtimesIC)); |
4105 | if (Rem->isZero()) { |
4106 | // Accept MaxFixedVF if we do not have a tail. |
4107 | LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n" ); |
4108 | return MaxFactors; |
4109 | } |
4110 | } |
4111 | |
4112 | // If we don't know the precise trip count, or if the trip count that we |
4113 | // found modulo the vectorization factor is not zero, try to fold the tail |
4114 | // by masking. |
4115 | // FIXME: look for a smaller MaxVF that does divide TC rather than masking. |
4116 | setTailFoldingStyles(IsScalableVF: MaxFactors.ScalableVF.isScalable(), UserIC); |
4117 | if (foldTailByMasking()) { |
4118 | if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) { |
4119 | LLVM_DEBUG( |
4120 | dbgs() |
4121 | << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will " |
4122 | "try to generate VP Intrinsics with scalable vector " |
4123 | "factors only.\n" ); |
4124 | // Tail folded loop using VP intrinsics restricts the VF to be scalable |
4125 | // for now. |
4126 | // TODO: extend it for fixed vectors, if required. |
4127 | assert(MaxFactors.ScalableVF.isScalable() && |
4128 | "Expected scalable vector factor." ); |
4129 | |
4130 | MaxFactors.FixedVF = ElementCount::getFixed(MinVal: 1); |
4131 | } |
4132 | return MaxFactors; |
4133 | } |
4134 | |
4135 | // If there was a tail-folding hint/switch, but we can't fold the tail by |
4136 | // masking, fallback to a vectorization with a scalar epilogue. |
4137 | if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { |
4138 | LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " |
4139 | "scalar epilogue instead.\n" ); |
4140 | ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; |
4141 | return MaxFactors; |
4142 | } |
4143 | |
4144 | if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { |
4145 | LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n" ); |
4146 | return FixedScalableVFPair::getNone(); |
4147 | } |
4148 | |
4149 | if (TC == 0) { |
4150 | reportVectorizationFailure( |
4151 | DebugMsg: "Unable to calculate the loop count due to complex control flow" , |
4152 | OREMsg: "unable to calculate the loop count due to complex control flow" , |
4153 | ORETag: "UnknownLoopCountComplexCFG" , ORE, TheLoop); |
4154 | return FixedScalableVFPair::getNone(); |
4155 | } |
4156 | |
4157 | reportVectorizationFailure( |
4158 | DebugMsg: "Cannot optimize for size and vectorize at the same time." , |
4159 | OREMsg: "cannot optimize for size and vectorize at the same time. " |
4160 | "Enable vectorization of this loop with '#pragma clang loop " |
4161 | "vectorize(enable)' when compiling with -Os/-Oz" , |
4162 | ORETag: "NoTailLoopWithOptForSize" , ORE, TheLoop); |
4163 | return FixedScalableVFPair::getNone(); |
4164 | } |
4165 | |
4166 | ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( |
4167 | unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType, |
4168 | ElementCount MaxSafeVF, bool FoldTailByMasking) { |
4169 | bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); |
4170 | const TypeSize WidestRegister = TTI.getRegisterBitWidth( |
4171 | K: ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector |
4172 | : TargetTransformInfo::RGK_FixedWidthVector); |
4173 | |
4174 | // Convenience function to return the minimum of two ElementCounts. |
4175 | auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { |
4176 | assert((LHS.isScalable() == RHS.isScalable()) && |
4177 | "Scalable flags must match" ); |
4178 | return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; |
4179 | }; |
4180 | |
4181 | // Ensure MaxVF is a power of 2; the dependence distance bound may not be. |
4182 | // Note that both WidestRegister and WidestType may not be a powers of 2. |
4183 | auto MaxVectorElementCount = ElementCount::get( |
4184 | MinVal: llvm::bit_floor(Value: WidestRegister.getKnownMinValue() / WidestType), |
4185 | Scalable: ComputeScalableMaxVF); |
4186 | MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); |
4187 | LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " |
4188 | << (MaxVectorElementCount * WidestType) << " bits.\n" ); |
4189 | |
4190 | if (!MaxVectorElementCount) { |
4191 | LLVM_DEBUG(dbgs() << "LV: The target has no " |
4192 | << (ComputeScalableMaxVF ? "scalable" : "fixed" ) |
4193 | << " vector registers.\n" ); |
4194 | return ElementCount::getFixed(MinVal: 1); |
4195 | } |
4196 | |
4197 | unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue(); |
4198 | if (MaxVectorElementCount.isScalable() && |
4199 | TheFunction->hasFnAttribute(Kind: Attribute::VScaleRange)) { |
4200 | auto Attr = TheFunction->getFnAttribute(Kind: Attribute::VScaleRange); |
4201 | auto Min = Attr.getVScaleRangeMin(); |
4202 | WidestRegisterMinEC *= Min; |
4203 | } |
4204 | |
4205 | // When a scalar epilogue is required, at least one iteration of the scalar |
4206 | // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a |
4207 | // max VF that results in a dead vector loop. |
4208 | if (MaxTripCount > 0 && requiresScalarEpilogue(IsVectorizing: true)) |
4209 | MaxTripCount -= 1; |
4210 | |
4211 | if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC && |
4212 | (!FoldTailByMasking || isPowerOf2_32(Value: MaxTripCount))) { |
4213 | // If upper bound loop trip count (TC) is known at compile time there is no |
4214 | // point in choosing VF greater than TC (as done in the loop below). Select |
4215 | // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is |
4216 | // scalable, we only fall back on a fixed VF when the TC is less than or |
4217 | // equal to the known number of lanes. |
4218 | auto ClampedUpperTripCount = llvm::bit_floor(Value: MaxTripCount); |
4219 | LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " |
4220 | "exceeding the constant trip count: " |
4221 | << ClampedUpperTripCount << "\n" ); |
4222 | return ElementCount::get( |
4223 | MinVal: ClampedUpperTripCount, |
4224 | Scalable: FoldTailByMasking ? MaxVectorElementCount.isScalable() : false); |
4225 | } |
4226 | |
4227 | TargetTransformInfo::RegisterKind RegKind = |
4228 | ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector |
4229 | : TargetTransformInfo::RGK_FixedWidthVector; |
4230 | ElementCount MaxVF = MaxVectorElementCount; |
4231 | if (MaximizeBandwidth || |
4232 | (MaximizeBandwidth.getNumOccurrences() == 0 && |
4233 | (TTI.shouldMaximizeVectorBandwidth(K: RegKind) || |
4234 | (UseWiderVFIfCallVariantsPresent && Legal->hasVectorCallVariants())))) { |
4235 | auto MaxVectorElementCountMaxBW = ElementCount::get( |
4236 | MinVal: llvm::bit_floor(Value: WidestRegister.getKnownMinValue() / SmallestType), |
4237 | Scalable: ComputeScalableMaxVF); |
4238 | MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); |
4239 | |
4240 | // Collect all viable vectorization factors larger than the default MaxVF |
4241 | // (i.e. MaxVectorElementCount). |
4242 | SmallVector<ElementCount, 8> VFs; |
4243 | for (ElementCount VS = MaxVectorElementCount * 2; |
4244 | ElementCount::isKnownLE(LHS: VS, RHS: MaxVectorElementCountMaxBW); VS *= 2) |
4245 | VFs.push_back(Elt: VS); |
4246 | |
4247 | // For each VF calculate its register usage. |
4248 | auto RUs = calculateRegisterUsage(VFs); |
4249 | |
4250 | // Select the largest VF which doesn't require more registers than existing |
4251 | // ones. |
4252 | for (int I = RUs.size() - 1; I >= 0; --I) { |
4253 | const auto &MLU = RUs[I].MaxLocalUsers; |
4254 | if (all_of(Range: MLU, P: [&](decltype(MLU.front()) &LU) { |
4255 | return LU.second <= TTI.getNumberOfRegisters(ClassID: LU.first); |
4256 | })) { |
4257 | MaxVF = VFs[I]; |
4258 | break; |
4259 | } |
4260 | } |
4261 | if (ElementCount MinVF = |
4262 | TTI.getMinimumVF(ElemWidth: SmallestType, IsScalable: ComputeScalableMaxVF)) { |
4263 | if (ElementCount::isKnownLT(LHS: MaxVF, RHS: MinVF)) { |
4264 | LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF |
4265 | << ") with target's minimum: " << MinVF << '\n'); |
4266 | MaxVF = MinVF; |
4267 | } |
4268 | } |
4269 | |
4270 | // Invalidate any widening decisions we might have made, in case the loop |
4271 | // requires prediction (decided later), but we have already made some |
4272 | // load/store widening decisions. |
4273 | invalidateCostModelingDecisions(); |
4274 | } |
4275 | return MaxVF; |
4276 | } |
4277 | |
4278 | /// Convenience function that returns the value of vscale_range iff |
4279 | /// vscale_range.min == vscale_range.max or otherwise returns the value |
4280 | /// returned by the corresponding TTI method. |
4281 | static std::optional<unsigned> |
4282 | getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) { |
4283 | const Function *Fn = L->getHeader()->getParent(); |
4284 | if (Fn->hasFnAttribute(Kind: Attribute::VScaleRange)) { |
4285 | auto Attr = Fn->getFnAttribute(Kind: Attribute::VScaleRange); |
4286 | auto Min = Attr.getVScaleRangeMin(); |
4287 | auto Max = Attr.getVScaleRangeMax(); |
4288 | if (Max && Min == Max) |
4289 | return Max; |
4290 | } |
4291 | |
4292 | return TTI.getVScaleForTuning(); |
4293 | } |
4294 | |
4295 | bool LoopVectorizationPlanner::isMoreProfitable( |
4296 | const VectorizationFactor &A, const VectorizationFactor &B) const { |
4297 | InstructionCost CostA = A.Cost; |
4298 | InstructionCost CostB = B.Cost; |
4299 | |
4300 | unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(L: OrigLoop); |
4301 | |
4302 | // Improve estimate for the vector width if it is scalable. |
4303 | unsigned EstimatedWidthA = A.Width.getKnownMinValue(); |
4304 | unsigned EstimatedWidthB = B.Width.getKnownMinValue(); |
4305 | if (std::optional<unsigned> VScale = getVScaleForTuning(L: OrigLoop, TTI)) { |
4306 | if (A.Width.isScalable()) |
4307 | EstimatedWidthA *= *VScale; |
4308 | if (B.Width.isScalable()) |
4309 | EstimatedWidthB *= *VScale; |
4310 | } |
4311 | |
4312 | // Assume vscale may be larger than 1 (or the value being tuned for), |
4313 | // so that scalable vectorization is slightly favorable over fixed-width |
4314 | // vectorization. |
4315 | bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() && |
4316 | A.Width.isScalable() && !B.Width.isScalable(); |
4317 | |
4318 | auto CmpFn = [PreferScalable](const InstructionCost &LHS, |
4319 | const InstructionCost &RHS) { |
4320 | return PreferScalable ? LHS <= RHS : LHS < RHS; |
4321 | }; |
4322 | |
4323 | // To avoid the need for FP division: |
4324 | // (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB) |
4325 | // <=> (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA) |
4326 | if (!MaxTripCount) |
4327 | return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA); |
4328 | |
4329 | auto GetCostForTC = [MaxTripCount, this](unsigned VF, |
4330 | InstructionCost VectorCost, |
4331 | InstructionCost ScalarCost) { |
4332 | // If the trip count is a known (possibly small) constant, the trip count |
4333 | // will be rounded up to an integer number of iterations under |
4334 | // FoldTailByMasking. The total cost in that case will be |
4335 | // VecCost*ceil(TripCount/VF). When not folding the tail, the total |
4336 | // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be |
4337 | // some extra overheads, but for the purpose of comparing the costs of |
4338 | // different VFs we can use this to compare the total loop-body cost |
4339 | // expected after vectorization. |
4340 | if (CM.foldTailByMasking()) |
4341 | return VectorCost * divideCeil(Numerator: MaxTripCount, Denominator: VF); |
4342 | return VectorCost * (MaxTripCount / VF) + ScalarCost * (MaxTripCount % VF); |
4343 | }; |
4344 | |
4345 | auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost); |
4346 | auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost); |
4347 | return CmpFn(RTCostA, RTCostB); |
4348 | } |
4349 | |
4350 | static void (SmallVector<InstructionVFPair> InvalidCosts, |
4351 | OptimizationRemarkEmitter *ORE, |
4352 | Loop *TheLoop) { |
4353 | if (InvalidCosts.empty()) |
4354 | return; |
4355 | |
4356 | // Emit a report of VFs with invalid costs in the loop. |
4357 | |
4358 | // Group the remarks per instruction, keeping the instruction order from |
4359 | // InvalidCosts. |
4360 | std::map<Instruction *, unsigned> Numbering; |
4361 | unsigned I = 0; |
4362 | for (auto &Pair : InvalidCosts) |
4363 | if (!Numbering.count(x: Pair.first)) |
4364 | Numbering[Pair.first] = I++; |
4365 | |
4366 | // Sort the list, first on instruction(number) then on VF. |
4367 | sort(C&: InvalidCosts, Comp: [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { |
4368 | if (Numbering[A.first] != Numbering[B.first]) |
4369 | return Numbering[A.first] < Numbering[B.first]; |
4370 | const auto &LHS = A.second; |
4371 | const auto &RHS = B.second; |
4372 | return std::make_tuple(args: LHS.isScalable(), args: LHS.getKnownMinValue()) < |
4373 | std::make_tuple(args: RHS.isScalable(), args: RHS.getKnownMinValue()); |
4374 | }); |
4375 | |
4376 | // For a list of ordered instruction-vf pairs: |
4377 | // [(load, vf1), (load, vf2), (store, vf1)] |
4378 | // Group the instructions together to emit separate remarks for: |
4379 | // load (vf1, vf2) |
4380 | // store (vf1) |
4381 | auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); |
4382 | auto Subset = ArrayRef<InstructionVFPair>(); |
4383 | do { |
4384 | if (Subset.empty()) |
4385 | Subset = Tail.take_front(N: 1); |
4386 | |
4387 | Instruction *I = Subset.front().first; |
4388 | |
4389 | // If the next instruction is different, or if there are no other pairs, |
4390 | // emit a remark for the collated subset. e.g. |
4391 | // [(load, vf1), (load, vf2))] |
4392 | // to emit: |
4393 | // remark: invalid costs for 'load' at VF=(vf, vf2) |
4394 | if (Subset == Tail || Tail[Subset.size()].first != I) { |
4395 | std::string OutString; |
4396 | raw_string_ostream OS(OutString); |
4397 | assert(!Subset.empty() && "Unexpected empty range" ); |
4398 | OS << "Instruction with invalid costs prevented vectorization at VF=(" ; |
4399 | for (const auto &Pair : Subset) |
4400 | OS << (Pair.second == Subset.front().second ? "" : ", " ) << Pair.second; |
4401 | OS << "):" ; |
4402 | if (auto *CI = dyn_cast<CallInst>(Val: I)) |
4403 | OS << " call to " << CI->getCalledFunction()->getName(); |
4404 | else |
4405 | OS << " " << I->getOpcodeName(); |
4406 | OS.flush(); |
4407 | reportVectorizationInfo(Msg: OutString, ORETag: "InvalidCost" , ORE, TheLoop, I); |
4408 | Tail = Tail.drop_front(N: Subset.size()); |
4409 | Subset = {}; |
4410 | } else |
4411 | // Grow the subset by one element |
4412 | Subset = Tail.take_front(N: Subset.size() + 1); |
4413 | } while (!Tail.empty()); |
4414 | } |
4415 | |
4416 | /// Check if any recipe of \p Plan will generate a vector value, which will be |
4417 | /// assigned a vector register. |
4418 | static bool willGenerateVectors(VPlan &Plan, ElementCount VF, |
4419 | const TargetTransformInfo &TTI) { |
4420 | assert(VF.isVector() && "Checking a scalar VF?" ); |
4421 | VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType(), |
4422 | Plan.getCanonicalIV()->getScalarType()->getContext()); |
4423 | DenseSet<VPRecipeBase *> EphemeralRecipes; |
4424 | collectEphemeralRecipesForVPlan(Plan, EphRecipes&: EphemeralRecipes); |
4425 | // Set of already visited types. |
4426 | DenseSet<Type *> Visited; |
4427 | for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( |
4428 | Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry()))) { |
4429 | for (VPRecipeBase &R : *VPBB) { |
4430 | if (EphemeralRecipes.contains(V: &R)) |
4431 | continue; |
4432 | // Continue early if the recipe is considered to not produce a vector |
4433 | // result. Note that this includes VPInstruction where some opcodes may |
4434 | // produce a vector, to preserve existing behavior as VPInstructions model |
4435 | // aspects not directly mapped to existing IR instructions. |
4436 | switch (R.getVPDefID()) { |
4437 | case VPDef::VPDerivedIVSC: |
4438 | case VPDef::VPScalarIVStepsSC: |
4439 | case VPDef::VPScalarCastSC: |
4440 | case VPDef::VPReplicateSC: |
4441 | case VPDef::VPInstructionSC: |
4442 | case VPDef::VPCanonicalIVPHISC: |
4443 | case VPDef::VPVectorPointerSC: |
4444 | case VPDef::VPExpandSCEVSC: |
4445 | case VPDef::VPEVLBasedIVPHISC: |
4446 | case VPDef::VPPredInstPHISC: |
4447 | case VPDef::VPBranchOnMaskSC: |
4448 | continue; |
4449 | case VPDef::VPReductionSC: |
4450 | case VPDef::VPActiveLaneMaskPHISC: |
4451 | case VPDef::VPWidenCallSC: |
4452 | case VPDef::VPWidenCanonicalIVSC: |
4453 | case VPDef::VPWidenCastSC: |
4454 | case VPDef::VPWidenGEPSC: |
4455 | case VPDef::VPWidenSC: |
4456 | case VPDef::VPWidenSelectSC: |
4457 | case VPDef::VPBlendSC: |
4458 | case VPDef::VPFirstOrderRecurrencePHISC: |
4459 | case VPDef::VPWidenPHISC: |
4460 | case VPDef::VPWidenIntOrFpInductionSC: |
4461 | case VPDef::VPWidenPointerInductionSC: |
4462 | case VPDef::VPReductionPHISC: |
4463 | case VPDef::VPInterleaveSC: |
4464 | case VPDef::VPWidenLoadEVLSC: |
4465 | case VPDef::VPWidenLoadSC: |
4466 | case VPDef::VPWidenStoreEVLSC: |
4467 | case VPDef::VPWidenStoreSC: |
4468 | break; |
4469 | default: |
4470 | llvm_unreachable("unhandled recipe" ); |
4471 | } |
4472 | |
4473 | auto WillWiden = [&TTI, VF](Type *ScalarTy) { |
4474 | Type *VectorTy = ToVectorTy(Scalar: ScalarTy, EC: VF); |
4475 | unsigned NumLegalParts = TTI.getNumberOfParts(Tp: VectorTy); |
4476 | if (!NumLegalParts) |
4477 | return false; |
4478 | if (VF.isScalable()) { |
4479 | // <vscale x 1 x iN> is assumed to be profitable over iN because |
4480 | // scalable registers are a distinct register class from scalar |
4481 | // ones. If we ever find a target which wants to lower scalable |
4482 | // vectors back to scalars, we'll need to update this code to |
4483 | // explicitly ask TTI about the register class uses for each part. |
4484 | return NumLegalParts <= VF.getKnownMinValue(); |
4485 | } |
4486 | // Two or more parts that share a register - are vectorized. |
4487 | return NumLegalParts < VF.getKnownMinValue(); |
4488 | }; |
4489 | |
4490 | // If no def nor is a store, e.g., branches, continue - no value to check. |
4491 | if (R.getNumDefinedValues() == 0 && |
4492 | !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>( |
4493 | Val: &R)) |
4494 | continue; |
4495 | // For multi-def recipes, currently only interleaved loads, suffice to |
4496 | // check first def only. |
4497 | // For stores check their stored value; for interleaved stores suffice |
4498 | // the check first stored value only. In all cases this is the second |
4499 | // operand. |
4500 | VPValue *ToCheck = |
4501 | R.getNumDefinedValues() >= 1 ? R.getVPValue(I: 0) : R.getOperand(N: 1); |
4502 | Type *ScalarTy = TypeInfo.inferScalarType(V: ToCheck); |
4503 | if (!Visited.insert(V: {ScalarTy}).second) |
4504 | continue; |
4505 | if (WillWiden(ScalarTy)) |
4506 | return true; |
4507 | } |
4508 | } |
4509 | |
4510 | return false; |
4511 | } |
4512 | |
4513 | VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { |
4514 | InstructionCost ExpectedCost = CM.expectedCost(VF: ElementCount::getFixed(MinVal: 1)); |
4515 | LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n" ); |
4516 | assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop" ); |
4517 | assert(any_of(VPlans, |
4518 | [](std::unique_ptr<VPlan> &P) { |
4519 | return P->hasVF(ElementCount::getFixed(1)); |
4520 | }) && |
4521 | "Expected Scalar VF to be a candidate" ); |
4522 | |
4523 | const VectorizationFactor ScalarCost(ElementCount::getFixed(MinVal: 1), ExpectedCost, |
4524 | ExpectedCost); |
4525 | VectorizationFactor ChosenFactor = ScalarCost; |
4526 | |
4527 | bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; |
4528 | if (ForceVectorization && |
4529 | (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) { |
4530 | // Ignore scalar width, because the user explicitly wants vectorization. |
4531 | // Initialize cost to max so that VF = 2 is, at least, chosen during cost |
4532 | // evaluation. |
4533 | ChosenFactor.Cost = InstructionCost::getMax(); |
4534 | } |
4535 | |
4536 | SmallVector<InstructionVFPair> InvalidCosts; |
4537 | for (auto &P : VPlans) { |
4538 | for (ElementCount VF : P->vectorFactors()) { |
4539 | // The cost for scalar VF=1 is already calculated, so ignore it. |
4540 | if (VF.isScalar()) |
4541 | continue; |
4542 | |
4543 | InstructionCost C = CM.expectedCost(VF, Invalid: &InvalidCosts); |
4544 | VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost); |
4545 | |
4546 | #ifndef NDEBUG |
4547 | unsigned AssumedMinimumVscale = |
4548 | getVScaleForTuning(OrigLoop, TTI).value_or(1); |
4549 | unsigned Width = |
4550 | Candidate.Width.isScalable() |
4551 | ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale |
4552 | : Candidate.Width.getFixedValue(); |
4553 | LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF |
4554 | << " costs: " << (Candidate.Cost / Width)); |
4555 | if (VF.isScalable()) |
4556 | LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " |
4557 | << AssumedMinimumVscale << ")" ); |
4558 | LLVM_DEBUG(dbgs() << ".\n" ); |
4559 | #endif |
4560 | |
4561 | if (!ForceVectorization && !willGenerateVectors(Plan&: *P, VF, TTI)) { |
4562 | LLVM_DEBUG( |
4563 | dbgs() |
4564 | << "LV: Not considering vector loop of width " << VF |
4565 | << " because it will not generate any vector instructions.\n" ); |
4566 | continue; |
4567 | } |
4568 | |
4569 | // If profitable add it to ProfitableVF list. |
4570 | if (isMoreProfitable(A: Candidate, B: ScalarCost)) |
4571 | ProfitableVFs.push_back(Elt: Candidate); |
4572 | |
4573 | if (isMoreProfitable(A: Candidate, B: ChosenFactor)) |
4574 | ChosenFactor = Candidate; |
4575 | } |
4576 | } |
4577 | |
4578 | emitInvalidCostRemarks(InvalidCosts, ORE, TheLoop: OrigLoop); |
4579 | |
4580 | if (!EnableCondStoresVectorization && CM.hasPredStores()) { |
4581 | reportVectorizationFailure( |
4582 | DebugMsg: "There are conditional stores." , |
4583 | OREMsg: "store that is conditionally executed prevents vectorization" , |
4584 | ORETag: "ConditionalStore" , ORE, TheLoop: OrigLoop); |
4585 | ChosenFactor = ScalarCost; |
4586 | } |
4587 | |
4588 | LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && |
4589 | !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs() |
4590 | << "LV: Vectorization seems to be not beneficial, " |
4591 | << "but was forced by a user.\n" ); |
4592 | LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n" ); |
4593 | return ChosenFactor; |
4594 | } |
4595 | |
4596 | bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization( |
4597 | ElementCount VF) const { |
4598 | // Cross iteration phis such as reductions need special handling and are |
4599 | // currently unsupported. |
4600 | if (any_of(Range: OrigLoop->getHeader()->phis(), |
4601 | P: [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(Phi: &Phi); })) |
4602 | return false; |
4603 | |
4604 | // Phis with uses outside of the loop require special handling and are |
4605 | // currently unsupported. |
4606 | for (const auto &Entry : Legal->getInductionVars()) { |
4607 | // Look for uses of the value of the induction at the last iteration. |
4608 | Value *PostInc = |
4609 | Entry.first->getIncomingValueForBlock(BB: OrigLoop->getLoopLatch()); |
4610 | for (User *U : PostInc->users()) |
4611 | if (!OrigLoop->contains(Inst: cast<Instruction>(Val: U))) |
4612 | return false; |
4613 | // Look for uses of penultimate value of the induction. |
4614 | for (User *U : Entry.first->users()) |
4615 | if (!OrigLoop->contains(Inst: cast<Instruction>(Val: U))) |
4616 | return false; |
4617 | } |
4618 | |
4619 | // Epilogue vectorization code has not been auditted to ensure it handles |
4620 | // non-latch exits properly. It may be fine, but it needs auditted and |
4621 | // tested. |
4622 | if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch()) |
4623 | return false; |
4624 | |
4625 | return true; |
4626 | } |
4627 | |
4628 | bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( |
4629 | const ElementCount VF) const { |
4630 | // FIXME: We need a much better cost-model to take different parameters such |
4631 | // as register pressure, code size increase and cost of extra branches into |
4632 | // account. For now we apply a very crude heuristic and only consider loops |
4633 | // with vectorization factors larger than a certain value. |
4634 | |
4635 | // Allow the target to opt out entirely. |
4636 | if (!TTI.preferEpilogueVectorization()) |
4637 | return false; |
4638 | |
4639 | // We also consider epilogue vectorization unprofitable for targets that don't |
4640 | // consider interleaving beneficial (eg. MVE). |
4641 | if (TTI.getMaxInterleaveFactor(VF) <= 1) |
4642 | return false; |
4643 | |
4644 | unsigned Multiplier = 1; |
4645 | if (VF.isScalable()) |
4646 | Multiplier = getVScaleForTuning(L: TheLoop, TTI).value_or(u: 1); |
4647 | if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF) |
4648 | return true; |
4649 | return false; |
4650 | } |
4651 | |
4652 | VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( |
4653 | const ElementCount MainLoopVF, unsigned IC) { |
4654 | VectorizationFactor Result = VectorizationFactor::Disabled(); |
4655 | if (!EnableEpilogueVectorization) { |
4656 | LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n" ); |
4657 | return Result; |
4658 | } |
4659 | |
4660 | if (!CM.isScalarEpilogueAllowed()) { |
4661 | LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no " |
4662 | "epilogue is allowed.\n" ); |
4663 | return Result; |
4664 | } |
4665 | |
4666 | // Not really a cost consideration, but check for unsupported cases here to |
4667 | // simplify the logic. |
4668 | if (!isCandidateForEpilogueVectorization(VF: MainLoopVF)) { |
4669 | LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop " |
4670 | "is not a supported candidate.\n" ); |
4671 | return Result; |
4672 | } |
4673 | |
4674 | if (EpilogueVectorizationForceVF > 1) { |
4675 | LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n" ); |
4676 | ElementCount ForcedEC = ElementCount::getFixed(MinVal: EpilogueVectorizationForceVF); |
4677 | if (hasPlanWithVF(VF: ForcedEC)) |
4678 | return {ForcedEC, 0, 0}; |
4679 | else { |
4680 | LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not " |
4681 | "viable.\n" ); |
4682 | return Result; |
4683 | } |
4684 | } |
4685 | |
4686 | if (OrigLoop->getHeader()->getParent()->hasOptSize() || |
4687 | OrigLoop->getHeader()->getParent()->hasMinSize()) { |
4688 | LLVM_DEBUG( |
4689 | dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n" ); |
4690 | return Result; |
4691 | } |
4692 | |
4693 | if (!CM.isEpilogueVectorizationProfitable(VF: MainLoopVF)) { |
4694 | LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " |
4695 | "this loop\n" ); |
4696 | return Result; |
4697 | } |
4698 | |
4699 | // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know |
4700 | // the main loop handles 8 lanes per iteration. We could still benefit from |
4701 | // vectorizing the epilogue loop with VF=4. |
4702 | ElementCount EstimatedRuntimeVF = MainLoopVF; |
4703 | if (MainLoopVF.isScalable()) { |
4704 | EstimatedRuntimeVF = ElementCount::getFixed(MinVal: MainLoopVF.getKnownMinValue()); |
4705 | if (std::optional<unsigned> VScale = getVScaleForTuning(L: OrigLoop, TTI)) |
4706 | EstimatedRuntimeVF *= *VScale; |
4707 | } |
4708 | |
4709 | ScalarEvolution &SE = *PSE.getSE(); |
4710 | Type *TCType = Legal->getWidestInductionType(); |
4711 | const SCEV *RemainingIterations = nullptr; |
4712 | for (auto &NextVF : ProfitableVFs) { |
4713 | // Skip candidate VFs without a corresponding VPlan. |
4714 | if (!hasPlanWithVF(VF: NextVF.Width)) |
4715 | continue; |
4716 | |
4717 | // Skip candidate VFs with widths >= the estimate runtime VF (scalable |
4718 | // vectors) or the VF of the main loop (fixed vectors). |
4719 | if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && |
4720 | ElementCount::isKnownGE(LHS: NextVF.Width, RHS: EstimatedRuntimeVF)) || |
4721 | ElementCount::isKnownGE(LHS: NextVF.Width, RHS: MainLoopVF)) |
4722 | continue; |
4723 | |
4724 | // If NextVF is greater than the number of remaining iterations, the |
4725 | // epilogue loop would be dead. Skip such factors. |
4726 | if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) { |
4727 | // TODO: extend to support scalable VFs. |
4728 | if (!RemainingIterations) { |
4729 | const SCEV *TC = createTripCountSCEV(IdxTy: TCType, PSE, OrigLoop); |
4730 | RemainingIterations = SE.getURemExpr( |
4731 | LHS: TC, RHS: SE.getConstant(Ty: TCType, V: MainLoopVF.getKnownMinValue() * IC)); |
4732 | } |
4733 | if (SE.isKnownPredicate( |
4734 | Pred: CmpInst::ICMP_UGT, |
4735 | LHS: SE.getConstant(Ty: TCType, V: NextVF.Width.getKnownMinValue()), |
4736 | RHS: RemainingIterations)) |
4737 | continue; |
4738 | } |
4739 | |
4740 | if (Result.Width.isScalar() || isMoreProfitable(A: NextVF, B: Result)) |
4741 | Result = NextVF; |
4742 | } |
4743 | |
4744 | if (Result != VectorizationFactor::Disabled()) |
4745 | LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " |
4746 | << Result.Width << "\n" ); |
4747 | return Result; |
4748 | } |
4749 | |
4750 | std::pair<unsigned, unsigned> |
4751 | LoopVectorizationCostModel::getSmallestAndWidestTypes() { |
4752 | unsigned MinWidth = -1U; |
4753 | unsigned MaxWidth = 8; |
4754 | const DataLayout &DL = TheFunction->getDataLayout(); |
4755 | // For in-loop reductions, no element types are added to ElementTypesInLoop |
4756 | // if there are no loads/stores in the loop. In this case, check through the |
4757 | // reduction variables to determine the maximum width. |
4758 | if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) { |
4759 | // Reset MaxWidth so that we can find the smallest type used by recurrences |
4760 | // in the loop. |
4761 | MaxWidth = -1U; |
4762 | for (const auto &PhiDescriptorPair : Legal->getReductionVars()) { |
4763 | const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second; |
4764 | // When finding the min width used by the recurrence we need to account |
4765 | // for casts on the input operands of the recurrence. |
4766 | MaxWidth = std::min<unsigned>( |
4767 | a: MaxWidth, b: std::min<unsigned>( |
4768 | a: RdxDesc.getMinWidthCastToRecurrenceTypeInBits(), |
4769 | b: RdxDesc.getRecurrenceType()->getScalarSizeInBits())); |
4770 | } |
4771 | } else { |
4772 | for (Type *T : ElementTypesInLoop) { |
4773 | MinWidth = std::min<unsigned>( |
4774 | a: MinWidth, b: DL.getTypeSizeInBits(Ty: T->getScalarType()).getFixedValue()); |
4775 | MaxWidth = std::max<unsigned>( |
4776 | a: MaxWidth, b: DL.getTypeSizeInBits(Ty: T->getScalarType()).getFixedValue()); |
4777 | } |
4778 | } |
4779 | return {MinWidth, MaxWidth}; |
4780 | } |
4781 | |
4782 | void LoopVectorizationCostModel::collectElementTypesForWidening() { |
4783 | ElementTypesInLoop.clear(); |
4784 | // For each block. |
4785 | for (BasicBlock *BB : TheLoop->blocks()) { |
4786 | // For each instruction in the loop. |
4787 | for (Instruction &I : BB->instructionsWithoutDebug()) { |
4788 | Type *T = I.getType(); |
4789 | |
4790 | // Skip ignored values. |
4791 | if (ValuesToIgnore.count(Ptr: &I)) |
4792 | continue; |
4793 | |
4794 | // Only examine Loads, Stores and PHINodes. |
4795 | if (!isa<LoadInst>(Val: I) && !isa<StoreInst>(Val: I) && !isa<PHINode>(Val: I)) |
4796 | continue; |
4797 | |
4798 | // Examine PHI nodes that are reduction variables. Update the type to |
4799 | // account for the recurrence type. |
4800 | if (auto *PN = dyn_cast<PHINode>(Val: &I)) { |
4801 | if (!Legal->isReductionVariable(PN)) |
4802 | continue; |
4803 | const RecurrenceDescriptor &RdxDesc = |
4804 | Legal->getReductionVars().find(Key: PN)->second; |
4805 | if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || |
4806 | TTI.preferInLoopReduction(Opcode: RdxDesc.getOpcode(), |
4807 | Ty: RdxDesc.getRecurrenceType(), |
4808 | Flags: TargetTransformInfo::ReductionFlags())) |
4809 | continue; |
4810 | T = RdxDesc.getRecurrenceType(); |
4811 | } |
4812 | |
4813 | // Examine the stored values. |
4814 | if (auto *ST = dyn_cast<StoreInst>(Val: &I)) |
4815 | T = ST->getValueOperand()->getType(); |
4816 | |
4817 | assert(T->isSized() && |
4818 | "Expected the load/store/recurrence type to be sized" ); |
4819 | |
4820 | ElementTypesInLoop.insert(Ptr: T); |
4821 | } |
4822 | } |
4823 | } |
4824 | |
4825 | unsigned |
4826 | LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, |
4827 | InstructionCost LoopCost) { |
4828 | // -- The interleave heuristics -- |
4829 | // We interleave the loop in order to expose ILP and reduce the loop overhead. |
4830 | // There are many micro-architectural considerations that we can't predict |
4831 | // at this level. For example, frontend pressure (on decode or fetch) due to |
4832 | // code size, or the number and capabilities of the execution ports. |
4833 | // |
4834 | // We use the following heuristics to select the interleave count: |
4835 | // 1. If the code has reductions, then we interleave to break the cross |
4836 | // iteration dependency. |
4837 | // 2. If the loop is really small, then we interleave to reduce the loop |
4838 | // overhead. |
4839 | // 3. We don't interleave if we think that we will spill registers to memory |
4840 | // due to the increased register pressure. |
4841 | |
4842 | if (!isScalarEpilogueAllowed()) |
4843 | return 1; |
4844 | |
4845 | // Do not interleave if EVL is preferred and no User IC is specified. |
4846 | if (foldTailWithEVL()) { |
4847 | LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. " |
4848 | "Unroll factor forced to be 1.\n" ); |
4849 | return 1; |
4850 | } |
4851 | |
4852 | // We used the distance for the interleave count. |
4853 | if (!Legal->isSafeForAnyVectorWidth()) |
4854 | return 1; |
4855 | |
4856 | auto BestKnownTC = getSmallBestKnownTC(SE&: *PSE.getSE(), L: TheLoop); |
4857 | const bool HasReductions = !Legal->getReductionVars().empty(); |
4858 | |
4859 | // If we did not calculate the cost for VF (because the user selected the VF) |
4860 | // then we calculate the cost of VF here. |
4861 | if (LoopCost == 0) { |
4862 | LoopCost = expectedCost(VF); |
4863 | assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost" ); |
4864 | |
4865 | // Loop body is free and there is no need for interleaving. |
4866 | if (LoopCost == 0) |
4867 | return 1; |
4868 | } |
4869 | |
4870 | RegisterUsage R = calculateRegisterUsage(VFs: {VF})[0]; |
4871 | // We divide by these constants so assume that we have at least one |
4872 | // instruction that uses at least one register. |
4873 | for (auto& pair : R.MaxLocalUsers) { |
4874 | pair.second = std::max(a: pair.second, b: 1U); |
4875 | } |
4876 | |
4877 | // We calculate the interleave count using the following formula. |
4878 | // Subtract the number of loop invariants from the number of available |
4879 | // registers. These registers are used by all of the interleaved instances. |
4880 | // Next, divide the remaining registers by the number of registers that is |
4881 | // required by the loop, in order to estimate how many parallel instances |
4882 | // fit without causing spills. All of this is rounded down if necessary to be |
4883 | // a power of two. We want power of two interleave count to simplify any |
4884 | // addressing operations or alignment considerations. |
4885 | // We also want power of two interleave counts to ensure that the induction |
4886 | // variable of the vector loop wraps to zero, when tail is folded by masking; |
4887 | // this currently happens when OptForSize, in which case IC is set to 1 above. |
4888 | unsigned IC = UINT_MAX; |
4889 | |
4890 | for (auto& pair : R.MaxLocalUsers) { |
4891 | unsigned TargetNumRegisters = TTI.getNumberOfRegisters(ClassID: pair.first); |
4892 | LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters |
4893 | << " registers of " |
4894 | << TTI.getRegisterClassName(pair.first) << " register class\n" ); |
4895 | if (VF.isScalar()) { |
4896 | if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) |
4897 | TargetNumRegisters = ForceTargetNumScalarRegs; |
4898 | } else { |
4899 | if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) |
4900 | TargetNumRegisters = ForceTargetNumVectorRegs; |
4901 | } |
4902 | unsigned MaxLocalUsers = pair.second; |
4903 | unsigned LoopInvariantRegs = 0; |
4904 | if (R.LoopInvariantRegs.find(Key: pair.first) != R.LoopInvariantRegs.end()) |
4905 | LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; |
4906 | |
4907 | unsigned TmpIC = llvm::bit_floor(Value: (TargetNumRegisters - LoopInvariantRegs) / |
4908 | MaxLocalUsers); |
4909 | // Don't count the induction variable as interleaved. |
4910 | if (EnableIndVarRegisterHeur) { |
4911 | TmpIC = llvm::bit_floor(Value: (TargetNumRegisters - LoopInvariantRegs - 1) / |
4912 | std::max(a: 1U, b: (MaxLocalUsers - 1))); |
4913 | } |
4914 | |
4915 | IC = std::min(a: IC, b: TmpIC); |
4916 | } |
4917 | |
4918 | // Clamp the interleave ranges to reasonable counts. |
4919 | unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); |
4920 | |
4921 | // Check if the user has overridden the max. |
4922 | if (VF.isScalar()) { |
4923 | if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) |
4924 | MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; |
4925 | } else { |
4926 | if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) |
4927 | MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; |
4928 | } |
4929 | |
4930 | unsigned EstimatedVF = VF.getKnownMinValue(); |
4931 | if (VF.isScalable()) { |
4932 | if (std::optional<unsigned> VScale = getVScaleForTuning(L: TheLoop, TTI)) |
4933 | EstimatedVF *= *VScale; |
4934 | } |
4935 | assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1" ); |
4936 | |
4937 | unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(L: TheLoop); |
4938 | if (KnownTC > 0) { |
4939 | // At least one iteration must be scalar when this constraint holds. So the |
4940 | // maximum available iterations for interleaving is one less. |
4941 | unsigned AvailableTC = |
4942 | requiresScalarEpilogue(IsVectorizing: VF.isVector()) ? KnownTC - 1 : KnownTC; |
4943 | |
4944 | // If trip count is known we select between two prospective ICs, where |
4945 | // 1) the aggressive IC is capped by the trip count divided by VF |
4946 | // 2) the conservative IC is capped by the trip count divided by (VF * 2) |
4947 | // The final IC is selected in a way that the epilogue loop trip count is |
4948 | // minimized while maximizing the IC itself, so that we either run the |
4949 | // vector loop at least once if it generates a small epilogue loop, or else |
4950 | // we run the vector loop at least twice. |
4951 | |
4952 | unsigned InterleaveCountUB = bit_floor( |
4953 | Value: std::max(a: 1u, b: std::min(a: AvailableTC / EstimatedVF, b: MaxInterleaveCount))); |
4954 | unsigned InterleaveCountLB = bit_floor(Value: std::max( |
4955 | a: 1u, b: std::min(a: AvailableTC / (EstimatedVF * 2), b: MaxInterleaveCount))); |
4956 | MaxInterleaveCount = InterleaveCountLB; |
4957 | |
4958 | if (InterleaveCountUB != InterleaveCountLB) { |
4959 | unsigned TailTripCountUB = |
4960 | (AvailableTC % (EstimatedVF * InterleaveCountUB)); |
4961 | unsigned TailTripCountLB = |
4962 | (AvailableTC % (EstimatedVF * InterleaveCountLB)); |
4963 | // If both produce same scalar tail, maximize the IC to do the same work |
4964 | // in fewer vector loop iterations |
4965 | if (TailTripCountUB == TailTripCountLB) |
4966 | MaxInterleaveCount = InterleaveCountUB; |
4967 | } |
4968 | } else if (BestKnownTC && *BestKnownTC > 0) { |
4969 | // At least one iteration must be scalar when this constraint holds. So the |
4970 | // maximum available iterations for interleaving is one less. |
4971 | unsigned AvailableTC = requiresScalarEpilogue(IsVectorizing: VF.isVector()) |
4972 | ? (*BestKnownTC) - 1 |
4973 | : *BestKnownTC; |
4974 | |
4975 | // If trip count is an estimated compile time constant, limit the |
4976 | // IC to be capped by the trip count divided by VF * 2, such that the vector |
4977 | // loop runs at least twice to make interleaving seem profitable when there |
4978 | // is an epilogue loop present. Since exact Trip count is not known we |
4979 | // choose to be conservative in our IC estimate. |
4980 | MaxInterleaveCount = bit_floor(Value: std::max( |
4981 | a: 1u, b: std::min(a: AvailableTC / (EstimatedVF * 2), b: MaxInterleaveCount))); |
4982 | } |
4983 | |
4984 | assert(MaxInterleaveCount > 0 && |
4985 | "Maximum interleave count must be greater than 0" ); |
4986 | |
4987 | // Clamp the calculated IC to be between the 1 and the max interleave count |
4988 | // that the target and trip count allows. |
4989 | if (IC > MaxInterleaveCount) |
4990 | IC = MaxInterleaveCount; |
4991 | else |
4992 | // Make sure IC is greater than 0. |
4993 | IC = std::max(a: 1u, b: IC); |
4994 | |
4995 | assert(IC > 0 && "Interleave count must be greater than 0." ); |
4996 | |
4997 | // Interleave if we vectorized this loop and there is a reduction that could |
4998 | // benefit from interleaving. |
4999 | if (VF.isVector() && HasReductions) { |
5000 | LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n" ); |
5001 | return IC; |
5002 | } |
5003 | |
5004 | // For any scalar loop that either requires runtime checks or predication we |
5005 | // are better off leaving this to the unroller. Note that if we've already |
5006 | // vectorized the loop we will have done the runtime check and so interleaving |
5007 | // won't require further checks. |
5008 | bool ScalarInterleavingRequiresPredication = |
5009 | (VF.isScalar() && any_of(Range: TheLoop->blocks(), P: [this](BasicBlock *BB) { |
5010 | return Legal->blockNeedsPredication(BB); |
5011 | })); |
5012 | bool ScalarInterleavingRequiresRuntimePointerCheck = |
5013 | (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); |
5014 | |
5015 | // We want to interleave small loops in order to reduce the loop overhead and |
5016 | // potentially expose ILP opportunities. |
5017 | LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' |
5018 | << "LV: IC is " << IC << '\n' |
5019 | << "LV: VF is " << VF << '\n'); |
5020 | const bool AggressivelyInterleaveReductions = |
5021 | TTI.enableAggressiveInterleaving(LoopHasReductions: HasReductions); |
5022 | if (!ScalarInterleavingRequiresRuntimePointerCheck && |
5023 | !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) { |
5024 | // We assume that the cost overhead is 1 and we use the cost model |
5025 | // to estimate the cost of the loop and interleave until the cost of the |
5026 | // loop overhead is about 5% of the cost of the loop. |
5027 | unsigned SmallIC = std::min(a: IC, b: (unsigned)llvm::bit_floor<uint64_t>( |
5028 | Value: SmallLoopCost / *LoopCost.getValue())); |
5029 | |
5030 | // Interleave until store/load ports (estimated by max interleave count) are |
5031 | // saturated. |
5032 | unsigned NumStores = Legal->getNumStores(); |
5033 | unsigned NumLoads = Legal->getNumLoads(); |
5034 | unsigned StoresIC = IC / (NumStores ? NumStores : 1); |
5035 | unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); |
5036 | |
5037 | // There is little point in interleaving for reductions containing selects |
5038 | // and compares when VF=1 since it may just create more overhead than it's |
5039 | // worth for loops with small trip counts. This is because we still have to |
5040 | // do the final reduction after the loop. |
5041 | bool HasSelectCmpReductions = |
5042 | HasReductions && |
5043 | any_of(Range: Legal->getReductionVars(), P: [&](auto &Reduction) -> bool { |
5044 | const RecurrenceDescriptor &RdxDesc = Reduction.second; |
5045 | return RecurrenceDescriptor::isAnyOfRecurrenceKind( |
5046 | Kind: RdxDesc.getRecurrenceKind()); |
5047 | }); |
5048 | if (HasSelectCmpReductions) { |
5049 | LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n" ); |
5050 | return 1; |
5051 | } |
5052 | |
5053 | // If we have a scalar reduction (vector reductions are already dealt with |
5054 | // by this point), we can increase the critical path length if the loop |
5055 | // we're interleaving is inside another loop. For tree-wise reductions |
5056 | // set the limit to 2, and for ordered reductions it's best to disable |
5057 | // interleaving entirely. |
5058 | if (HasReductions && TheLoop->getLoopDepth() > 1) { |
5059 | bool HasOrderedReductions = |
5060 | any_of(Range: Legal->getReductionVars(), P: [&](auto &Reduction) -> bool { |
5061 | const RecurrenceDescriptor &RdxDesc = Reduction.second; |
5062 | return RdxDesc.isOrdered(); |
5063 | }); |
5064 | if (HasOrderedReductions) { |
5065 | LLVM_DEBUG( |
5066 | dbgs() << "LV: Not interleaving scalar ordered reductions.\n" ); |
5067 | return 1; |
5068 | } |
5069 | |
5070 | unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); |
5071 | SmallIC = std::min(a: SmallIC, b: F); |
5072 | StoresIC = std::min(a: StoresIC, b: F); |
5073 | LoadsIC = std::min(a: LoadsIC, b: F); |
5074 | } |
5075 | |
5076 | if (EnableLoadStoreRuntimeInterleave && |
5077 | std::max(a: StoresIC, b: LoadsIC) > SmallIC) { |
5078 | LLVM_DEBUG( |
5079 | dbgs() << "LV: Interleaving to saturate store or load ports.\n" ); |
5080 | return std::max(a: StoresIC, b: LoadsIC); |
5081 | } |
5082 | |
5083 | // If there are scalar reductions and TTI has enabled aggressive |
5084 | // interleaving for reductions, we will interleave to expose ILP. |
5085 | if (VF.isScalar() && AggressivelyInterleaveReductions) { |
5086 | LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n" ); |
5087 | // Interleave no less than SmallIC but not as aggressive as the normal IC |
5088 | // to satisfy the rare situation when resources are too limited. |
5089 | return std::max(a: IC / 2, b: SmallIC); |
5090 | } else { |
5091 | LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n" ); |
5092 | return SmallIC; |
5093 | } |
5094 | } |
5095 | |
5096 | // Interleave if this is a large loop (small loops are already dealt with by |
5097 | // this point) that could benefit from interleaving. |
5098 | if (AggressivelyInterleaveReductions) { |
5099 | LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n" ); |
5100 | return IC; |
5101 | } |
5102 | |
5103 | LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n" ); |
5104 | return 1; |
5105 | } |
5106 | |
5107 | SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> |
5108 | LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { |
5109 | // This function calculates the register usage by measuring the highest number |
5110 | // of values that are alive at a single location. Obviously, this is a very |
5111 | // rough estimation. We scan the loop in a topological order in order and |
5112 | // assign a number to each instruction. We use RPO to ensure that defs are |
5113 | // met before their users. We assume that each instruction that has in-loop |
5114 | // users starts an interval. We record every time that an in-loop value is |
5115 | // used, so we have a list of the first and last occurrences of each |
5116 | // instruction. Next, we transpose this data structure into a multi map that |
5117 | // holds the list of intervals that *end* at a specific location. This multi |
5118 | // map allows us to perform a linear search. We scan the instructions linearly |
5119 | // and record each time that a new interval starts, by placing it in a set. |
5120 | // If we find this value in the multi-map then we remove it from the set. |
5121 | // The max register usage is the maximum size of the set. |
5122 | // We also search for instructions that are defined outside the loop, but are |
5123 | // used inside the loop. We need this number separately from the max-interval |
5124 | // usage number because when we unroll, loop-invariant values do not take |
5125 | // more register. |
5126 | LoopBlocksDFS DFS(TheLoop); |
5127 | DFS.perform(LI); |
5128 | |
5129 | RegisterUsage RU; |
5130 | |
5131 | // Each 'key' in the map opens a new interval. The values |
5132 | // of the map are the index of the 'last seen' usage of the |
5133 | // instruction that is the key. |
5134 | using IntervalMap = DenseMap<Instruction *, unsigned>; |
5135 | |
5136 | // Maps instruction to its index. |
5137 | SmallVector<Instruction *, 64> IdxToInstr; |
5138 | // Marks the end of each interval. |
5139 | IntervalMap EndPoint; |
5140 | // Saves the list of instruction indices that are used in the loop. |
5141 | SmallPtrSet<Instruction *, 8> Ends; |
5142 | // Saves the list of values that are used in the loop but are defined outside |
5143 | // the loop (not including non-instruction values such as arguments and |
5144 | // constants). |
5145 | SmallSetVector<Instruction *, 8> LoopInvariants; |
5146 | |
5147 | for (BasicBlock *BB : make_range(x: DFS.beginRPO(), y: DFS.endRPO())) { |
5148 | for (Instruction &I : BB->instructionsWithoutDebug()) { |
5149 | IdxToInstr.push_back(Elt: &I); |
5150 | |
5151 | // Save the end location of each USE. |
5152 | for (Value *U : I.operands()) { |
5153 | auto *Instr = dyn_cast<Instruction>(Val: U); |
5154 | |
5155 | // Ignore non-instruction values such as arguments, constants, etc. |
5156 | // FIXME: Might need some motivation why these values are ignored. If |
5157 | // for example an argument is used inside the loop it will increase the |
5158 | // register pressure (so shouldn't we add it to LoopInvariants). |
5159 | if (!Instr) |
5160 | continue; |
5161 | |
5162 | // If this instruction is outside the loop then record it and continue. |
5163 | if (!TheLoop->contains(Inst: Instr)) { |
5164 | LoopInvariants.insert(X: Instr); |
5165 | continue; |
5166 | } |
5167 | |
5168 | // Overwrite previous end points. |
5169 | EndPoint[Instr] = IdxToInstr.size(); |
5170 | Ends.insert(Ptr: Instr); |
5171 | } |
5172 | } |
5173 | } |
5174 | |
5175 | // Saves the list of intervals that end with the index in 'key'. |
5176 | using InstrList = SmallVector<Instruction *, 2>; |
5177 | DenseMap<unsigned, InstrList> TransposeEnds; |
5178 | |
5179 | // Transpose the EndPoints to a list of values that end at each index. |
5180 | for (auto &Interval : EndPoint) |
5181 | TransposeEnds[Interval.second].push_back(Elt: Interval.first); |
5182 | |
5183 | SmallPtrSet<Instruction *, 8> OpenIntervals; |
5184 | SmallVector<RegisterUsage, 8> RUs(VFs.size()); |
5185 | SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); |
5186 | |
5187 | LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n" ); |
5188 | |
5189 | const auto &TTICapture = TTI; |
5190 | auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { |
5191 | if (Ty->isTokenTy() || !VectorType::isValidElementType(ElemTy: Ty)) |
5192 | return 0; |
5193 | return TTICapture.getRegUsageForType(Ty: VectorType::get(ElementType: Ty, EC: VF)); |
5194 | }; |
5195 | |
5196 | for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { |
5197 | Instruction *I = IdxToInstr[i]; |
5198 | |
5199 | // Remove all of the instructions that end at this location. |
5200 | InstrList &List = TransposeEnds[i]; |
5201 | for (Instruction *ToRemove : List) |
5202 | OpenIntervals.erase(Ptr: ToRemove); |
5203 | |
5204 | // Ignore instructions that are never used within the loop. |
5205 | if (!Ends.count(Ptr: I)) |
5206 | continue; |
5207 | |
5208 | // Skip ignored values. |
5209 | if (ValuesToIgnore.count(Ptr: I)) |
5210 | continue; |
5211 | |
5212 | collectInLoopReductions(); |
5213 | |
5214 | // For each VF find the maximum usage of registers. |
5215 | for (unsigned j = 0, e = VFs.size(); j < e; ++j) { |
5216 | // Count the number of registers used, per register class, given all open |
5217 | // intervals. |
5218 | // Note that elements in this SmallMapVector will be default constructed |
5219 | // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if |
5220 | // there is no previous entry for ClassID. |
5221 | SmallMapVector<unsigned, unsigned, 4> RegUsage; |
5222 | |
5223 | if (VFs[j].isScalar()) { |
5224 | for (auto *Inst : OpenIntervals) { |
5225 | unsigned ClassID = |
5226 | TTI.getRegisterClassForType(Vector: false, Ty: Inst->getType()); |
5227 | // FIXME: The target might use more than one register for the type |
5228 | // even in the scalar case. |
5229 | RegUsage[ClassID] += 1; |
5230 | } |
5231 | } else { |
5232 | collectUniformsAndScalars(VF: VFs[j]); |
5233 | for (auto *Inst : OpenIntervals) { |
5234 | // Skip ignored values for VF > 1. |
5235 | if (VecValuesToIgnore.count(Ptr: Inst)) |
5236 | continue; |
5237 | if (isScalarAfterVectorization(I: Inst, VF: VFs[j])) { |
5238 | unsigned ClassID = |
5239 | TTI.getRegisterClassForType(Vector: false, Ty: Inst->getType()); |
5240 | // FIXME: The target might use more than one register for the type |
5241 | // even in the scalar case. |
5242 | RegUsage[ClassID] += 1; |
5243 | } else { |
5244 | unsigned ClassID = |
5245 | TTI.getRegisterClassForType(Vector: true, Ty: Inst->getType()); |
5246 | RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); |
5247 | } |
5248 | } |
5249 | } |
5250 | |
5251 | for (auto& pair : RegUsage) { |
5252 | auto &Entry = MaxUsages[j][pair.first]; |
5253 | Entry = std::max(a: Entry, b: pair.second); |
5254 | } |
5255 | } |
5256 | |
5257 | LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " |
5258 | << OpenIntervals.size() << '\n'); |
5259 | |
5260 | // Add the current instruction to the list of open intervals. |
5261 | OpenIntervals.insert(Ptr: I); |
5262 | } |
5263 | |
5264 | for (unsigned i = 0, e = VFs.size(); i < e; ++i) { |
5265 | // Note that elements in this SmallMapVector will be default constructed |
5266 | // as 0. So we can use "Invariant[ClassID] += n" in the code below even if |
5267 | // there is no previous entry for ClassID. |
5268 | SmallMapVector<unsigned, unsigned, 4> Invariant; |
5269 | |
5270 | for (auto *Inst : LoopInvariants) { |
5271 | // FIXME: The target might use more than one register for the type |
5272 | // even in the scalar case. |
5273 | bool IsScalar = all_of(Range: Inst->users(), P: [&](User *U) { |
5274 | auto *I = cast<Instruction>(Val: U); |
5275 | return TheLoop != LI->getLoopFor(BB: I->getParent()) || |
5276 | isScalarAfterVectorization(I, VF: VFs[i]); |
5277 | }); |
5278 | |
5279 | ElementCount VF = IsScalar ? ElementCount::getFixed(MinVal: 1) : VFs[i]; |
5280 | unsigned ClassID = |
5281 | TTI.getRegisterClassForType(Vector: VF.isVector(), Ty: Inst->getType()); |
5282 | Invariant[ClassID] += GetRegUsage(Inst->getType(), VF); |
5283 | } |
5284 | |
5285 | LLVM_DEBUG({ |
5286 | dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; |
5287 | dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() |
5288 | << " item\n" ; |
5289 | for (const auto &pair : MaxUsages[i]) { |
5290 | dbgs() << "LV(REG): RegisterClass: " |
5291 | << TTI.getRegisterClassName(pair.first) << ", " << pair.second |
5292 | << " registers\n" ; |
5293 | } |
5294 | dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() |
5295 | << " item\n" ; |
5296 | for (const auto &pair : Invariant) { |
5297 | dbgs() << "LV(REG): RegisterClass: " |
5298 | << TTI.getRegisterClassName(pair.first) << ", " << pair.second |
5299 | << " registers\n" ; |
5300 | } |
5301 | }); |
5302 | |
5303 | RU.LoopInvariantRegs = Invariant; |
5304 | RU.MaxLocalUsers = MaxUsages[i]; |
5305 | RUs[i] = RU; |
5306 | } |
5307 | |
5308 | return RUs; |
5309 | } |
5310 | |
5311 | bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, |
5312 | ElementCount VF) { |
5313 | // TODO: Cost model for emulated masked load/store is completely |
5314 | // broken. This hack guides the cost model to use an artificially |
5315 | // high enough value to practically disable vectorization with such |
5316 | // operations, except where previously deployed legality hack allowed |
5317 | // using very low cost values. This is to avoid regressions coming simply |
5318 | // from moving "masked load/store" check from legality to cost model. |
5319 | // Masked Load/Gather emulation was previously never allowed. |
5320 | // Limited number of Masked Store/Scatter emulation was allowed. |
5321 | assert((isPredicatedInst(I)) && |
5322 | "Expecting a scalar emulated instruction" ); |
5323 | return isa<LoadInst>(Val: I) || |
5324 | (isa<StoreInst>(Val: I) && |
5325 | NumPredStores > NumberOfStoresToPredicate); |
5326 | } |
5327 | |
5328 | void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { |
5329 | // If we aren't vectorizing the loop, or if we've already collected the |
5330 | // instructions to scalarize, there's nothing to do. Collection may already |
5331 | // have occurred if we have a user-selected VF and are now computing the |
5332 | // expected cost for interleaving. |
5333 | if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(Val: VF)) |
5334 | return; |
5335 | |
5336 | // Initialize a mapping for VF in InstsToScalalarize. If we find that it's |
5337 | // not profitable to scalarize any instructions, the presence of VF in the |
5338 | // map will indicate that we've analyzed it already. |
5339 | ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; |
5340 | |
5341 | PredicatedBBsAfterVectorization[VF].clear(); |
5342 | |
5343 | // Find all the instructions that are scalar with predication in the loop and |
5344 | // determine if it would be better to not if-convert the blocks they are in. |
5345 | // If so, we also record the instructions to scalarize. |
5346 | for (BasicBlock *BB : TheLoop->blocks()) { |
5347 | if (!blockNeedsPredicationForAnyReason(BB)) |
5348 | continue; |
5349 | for (Instruction &I : *BB) |
5350 | if (isScalarWithPredication(I: &I, VF)) { |
5351 | ScalarCostsTy ScalarCosts; |
5352 | // Do not apply discount logic for: |
5353 | // 1. Scalars after vectorization, as there will only be a single copy |
5354 | // of the instruction. |
5355 | // 2. Scalable VF, as that would lead to invalid scalarization costs. |
5356 | // 3. Emulated masked memrefs, if a hacked cost is needed. |
5357 | if (!isScalarAfterVectorization(I: &I, VF) && !VF.isScalable() && |
5358 | !useEmulatedMaskMemRefHack(I: &I, VF) && |
5359 | computePredInstDiscount(PredInst: &I, ScalarCosts, VF) >= 0) |
5360 | ScalarCostsVF.insert(I: ScalarCosts.begin(), E: ScalarCosts.end()); |
5361 | // Remember that BB will remain after vectorization. |
5362 | PredicatedBBsAfterVectorization[VF].insert(Ptr: BB); |
5363 | for (auto *Pred : predecessors(BB)) { |
5364 | if (Pred->getSingleSuccessor() == BB) |
5365 | PredicatedBBsAfterVectorization[VF].insert(Ptr: Pred); |
5366 | } |
5367 | } |
5368 | } |
5369 | } |
5370 | |
5371 | InstructionCost LoopVectorizationCostModel::computePredInstDiscount( |
5372 | Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { |
5373 | assert(!isUniformAfterVectorization(PredInst, VF) && |
5374 | "Instruction marked uniform-after-vectorization will be predicated" ); |
5375 | |
5376 | // Initialize the discount to zero, meaning that the scalar version and the |
5377 | // vector version cost the same. |
5378 | InstructionCost Discount = 0; |
5379 | |
5380 | // Holds instructions to analyze. The instructions we visit are mapped in |
5381 | // ScalarCosts. Those instructions are the ones that would be scalarized if |
5382 | // we find that the scalar version costs less. |
5383 | SmallVector<Instruction *, 8> Worklist; |
5384 | |
5385 | // Returns true if the given instruction can be scalarized. |
5386 | auto canBeScalarized = [&](Instruction *I) -> bool { |
5387 | // We only attempt to scalarize instructions forming a single-use chain |
5388 | // from the original predicated block that would otherwise be vectorized. |
5389 | // Although not strictly necessary, we give up on instructions we know will |
5390 | // already be scalar to avoid traversing chains that are unlikely to be |
5391 | // beneficial. |
5392 | if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || |
5393 | isScalarAfterVectorization(I, VF)) |
5394 | return false; |
5395 | |
5396 | // If the instruction is scalar with predication, it will be analyzed |
5397 | // separately. We ignore it within the context of PredInst. |
5398 | if (isScalarWithPredication(I, VF)) |
5399 | return false; |
5400 | |
5401 | // If any of the instruction's operands are uniform after vectorization, |
5402 | // the instruction cannot be scalarized. This prevents, for example, a |
5403 | // masked load from being scalarized. |
5404 | // |
5405 | // We assume we will only emit a value for lane zero of an instruction |
5406 | // marked uniform after vectorization, rather than VF identical values. |
5407 | // Thus, if we scalarize an instruction that uses a uniform, we would |
5408 | // create uses of values corresponding to the lanes we aren't emitting code |
5409 | // for. This behavior can be changed by allowing getScalarValue to clone |
5410 | // the lane zero values for uniforms rather than asserting. |
5411 | for (Use &U : I->operands()) |
5412 | if (auto *J = dyn_cast<Instruction>(Val: U.get())) |
5413 | if (isUniformAfterVectorization(I: J, VF)) |
5414 | return false; |
5415 | |
5416 | // Otherwise, we can scalarize the instruction. |
5417 | return true; |
5418 | }; |
5419 | |
5420 | // Compute the expected cost discount from scalarizing the entire expression |
5421 | // feeding the predicated instruction. We currently only consider expressions |
5422 | // that are single-use instruction chains. |
5423 | Worklist.push_back(Elt: PredInst); |
5424 | while (!Worklist.empty()) { |
5425 | Instruction *I = Worklist.pop_back_val(); |
5426 | |
5427 | // If we've already analyzed the instruction, there's nothing to do. |
5428 | if (ScalarCosts.contains(Val: I)) |
5429 | continue; |
5430 | |
5431 | // Compute the cost of the vector instruction. Note that this cost already |
5432 | // includes the scalarization overhead of the predicated instruction. |
5433 | InstructionCost VectorCost = getInstructionCost(I, VF); |
5434 | |
5435 | // Compute the cost of the scalarized instruction. This cost is the cost of |
5436 | // the instruction as if it wasn't if-converted and instead remained in the |
5437 | // predicated block. We will scale this cost by block probability after |
5438 | // computing the scalarization overhead. |
5439 | InstructionCost ScalarCost = |
5440 | VF.getFixedValue() * getInstructionCost(I, VF: ElementCount::getFixed(MinVal: 1)); |
5441 | |
5442 | // Compute the scalarization overhead of needed insertelement instructions |
5443 | // and phi nodes. |
5444 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
5445 | if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { |
5446 | ScalarCost += TTI.getScalarizationOverhead( |
5447 | Ty: cast<VectorType>(Val: ToVectorTy(Scalar: I->getType(), EC: VF)), |
5448 | DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()), /*Insert*/ true, |
5449 | /*Extract*/ false, CostKind); |
5450 | ScalarCost += |
5451 | VF.getFixedValue() * TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind); |
5452 | } |
5453 | |
5454 | // Compute the scalarization overhead of needed extractelement |
5455 | // instructions. For each of the instruction's operands, if the operand can |
5456 | // be scalarized, add it to the worklist; otherwise, account for the |
5457 | // overhead. |
5458 | for (Use &U : I->operands()) |
5459 | if (auto *J = dyn_cast<Instruction>(Val: U.get())) { |
5460 | assert(VectorType::isValidElementType(J->getType()) && |
5461 | "Instruction has non-scalar type" ); |
5462 | if (canBeScalarized(J)) |
5463 | Worklist.push_back(Elt: J); |
5464 | else if (needsExtract(V: J, VF)) { |
5465 | ScalarCost += TTI.getScalarizationOverhead( |
5466 | Ty: cast<VectorType>(Val: ToVectorTy(Scalar: J->getType(), EC: VF)), |
5467 | DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()), /*Insert*/ false, |
5468 | /*Extract*/ true, CostKind); |
5469 | } |
5470 | } |
5471 | |
5472 | // Scale the total scalar cost by block probability. |
5473 | ScalarCost /= getReciprocalPredBlockProb(); |
5474 | |
5475 | // Compute the discount. A non-negative discount means the vector version |
5476 | // of the instruction costs more, and scalarizing would be beneficial. |
5477 | Discount += VectorCost - ScalarCost; |
5478 | ScalarCosts[I] = ScalarCost; |
5479 | } |
5480 | |
5481 | return Discount; |
5482 | } |
5483 | |
5484 | InstructionCost LoopVectorizationCostModel::expectedCost( |
5485 | ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { |
5486 | InstructionCost Cost; |
5487 | |
5488 | // For each block. |
5489 | for (BasicBlock *BB : TheLoop->blocks()) { |
5490 | InstructionCost BlockCost; |
5491 | |
5492 | // For each instruction in the old loop. |
5493 | for (Instruction &I : BB->instructionsWithoutDebug()) { |
5494 | // Skip ignored values. |
5495 | if (ValuesToIgnore.count(Ptr: &I) || |
5496 | (VF.isVector() && VecValuesToIgnore.count(Ptr: &I))) |
5497 | continue; |
5498 | |
5499 | InstructionCost C = getInstructionCost(I: &I, VF); |
5500 | |
5501 | // Check if we should override the cost. |
5502 | if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0) |
5503 | C = InstructionCost(ForceTargetInstructionCost); |
5504 | |
5505 | // Keep a list of instructions with invalid costs. |
5506 | if (Invalid && !C.isValid()) |
5507 | Invalid->emplace_back(Args: &I, Args&: VF); |
5508 | |
5509 | BlockCost += C; |
5510 | LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF " |
5511 | << VF << " For instruction: " << I << '\n'); |
5512 | } |
5513 | |
5514 | // If we are vectorizing a predicated block, it will have been |
5515 | // if-converted. This means that the block's instructions (aside from |
5516 | // stores and instructions that may divide by zero) will now be |
5517 | // unconditionally executed. For the scalar case, we may not always execute |
5518 | // the predicated block, if it is an if-else block. Thus, scale the block's |
5519 | // cost by the probability of executing it. blockNeedsPredication from |
5520 | // Legal is used so as to not include all blocks in tail folded loops. |
5521 | if (VF.isScalar() && Legal->blockNeedsPredication(BB)) |
5522 | BlockCost /= getReciprocalPredBlockProb(); |
5523 | |
5524 | Cost += BlockCost; |
5525 | } |
5526 | |
5527 | return Cost; |
5528 | } |
5529 | |
5530 | /// Gets Address Access SCEV after verifying that the access pattern |
5531 | /// is loop invariant except the induction variable dependence. |
5532 | /// |
5533 | /// This SCEV can be sent to the Target in order to estimate the address |
5534 | /// calculation cost. |
5535 | static const SCEV *getAddressAccessSCEV( |
5536 | Value *Ptr, |
5537 | LoopVectorizationLegality *Legal, |
5538 | PredicatedScalarEvolution &PSE, |
5539 | const Loop *TheLoop) { |
5540 | |
5541 | auto *Gep = dyn_cast<GetElementPtrInst>(Val: Ptr); |
5542 | if (!Gep) |
5543 | return nullptr; |
5544 | |
5545 | // We are looking for a gep with all loop invariant indices except for one |
5546 | // which should be an induction variable. |
5547 | auto SE = PSE.getSE(); |
5548 | unsigned NumOperands = Gep->getNumOperands(); |
5549 | for (unsigned i = 1; i < NumOperands; ++i) { |
5550 | Value *Opd = Gep->getOperand(i_nocapture: i); |
5551 | if (!SE->isLoopInvariant(S: SE->getSCEV(V: Opd), L: TheLoop) && |
5552 | !Legal->isInductionVariable(V: Opd)) |
5553 | return nullptr; |
5554 | } |
5555 | |
5556 | // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. |
5557 | return PSE.getSCEV(V: Ptr); |
5558 | } |
5559 | |
5560 | InstructionCost |
5561 | LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, |
5562 | ElementCount VF) { |
5563 | assert(VF.isVector() && |
5564 | "Scalarization cost of instruction implies vectorization." ); |
5565 | if (VF.isScalable()) |
5566 | return InstructionCost::getInvalid(); |
5567 | |
5568 | Type *ValTy = getLoadStoreType(I); |
5569 | auto SE = PSE.getSE(); |
5570 | |
5571 | unsigned AS = getLoadStoreAddressSpace(I); |
5572 | Value *Ptr = getLoadStorePointerOperand(V: I); |
5573 | Type *PtrTy = ToVectorTy(Scalar: Ptr->getType(), EC: VF); |
5574 | // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` |
5575 | // that it is being called from this specific place. |
5576 | |
5577 | // Figure out whether the access is strided and get the stride value |
5578 | // if it's known in compile time |
5579 | const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); |
5580 | |
5581 | // Get the cost of the scalar memory instruction and address computation. |
5582 | InstructionCost Cost = |
5583 | VF.getKnownMinValue() * TTI.getAddressComputationCost(Ty: PtrTy, SE, Ptr: PtrSCEV); |
5584 | |
5585 | // Don't pass *I here, since it is scalar but will actually be part of a |
5586 | // vectorized loop where the user of it is a vectorized instruction. |
5587 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
5588 | const Align Alignment = getLoadStoreAlignment(I); |
5589 | Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(Opcode: I->getOpcode(), |
5590 | Src: ValTy->getScalarType(), |
5591 | Alignment, AddressSpace: AS, CostKind); |
5592 | |
5593 | // Get the overhead of the extractelement and insertelement instructions |
5594 | // we might create due to scalarization. |
5595 | Cost += getScalarizationOverhead(I, VF, CostKind); |
5596 | |
5597 | // If we have a predicated load/store, it will need extra i1 extracts and |
5598 | // conditional branches, but may not be executed for each vector lane. Scale |
5599 | // the cost by the probability of executing the predicated block. |
5600 | if (isPredicatedInst(I)) { |
5601 | Cost /= getReciprocalPredBlockProb(); |
5602 | |
5603 | // Add the cost of an i1 extract and a branch |
5604 | auto *Vec_i1Ty = |
5605 | VectorType::get(ElementType: IntegerType::getInt1Ty(C&: ValTy->getContext()), EC: VF); |
5606 | Cost += TTI.getScalarizationOverhead( |
5607 | Ty: Vec_i1Ty, DemandedElts: APInt::getAllOnes(numBits: VF.getKnownMinValue()), |
5608 | /*Insert=*/false, /*Extract=*/true, CostKind); |
5609 | Cost += TTI.getCFInstrCost(Opcode: Instruction::Br, CostKind); |
5610 | |
5611 | if (useEmulatedMaskMemRefHack(I, VF)) |
5612 | // Artificially setting to a high enough value to practically disable |
5613 | // vectorization with such operations. |
5614 | Cost = 3000000; |
5615 | } |
5616 | |
5617 | return Cost; |
5618 | } |
5619 | |
5620 | InstructionCost |
5621 | LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, |
5622 | ElementCount VF) { |
5623 | Type *ValTy = getLoadStoreType(I); |
5624 | auto *VectorTy = cast<VectorType>(Val: ToVectorTy(Scalar: ValTy, EC: VF)); |
5625 | Value *Ptr = getLoadStorePointerOperand(V: I); |
5626 | unsigned AS = getLoadStoreAddressSpace(I); |
5627 | int ConsecutiveStride = Legal->isConsecutivePtr(AccessTy: ValTy, Ptr); |
5628 | enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
5629 | |
5630 | assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && |
5631 | "Stride should be 1 or -1 for consecutive memory access" ); |
5632 | const Align Alignment = getLoadStoreAlignment(I); |
5633 | InstructionCost Cost = 0; |
5634 | if (Legal->isMaskRequired(I)) { |
5635 | Cost += TTI.getMaskedMemoryOpCost(Opcode: I->getOpcode(), Src: VectorTy, Alignment, AddressSpace: AS, |
5636 | CostKind); |
5637 | } else { |
5638 | TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: I->getOperand(i: 0)); |
5639 | Cost += TTI.getMemoryOpCost(Opcode: I->getOpcode(), Src: VectorTy, Alignment, AddressSpace: AS, |
5640 | CostKind, OpdInfo: OpInfo, I); |
5641 | } |
5642 | |
5643 | bool Reverse = ConsecutiveStride < 0; |
5644 | if (Reverse) |
5645 | Cost += TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse, Tp: VectorTy, |
5646 | Mask: std::nullopt, CostKind, Index: 0); |
5647 | return Cost; |
5648 | } |
5649 | |
5650 | InstructionCost |
5651 | LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, |
5652 | ElementCount VF) { |
5653 | assert(Legal->isUniformMemOp(*I, VF)); |
5654 | |
5655 | Type *ValTy = getLoadStoreType(I); |
5656 | auto *VectorTy = cast<VectorType>(Val: ToVectorTy(Scalar: ValTy, EC: VF)); |
5657 | const Align Alignment = getLoadStoreAlignment(I); |
5658 | unsigned AS = getLoadStoreAddressSpace(I); |
5659 | enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
5660 | if (isa<LoadInst>(Val: I)) { |
5661 | return TTI.getAddressComputationCost(Ty: ValTy) + |
5662 | TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: ValTy, Alignment, AddressSpace: AS, |
5663 | CostKind) + |
5664 | TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Broadcast, Tp: VectorTy); |
5665 | } |
5666 | StoreInst *SI = cast<StoreInst>(Val: I); |
5667 | |
5668 | bool isLoopInvariantStoreValue = Legal->isInvariant(V: SI->getValueOperand()); |
5669 | return TTI.getAddressComputationCost(Ty: ValTy) + |
5670 | TTI.getMemoryOpCost(Opcode: Instruction::Store, Src: ValTy, Alignment, AddressSpace: AS, |
5671 | CostKind) + |
5672 | (isLoopInvariantStoreValue |
5673 | ? 0 |
5674 | : TTI.getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VectorTy, |
5675 | CostKind, Index: VF.getKnownMinValue() - 1)); |
5676 | } |
5677 | |
5678 | InstructionCost |
5679 | LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, |
5680 | ElementCount VF) { |
5681 | Type *ValTy = getLoadStoreType(I); |
5682 | auto *VectorTy = cast<VectorType>(Val: ToVectorTy(Scalar: ValTy, EC: VF)); |
5683 | const Align Alignment = getLoadStoreAlignment(I); |
5684 | const Value *Ptr = getLoadStorePointerOperand(V: I); |
5685 | |
5686 | return TTI.getAddressComputationCost(Ty: VectorTy) + |
5687 | TTI.getGatherScatterOpCost( |
5688 | Opcode: I->getOpcode(), DataTy: VectorTy, Ptr, VariableMask: Legal->isMaskRequired(I), Alignment, |
5689 | CostKind: TargetTransformInfo::TCK_RecipThroughput, I); |
5690 | } |
5691 | |
5692 | InstructionCost |
5693 | LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, |
5694 | ElementCount VF) { |
5695 | Type *ValTy = getLoadStoreType(I); |
5696 | auto *VectorTy = cast<VectorType>(Val: ToVectorTy(Scalar: ValTy, EC: VF)); |
5697 | unsigned AS = getLoadStoreAddressSpace(I); |
5698 | enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
5699 | |
5700 | auto Group = getInterleavedAccessGroup(Instr: I); |
5701 | assert(Group && "Fail to get an interleaved access group." ); |
5702 | |
5703 | unsigned InterleaveFactor = Group->getFactor(); |
5704 | auto *WideVecTy = VectorType::get(ElementType: ValTy, EC: VF * InterleaveFactor); |
5705 | |
5706 | // Holds the indices of existing members in the interleaved group. |
5707 | SmallVector<unsigned, 4> Indices; |
5708 | for (unsigned IF = 0; IF < InterleaveFactor; IF++) |
5709 | if (Group->getMember(Index: IF)) |
5710 | Indices.push_back(Elt: IF); |
5711 | |
5712 | // Calculate the cost of the whole interleaved group. |
5713 | bool UseMaskForGaps = |
5714 | (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || |
5715 | (isa<StoreInst>(Val: I) && (Group->getNumMembers() < Group->getFactor())); |
5716 | InstructionCost Cost = TTI.getInterleavedMemoryOpCost( |
5717 | Opcode: I->getOpcode(), VecTy: WideVecTy, Factor: Group->getFactor(), Indices, Alignment: Group->getAlign(), |
5718 | AddressSpace: AS, CostKind, UseMaskForCond: Legal->isMaskRequired(I), UseMaskForGaps); |
5719 | |
5720 | if (Group->isReverse()) { |
5721 | // TODO: Add support for reversed masked interleaved access. |
5722 | assert(!Legal->isMaskRequired(I) && |
5723 | "Reverse masked interleaved access not supported." ); |
5724 | Cost += Group->getNumMembers() * |
5725 | TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse, Tp: VectorTy, |
5726 | Mask: std::nullopt, CostKind, Index: 0); |
5727 | } |
5728 | return Cost; |
5729 | } |
5730 | |
5731 | std::optional<InstructionCost> |
5732 | LoopVectorizationCostModel::getReductionPatternCost( |
5733 | Instruction *I, ElementCount VF, Type *Ty, |
5734 | TTI::TargetCostKind CostKind) const { |
5735 | using namespace llvm::PatternMatch; |
5736 | // Early exit for no inloop reductions |
5737 | if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Val: Ty)) |
5738 | return std::nullopt; |
5739 | auto *VectorTy = cast<VectorType>(Val: Ty); |
5740 | |
5741 | // We are looking for a pattern of, and finding the minimal acceptable cost: |
5742 | // reduce(mul(ext(A), ext(B))) or |
5743 | // reduce(mul(A, B)) or |
5744 | // reduce(ext(A)) or |
5745 | // reduce(A). |
5746 | // The basic idea is that we walk down the tree to do that, finding the root |
5747 | // reduction instruction in InLoopReductionImmediateChains. From there we find |
5748 | // the pattern of mul/ext and test the cost of the entire pattern vs the cost |
5749 | // of the components. If the reduction cost is lower then we return it for the |
5750 | // reduction instruction and 0 for the other instructions in the pattern. If |
5751 | // it is not we return an invalid cost specifying the orignal cost method |
5752 | // should be used. |
5753 | Instruction *RetI = I; |
5754 | if (match(V: RetI, P: m_ZExtOrSExt(Op: m_Value()))) { |
5755 | if (!RetI->hasOneUser()) |
5756 | return std::nullopt; |
5757 | RetI = RetI->user_back(); |
5758 | } |
5759 | |
5760 | if (match(V: RetI, P: m_OneUse(SubPattern: m_Mul(L: m_Value(), R: m_Value()))) && |
5761 | RetI->user_back()->getOpcode() == Instruction::Add) { |
5762 | RetI = RetI->user_back(); |
5763 | } |
5764 | |
5765 | // Test if the found instruction is a reduction, and if not return an invalid |
5766 | // cost specifying the parent to use the original cost modelling. |
5767 | if (!InLoopReductionImmediateChains.count(Val: RetI)) |
5768 | return std::nullopt; |
5769 | |
5770 | // Find the reduction this chain is a part of and calculate the basic cost of |
5771 | // the reduction on its own. |
5772 | Instruction *LastChain = InLoopReductionImmediateChains.at(Val: RetI); |
5773 | Instruction *ReductionPhi = LastChain; |
5774 | while (!isa<PHINode>(Val: ReductionPhi)) |
5775 | ReductionPhi = InLoopReductionImmediateChains.at(Val: ReductionPhi); |
5776 | |
5777 | const RecurrenceDescriptor &RdxDesc = |
5778 | Legal->getReductionVars().find(Key: cast<PHINode>(Val: ReductionPhi))->second; |
5779 | |
5780 | InstructionCost BaseCost; |
5781 | RecurKind RK = RdxDesc.getRecurrenceKind(); |
5782 | if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RK)) { |
5783 | Intrinsic::ID MinMaxID = getMinMaxReductionIntrinsicOp(RK); |
5784 | BaseCost = TTI.getMinMaxReductionCost(IID: MinMaxID, Ty: VectorTy, |
5785 | FMF: RdxDesc.getFastMathFlags(), CostKind); |
5786 | } else { |
5787 | BaseCost = TTI.getArithmeticReductionCost( |
5788 | Opcode: RdxDesc.getOpcode(), Ty: VectorTy, FMF: RdxDesc.getFastMathFlags(), CostKind); |
5789 | } |
5790 | |
5791 | // For a call to the llvm.fmuladd intrinsic we need to add the cost of a |
5792 | // normal fmul instruction to the cost of the fadd reduction. |
5793 | if (RK == RecurKind::FMulAdd) |
5794 | BaseCost += |
5795 | TTI.getArithmeticInstrCost(Opcode: Instruction::FMul, Ty: VectorTy, CostKind); |
5796 | |
5797 | // If we're using ordered reductions then we can just return the base cost |
5798 | // here, since getArithmeticReductionCost calculates the full ordered |
5799 | // reduction cost when FP reassociation is not allowed. |
5800 | if (useOrderedReductions(RdxDesc)) |
5801 | return BaseCost; |
5802 | |
5803 | // Get the operand that was not the reduction chain and match it to one of the |
5804 | // patterns, returning the better cost if it is found. |
5805 | Instruction *RedOp = RetI->getOperand(i: 1) == LastChain |
5806 | ? dyn_cast<Instruction>(Val: RetI->getOperand(i: 0)) |
5807 | : dyn_cast<Instruction>(Val: RetI->getOperand(i: 1)); |
5808 | |
5809 | VectorTy = VectorType::get(ElementType: I->getOperand(i: 0)->getType(), Other: VectorTy); |
5810 | |
5811 | Instruction *Op0, *Op1; |
5812 | if (RedOp && RdxDesc.getOpcode() == Instruction::Add && |
5813 | match(V: RedOp, |
5814 | P: m_ZExtOrSExt(Op: m_Mul(L: m_Instruction(I&: Op0), R: m_Instruction(I&: Op1)))) && |
5815 | match(V: Op0, P: m_ZExtOrSExt(Op: m_Value())) && |
5816 | Op0->getOpcode() == Op1->getOpcode() && |
5817 | Op0->getOperand(i: 0)->getType() == Op1->getOperand(i: 0)->getType() && |
5818 | !TheLoop->isLoopInvariant(V: Op0) && !TheLoop->isLoopInvariant(V: Op1) && |
5819 | (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { |
5820 | |
5821 | // Matched reduce.add(ext(mul(ext(A), ext(B))) |
5822 | // Note that the extend opcodes need to all match, or if A==B they will have |
5823 | // been converted to zext(mul(sext(A), sext(A))) as it is known positive, |
5824 | // which is equally fine. |
5825 | bool IsUnsigned = isa<ZExtInst>(Val: Op0); |
5826 | auto *ExtType = VectorType::get(ElementType: Op0->getOperand(i: 0)->getType(), Other: VectorTy); |
5827 | auto *MulType = VectorType::get(ElementType: Op0->getType(), Other: VectorTy); |
5828 | |
5829 | InstructionCost ExtCost = |
5830 | TTI.getCastInstrCost(Opcode: Op0->getOpcode(), Dst: MulType, Src: ExtType, |
5831 | CCH: TTI::CastContextHint::None, CostKind, I: Op0); |
5832 | InstructionCost MulCost = |
5833 | TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: MulType, CostKind); |
5834 | InstructionCost Ext2Cost = |
5835 | TTI.getCastInstrCost(Opcode: RedOp->getOpcode(), Dst: VectorTy, Src: MulType, |
5836 | CCH: TTI::CastContextHint::None, CostKind, I: RedOp); |
5837 | |
5838 | InstructionCost RedCost = TTI.getMulAccReductionCost( |
5839 | IsUnsigned, ResTy: RdxDesc.getRecurrenceType(), Ty: ExtType, CostKind); |
5840 | |
5841 | if (RedCost.isValid() && |
5842 | RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) |
5843 | return I == RetI ? RedCost : 0; |
5844 | } else if (RedOp && match(V: RedOp, P: m_ZExtOrSExt(Op: m_Value())) && |
5845 | !TheLoop->isLoopInvariant(V: RedOp)) { |
5846 | // Matched reduce(ext(A)) |
5847 | bool IsUnsigned = isa<ZExtInst>(Val: RedOp); |
5848 | auto *ExtType = VectorType::get(ElementType: RedOp->getOperand(i: 0)->getType(), Other: VectorTy); |
5849 | InstructionCost RedCost = TTI.getExtendedReductionCost( |
5850 | Opcode: RdxDesc.getOpcode(), IsUnsigned, ResTy: RdxDesc.getRecurrenceType(), Ty: ExtType, |
5851 | FMF: RdxDesc.getFastMathFlags(), CostKind); |
5852 | |
5853 | InstructionCost ExtCost = |
5854 | TTI.getCastInstrCost(Opcode: RedOp->getOpcode(), Dst: VectorTy, Src: ExtType, |
5855 | CCH: TTI::CastContextHint::None, CostKind, I: RedOp); |
5856 | if (RedCost.isValid() && RedCost < BaseCost + ExtCost) |
5857 | return I == RetI ? RedCost : 0; |
5858 | } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add && |
5859 | match(V: RedOp, P: m_Mul(L: m_Instruction(I&: Op0), R: m_Instruction(I&: Op1)))) { |
5860 | if (match(V: Op0, P: m_ZExtOrSExt(Op: m_Value())) && |
5861 | Op0->getOpcode() == Op1->getOpcode() && |
5862 | !TheLoop->isLoopInvariant(V: Op0) && !TheLoop->isLoopInvariant(V: Op1)) { |
5863 | bool IsUnsigned = isa<ZExtInst>(Val: Op0); |
5864 | Type *Op0Ty = Op0->getOperand(i: 0)->getType(); |
5865 | Type *Op1Ty = Op1->getOperand(i: 0)->getType(); |
5866 | Type *LargestOpTy = |
5867 | Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty |
5868 | : Op0Ty; |
5869 | auto *ExtType = VectorType::get(ElementType: LargestOpTy, Other: VectorTy); |
5870 | |
5871 | // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of |
5872 | // different sizes. We take the largest type as the ext to reduce, and add |
5873 | // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). |
5874 | InstructionCost ExtCost0 = TTI.getCastInstrCost( |
5875 | Opcode: Op0->getOpcode(), Dst: VectorTy, Src: VectorType::get(ElementType: Op0Ty, Other: VectorTy), |
5876 | CCH: TTI::CastContextHint::None, CostKind, I: Op0); |
5877 | InstructionCost ExtCost1 = TTI.getCastInstrCost( |
5878 | Opcode: Op1->getOpcode(), Dst: VectorTy, Src: VectorType::get(ElementType: Op1Ty, Other: VectorTy), |
5879 | CCH: TTI::CastContextHint::None, CostKind, I: Op1); |
5880 | InstructionCost MulCost = |
5881 | TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind); |
5882 | |
5883 | InstructionCost RedCost = TTI.getMulAccReductionCost( |
5884 | IsUnsigned, ResTy: RdxDesc.getRecurrenceType(), Ty: ExtType, CostKind); |
5885 | InstructionCost = 0; |
5886 | if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { |
5887 | Instruction * = (Op0Ty != LargestOpTy) ? Op0 : Op1; |
5888 | ExtraExtCost = TTI.getCastInstrCost( |
5889 | Opcode: ExtraExtOp->getOpcode(), Dst: ExtType, |
5890 | Src: VectorType::get(ElementType: ExtraExtOp->getOperand(i: 0)->getType(), Other: VectorTy), |
5891 | CCH: TTI::CastContextHint::None, CostKind, I: ExtraExtOp); |
5892 | } |
5893 | |
5894 | if (RedCost.isValid() && |
5895 | (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) |
5896 | return I == RetI ? RedCost : 0; |
5897 | } else if (!match(V: I, P: m_ZExtOrSExt(Op: m_Value()))) { |
5898 | // Matched reduce.add(mul()) |
5899 | InstructionCost MulCost = |
5900 | TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind); |
5901 | |
5902 | InstructionCost RedCost = TTI.getMulAccReductionCost( |
5903 | IsUnsigned: true, ResTy: RdxDesc.getRecurrenceType(), Ty: VectorTy, CostKind); |
5904 | |
5905 | if (RedCost.isValid() && RedCost < MulCost + BaseCost) |
5906 | return I == RetI ? RedCost : 0; |
5907 | } |
5908 | } |
5909 | |
5910 | return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt; |
5911 | } |
5912 | |
5913 | InstructionCost |
5914 | LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, |
5915 | ElementCount VF) { |
5916 | // Calculate scalar cost only. Vectorization cost should be ready at this |
5917 | // moment. |
5918 | if (VF.isScalar()) { |
5919 | Type *ValTy = getLoadStoreType(I); |
5920 | const Align Alignment = getLoadStoreAlignment(I); |
5921 | unsigned AS = getLoadStoreAddressSpace(I); |
5922 | |
5923 | TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: I->getOperand(i: 0)); |
5924 | return TTI.getAddressComputationCost(Ty: ValTy) + |
5925 | TTI.getMemoryOpCost(Opcode: I->getOpcode(), Src: ValTy, Alignment, AddressSpace: AS, |
5926 | CostKind: TTI::TCK_RecipThroughput, OpdInfo: OpInfo, I); |
5927 | } |
5928 | return getWideningCost(I, VF); |
5929 | } |
5930 | |
5931 | InstructionCost LoopVectorizationCostModel::getScalarizationOverhead( |
5932 | Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const { |
5933 | |
5934 | // There is no mechanism yet to create a scalable scalarization loop, |
5935 | // so this is currently Invalid. |
5936 | if (VF.isScalable()) |
5937 | return InstructionCost::getInvalid(); |
5938 | |
5939 | if (VF.isScalar()) |
5940 | return 0; |
5941 | |
5942 | InstructionCost Cost = 0; |
5943 | Type *RetTy = ToVectorTy(Scalar: I->getType(), EC: VF); |
5944 | if (!RetTy->isVoidTy() && |
5945 | (!isa<LoadInst>(Val: I) || !TTI.supportsEfficientVectorElementLoadStore())) |
5946 | Cost += TTI.getScalarizationOverhead( |
5947 | Ty: cast<VectorType>(Val: RetTy), DemandedElts: APInt::getAllOnes(numBits: VF.getKnownMinValue()), |
5948 | /*Insert*/ true, |
5949 | /*Extract*/ false, CostKind); |
5950 | |
5951 | // Some targets keep addresses scalar. |
5952 | if (isa<LoadInst>(Val: I) && !TTI.prefersVectorizedAddressing()) |
5953 | return Cost; |
5954 | |
5955 | // Some targets support efficient element stores. |
5956 | if (isa<StoreInst>(Val: I) && TTI.supportsEfficientVectorElementLoadStore()) |
5957 | return Cost; |
5958 | |
5959 | // Collect operands to consider. |
5960 | CallInst *CI = dyn_cast<CallInst>(Val: I); |
5961 | Instruction::op_range Ops = CI ? CI->args() : I->operands(); |
5962 | |
5963 | // Skip operands that do not require extraction/scalarization and do not incur |
5964 | // any overhead. |
5965 | SmallVector<Type *> Tys; |
5966 | for (auto *V : filterExtractingOperands(Ops, VF)) |
5967 | Tys.push_back(Elt: MaybeVectorizeType(Elt: V->getType(), VF)); |
5968 | return Cost + TTI.getOperandsScalarizationOverhead( |
5969 | Args: filterExtractingOperands(Ops, VF), Tys, CostKind); |
5970 | } |
5971 | |
5972 | void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { |
5973 | if (VF.isScalar()) |
5974 | return; |
5975 | NumPredStores = 0; |
5976 | for (BasicBlock *BB : TheLoop->blocks()) { |
5977 | // For each instruction in the old loop. |
5978 | for (Instruction &I : *BB) { |
5979 | Value *Ptr = getLoadStorePointerOperand(V: &I); |
5980 | if (!Ptr) |
5981 | continue; |
5982 | |
5983 | // TODO: We should generate better code and update the cost model for |
5984 | // predicated uniform stores. Today they are treated as any other |
5985 | // predicated store (see added test cases in |
5986 | // invariant-store-vectorization.ll). |
5987 | if (isa<StoreInst>(Val: &I) && isScalarWithPredication(I: &I, VF)) |
5988 | NumPredStores++; |
5989 | |
5990 | if (Legal->isUniformMemOp(I, VF)) { |
5991 | auto isLegalToScalarize = [&]() { |
5992 | if (!VF.isScalable()) |
5993 | // Scalarization of fixed length vectors "just works". |
5994 | return true; |
5995 | |
5996 | // We have dedicated lowering for unpredicated uniform loads and |
5997 | // stores. Note that even with tail folding we know that at least |
5998 | // one lane is active (i.e. generalized predication is not possible |
5999 | // here), and the logic below depends on this fact. |
6000 | if (!foldTailByMasking()) |
6001 | return true; |
6002 | |
6003 | // For scalable vectors, a uniform memop load is always |
6004 | // uniform-by-parts and we know how to scalarize that. |
6005 | if (isa<LoadInst>(Val: I)) |
6006 | return true; |
6007 | |
6008 | // A uniform store isn't neccessarily uniform-by-part |
6009 | // and we can't assume scalarization. |
6010 | auto &SI = cast<StoreInst>(Val&: I); |
6011 | return TheLoop->isLoopInvariant(V: SI.getValueOperand()); |
6012 | }; |
6013 | |
6014 | const InstructionCost GatherScatterCost = |
6015 | isLegalGatherOrScatter(V: &I, VF) ? |
6016 | getGatherScatterCost(I: &I, VF) : InstructionCost::getInvalid(); |
6017 | |
6018 | // Load: Scalar load + broadcast |
6019 | // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract |
6020 | // FIXME: This cost is a significant under-estimate for tail folded |
6021 | // memory ops. |
6022 | const InstructionCost ScalarizationCost = isLegalToScalarize() ? |
6023 | getUniformMemOpCost(I: &I, VF) : InstructionCost::getInvalid(); |
6024 | |
6025 | // Choose better solution for the current VF, Note that Invalid |
6026 | // costs compare as maximumal large. If both are invalid, we get |
6027 | // scalable invalid which signals a failure and a vectorization abort. |
6028 | if (GatherScatterCost < ScalarizationCost) |
6029 | setWideningDecision(I: &I, VF, W: CM_GatherScatter, Cost: GatherScatterCost); |
6030 | else |
6031 | setWideningDecision(I: &I, VF, W: CM_Scalarize, Cost: ScalarizationCost); |
6032 | continue; |
6033 | } |
6034 | |
6035 | // We assume that widening is the best solution when possible. |
6036 | if (memoryInstructionCanBeWidened(I: &I, VF)) { |
6037 | InstructionCost Cost = getConsecutiveMemOpCost(I: &I, VF); |
6038 | int ConsecutiveStride = Legal->isConsecutivePtr( |
6039 | AccessTy: getLoadStoreType(I: &I), Ptr: getLoadStorePointerOperand(V: &I)); |
6040 | assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && |
6041 | "Expected consecutive stride." ); |
6042 | InstWidening Decision = |
6043 | ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; |
6044 | setWideningDecision(I: &I, VF, W: Decision, Cost); |
6045 | continue; |
6046 | } |
6047 | |
6048 | // Choose between Interleaving, Gather/Scatter or Scalarization. |
6049 | InstructionCost InterleaveCost = InstructionCost::getInvalid(); |
6050 | unsigned NumAccesses = 1; |
6051 | if (isAccessInterleaved(Instr: &I)) { |
6052 | auto Group = getInterleavedAccessGroup(Instr: &I); |
6053 | assert(Group && "Fail to get an interleaved access group." ); |
6054 | |
6055 | // Make one decision for the whole group. |
6056 | if (getWideningDecision(I: &I, VF) != CM_Unknown) |
6057 | continue; |
6058 | |
6059 | NumAccesses = Group->getNumMembers(); |
6060 | if (interleavedAccessCanBeWidened(I: &I, VF)) |
6061 | InterleaveCost = getInterleaveGroupCost(I: &I, VF); |
6062 | } |
6063 | |
6064 | InstructionCost GatherScatterCost = |
6065 | isLegalGatherOrScatter(V: &I, VF) |
6066 | ? getGatherScatterCost(I: &I, VF) * NumAccesses |
6067 | : InstructionCost::getInvalid(); |
6068 | |
6069 | InstructionCost ScalarizationCost = |
6070 | getMemInstScalarizationCost(I: &I, VF) * NumAccesses; |
6071 | |
6072 | // Choose better solution for the current VF, |
6073 | // write down this decision and use it during vectorization. |
6074 | InstructionCost Cost; |
6075 | InstWidening Decision; |
6076 | if (InterleaveCost <= GatherScatterCost && |
6077 | InterleaveCost < ScalarizationCost) { |
6078 | Decision = CM_Interleave; |
6079 | Cost = InterleaveCost; |
6080 | } else if (GatherScatterCost < ScalarizationCost) { |
6081 | Decision = CM_GatherScatter; |
6082 | Cost = GatherScatterCost; |
6083 | } else { |
6084 | Decision = CM_Scalarize; |
6085 | Cost = ScalarizationCost; |
6086 | } |
6087 | // If the instructions belongs to an interleave group, the whole group |
6088 | // receives the same decision. The whole group receives the cost, but |
6089 | // the cost will actually be assigned to one instruction. |
6090 | if (auto Group = getInterleavedAccessGroup(Instr: &I)) |
6091 | setWideningDecision(Grp: Group, VF, W: Decision, Cost); |
6092 | else |
6093 | setWideningDecision(I: &I, VF, W: Decision, Cost); |
6094 | } |
6095 | } |
6096 | |
6097 | // Make sure that any load of address and any other address computation |
6098 | // remains scalar unless there is gather/scatter support. This avoids |
6099 | // inevitable extracts into address registers, and also has the benefit of |
6100 | // activating LSR more, since that pass can't optimize vectorized |
6101 | // addresses. |
6102 | if (TTI.prefersVectorizedAddressing()) |
6103 | return; |
6104 | |
6105 | // Start with all scalar pointer uses. |
6106 | SmallPtrSet<Instruction *, 8> AddrDefs; |
6107 | for (BasicBlock *BB : TheLoop->blocks()) |
6108 | for (Instruction &I : *BB) { |
6109 | Instruction *PtrDef = |
6110 | dyn_cast_or_null<Instruction>(Val: getLoadStorePointerOperand(V: &I)); |
6111 | if (PtrDef && TheLoop->contains(Inst: PtrDef) && |
6112 | getWideningDecision(I: &I, VF) != CM_GatherScatter) |
6113 | AddrDefs.insert(Ptr: PtrDef); |
6114 | } |
6115 | |
6116 | // Add all instructions used to generate the addresses. |
6117 | SmallVector<Instruction *, 4> Worklist; |
6118 | append_range(C&: Worklist, R&: AddrDefs); |
6119 | while (!Worklist.empty()) { |
6120 | Instruction *I = Worklist.pop_back_val(); |
6121 | for (auto &Op : I->operands()) |
6122 | if (auto *InstOp = dyn_cast<Instruction>(Val&: Op)) |
6123 | if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(Val: InstOp) && |
6124 | AddrDefs.insert(Ptr: InstOp).second) |
6125 | Worklist.push_back(Elt: InstOp); |
6126 | } |
6127 | |
6128 | for (auto *I : AddrDefs) { |
6129 | if (isa<LoadInst>(Val: I)) { |
6130 | // Setting the desired widening decision should ideally be handled in |
6131 | // by cost functions, but since this involves the task of finding out |
6132 | // if the loaded register is involved in an address computation, it is |
6133 | // instead changed here when we know this is the case. |
6134 | InstWidening Decision = getWideningDecision(I, VF); |
6135 | if (Decision == CM_Widen || Decision == CM_Widen_Reverse) |
6136 | // Scalarize a widened load of address. |
6137 | setWideningDecision( |
6138 | I, VF, W: CM_Scalarize, |
6139 | Cost: (VF.getKnownMinValue() * |
6140 | getMemoryInstructionCost(I, VF: ElementCount::getFixed(MinVal: 1)))); |
6141 | else if (auto Group = getInterleavedAccessGroup(Instr: I)) { |
6142 | // Scalarize an interleave group of address loads. |
6143 | for (unsigned I = 0; I < Group->getFactor(); ++I) { |
6144 | if (Instruction *Member = Group->getMember(Index: I)) |
6145 | setWideningDecision( |
6146 | I: Member, VF, W: CM_Scalarize, |
6147 | Cost: (VF.getKnownMinValue() * |
6148 | getMemoryInstructionCost(I: Member, VF: ElementCount::getFixed(MinVal: 1)))); |
6149 | } |
6150 | } |
6151 | } else |
6152 | // Make sure I gets scalarized and a cost estimate without |
6153 | // scalarization overhead. |
6154 | ForcedScalars[VF].insert(Ptr: I); |
6155 | } |
6156 | } |
6157 | |
6158 | void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) { |
6159 | assert(!VF.isScalar() && |
6160 | "Trying to set a vectorization decision for a scalar VF" ); |
6161 | |
6162 | for (BasicBlock *BB : TheLoop->blocks()) { |
6163 | // For each instruction in the old loop. |
6164 | for (Instruction &I : *BB) { |
6165 | CallInst *CI = dyn_cast<CallInst>(Val: &I); |
6166 | |
6167 | if (!CI) |
6168 | continue; |
6169 | |
6170 | InstructionCost ScalarCost = InstructionCost::getInvalid(); |
6171 | InstructionCost VectorCost = InstructionCost::getInvalid(); |
6172 | InstructionCost IntrinsicCost = InstructionCost::getInvalid(); |
6173 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
6174 | |
6175 | Function *ScalarFunc = CI->getCalledFunction(); |
6176 | Type *ScalarRetTy = CI->getType(); |
6177 | SmallVector<Type *, 4> Tys, ScalarTys; |
6178 | bool MaskRequired = Legal->isMaskRequired(I: CI); |
6179 | for (auto &ArgOp : CI->args()) |
6180 | ScalarTys.push_back(Elt: ArgOp->getType()); |
6181 | |
6182 | // Compute corresponding vector type for return value and arguments. |
6183 | Type *RetTy = ToVectorTy(Scalar: ScalarRetTy, EC: VF); |
6184 | for (Type *ScalarTy : ScalarTys) |
6185 | Tys.push_back(Elt: ToVectorTy(Scalar: ScalarTy, EC: VF)); |
6186 | |
6187 | // An in-loop reduction using an fmuladd intrinsic is a special case; |
6188 | // we don't want the normal cost for that intrinsic. |
6189 | if (RecurrenceDescriptor::isFMulAddIntrinsic(I: CI)) |
6190 | if (auto RedCost = getReductionPatternCost(I: CI, VF, Ty: RetTy, CostKind)) { |
6191 | setCallWideningDecision(CI, VF, Kind: CM_IntrinsicCall, Variant: nullptr, |
6192 | IID: getVectorIntrinsicIDForCall(CI, TLI), |
6193 | MaskPos: std::nullopt, Cost: *RedCost); |
6194 | continue; |
6195 | } |
6196 | |
6197 | // Estimate cost of scalarized vector call. The source operands are |
6198 | // assumed to be vectors, so we need to extract individual elements from |
6199 | // there, execute VF scalar calls, and then gather the result into the |
6200 | // vector return value. |
6201 | InstructionCost ScalarCallCost = |
6202 | TTI.getCallInstrCost(F: ScalarFunc, RetTy: ScalarRetTy, Tys: ScalarTys, CostKind); |
6203 | |
6204 | // Compute costs of unpacking argument values for the scalar calls and |
6205 | // packing the return values to a vector. |
6206 | InstructionCost ScalarizationCost = |
6207 | getScalarizationOverhead(I: CI, VF, CostKind); |
6208 | |
6209 | ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; |
6210 | |
6211 | // Find the cost of vectorizing the call, if we can find a suitable |
6212 | // vector variant of the function. |
6213 | bool UsesMask = false; |
6214 | VFInfo FuncInfo; |
6215 | Function *VecFunc = nullptr; |
6216 | // Search through any available variants for one we can use at this VF. |
6217 | for (VFInfo &Info : VFDatabase::getMappings(CI: *CI)) { |
6218 | // Must match requested VF. |
6219 | if (Info.Shape.VF != VF) |
6220 | continue; |
6221 | |
6222 | // Must take a mask argument if one is required |
6223 | if (MaskRequired && !Info.isMasked()) |
6224 | continue; |
6225 | |
6226 | // Check that all parameter kinds are supported |
6227 | bool ParamsOk = true; |
6228 | for (VFParameter Param : Info.Shape.Parameters) { |
6229 | switch (Param.ParamKind) { |
6230 | case VFParamKind::Vector: |
6231 | break; |
6232 | case VFParamKind::OMP_Uniform: { |
6233 | Value *ScalarParam = CI->getArgOperand(i: Param.ParamPos); |
6234 | // Make sure the scalar parameter in the loop is invariant. |
6235 | if (!PSE.getSE()->isLoopInvariant(S: PSE.getSCEV(V: ScalarParam), |
6236 | L: TheLoop)) |
6237 | ParamsOk = false; |
6238 | break; |
6239 | } |
6240 | case VFParamKind::OMP_Linear: { |
6241 | Value *ScalarParam = CI->getArgOperand(i: Param.ParamPos); |
6242 | // Find the stride for the scalar parameter in this loop and see if |
6243 | // it matches the stride for the variant. |
6244 | // TODO: do we need to figure out the cost of an extract to get the |
6245 | // first lane? Or do we hope that it will be folded away? |
6246 | ScalarEvolution *SE = PSE.getSE(); |
6247 | const auto *SAR = |
6248 | dyn_cast<SCEVAddRecExpr>(Val: SE->getSCEV(V: ScalarParam)); |
6249 | |
6250 | if (!SAR || SAR->getLoop() != TheLoop) { |
6251 | ParamsOk = false; |
6252 | break; |
6253 | } |
6254 | |
6255 | const SCEVConstant *Step = |
6256 | dyn_cast<SCEVConstant>(Val: SAR->getStepRecurrence(SE&: *SE)); |
6257 | |
6258 | if (!Step || |
6259 | Step->getAPInt().getSExtValue() != Param.LinearStepOrPos) |
6260 | ParamsOk = false; |
6261 | |
6262 | break; |
6263 | } |
6264 | case VFParamKind::GlobalPredicate: |
6265 | UsesMask = true; |
6266 | break; |
6267 | default: |
6268 | ParamsOk = false; |
6269 | break; |
6270 | } |
6271 | } |
6272 | |
6273 | if (!ParamsOk) |
6274 | continue; |
6275 | |
6276 | // Found a suitable candidate, stop here. |
6277 | VecFunc = CI->getModule()->getFunction(Name: Info.VectorName); |
6278 | FuncInfo = Info; |
6279 | break; |
6280 | } |
6281 | |
6282 | // Add in the cost of synthesizing a mask if one wasn't required. |
6283 | InstructionCost MaskCost = 0; |
6284 | if (VecFunc && UsesMask && !MaskRequired) |
6285 | MaskCost = TTI.getShuffleCost( |
6286 | Kind: TargetTransformInfo::SK_Broadcast, |
6287 | Tp: VectorType::get(ElementType: IntegerType::getInt1Ty( |
6288 | C&: VecFunc->getFunctionType()->getContext()), |
6289 | EC: VF)); |
6290 | |
6291 | if (TLI && VecFunc && !CI->isNoBuiltin()) |
6292 | VectorCost = |
6293 | TTI.getCallInstrCost(F: nullptr, RetTy, Tys, CostKind) + MaskCost; |
6294 | |
6295 | // Find the cost of an intrinsic; some targets may have instructions that |
6296 | // perform the operation without needing an actual call. |
6297 | Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI); |
6298 | if (IID != Intrinsic::not_intrinsic) |
6299 | IntrinsicCost = getVectorIntrinsicCost(CI, VF); |
6300 | |
6301 | InstructionCost Cost = ScalarCost; |
6302 | InstWidening Decision = CM_Scalarize; |
6303 | |
6304 | if (VectorCost <= Cost) { |
6305 | Cost = VectorCost; |
6306 | Decision = CM_VectorCall; |
6307 | } |
6308 | |
6309 | if (IntrinsicCost <= Cost) { |
6310 | Cost = IntrinsicCost; |
6311 | Decision = CM_IntrinsicCall; |
6312 | } |
6313 | |
6314 | setCallWideningDecision(CI, VF, Kind: Decision, Variant: VecFunc, IID, |
6315 | MaskPos: FuncInfo.getParamIndexForOptionalMask(), Cost); |
6316 | } |
6317 | } |
6318 | } |
6319 | |
6320 | InstructionCost |
6321 | LoopVectorizationCostModel::getInstructionCost(Instruction *I, |
6322 | ElementCount VF) { |
6323 | // If we know that this instruction will remain uniform, check the cost of |
6324 | // the scalar version. |
6325 | if (isUniformAfterVectorization(I, VF)) |
6326 | VF = ElementCount::getFixed(MinVal: 1); |
6327 | |
6328 | if (VF.isVector() && isProfitableToScalarize(I, VF)) |
6329 | return InstsToScalarize[VF][I]; |
6330 | |
6331 | // Forced scalars do not have any scalarization overhead. |
6332 | auto ForcedScalar = ForcedScalars.find(Val: VF); |
6333 | if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { |
6334 | auto InstSet = ForcedScalar->second; |
6335 | if (InstSet.count(Ptr: I)) |
6336 | return getInstructionCost(I, VF: ElementCount::getFixed(MinVal: 1)) * |
6337 | VF.getKnownMinValue(); |
6338 | } |
6339 | |
6340 | Type *RetTy = I->getType(); |
6341 | if (canTruncateToMinimalBitwidth(I, VF)) |
6342 | RetTy = IntegerType::get(C&: RetTy->getContext(), NumBits: MinBWs[I]); |
6343 | auto SE = PSE.getSE(); |
6344 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
6345 | |
6346 | auto hasSingleCopyAfterVectorization = [this](Instruction *I, |
6347 | ElementCount VF) -> bool { |
6348 | if (VF.isScalar()) |
6349 | return true; |
6350 | |
6351 | auto Scalarized = InstsToScalarize.find(Val: VF); |
6352 | assert(Scalarized != InstsToScalarize.end() && |
6353 | "VF not yet analyzed for scalarization profitability" ); |
6354 | return !Scalarized->second.count(Val: I) && |
6355 | llvm::all_of(Range: I->users(), P: [&](User *U) { |
6356 | auto *UI = cast<Instruction>(Val: U); |
6357 | return !Scalarized->second.count(Val: UI); |
6358 | }); |
6359 | }; |
6360 | (void) hasSingleCopyAfterVectorization; |
6361 | |
6362 | Type *VectorTy; |
6363 | if (isScalarAfterVectorization(I, VF)) { |
6364 | // With the exception of GEPs and PHIs, after scalarization there should |
6365 | // only be one copy of the instruction generated in the loop. This is |
6366 | // because the VF is either 1, or any instructions that need scalarizing |
6367 | // have already been dealt with by the time we get here. As a result, |
6368 | // it means we don't have to multiply the instruction cost by VF. |
6369 | assert(I->getOpcode() == Instruction::GetElementPtr || |
6370 | I->getOpcode() == Instruction::PHI || |
6371 | (I->getOpcode() == Instruction::BitCast && |
6372 | I->getType()->isPointerTy()) || |
6373 | hasSingleCopyAfterVectorization(I, VF)); |
6374 | VectorTy = RetTy; |
6375 | } else |
6376 | VectorTy = ToVectorTy(Scalar: RetTy, EC: VF); |
6377 | |
6378 | if (VF.isVector() && VectorTy->isVectorTy() && |
6379 | !TTI.getNumberOfParts(Tp: VectorTy)) |
6380 | return InstructionCost::getInvalid(); |
6381 | |
6382 | // TODO: We need to estimate the cost of intrinsic calls. |
6383 | switch (I->getOpcode()) { |
6384 | case Instruction::GetElementPtr: |
6385 | // We mark this instruction as zero-cost because the cost of GEPs in |
6386 | // vectorized code depends on whether the corresponding memory instruction |
6387 | // is scalarized or not. Therefore, we handle GEPs with the memory |
6388 | // instruction cost. |
6389 | return 0; |
6390 | case Instruction::Br: { |
6391 | // In cases of scalarized and predicated instructions, there will be VF |
6392 | // predicated blocks in the vectorized loop. Each branch around these |
6393 | // blocks requires also an extract of its vector compare i1 element. |
6394 | // Note that the conditional branch from the loop latch will be replaced by |
6395 | // a single branch controlling the loop, so there is no extra overhead from |
6396 | // scalarization. |
6397 | bool ScalarPredicatedBB = false; |
6398 | BranchInst *BI = cast<BranchInst>(Val: I); |
6399 | if (VF.isVector() && BI->isConditional() && |
6400 | (PredicatedBBsAfterVectorization[VF].count(Ptr: BI->getSuccessor(i: 0)) || |
6401 | PredicatedBBsAfterVectorization[VF].count(Ptr: BI->getSuccessor(i: 1))) && |
6402 | BI->getParent() != TheLoop->getLoopLatch()) |
6403 | ScalarPredicatedBB = true; |
6404 | |
6405 | if (ScalarPredicatedBB) { |
6406 | // Not possible to scalarize scalable vector with predicated instructions. |
6407 | if (VF.isScalable()) |
6408 | return InstructionCost::getInvalid(); |
6409 | // Return cost for branches around scalarized and predicated blocks. |
6410 | auto *Vec_i1Ty = |
6411 | VectorType::get(ElementType: IntegerType::getInt1Ty(C&: RetTy->getContext()), EC: VF); |
6412 | return ( |
6413 | TTI.getScalarizationOverhead( |
6414 | Ty: Vec_i1Ty, DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()), |
6415 | /*Insert*/ false, /*Extract*/ true, CostKind) + |
6416 | (TTI.getCFInstrCost(Opcode: Instruction::Br, CostKind) * VF.getFixedValue())); |
6417 | } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) |
6418 | // The back-edge branch will remain, as will all scalar branches. |
6419 | return TTI.getCFInstrCost(Opcode: Instruction::Br, CostKind); |
6420 | else |
6421 | // This branch will be eliminated by if-conversion. |
6422 | return 0; |
6423 | // Note: We currently assume zero cost for an unconditional branch inside |
6424 | // a predicated block since it will become a fall-through, although we |
6425 | // may decide in the future to call TTI for all branches. |
6426 | } |
6427 | case Instruction::PHI: { |
6428 | auto *Phi = cast<PHINode>(Val: I); |
6429 | |
6430 | // First-order recurrences are replaced by vector shuffles inside the loop. |
6431 | if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) { |
6432 | // For <vscale x 1 x i64>, if vscale = 1 we are unable to extract the |
6433 | // penultimate value of the recurrence. |
6434 | // TODO: Consider vscale_range info. |
6435 | if (VF.isScalable() && VF.getKnownMinValue() == 1) |
6436 | return InstructionCost::getInvalid(); |
6437 | SmallVector<int> Mask(VF.getKnownMinValue()); |
6438 | std::iota(first: Mask.begin(), last: Mask.end(), value: VF.getKnownMinValue() - 1); |
6439 | return TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Splice, |
6440 | Tp: cast<VectorType>(Val: VectorTy), Mask, CostKind, |
6441 | Index: VF.getKnownMinValue() - 1); |
6442 | } |
6443 | |
6444 | // Phi nodes in non-header blocks (not inductions, reductions, etc.) are |
6445 | // converted into select instructions. We require N - 1 selects per phi |
6446 | // node, where N is the number of incoming values. |
6447 | if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) |
6448 | return (Phi->getNumIncomingValues() - 1) * |
6449 | TTI.getCmpSelInstrCost( |
6450 | Opcode: Instruction::Select, ValTy: ToVectorTy(Scalar: Phi->getType(), EC: VF), |
6451 | CondTy: ToVectorTy(Scalar: Type::getInt1Ty(C&: Phi->getContext()), EC: VF), |
6452 | VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind); |
6453 | |
6454 | return TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind); |
6455 | } |
6456 | case Instruction::UDiv: |
6457 | case Instruction::SDiv: |
6458 | case Instruction::URem: |
6459 | case Instruction::SRem: |
6460 | if (VF.isVector() && isPredicatedInst(I)) { |
6461 | const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF); |
6462 | return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ? |
6463 | ScalarCost : SafeDivisorCost; |
6464 | } |
6465 | // We've proven all lanes safe to speculate, fall through. |
6466 | [[fallthrough]]; |
6467 | case Instruction::Add: |
6468 | case Instruction::FAdd: |
6469 | case Instruction::Sub: |
6470 | case Instruction::FSub: |
6471 | case Instruction::Mul: |
6472 | case Instruction::FMul: |
6473 | case Instruction::FDiv: |
6474 | case Instruction::FRem: |
6475 | case Instruction::Shl: |
6476 | case Instruction::LShr: |
6477 | case Instruction::AShr: |
6478 | case Instruction::And: |
6479 | case Instruction::Or: |
6480 | case Instruction::Xor: { |
6481 | // If we're speculating on the stride being 1, the multiplication may |
6482 | // fold away. We can generalize this for all operations using the notion |
6483 | // of neutral elements. (TODO) |
6484 | if (I->getOpcode() == Instruction::Mul && |
6485 | (PSE.getSCEV(V: I->getOperand(i: 0))->isOne() || |
6486 | PSE.getSCEV(V: I->getOperand(i: 1))->isOne())) |
6487 | return 0; |
6488 | |
6489 | // Detect reduction patterns |
6490 | if (auto RedCost = getReductionPatternCost(I, VF, Ty: VectorTy, CostKind)) |
6491 | return *RedCost; |
6492 | |
6493 | // Certain instructions can be cheaper to vectorize if they have a constant |
6494 | // second vector operand. One example of this are shifts on x86. |
6495 | Value *Op2 = I->getOperand(i: 1); |
6496 | auto Op2Info = TTI.getOperandInfo(V: Op2); |
6497 | if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && |
6498 | Legal->isInvariant(V: Op2)) |
6499 | Op2Info.Kind = TargetTransformInfo::OK_UniformValue; |
6500 | |
6501 | SmallVector<const Value *, 4> Operands(I->operand_values()); |
6502 | return TTI.getArithmeticInstrCost( |
6503 | Opcode: I->getOpcode(), Ty: VectorTy, CostKind, |
6504 | Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None}, |
6505 | Opd2Info: Op2Info, Args: Operands, CxtI: I, TLibInfo: TLI); |
6506 | } |
6507 | case Instruction::FNeg: { |
6508 | return TTI.getArithmeticInstrCost( |
6509 | Opcode: I->getOpcode(), Ty: VectorTy, CostKind, |
6510 | Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None}, |
6511 | Opd2Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None}, |
6512 | Args: I->getOperand(i: 0), CxtI: I); |
6513 | } |
6514 | case Instruction::Select: { |
6515 | SelectInst *SI = cast<SelectInst>(Val: I); |
6516 | const SCEV *CondSCEV = SE->getSCEV(V: SI->getCondition()); |
6517 | bool ScalarCond = (SE->isLoopInvariant(S: CondSCEV, L: TheLoop)); |
6518 | |
6519 | const Value *Op0, *Op1; |
6520 | using namespace llvm::PatternMatch; |
6521 | if (!ScalarCond && (match(V: I, P: m_LogicalAnd(L: m_Value(V&: Op0), R: m_Value(V&: Op1))) || |
6522 | match(V: I, P: m_LogicalOr(L: m_Value(V&: Op0), R: m_Value(V&: Op1))))) { |
6523 | // select x, y, false --> x & y |
6524 | // select x, true, y --> x | y |
6525 | const auto [Op1VK, Op1VP] = TTI::getOperandInfo(V: Op0); |
6526 | const auto [Op2VK, Op2VP] = TTI::getOperandInfo(V: Op1); |
6527 | assert(Op0->getType()->getScalarSizeInBits() == 1 && |
6528 | Op1->getType()->getScalarSizeInBits() == 1); |
6529 | |
6530 | SmallVector<const Value *, 2> Operands{Op0, Op1}; |
6531 | return TTI.getArithmeticInstrCost( |
6532 | Opcode: match(V: I, P: m_LogicalOr()) ? Instruction::Or : Instruction::And, Ty: VectorTy, |
6533 | CostKind, Opd1Info: {.Kind: Op1VK, .Properties: Op1VP}, Opd2Info: {.Kind: Op2VK, .Properties: Op2VP}, Args: Operands, CxtI: I); |
6534 | } |
6535 | |
6536 | Type *CondTy = SI->getCondition()->getType(); |
6537 | if (!ScalarCond) |
6538 | CondTy = VectorType::get(ElementType: CondTy, EC: VF); |
6539 | |
6540 | CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; |
6541 | if (auto *Cmp = dyn_cast<CmpInst>(Val: SI->getCondition())) |
6542 | Pred = Cmp->getPredicate(); |
6543 | return TTI.getCmpSelInstrCost(Opcode: I->getOpcode(), ValTy: VectorTy, CondTy, VecPred: Pred, |
6544 | CostKind, I); |
6545 | } |
6546 | case Instruction::ICmp: |
6547 | case Instruction::FCmp: { |
6548 | Type *ValTy = I->getOperand(i: 0)->getType(); |
6549 | Instruction *Op0AsInstruction = dyn_cast<Instruction>(Val: I->getOperand(i: 0)); |
6550 | if (canTruncateToMinimalBitwidth(I: Op0AsInstruction, VF)) |
6551 | ValTy = IntegerType::get(C&: ValTy->getContext(), NumBits: MinBWs[Op0AsInstruction]); |
6552 | VectorTy = ToVectorTy(Scalar: ValTy, EC: VF); |
6553 | return TTI.getCmpSelInstrCost(Opcode: I->getOpcode(), ValTy: VectorTy, CondTy: nullptr, |
6554 | VecPred: cast<CmpInst>(Val: I)->getPredicate(), CostKind, |
6555 | I); |
6556 | } |
6557 | case Instruction::Store: |
6558 | case Instruction::Load: { |
6559 | ElementCount Width = VF; |
6560 | if (Width.isVector()) { |
6561 | InstWidening Decision = getWideningDecision(I, VF: Width); |
6562 | assert(Decision != CM_Unknown && |
6563 | "CM decision should be taken at this point" ); |
6564 | if (getWideningCost(I, VF) == InstructionCost::getInvalid()) |
6565 | return InstructionCost::getInvalid(); |
6566 | if (Decision == CM_Scalarize) |
6567 | Width = ElementCount::getFixed(MinVal: 1); |
6568 | } |
6569 | VectorTy = ToVectorTy(Scalar: getLoadStoreType(I), EC: Width); |
6570 | return getMemoryInstructionCost(I, VF); |
6571 | } |
6572 | case Instruction::BitCast: |
6573 | if (I->getType()->isPointerTy()) |
6574 | return 0; |
6575 | [[fallthrough]]; |
6576 | case Instruction::ZExt: |
6577 | case Instruction::SExt: |
6578 | case Instruction::FPToUI: |
6579 | case Instruction::FPToSI: |
6580 | case Instruction::FPExt: |
6581 | case Instruction::PtrToInt: |
6582 | case Instruction::IntToPtr: |
6583 | case Instruction::SIToFP: |
6584 | case Instruction::UIToFP: |
6585 | case Instruction::Trunc: |
6586 | case Instruction::FPTrunc: { |
6587 | // Computes the CastContextHint from a Load/Store instruction. |
6588 | auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { |
6589 | assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && |
6590 | "Expected a load or a store!" ); |
6591 | |
6592 | if (VF.isScalar() || !TheLoop->contains(Inst: I)) |
6593 | return TTI::CastContextHint::Normal; |
6594 | |
6595 | switch (getWideningDecision(I, VF)) { |
6596 | case LoopVectorizationCostModel::CM_GatherScatter: |
6597 | return TTI::CastContextHint::GatherScatter; |
6598 | case LoopVectorizationCostModel::CM_Interleave: |
6599 | return TTI::CastContextHint::Interleave; |
6600 | case LoopVectorizationCostModel::CM_Scalarize: |
6601 | case LoopVectorizationCostModel::CM_Widen: |
6602 | return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked |
6603 | : TTI::CastContextHint::Normal; |
6604 | case LoopVectorizationCostModel::CM_Widen_Reverse: |
6605 | return TTI::CastContextHint::Reversed; |
6606 | case LoopVectorizationCostModel::CM_Unknown: |
6607 | llvm_unreachable("Instr did not go through cost modelling?" ); |
6608 | case LoopVectorizationCostModel::CM_VectorCall: |
6609 | case LoopVectorizationCostModel::CM_IntrinsicCall: |
6610 | llvm_unreachable_internal(msg: "Instr has invalid widening decision" ); |
6611 | } |
6612 | |
6613 | llvm_unreachable("Unhandled case!" ); |
6614 | }; |
6615 | |
6616 | unsigned Opcode = I->getOpcode(); |
6617 | TTI::CastContextHint CCH = TTI::CastContextHint::None; |
6618 | // For Trunc, the context is the only user, which must be a StoreInst. |
6619 | if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { |
6620 | if (I->hasOneUse()) |
6621 | if (StoreInst *Store = dyn_cast<StoreInst>(Val: *I->user_begin())) |
6622 | CCH = ComputeCCH(Store); |
6623 | } |
6624 | // For Z/Sext, the context is the operand, which must be a LoadInst. |
6625 | else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || |
6626 | Opcode == Instruction::FPExt) { |
6627 | if (LoadInst *Load = dyn_cast<LoadInst>(Val: I->getOperand(i: 0))) |
6628 | CCH = ComputeCCH(Load); |
6629 | } |
6630 | |
6631 | // We optimize the truncation of induction variables having constant |
6632 | // integer steps. The cost of these truncations is the same as the scalar |
6633 | // operation. |
6634 | if (isOptimizableIVTruncate(I, VF)) { |
6635 | auto *Trunc = cast<TruncInst>(Val: I); |
6636 | return TTI.getCastInstrCost(Opcode: Instruction::Trunc, Dst: Trunc->getDestTy(), |
6637 | Src: Trunc->getSrcTy(), CCH, CostKind, I: Trunc); |
6638 | } |
6639 | |
6640 | // Detect reduction patterns |
6641 | if (auto RedCost = getReductionPatternCost(I, VF, Ty: VectorTy, CostKind)) |
6642 | return *RedCost; |
6643 | |
6644 | Type *SrcScalarTy = I->getOperand(i: 0)->getType(); |
6645 | Instruction *Op0AsInstruction = dyn_cast<Instruction>(Val: I->getOperand(i: 0)); |
6646 | if (canTruncateToMinimalBitwidth(I: Op0AsInstruction, VF)) |
6647 | SrcScalarTy = |
6648 | IntegerType::get(C&: SrcScalarTy->getContext(), NumBits: MinBWs[Op0AsInstruction]); |
6649 | Type *SrcVecTy = |
6650 | VectorTy->isVectorTy() ? ToVectorTy(Scalar: SrcScalarTy, EC: VF) : SrcScalarTy; |
6651 | |
6652 | if (canTruncateToMinimalBitwidth(I, VF)) { |
6653 | // If the result type is <= the source type, there will be no extend |
6654 | // after truncating the users to the minimal required bitwidth. |
6655 | if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() && |
6656 | (I->getOpcode() == Instruction::ZExt || |
6657 | I->getOpcode() == Instruction::SExt)) |
6658 | return 0; |
6659 | } |
6660 | |
6661 | return TTI.getCastInstrCost(Opcode, Dst: VectorTy, Src: SrcVecTy, CCH, CostKind, I); |
6662 | } |
6663 | case Instruction::Call: |
6664 | return getVectorCallCost(CI: cast<CallInst>(Val: I), VF); |
6665 | case Instruction::ExtractValue: |
6666 | return TTI.getInstructionCost(U: I, CostKind: TTI::TCK_RecipThroughput); |
6667 | case Instruction::Alloca: |
6668 | // We cannot easily widen alloca to a scalable alloca, as |
6669 | // the result would need to be a vector of pointers. |
6670 | if (VF.isScalable()) |
6671 | return InstructionCost::getInvalid(); |
6672 | [[fallthrough]]; |
6673 | default: |
6674 | // This opcode is unknown. Assume that it is the same as 'mul'. |
6675 | return TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind); |
6676 | } // end of switch. |
6677 | } |
6678 | |
6679 | void LoopVectorizationCostModel::collectValuesToIgnore() { |
6680 | // Ignore ephemeral values. |
6681 | CodeMetrics::collectEphemeralValues(L: TheLoop, AC, EphValues&: ValuesToIgnore); |
6682 | |
6683 | SmallVector<Value *, 4> DeadInterleavePointerOps; |
6684 | for (BasicBlock *BB : TheLoop->blocks()) |
6685 | for (Instruction &I : *BB) { |
6686 | // Find all stores to invariant variables. Since they are going to sink |
6687 | // outside the loop we do not need calculate cost for them. |
6688 | StoreInst *SI; |
6689 | if ((SI = dyn_cast<StoreInst>(Val: &I)) && |
6690 | Legal->isInvariantAddressOfReduction(V: SI->getPointerOperand())) |
6691 | ValuesToIgnore.insert(Ptr: &I); |
6692 | |
6693 | // For interleave groups, we only create a pointer for the start of the |
6694 | // interleave group. Queue up addresses of group members except the insert |
6695 | // position for further processing. |
6696 | if (isAccessInterleaved(Instr: &I)) { |
6697 | auto *Group = getInterleavedAccessGroup(Instr: &I); |
6698 | if (Group->getInsertPos() == &I) |
6699 | continue; |
6700 | Value *PointerOp = getLoadStorePointerOperand(V: &I); |
6701 | DeadInterleavePointerOps.push_back(Elt: PointerOp); |
6702 | } |
6703 | } |
6704 | |
6705 | // Mark ops feeding interleave group members as free, if they are only used |
6706 | // by other dead computations. |
6707 | for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) { |
6708 | auto *Op = dyn_cast<Instruction>(Val: DeadInterleavePointerOps[I]); |
6709 | if (!Op || !TheLoop->contains(Inst: Op) || any_of(Range: Op->users(), P: [this](User *U) { |
6710 | Instruction *UI = cast<Instruction>(Val: U); |
6711 | return !VecValuesToIgnore.contains(Ptr: U) && |
6712 | (!isAccessInterleaved(Instr: UI) || |
6713 | getInterleavedAccessGroup(Instr: UI)->getInsertPos() == UI); |
6714 | })) |
6715 | continue; |
6716 | VecValuesToIgnore.insert(Ptr: Op); |
6717 | DeadInterleavePointerOps.append(in_start: Op->op_begin(), in_end: Op->op_end()); |
6718 | } |
6719 | |
6720 | // Ignore type-promoting instructions we identified during reduction |
6721 | // detection. |
6722 | for (const auto &Reduction : Legal->getReductionVars()) { |
6723 | const RecurrenceDescriptor &RedDes = Reduction.second; |
6724 | const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); |
6725 | VecValuesToIgnore.insert(I: Casts.begin(), E: Casts.end()); |
6726 | } |
6727 | // Ignore type-casting instructions we identified during induction |
6728 | // detection. |
6729 | for (const auto &Induction : Legal->getInductionVars()) { |
6730 | const InductionDescriptor &IndDes = Induction.second; |
6731 | const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); |
6732 | VecValuesToIgnore.insert(I: Casts.begin(), E: Casts.end()); |
6733 | } |
6734 | } |
6735 | |
6736 | void LoopVectorizationCostModel::collectInLoopReductions() { |
6737 | for (const auto &Reduction : Legal->getReductionVars()) { |
6738 | PHINode *Phi = Reduction.first; |
6739 | const RecurrenceDescriptor &RdxDesc = Reduction.second; |
6740 | |
6741 | // We don't collect reductions that are type promoted (yet). |
6742 | if (RdxDesc.getRecurrenceType() != Phi->getType()) |
6743 | continue; |
6744 | |
6745 | // If the target would prefer this reduction to happen "in-loop", then we |
6746 | // want to record it as such. |
6747 | unsigned Opcode = RdxDesc.getOpcode(); |
6748 | if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && |
6749 | !TTI.preferInLoopReduction(Opcode, Ty: Phi->getType(), |
6750 | Flags: TargetTransformInfo::ReductionFlags())) |
6751 | continue; |
6752 | |
6753 | // Check that we can correctly put the reductions into the loop, by |
6754 | // finding the chain of operations that leads from the phi to the loop |
6755 | // exit value. |
6756 | SmallVector<Instruction *, 4> ReductionOperations = |
6757 | RdxDesc.getReductionOpChain(Phi, L: TheLoop); |
6758 | bool InLoop = !ReductionOperations.empty(); |
6759 | |
6760 | if (InLoop) { |
6761 | InLoopReductions.insert(Ptr: Phi); |
6762 | // Add the elements to InLoopReductionImmediateChains for cost modelling. |
6763 | Instruction *LastChain = Phi; |
6764 | for (auto *I : ReductionOperations) { |
6765 | InLoopReductionImmediateChains[I] = LastChain; |
6766 | LastChain = I; |
6767 | } |
6768 | } |
6769 | LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop" ) |
6770 | << " reduction for phi: " << *Phi << "\n" ); |
6771 | } |
6772 | } |
6773 | |
6774 | VPValue *VPBuilder::createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, |
6775 | DebugLoc DL, const Twine &Name) { |
6776 | assert(Pred >= CmpInst::FIRST_ICMP_PREDICATE && |
6777 | Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate" ); |
6778 | return tryInsertInstruction( |
6779 | VPI: new VPInstruction(Instruction::ICmp, Pred, A, B, DL, Name)); |
6780 | } |
6781 | |
6782 | // This function will select a scalable VF if the target supports scalable |
6783 | // vectors and a fixed one otherwise. |
6784 | // TODO: we could return a pair of values that specify the max VF and |
6785 | // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of |
6786 | // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment |
6787 | // doesn't have a cost model that can choose which plan to execute if |
6788 | // more than one is generated. |
6789 | static ElementCount determineVPlanVF(const TargetTransformInfo &TTI, |
6790 | LoopVectorizationCostModel &CM) { |
6791 | unsigned WidestType; |
6792 | std::tie(args: std::ignore, args&: WidestType) = CM.getSmallestAndWidestTypes(); |
6793 | |
6794 | TargetTransformInfo::RegisterKind RegKind = |
6795 | TTI.enableScalableVectorization() |
6796 | ? TargetTransformInfo::RGK_ScalableVector |
6797 | : TargetTransformInfo::RGK_FixedWidthVector; |
6798 | |
6799 | TypeSize RegSize = TTI.getRegisterBitWidth(K: RegKind); |
6800 | unsigned N = RegSize.getKnownMinValue() / WidestType; |
6801 | return ElementCount::get(MinVal: N, Scalable: RegSize.isScalable()); |
6802 | } |
6803 | |
6804 | VectorizationFactor |
6805 | LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { |
6806 | ElementCount VF = UserVF; |
6807 | // Outer loop handling: They may require CFG and instruction level |
6808 | // transformations before even evaluating whether vectorization is profitable. |
6809 | // Since we cannot modify the incoming IR, we need to build VPlan upfront in |
6810 | // the vectorization pipeline. |
6811 | if (!OrigLoop->isInnermost()) { |
6812 | // If the user doesn't provide a vectorization factor, determine a |
6813 | // reasonable one. |
6814 | if (UserVF.isZero()) { |
6815 | VF = determineVPlanVF(TTI, CM); |
6816 | LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n" ); |
6817 | |
6818 | // Make sure we have a VF > 1 for stress testing. |
6819 | if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { |
6820 | LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " |
6821 | << "overriding computed VF.\n" ); |
6822 | VF = ElementCount::getFixed(MinVal: 4); |
6823 | } |
6824 | } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() && |
6825 | !ForceTargetSupportsScalableVectors) { |
6826 | LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but " |
6827 | << "not supported by the target.\n" ); |
6828 | reportVectorizationFailure( |
6829 | DebugMsg: "Scalable vectorization requested but not supported by the target" , |
6830 | OREMsg: "the scalable user-specified vectorization width for outer-loop " |
6831 | "vectorization cannot be used because the target does not support " |
6832 | "scalable vectors." , |
6833 | ORETag: "ScalableVFUnfeasible" , ORE, TheLoop: OrigLoop); |
6834 | return VectorizationFactor::Disabled(); |
6835 | } |
6836 | assert(EnableVPlanNativePath && "VPlan-native path is not enabled." ); |
6837 | assert(isPowerOf2_32(VF.getKnownMinValue()) && |
6838 | "VF needs to be a power of two" ); |
6839 | LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "" ) |
6840 | << "VF " << VF << " to build VPlans.\n" ); |
6841 | buildVPlans(MinVF: VF, MaxVF: VF); |
6842 | |
6843 | // For VPlan build stress testing, we bail out after VPlan construction. |
6844 | if (VPlanBuildStressTest) |
6845 | return VectorizationFactor::Disabled(); |
6846 | |
6847 | return {VF, 0 /*Cost*/, 0 /* ScalarCost */}; |
6848 | } |
6849 | |
6850 | LLVM_DEBUG( |
6851 | dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " |
6852 | "VPlan-native path.\n" ); |
6853 | return VectorizationFactor::Disabled(); |
6854 | } |
6855 | |
6856 | std::optional<VectorizationFactor> |
6857 | LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { |
6858 | assert(OrigLoop->isInnermost() && "Inner loop expected." ); |
6859 | CM.collectValuesToIgnore(); |
6860 | CM.collectElementTypesForWidening(); |
6861 | |
6862 | FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); |
6863 | if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. |
6864 | return std::nullopt; |
6865 | |
6866 | // Invalidate interleave groups if all blocks of loop will be predicated. |
6867 | if (CM.blockNeedsPredicationForAnyReason(BB: OrigLoop->getHeader()) && |
6868 | !useMaskedInterleavedAccesses(TTI)) { |
6869 | LLVM_DEBUG( |
6870 | dbgs() |
6871 | << "LV: Invalidate all interleaved groups due to fold-tail by masking " |
6872 | "which requires masked-interleaved support.\n" ); |
6873 | if (CM.InterleaveInfo.invalidateGroups()) |
6874 | // Invalidating interleave groups also requires invalidating all decisions |
6875 | // based on them, which includes widening decisions and uniform and scalar |
6876 | // values. |
6877 | CM.invalidateCostModelingDecisions(); |
6878 | } |
6879 | |
6880 | if (CM.foldTailByMasking()) |
6881 | Legal->prepareToFoldTailByMasking(); |
6882 | |
6883 | ElementCount MaxUserVF = |
6884 | UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; |
6885 | bool UserVFIsLegal = ElementCount::isKnownLE(LHS: UserVF, RHS: MaxUserVF); |
6886 | if (!UserVF.isZero() && UserVFIsLegal) { |
6887 | assert(isPowerOf2_32(UserVF.getKnownMinValue()) && |
6888 | "VF needs to be a power of two" ); |
6889 | // Collect the instructions (and their associated costs) that will be more |
6890 | // profitable to scalarize. |
6891 | CM.collectInLoopReductions(); |
6892 | if (CM.selectUserVectorizationFactor(UserVF)) { |
6893 | LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n" ); |
6894 | buildVPlansWithVPRecipes(MinVF: UserVF, MaxVF: UserVF); |
6895 | if (!hasPlanWithVF(VF: UserVF)) { |
6896 | LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF |
6897 | << ".\n" ); |
6898 | return std::nullopt; |
6899 | } |
6900 | |
6901 | LLVM_DEBUG(printPlans(dbgs())); |
6902 | return {{UserVF, 0, 0}}; |
6903 | } else |
6904 | reportVectorizationInfo(Msg: "UserVF ignored because of invalid costs." , |
6905 | ORETag: "InvalidCost" , ORE, TheLoop: OrigLoop); |
6906 | } |
6907 | |
6908 | // Collect the Vectorization Factor Candidates. |
6909 | SmallVector<ElementCount> VFCandidates; |
6910 | for (auto VF = ElementCount::getFixed(MinVal: 1); |
6911 | ElementCount::isKnownLE(LHS: VF, RHS: MaxFactors.FixedVF); VF *= 2) |
6912 | VFCandidates.push_back(Elt: VF); |
6913 | for (auto VF = ElementCount::getScalable(MinVal: 1); |
6914 | ElementCount::isKnownLE(LHS: VF, RHS: MaxFactors.ScalableVF); VF *= 2) |
6915 | VFCandidates.push_back(Elt: VF); |
6916 | |
6917 | CM.collectInLoopReductions(); |
6918 | for (const auto &VF : VFCandidates) { |
6919 | // Collect Uniform and Scalar instructions after vectorization with VF. |
6920 | CM.collectUniformsAndScalars(VF); |
6921 | |
6922 | // Collect the instructions (and their associated costs) that will be more |
6923 | // profitable to scalarize. |
6924 | if (VF.isVector()) |
6925 | CM.collectInstsToScalarize(VF); |
6926 | } |
6927 | |
6928 | buildVPlansWithVPRecipes(MinVF: ElementCount::getFixed(MinVal: 1), MaxVF: MaxFactors.FixedVF); |
6929 | buildVPlansWithVPRecipes(MinVF: ElementCount::getScalable(MinVal: 1), MaxVF: MaxFactors.ScalableVF); |
6930 | |
6931 | LLVM_DEBUG(printPlans(dbgs())); |
6932 | if (VPlans.empty()) |
6933 | return std::nullopt; |
6934 | if (all_of(Range&: VPlans, |
6935 | P: [](std::unique_ptr<VPlan> &P) { return P->hasScalarVFOnly(); })) |
6936 | return VectorizationFactor::Disabled(); |
6937 | |
6938 | // Select the optimal vectorization factor according to the legacy cost-model. |
6939 | // This is now only used to verify the decisions by the new VPlan-based |
6940 | // cost-model and will be retired once the VPlan-based cost-model is |
6941 | // stabilized. |
6942 | VectorizationFactor VF = selectVectorizationFactor(); |
6943 | assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero." ); |
6944 | if (!hasPlanWithVF(VF: VF.Width)) { |
6945 | LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width |
6946 | << ".\n" ); |
6947 | return std::nullopt; |
6948 | } |
6949 | return VF; |
6950 | } |
6951 | |
6952 | InstructionCost VPCostContext::getLegacyCost(Instruction *UI, |
6953 | ElementCount VF) const { |
6954 | return CM.getInstructionCost(I: UI, VF); |
6955 | } |
6956 | |
6957 | bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const { |
6958 | return CM.ValuesToIgnore.contains(Ptr: UI) || |
6959 | (IsVector && CM.VecValuesToIgnore.contains(Ptr: UI)) || |
6960 | SkipCostComputation.contains(Ptr: UI); |
6961 | } |
6962 | |
6963 | InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, |
6964 | ElementCount VF) const { |
6965 | InstructionCost Cost = 0; |
6966 | LLVMContext &LLVMCtx = OrigLoop->getHeader()->getContext(); |
6967 | VPCostContext CostCtx(CM.TTI, Legal->getWidestInductionType(), LLVMCtx, CM); |
6968 | |
6969 | // Cost modeling for inductions is inaccurate in the legacy cost model |
6970 | // compared to the recipes that are generated. To match here initially during |
6971 | // VPlan cost model bring up directly use the induction costs from the legacy |
6972 | // cost model. Note that we do this as pre-processing; the VPlan may not have |
6973 | // any recipes associated with the original induction increment instruction |
6974 | // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute |
6975 | // the cost of induction phis and increments (both that are represented by |
6976 | // recipes and those that are not), to avoid distinguishing between them here, |
6977 | // and skip all recipes that represent induction phis and increments (the |
6978 | // former case) later on, if they exist, to avoid counting them twice. |
6979 | // Similarly we pre-compute the cost of any optimized truncates. |
6980 | // TODO: Switch to more accurate costing based on VPlan. |
6981 | for (const auto &[IV, IndDesc] : Legal->getInductionVars()) { |
6982 | Instruction *IVInc = cast<Instruction>( |
6983 | Val: IV->getIncomingValueForBlock(BB: OrigLoop->getLoopLatch())); |
6984 | SmallVector<Instruction *> IVInsts = {IV, IVInc}; |
6985 | for (User *U : IV->users()) { |
6986 | auto *CI = cast<Instruction>(Val: U); |
6987 | if (!CostCtx.CM.isOptimizableIVTruncate(I: CI, VF)) |
6988 | continue; |
6989 | IVInsts.push_back(Elt: CI); |
6990 | } |
6991 | for (Instruction *IVInst : IVInsts) { |
6992 | if (!CostCtx.SkipCostComputation.insert(Ptr: IVInst).second) |
6993 | continue; |
6994 | InstructionCost InductionCost = CostCtx.getLegacyCost(UI: IVInst, VF); |
6995 | LLVM_DEBUG({ |
6996 | dbgs() << "Cost of " << InductionCost << " for VF " << VF |
6997 | << ": induction instruction " << *IVInst << "\n" ; |
6998 | }); |
6999 | Cost += InductionCost; |
7000 | } |
7001 | } |
7002 | |
7003 | /// Compute the cost of all exiting conditions of the loop using the legacy |
7004 | /// cost model. This is to match the legacy behavior, which adds the cost of |
7005 | /// all exit conditions. Note that this over-estimates the cost, as there will |
7006 | /// be a single condition to control the vector loop. |
7007 | SmallVector<BasicBlock *> Exiting; |
7008 | CM.TheLoop->getExitingBlocks(ExitingBlocks&: Exiting); |
7009 | SetVector<Instruction *> ExitInstrs; |
7010 | // Collect all exit conditions. |
7011 | for (BasicBlock *EB : Exiting) { |
7012 | auto *Term = dyn_cast<BranchInst>(Val: EB->getTerminator()); |
7013 | if (!Term) |
7014 | continue; |
7015 | if (auto *CondI = dyn_cast<Instruction>(Val: Term->getOperand(i_nocapture: 0))) { |
7016 | ExitInstrs.insert(X: CondI); |
7017 | } |
7018 | } |
7019 | // Compute the cost of all instructions only feeding the exit conditions. |
7020 | for (unsigned I = 0; I != ExitInstrs.size(); ++I) { |
7021 | Instruction *CondI = ExitInstrs[I]; |
7022 | if (!OrigLoop->contains(Inst: CondI) || |
7023 | !CostCtx.SkipCostComputation.insert(Ptr: CondI).second) |
7024 | continue; |
7025 | Cost += CostCtx.getLegacyCost(UI: CondI, VF); |
7026 | for (Value *Op : CondI->operands()) { |
7027 | auto *OpI = dyn_cast<Instruction>(Val: Op); |
7028 | if (!OpI || any_of(Range: OpI->users(), P: [&ExitInstrs, this](User *U) { |
7029 | return OrigLoop->contains(BB: cast<Instruction>(Val: U)->getParent()) && |
7030 | !ExitInstrs.contains(key: cast<Instruction>(Val: U)); |
7031 | })) |
7032 | continue; |
7033 | ExitInstrs.insert(X: OpI); |
7034 | } |
7035 | } |
7036 | |
7037 | // The legacy cost model has special logic to compute the cost of in-loop |
7038 | // reductions, which may be smaller than the sum of all instructions involved |
7039 | // in the reduction. For AnyOf reductions, VPlan codegen may remove the select |
7040 | // which the legacy cost model uses to assign cost. Pre-compute their costs |
7041 | // for now. |
7042 | // TODO: Switch to costing based on VPlan once the logic has been ported. |
7043 | for (const auto &[RedPhi, RdxDesc] : Legal->getReductionVars()) { |
7044 | if (!CM.isInLoopReduction(Phi: RedPhi) && |
7045 | !RecurrenceDescriptor::isAnyOfRecurrenceKind( |
7046 | Kind: RdxDesc.getRecurrenceKind())) |
7047 | continue; |
7048 | |
7049 | // AnyOf reduction codegen may remove the select. To match the legacy cost |
7050 | // model, pre-compute the cost for AnyOf reductions here. |
7051 | if (RecurrenceDescriptor::isAnyOfRecurrenceKind( |
7052 | Kind: RdxDesc.getRecurrenceKind())) { |
7053 | auto *Select = cast<SelectInst>(Val: *find_if( |
7054 | Range: RedPhi->users(), P: [](User *U) { return isa<SelectInst>(Val: U); })); |
7055 | assert(!CostCtx.SkipCostComputation.contains(Select) && |
7056 | "reduction op visited multiple times" ); |
7057 | CostCtx.SkipCostComputation.insert(Ptr: Select); |
7058 | auto ReductionCost = CostCtx.getLegacyCost(UI: Select, VF); |
7059 | LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF |
7060 | << ":\n any-of reduction " << *Select << "\n" ); |
7061 | Cost += ReductionCost; |
7062 | continue; |
7063 | } |
7064 | |
7065 | const auto &ChainOps = RdxDesc.getReductionOpChain(Phi: RedPhi, L: OrigLoop); |
7066 | SetVector<Instruction *> ChainOpsAndOperands(ChainOps.begin(), |
7067 | ChainOps.end()); |
7068 | // Also include the operands of instructions in the chain, as the cost-model |
7069 | // may mark extends as free. |
7070 | for (auto *ChainOp : ChainOps) { |
7071 | for (Value *Op : ChainOp->operands()) { |
7072 | if (auto *I = dyn_cast<Instruction>(Val: Op)) |
7073 | ChainOpsAndOperands.insert(X: I); |
7074 | } |
7075 | } |
7076 | |
7077 | // Pre-compute the cost for I, if it has a reduction pattern cost. |
7078 | for (Instruction *I : ChainOpsAndOperands) { |
7079 | auto ReductionCost = CM.getReductionPatternCost( |
7080 | I, VF, Ty: ToVectorTy(Scalar: I->getType(), EC: VF), CostKind: TTI::TCK_RecipThroughput); |
7081 | if (!ReductionCost) |
7082 | continue; |
7083 | |
7084 | assert(!CostCtx.SkipCostComputation.contains(I) && |
7085 | "reduction op visited multiple times" ); |
7086 | CostCtx.SkipCostComputation.insert(Ptr: I); |
7087 | LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF |
7088 | << ":\n in-loop reduction " << *I << "\n" ); |
7089 | Cost += *ReductionCost; |
7090 | } |
7091 | } |
7092 | |
7093 | // Pre-compute the costs for branches except for the backedge, as the number |
7094 | // of replicate regions in a VPlan may not directly match the number of |
7095 | // branches, which would lead to different decisions. |
7096 | // TODO: Compute cost of branches for each replicate region in the VPlan, |
7097 | // which is more accurate than the legacy cost model. |
7098 | for (BasicBlock *BB : OrigLoop->blocks()) { |
7099 | if (BB == OrigLoop->getLoopLatch()) |
7100 | continue; |
7101 | CostCtx.SkipCostComputation.insert(Ptr: BB->getTerminator()); |
7102 | auto BranchCost = CostCtx.getLegacyCost(UI: BB->getTerminator(), VF); |
7103 | Cost += BranchCost; |
7104 | } |
7105 | // Now compute and add the VPlan-based cost. |
7106 | Cost += Plan.cost(VF, Ctx&: CostCtx); |
7107 | LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost << "\n" ); |
7108 | return Cost; |
7109 | } |
7110 | |
7111 | VPlan &LoopVectorizationPlanner::getBestPlan() const { |
7112 | // If there is a single VPlan with a single VF, return it directly. |
7113 | VPlan &FirstPlan = *VPlans[0]; |
7114 | if (VPlans.size() == 1 && size(Range: FirstPlan.vectorFactors()) == 1) |
7115 | return FirstPlan; |
7116 | |
7117 | VPlan *BestPlan = &FirstPlan; |
7118 | ElementCount ScalarVF = ElementCount::getFixed(MinVal: 1); |
7119 | assert(hasPlanWithVF(ScalarVF) && |
7120 | "More than a single plan/VF w/o any plan having scalar VF" ); |
7121 | |
7122 | // TODO: Compute scalar cost using VPlan-based cost model. |
7123 | InstructionCost ScalarCost = CM.expectedCost(VF: ScalarVF); |
7124 | VectorizationFactor BestFactor(ScalarVF, ScalarCost, ScalarCost); |
7125 | |
7126 | bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; |
7127 | if (ForceVectorization) { |
7128 | // Ignore scalar width, because the user explicitly wants vectorization. |
7129 | // Initialize cost to max so that VF = 2 is, at least, chosen during cost |
7130 | // evaluation. |
7131 | BestFactor.Cost = InstructionCost::getMax(); |
7132 | } |
7133 | |
7134 | for (auto &P : VPlans) { |
7135 | for (ElementCount VF : P->vectorFactors()) { |
7136 | if (VF.isScalar()) |
7137 | continue; |
7138 | if (!ForceVectorization && !willGenerateVectors(Plan&: *P, VF, TTI)) { |
7139 | LLVM_DEBUG( |
7140 | dbgs() |
7141 | << "LV: Not considering vector loop of width " << VF |
7142 | << " because it will not generate any vector instructions.\n" ); |
7143 | continue; |
7144 | } |
7145 | |
7146 | InstructionCost Cost = cost(Plan&: *P, VF); |
7147 | VectorizationFactor CurrentFactor(VF, Cost, ScalarCost); |
7148 | if (isMoreProfitable(A: CurrentFactor, B: BestFactor)) { |
7149 | BestFactor = CurrentFactor; |
7150 | BestPlan = &*P; |
7151 | } |
7152 | } |
7153 | } |
7154 | BestPlan->setVF(BestFactor.Width); |
7155 | return *BestPlan; |
7156 | } |
7157 | |
7158 | VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { |
7159 | assert(count_if(VPlans, |
7160 | [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == |
7161 | 1 && |
7162 | "Best VF has not a single VPlan." ); |
7163 | |
7164 | for (const VPlanPtr &Plan : VPlans) { |
7165 | if (Plan->hasVF(VF)) |
7166 | return *Plan.get(); |
7167 | } |
7168 | llvm_unreachable("No plan found!" ); |
7169 | } |
7170 | |
7171 | static void AddRuntimeUnrollDisableMetaData(Loop *L) { |
7172 | SmallVector<Metadata *, 4> MDs; |
7173 | // Reserve first location for self reference to the LoopID metadata node. |
7174 | MDs.push_back(Elt: nullptr); |
7175 | bool IsUnrollMetadata = false; |
7176 | MDNode *LoopID = L->getLoopID(); |
7177 | if (LoopID) { |
7178 | // First find existing loop unrolling disable metadata. |
7179 | for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { |
7180 | auto *MD = dyn_cast<MDNode>(Val: LoopID->getOperand(I: i)); |
7181 | if (MD) { |
7182 | const auto *S = dyn_cast<MDString>(Val: MD->getOperand(I: 0)); |
7183 | IsUnrollMetadata = |
7184 | S && S->getString().starts_with(Prefix: "llvm.loop.unroll.disable" ); |
7185 | } |
7186 | MDs.push_back(Elt: LoopID->getOperand(I: i)); |
7187 | } |
7188 | } |
7189 | |
7190 | if (!IsUnrollMetadata) { |
7191 | // Add runtime unroll disable metadata. |
7192 | LLVMContext &Context = L->getHeader()->getContext(); |
7193 | SmallVector<Metadata *, 1> DisableOperands; |
7194 | DisableOperands.push_back( |
7195 | Elt: MDString::get(Context, Str: "llvm.loop.unroll.runtime.disable" )); |
7196 | MDNode *DisableNode = MDNode::get(Context, MDs: DisableOperands); |
7197 | MDs.push_back(Elt: DisableNode); |
7198 | MDNode *NewLoopID = MDNode::get(Context, MDs); |
7199 | // Set operand 0 to refer to the loop id itself. |
7200 | NewLoopID->replaceOperandWith(I: 0, New: NewLoopID); |
7201 | L->setLoopID(NewLoopID); |
7202 | } |
7203 | } |
7204 | |
7205 | // Check if \p RedResult is a ComputeReductionResult instruction, and if it is |
7206 | // create a merge phi node for it and add it to \p ReductionResumeValues. |
7207 | static void createAndCollectMergePhiForReduction( |
7208 | VPInstruction *RedResult, |
7209 | DenseMap<const RecurrenceDescriptor *, Value *> &ReductionResumeValues, |
7210 | VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock, |
7211 | bool VectorizingEpilogue) { |
7212 | if (!RedResult || |
7213 | RedResult->getOpcode() != VPInstruction::ComputeReductionResult) |
7214 | return; |
7215 | |
7216 | auto *PhiR = cast<VPReductionPHIRecipe>(Val: RedResult->getOperand(N: 0)); |
7217 | const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); |
7218 | |
7219 | Value *FinalValue = |
7220 | State.get(Def: RedResult, Instance: VPIteration(State.UF - 1, VPLane::getFirstLane())); |
7221 | auto *ResumePhi = |
7222 | dyn_cast<PHINode>(Val: PhiR->getStartValue()->getUnderlyingValue()); |
7223 | if (VectorizingEpilogue && RecurrenceDescriptor::isAnyOfRecurrenceKind( |
7224 | Kind: RdxDesc.getRecurrenceKind())) { |
7225 | auto *Cmp = cast<ICmpInst>(Val: PhiR->getStartValue()->getUnderlyingValue()); |
7226 | assert(Cmp->getPredicate() == CmpInst::ICMP_NE); |
7227 | assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue()); |
7228 | ResumePhi = cast<PHINode>(Val: Cmp->getOperand(i_nocapture: 0)); |
7229 | } |
7230 | assert((!VectorizingEpilogue || ResumePhi) && |
7231 | "when vectorizing the epilogue loop, we need a resume phi from main " |
7232 | "vector loop" ); |
7233 | |
7234 | // TODO: bc.merge.rdx should not be created here, instead it should be |
7235 | // modeled in VPlan. |
7236 | BasicBlock * = OrigLoop->getLoopPreheader(); |
7237 | // Create a phi node that merges control-flow from the backedge-taken check |
7238 | // block and the middle block. |
7239 | auto *BCBlockPhi = |
7240 | PHINode::Create(Ty: FinalValue->getType(), NumReservedValues: 2, NameStr: "bc.merge.rdx" , |
7241 | InsertBefore: LoopScalarPreHeader->getTerminator()->getIterator()); |
7242 | |
7243 | // If we are fixing reductions in the epilogue loop then we should already |
7244 | // have created a bc.merge.rdx Phi after the main vector body. Ensure that |
7245 | // we carry over the incoming values correctly. |
7246 | for (auto *Incoming : predecessors(BB: LoopScalarPreHeader)) { |
7247 | if (Incoming == LoopMiddleBlock) |
7248 | BCBlockPhi->addIncoming(V: FinalValue, BB: Incoming); |
7249 | else if (ResumePhi && is_contained(Range: ResumePhi->blocks(), Element: Incoming)) |
7250 | BCBlockPhi->addIncoming(V: ResumePhi->getIncomingValueForBlock(BB: Incoming), |
7251 | BB: Incoming); |
7252 | else |
7253 | BCBlockPhi->addIncoming(V: RdxDesc.getRecurrenceStartValue(), BB: Incoming); |
7254 | } |
7255 | |
7256 | auto *OrigPhi = cast<PHINode>(Val: PhiR->getUnderlyingValue()); |
7257 | // TODO: This fixup should instead be modeled in VPlan. |
7258 | // Fix the scalar loop reduction variable with the incoming reduction sum |
7259 | // from the vector body and from the backedge value. |
7260 | int IncomingEdgeBlockIdx = |
7261 | OrigPhi->getBasicBlockIndex(BB: OrigLoop->getLoopLatch()); |
7262 | assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index" ); |
7263 | // Pick the other block. |
7264 | int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); |
7265 | OrigPhi->setIncomingValue(i: SelfEdgeBlockIdx, V: BCBlockPhi); |
7266 | Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); |
7267 | OrigPhi->setIncomingValue(i: IncomingEdgeBlockIdx, V: LoopExitInst); |
7268 | |
7269 | ReductionResumeValues[&RdxDesc] = BCBlockPhi; |
7270 | } |
7271 | |
7272 | std::pair<DenseMap<const SCEV *, Value *>, |
7273 | DenseMap<const RecurrenceDescriptor *, Value *>> |
7274 | LoopVectorizationPlanner::executePlan( |
7275 | ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan, |
7276 | InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization, |
7277 | const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) { |
7278 | assert(BestVPlan.hasVF(BestVF) && |
7279 | "Trying to execute plan with unsupported VF" ); |
7280 | assert(BestVPlan.hasUF(BestUF) && |
7281 | "Trying to execute plan with unsupported UF" ); |
7282 | assert( |
7283 | (IsEpilogueVectorization || !ExpandedSCEVs) && |
7284 | "expanded SCEVs to reuse can only be used during epilogue vectorization" ); |
7285 | (void)IsEpilogueVectorization; |
7286 | |
7287 | VPlanTransforms::optimizeForVFAndUF(Plan&: BestVPlan, BestVF, BestUF, PSE); |
7288 | |
7289 | LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF |
7290 | << ", UF=" << BestUF << '\n'); |
7291 | BestVPlan.setName("Final VPlan" ); |
7292 | LLVM_DEBUG(BestVPlan.dump()); |
7293 | |
7294 | // Perform the actual loop transformation. |
7295 | VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan, |
7296 | OrigLoop->getHeader()->getContext()); |
7297 | |
7298 | // 0. Generate SCEV-dependent code into the preheader, including TripCount, |
7299 | // before making any changes to the CFG. |
7300 | if (!BestVPlan.getPreheader()->empty()) { |
7301 | State.CFG.PrevBB = OrigLoop->getLoopPreheader(); |
7302 | State.Builder.SetInsertPoint(OrigLoop->getLoopPreheader()->getTerminator()); |
7303 | BestVPlan.getPreheader()->execute(State: &State); |
7304 | } |
7305 | if (!ILV.getTripCount()) |
7306 | ILV.setTripCount(State.get(Def: BestVPlan.getTripCount(), Instance: {0, 0})); |
7307 | else |
7308 | assert(IsEpilogueVectorization && "should only re-use the existing trip " |
7309 | "count during epilogue vectorization" ); |
7310 | |
7311 | // 1. Set up the skeleton for vectorization, including vector pre-header and |
7312 | // middle block. The vector loop is created during VPlan execution. |
7313 | Value *CanonicalIVStartValue; |
7314 | std::tie(args&: State.CFG.PrevBB, args&: CanonicalIVStartValue) = |
7315 | ILV.createVectorizedLoopSkeleton(ExpandedSCEVs: ExpandedSCEVs ? *ExpandedSCEVs |
7316 | : State.ExpandedSCEVs); |
7317 | #ifdef EXPENSIVE_CHECKS |
7318 | assert(DT->verify(DominatorTree::VerificationLevel::Fast)); |
7319 | #endif |
7320 | |
7321 | // Only use noalias metadata when using memory checks guaranteeing no overlap |
7322 | // across all iterations. |
7323 | const LoopAccessInfo *LAI = ILV.Legal->getLAI(); |
7324 | std::unique_ptr<LoopVersioning> LVer = nullptr; |
7325 | if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() && |
7326 | !LAI->getRuntimePointerChecking()->getDiffChecks()) { |
7327 | |
7328 | // We currently don't use LoopVersioning for the actual loop cloning but we |
7329 | // still use it to add the noalias metadata. |
7330 | // TODO: Find a better way to re-use LoopVersioning functionality to add |
7331 | // metadata. |
7332 | LVer = std::make_unique<LoopVersioning>( |
7333 | args: *LAI, args: LAI->getRuntimePointerChecking()->getChecks(), args&: OrigLoop, args&: LI, args&: DT, |
7334 | args: PSE.getSE()); |
7335 | State.LVer = &*LVer; |
7336 | State.LVer->prepareNoAliasMetadata(); |
7337 | } |
7338 | |
7339 | ILV.printDebugTracesAtStart(); |
7340 | |
7341 | //===------------------------------------------------===// |
7342 | // |
7343 | // Notice: any optimization or new instruction that go |
7344 | // into the code below should also be implemented in |
7345 | // the cost-model. |
7346 | // |
7347 | //===------------------------------------------------===// |
7348 | |
7349 | // 2. Copy and widen instructions from the old loop into the new loop. |
7350 | BestVPlan.prepareToExecute(TripCount: ILV.getTripCount(), |
7351 | VectorTripCount: ILV.getOrCreateVectorTripCount(InsertBlock: nullptr), |
7352 | CanonicalIVStartValue, State); |
7353 | |
7354 | BestVPlan.execute(State: &State); |
7355 | |
7356 | // 2.5 Collect reduction resume values. |
7357 | DenseMap<const RecurrenceDescriptor *, Value *> ReductionResumeValues; |
7358 | auto *ExitVPBB = |
7359 | cast<VPBasicBlock>(Val: BestVPlan.getVectorLoopRegion()->getSingleSuccessor()); |
7360 | for (VPRecipeBase &R : *ExitVPBB) { |
7361 | createAndCollectMergePhiForReduction( |
7362 | RedResult: dyn_cast<VPInstruction>(Val: &R), ReductionResumeValues, State, OrigLoop, |
7363 | LoopMiddleBlock: State.CFG.VPBB2IRBB[ExitVPBB], VectorizingEpilogue: ExpandedSCEVs); |
7364 | } |
7365 | |
7366 | // 2.6. Maintain Loop Hints |
7367 | // Keep all loop hints from the original loop on the vector loop (we'll |
7368 | // replace the vectorizer-specific hints below). |
7369 | MDNode *OrigLoopID = OrigLoop->getLoopID(); |
7370 | |
7371 | std::optional<MDNode *> VectorizedLoopID = |
7372 | makeFollowupLoopID(OrigLoopID, FollowupAttrs: {LLVMLoopVectorizeFollowupAll, |
7373 | LLVMLoopVectorizeFollowupVectorized}); |
7374 | |
7375 | VPBasicBlock * = |
7376 | BestVPlan.getVectorLoopRegion()->getEntryBasicBlock(); |
7377 | Loop *L = LI->getLoopFor(BB: State.CFG.VPBB2IRBB[HeaderVPBB]); |
7378 | if (VectorizedLoopID) |
7379 | L->setLoopID(*VectorizedLoopID); |
7380 | else { |
7381 | // Keep all loop hints from the original loop on the vector loop (we'll |
7382 | // replace the vectorizer-specific hints below). |
7383 | if (MDNode *LID = OrigLoop->getLoopID()) |
7384 | L->setLoopID(LID); |
7385 | |
7386 | LoopVectorizeHints Hints(L, true, *ORE); |
7387 | Hints.setAlreadyVectorized(); |
7388 | } |
7389 | TargetTransformInfo::UnrollingPreferences UP; |
7390 | TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE); |
7391 | if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue) |
7392 | AddRuntimeUnrollDisableMetaData(L); |
7393 | |
7394 | // 3. Fix the vectorized code: take care of header phi's, live-outs, |
7395 | // predication, updating analyses. |
7396 | ILV.fixVectorizedLoop(State, Plan&: BestVPlan); |
7397 | |
7398 | ILV.printDebugTracesAtEnd(); |
7399 | |
7400 | // 4. Adjust branch weight of the branch in the middle block. |
7401 | auto *MiddleTerm = |
7402 | cast<BranchInst>(Val: State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator()); |
7403 | if (MiddleTerm->isConditional() && |
7404 | hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator())) { |
7405 | // Assume that `Count % VectorTripCount` is equally distributed. |
7406 | unsigned TripCount = State.UF * State.VF.getKnownMinValue(); |
7407 | assert(TripCount > 0 && "trip count should not be zero" ); |
7408 | const uint32_t Weights[] = {1, TripCount - 1}; |
7409 | setBranchWeights(I&: *MiddleTerm, Weights, /*IsExpected=*/false); |
7410 | } |
7411 | |
7412 | return {State.ExpandedSCEVs, ReductionResumeValues}; |
7413 | } |
7414 | |
7415 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
7416 | void LoopVectorizationPlanner::printPlans(raw_ostream &O) { |
7417 | for (const auto &Plan : VPlans) |
7418 | if (PrintVPlansInDotFormat) |
7419 | Plan->printDOT(O); |
7420 | else |
7421 | Plan->print(O); |
7422 | } |
7423 | #endif |
7424 | |
7425 | //===--------------------------------------------------------------------===// |
7426 | // EpilogueVectorizerMainLoop |
7427 | //===--------------------------------------------------------------------===// |
7428 | |
7429 | /// This function is partially responsible for generating the control flow |
7430 | /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. |
7431 | std::pair<BasicBlock *, Value *> |
7432 | EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton( |
7433 | const SCEV2ValueTy &ExpandedSCEVs) { |
7434 | createVectorLoopSkeleton(Prefix: "" ); |
7435 | |
7436 | // Generate the code to check the minimum iteration count of the vector |
7437 | // epilogue (see below). |
7438 | EPI.EpilogueIterationCountCheck = |
7439 | emitIterationCountCheck(Bypass: LoopScalarPreHeader, ForEpilogue: true); |
7440 | EPI.EpilogueIterationCountCheck->setName("iter.check" ); |
7441 | |
7442 | // Generate the code to check any assumptions that we've made for SCEV |
7443 | // expressions. |
7444 | EPI.SCEVSafetyCheck = emitSCEVChecks(Bypass: LoopScalarPreHeader); |
7445 | |
7446 | // Generate the code that checks at runtime if arrays overlap. We put the |
7447 | // checks into a separate block to make the more common case of few elements |
7448 | // faster. |
7449 | EPI.MemSafetyCheck = emitMemRuntimeChecks(Bypass: LoopScalarPreHeader); |
7450 | |
7451 | // Generate the iteration count check for the main loop, *after* the check |
7452 | // for the epilogue loop, so that the path-length is shorter for the case |
7453 | // that goes directly through the vector epilogue. The longer-path length for |
7454 | // the main loop is compensated for, by the gain from vectorizing the larger |
7455 | // trip count. Note: the branch will get updated later on when we vectorize |
7456 | // the epilogue. |
7457 | EPI.MainLoopIterationCountCheck = |
7458 | emitIterationCountCheck(Bypass: LoopScalarPreHeader, ForEpilogue: false); |
7459 | |
7460 | // Generate the induction variable. |
7461 | EPI.VectorTripCount = getOrCreateVectorTripCount(InsertBlock: LoopVectorPreHeader); |
7462 | |
7463 | // Skip induction resume value creation here because they will be created in |
7464 | // the second pass for the scalar loop. The induction resume values for the |
7465 | // inductions in the epilogue loop are created before executing the plan for |
7466 | // the epilogue loop. |
7467 | |
7468 | return {LoopVectorPreHeader, nullptr}; |
7469 | } |
7470 | |
7471 | void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { |
7472 | LLVM_DEBUG({ |
7473 | dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" |
7474 | << "Main Loop VF:" << EPI.MainLoopVF |
7475 | << ", Main Loop UF:" << EPI.MainLoopUF |
7476 | << ", Epilogue Loop VF:" << EPI.EpilogueVF |
7477 | << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n" ; |
7478 | }); |
7479 | } |
7480 | |
7481 | void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { |
7482 | DEBUG_WITH_TYPE(VerboseDebug, { |
7483 | dbgs() << "intermediate fn:\n" |
7484 | << *OrigLoop->getHeader()->getParent() << "\n" ; |
7485 | }); |
7486 | } |
7487 | |
7488 | BasicBlock * |
7489 | EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, |
7490 | bool ForEpilogue) { |
7491 | assert(Bypass && "Expected valid bypass basic block." ); |
7492 | ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; |
7493 | unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; |
7494 | Value *Count = getTripCount(); |
7495 | // Reuse existing vector loop preheader for TC checks. |
7496 | // Note that new preheader block is generated for vector loop. |
7497 | BasicBlock *const TCCheckBlock = LoopVectorPreHeader; |
7498 | IRBuilder<> Builder(TCCheckBlock->getTerminator()); |
7499 | |
7500 | // Generate code to check if the loop's trip count is less than VF * UF of the |
7501 | // main vector loop. |
7502 | auto P = Cost->requiresScalarEpilogue(IsVectorizing: ForEpilogue ? EPI.EpilogueVF.isVector() |
7503 | : VF.isVector()) |
7504 | ? ICmpInst::ICMP_ULE |
7505 | : ICmpInst::ICMP_ULT; |
7506 | |
7507 | Value *CheckMinIters = Builder.CreateICmp( |
7508 | P, LHS: Count, RHS: createStepForVF(B&: Builder, Ty: Count->getType(), VF: VFactor, Step: UFactor), |
7509 | Name: "min.iters.check" ); |
7510 | |
7511 | if (!ForEpilogue) |
7512 | TCCheckBlock->setName("vector.main.loop.iter.check" ); |
7513 | |
7514 | // Create new preheader for vector loop. |
7515 | LoopVectorPreHeader = SplitBlock(Old: TCCheckBlock, SplitPt: TCCheckBlock->getTerminator(), |
7516 | DT, LI, MSSAU: nullptr, BBName: "vector.ph" ); |
7517 | |
7518 | if (ForEpilogue) { |
7519 | assert(DT->properlyDominates(DT->getNode(TCCheckBlock), |
7520 | DT->getNode(Bypass)->getIDom()) && |
7521 | "TC check is expected to dominate Bypass" ); |
7522 | |
7523 | // Update dominator for Bypass. |
7524 | DT->changeImmediateDominator(BB: Bypass, NewBB: TCCheckBlock); |
7525 | LoopBypassBlocks.push_back(Elt: TCCheckBlock); |
7526 | |
7527 | // Save the trip count so we don't have to regenerate it in the |
7528 | // vec.epilog.iter.check. This is safe to do because the trip count |
7529 | // generated here dominates the vector epilog iter check. |
7530 | EPI.TripCount = Count; |
7531 | } |
7532 | |
7533 | BranchInst &BI = |
7534 | *BranchInst::Create(IfTrue: Bypass, IfFalse: LoopVectorPreHeader, Cond: CheckMinIters); |
7535 | if (hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator())) |
7536 | setBranchWeights(I&: BI, Weights: MinItersBypassWeights, /*IsExpected=*/false); |
7537 | ReplaceInstWithInst(From: TCCheckBlock->getTerminator(), To: &BI); |
7538 | |
7539 | return TCCheckBlock; |
7540 | } |
7541 | |
7542 | //===--------------------------------------------------------------------===// |
7543 | // EpilogueVectorizerEpilogueLoop |
7544 | //===--------------------------------------------------------------------===// |
7545 | |
7546 | /// This function is partially responsible for generating the control flow |
7547 | /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. |
7548 | std::pair<BasicBlock *, Value *> |
7549 | EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton( |
7550 | const SCEV2ValueTy &ExpandedSCEVs) { |
7551 | createVectorLoopSkeleton(Prefix: "vec.epilog." ); |
7552 | |
7553 | // Now, compare the remaining count and if there aren't enough iterations to |
7554 | // execute the vectorized epilogue skip to the scalar part. |
7555 | LoopVectorPreHeader->setName("vec.epilog.ph" ); |
7556 | BasicBlock *VecEpilogueIterationCountCheck = |
7557 | SplitBlock(Old: LoopVectorPreHeader, SplitPt: LoopVectorPreHeader->begin(), DT, LI, |
7558 | MSSAU: nullptr, BBName: "vec.epilog.iter.check" , Before: true); |
7559 | emitMinimumVectorEpilogueIterCountCheck(Bypass: LoopScalarPreHeader, |
7560 | Insert: VecEpilogueIterationCountCheck); |
7561 | |
7562 | // Adjust the control flow taking the state info from the main loop |
7563 | // vectorization into account. |
7564 | assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && |
7565 | "expected this to be saved from the previous pass." ); |
7566 | EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( |
7567 | From: VecEpilogueIterationCountCheck, To: LoopVectorPreHeader); |
7568 | |
7569 | DT->changeImmediateDominator(BB: LoopVectorPreHeader, |
7570 | NewBB: EPI.MainLoopIterationCountCheck); |
7571 | |
7572 | EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( |
7573 | From: VecEpilogueIterationCountCheck, To: LoopScalarPreHeader); |
7574 | |
7575 | if (EPI.SCEVSafetyCheck) |
7576 | EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( |
7577 | From: VecEpilogueIterationCountCheck, To: LoopScalarPreHeader); |
7578 | if (EPI.MemSafetyCheck) |
7579 | EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( |
7580 | From: VecEpilogueIterationCountCheck, To: LoopScalarPreHeader); |
7581 | |
7582 | DT->changeImmediateDominator( |
7583 | BB: VecEpilogueIterationCountCheck, |
7584 | NewBB: VecEpilogueIterationCountCheck->getSinglePredecessor()); |
7585 | |
7586 | DT->changeImmediateDominator(BB: LoopScalarPreHeader, |
7587 | NewBB: EPI.EpilogueIterationCountCheck); |
7588 | if (!Cost->requiresScalarEpilogue(IsVectorizing: EPI.EpilogueVF.isVector())) |
7589 | // If there is an epilogue which must run, there's no edge from the |
7590 | // middle block to exit blocks and thus no need to update the immediate |
7591 | // dominator of the exit blocks. |
7592 | DT->changeImmediateDominator(BB: LoopExitBlock, |
7593 | NewBB: EPI.EpilogueIterationCountCheck); |
7594 | |
7595 | // Keep track of bypass blocks, as they feed start values to the induction and |
7596 | // reduction phis in the scalar loop preheader. |
7597 | if (EPI.SCEVSafetyCheck) |
7598 | LoopBypassBlocks.push_back(Elt: EPI.SCEVSafetyCheck); |
7599 | if (EPI.MemSafetyCheck) |
7600 | LoopBypassBlocks.push_back(Elt: EPI.MemSafetyCheck); |
7601 | LoopBypassBlocks.push_back(Elt: EPI.EpilogueIterationCountCheck); |
7602 | |
7603 | // The vec.epilog.iter.check block may contain Phi nodes from inductions or |
7604 | // reductions which merge control-flow from the latch block and the middle |
7605 | // block. Update the incoming values here and move the Phi into the preheader. |
7606 | SmallVector<PHINode *, 4> PhisInBlock; |
7607 | for (PHINode &Phi : VecEpilogueIterationCountCheck->phis()) |
7608 | PhisInBlock.push_back(Elt: &Phi); |
7609 | |
7610 | for (PHINode *Phi : PhisInBlock) { |
7611 | Phi->moveBefore(MovePos: LoopVectorPreHeader->getFirstNonPHI()); |
7612 | Phi->replaceIncomingBlockWith( |
7613 | Old: VecEpilogueIterationCountCheck->getSinglePredecessor(), |
7614 | New: VecEpilogueIterationCountCheck); |
7615 | |
7616 | // If the phi doesn't have an incoming value from the |
7617 | // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming |
7618 | // value and also those from other check blocks. This is needed for |
7619 | // reduction phis only. |
7620 | if (none_of(Range: Phi->blocks(), P: [&](BasicBlock *IncB) { |
7621 | return EPI.EpilogueIterationCountCheck == IncB; |
7622 | })) |
7623 | continue; |
7624 | Phi->removeIncomingValue(BB: EPI.EpilogueIterationCountCheck); |
7625 | if (EPI.SCEVSafetyCheck) |
7626 | Phi->removeIncomingValue(BB: EPI.SCEVSafetyCheck); |
7627 | if (EPI.MemSafetyCheck) |
7628 | Phi->removeIncomingValue(BB: EPI.MemSafetyCheck); |
7629 | } |
7630 | |
7631 | // Generate a resume induction for the vector epilogue and put it in the |
7632 | // vector epilogue preheader |
7633 | Type *IdxTy = Legal->getWidestInductionType(); |
7634 | PHINode *EPResumeVal = PHINode::Create(Ty: IdxTy, NumReservedValues: 2, NameStr: "vec.epilog.resume.val" ); |
7635 | EPResumeVal->insertBefore(InsertPos: LoopVectorPreHeader->getFirstNonPHIIt()); |
7636 | EPResumeVal->addIncoming(V: EPI.VectorTripCount, BB: VecEpilogueIterationCountCheck); |
7637 | EPResumeVal->addIncoming(V: ConstantInt::get(Ty: IdxTy, V: 0), |
7638 | BB: EPI.MainLoopIterationCountCheck); |
7639 | |
7640 | // Generate induction resume values. These variables save the new starting |
7641 | // indexes for the scalar loop. They are used to test if there are any tail |
7642 | // iterations left once the vector loop has completed. |
7643 | // Note that when the vectorized epilogue is skipped due to iteration count |
7644 | // check, then the resume value for the induction variable comes from |
7645 | // the trip count of the main vector loop, hence passing the AdditionalBypass |
7646 | // argument. |
7647 | createInductionResumeValues(ExpandedSCEVs, |
7648 | AdditionalBypass: {VecEpilogueIterationCountCheck, |
7649 | EPI.VectorTripCount} /* AdditionalBypass */); |
7650 | |
7651 | return {LoopVectorPreHeader, EPResumeVal}; |
7652 | } |
7653 | |
7654 | BasicBlock * |
7655 | EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( |
7656 | BasicBlock *Bypass, BasicBlock *Insert) { |
7657 | |
7658 | assert(EPI.TripCount && |
7659 | "Expected trip count to have been safed in the first pass." ); |
7660 | assert( |
7661 | (!isa<Instruction>(EPI.TripCount) || |
7662 | DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && |
7663 | "saved trip count does not dominate insertion point." ); |
7664 | Value *TC = EPI.TripCount; |
7665 | IRBuilder<> Builder(Insert->getTerminator()); |
7666 | Value *Count = Builder.CreateSub(LHS: TC, RHS: EPI.VectorTripCount, Name: "n.vec.remaining" ); |
7667 | |
7668 | // Generate code to check if the loop's trip count is less than VF * UF of the |
7669 | // vector epilogue loop. |
7670 | auto P = Cost->requiresScalarEpilogue(IsVectorizing: EPI.EpilogueVF.isVector()) |
7671 | ? ICmpInst::ICMP_ULE |
7672 | : ICmpInst::ICMP_ULT; |
7673 | |
7674 | Value *CheckMinIters = |
7675 | Builder.CreateICmp(P, LHS: Count, |
7676 | RHS: createStepForVF(B&: Builder, Ty: Count->getType(), |
7677 | VF: EPI.EpilogueVF, Step: EPI.EpilogueUF), |
7678 | Name: "min.epilog.iters.check" ); |
7679 | |
7680 | BranchInst &BI = |
7681 | *BranchInst::Create(IfTrue: Bypass, IfFalse: LoopVectorPreHeader, Cond: CheckMinIters); |
7682 | if (hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator())) { |
7683 | unsigned MainLoopStep = UF * VF.getKnownMinValue(); |
7684 | unsigned EpilogueLoopStep = |
7685 | EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue(); |
7686 | // We assume the remaining `Count` is equally distributed in |
7687 | // [0, MainLoopStep) |
7688 | // So the probability for `Count < EpilogueLoopStep` should be |
7689 | // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep |
7690 | unsigned EstimatedSkipCount = std::min(a: MainLoopStep, b: EpilogueLoopStep); |
7691 | const uint32_t Weights[] = {EstimatedSkipCount, |
7692 | MainLoopStep - EstimatedSkipCount}; |
7693 | setBranchWeights(I&: BI, Weights, /*IsExpected=*/false); |
7694 | } |
7695 | ReplaceInstWithInst(From: Insert->getTerminator(), To: &BI); |
7696 | LoopBypassBlocks.push_back(Elt: Insert); |
7697 | return Insert; |
7698 | } |
7699 | |
7700 | void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { |
7701 | LLVM_DEBUG({ |
7702 | dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" |
7703 | << "Epilogue Loop VF:" << EPI.EpilogueVF |
7704 | << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n" ; |
7705 | }); |
7706 | } |
7707 | |
7708 | void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { |
7709 | DEBUG_WITH_TYPE(VerboseDebug, { |
7710 | dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n" ; |
7711 | }); |
7712 | } |
7713 | |
7714 | bool LoopVectorizationPlanner::getDecisionAndClampRange( |
7715 | const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { |
7716 | assert(!Range.isEmpty() && "Trying to test an empty VF range." ); |
7717 | bool PredicateAtRangeStart = Predicate(Range.Start); |
7718 | |
7719 | for (ElementCount TmpVF : VFRange(Range.Start * 2, Range.End)) |
7720 | if (Predicate(TmpVF) != PredicateAtRangeStart) { |
7721 | Range.End = TmpVF; |
7722 | break; |
7723 | } |
7724 | |
7725 | return PredicateAtRangeStart; |
7726 | } |
7727 | |
7728 | /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, |
7729 | /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range |
7730 | /// of VF's starting at a given VF and extending it as much as possible. Each |
7731 | /// vectorization decision can potentially shorten this sub-range during |
7732 | /// buildVPlan(). |
7733 | void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, |
7734 | ElementCount MaxVF) { |
7735 | auto MaxVFTimes2 = MaxVF * 2; |
7736 | for (ElementCount VF = MinVF; ElementCount::isKnownLT(LHS: VF, RHS: MaxVFTimes2);) { |
7737 | VFRange SubRange = {VF, MaxVFTimes2}; |
7738 | VPlans.push_back(Elt: buildVPlan(Range&: SubRange)); |
7739 | VF = SubRange.End; |
7740 | } |
7741 | } |
7742 | |
7743 | iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>> |
7744 | VPRecipeBuilder::mapToVPValues(User::op_range Operands) { |
7745 | std::function<VPValue *(Value *)> Fn = [this](Value *Op) { |
7746 | if (auto *I = dyn_cast<Instruction>(Val: Op)) { |
7747 | if (auto *R = Ingredient2Recipe.lookup(Val: I)) |
7748 | return R->getVPSingleValue(); |
7749 | } |
7750 | return Plan.getOrAddLiveIn(V: Op); |
7751 | }; |
7752 | return map_range(C&: Operands, F: Fn); |
7753 | } |
7754 | |
7755 | VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { |
7756 | assert(is_contained(predecessors(Dst), Src) && "Invalid edge" ); |
7757 | |
7758 | // Look for cached value. |
7759 | std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); |
7760 | EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Val: Edge); |
7761 | if (ECEntryIt != EdgeMaskCache.end()) |
7762 | return ECEntryIt->second; |
7763 | |
7764 | VPValue *SrcMask = getBlockInMask(BB: Src); |
7765 | |
7766 | // The terminator has to be a branch inst! |
7767 | BranchInst *BI = dyn_cast<BranchInst>(Val: Src->getTerminator()); |
7768 | assert(BI && "Unexpected terminator found" ); |
7769 | |
7770 | if (!BI->isConditional() || BI->getSuccessor(i: 0) == BI->getSuccessor(i: 1)) |
7771 | return EdgeMaskCache[Edge] = SrcMask; |
7772 | |
7773 | // If source is an exiting block, we know the exit edge is dynamically dead |
7774 | // in the vector loop, and thus we don't need to restrict the mask. Avoid |
7775 | // adding uses of an otherwise potentially dead instruction. |
7776 | if (OrigLoop->isLoopExiting(BB: Src)) |
7777 | return EdgeMaskCache[Edge] = SrcMask; |
7778 | |
7779 | VPValue *EdgeMask = getVPValueOrAddLiveIn(V: BI->getCondition(), Plan); |
7780 | assert(EdgeMask && "No Edge Mask found for condition" ); |
7781 | |
7782 | if (BI->getSuccessor(i: 0) != Dst) |
7783 | EdgeMask = Builder.createNot(Operand: EdgeMask, DL: BI->getDebugLoc()); |
7784 | |
7785 | if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. |
7786 | // The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask |
7787 | // is false and EdgeMask is poison. Avoid that by using 'LogicalAnd' |
7788 | // instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'. |
7789 | EdgeMask = Builder.createLogicalAnd(LHS: SrcMask, RHS: EdgeMask, DL: BI->getDebugLoc()); |
7790 | } |
7791 | |
7792 | return EdgeMaskCache[Edge] = EdgeMask; |
7793 | } |
7794 | |
7795 | VPValue *VPRecipeBuilder::getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const { |
7796 | assert(is_contained(predecessors(Dst), Src) && "Invalid edge" ); |
7797 | |
7798 | // Look for cached value. |
7799 | std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); |
7800 | EdgeMaskCacheTy::const_iterator ECEntryIt = EdgeMaskCache.find(Val: Edge); |
7801 | assert(ECEntryIt != EdgeMaskCache.end() && |
7802 | "looking up mask for edge which has not been created" ); |
7803 | return ECEntryIt->second; |
7804 | } |
7805 | |
7806 | void VPRecipeBuilder::() { |
7807 | BasicBlock * = OrigLoop->getHeader(); |
7808 | |
7809 | // When not folding the tail, use nullptr to model all-true mask. |
7810 | if (!CM.foldTailByMasking()) { |
7811 | BlockMaskCache[Header] = nullptr; |
7812 | return; |
7813 | } |
7814 | |
7815 | // Introduce the early-exit compare IV <= BTC to form header block mask. |
7816 | // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by |
7817 | // constructing the desired canonical IV in the header block as its first |
7818 | // non-phi instructions. |
7819 | |
7820 | VPBasicBlock * = Plan.getVectorLoopRegion()->getEntryBasicBlock(); |
7821 | auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); |
7822 | auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV()); |
7823 | HeaderVPBB->insert(Recipe: IV, InsertPt: NewInsertionPoint); |
7824 | |
7825 | VPBuilder::InsertPointGuard Guard(Builder); |
7826 | Builder.setInsertPoint(TheBB: HeaderVPBB, IP: NewInsertionPoint); |
7827 | VPValue *BlockMask = nullptr; |
7828 | VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); |
7829 | BlockMask = Builder.createICmp(Pred: CmpInst::ICMP_ULE, A: IV, B: BTC); |
7830 | BlockMaskCache[Header] = BlockMask; |
7831 | } |
7832 | |
7833 | VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const { |
7834 | // Return the cached value. |
7835 | BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(Val: BB); |
7836 | assert(BCEntryIt != BlockMaskCache.end() && |
7837 | "Trying to access mask for block without one." ); |
7838 | return BCEntryIt->second; |
7839 | } |
7840 | |
7841 | void VPRecipeBuilder::createBlockInMask(BasicBlock *BB) { |
7842 | assert(OrigLoop->contains(BB) && "Block is not a part of a loop" ); |
7843 | assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed" ); |
7844 | assert(OrigLoop->getHeader() != BB && |
7845 | "Loop header must have cached block mask" ); |
7846 | |
7847 | // All-one mask is modelled as no-mask following the convention for masked |
7848 | // load/store/gather/scatter. Initialize BlockMask to no-mask. |
7849 | VPValue *BlockMask = nullptr; |
7850 | // This is the block mask. We OR all incoming edges. |
7851 | for (auto *Predecessor : predecessors(BB)) { |
7852 | VPValue *EdgeMask = createEdgeMask(Src: Predecessor, Dst: BB); |
7853 | if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too. |
7854 | BlockMaskCache[BB] = EdgeMask; |
7855 | return; |
7856 | } |
7857 | |
7858 | if (!BlockMask) { // BlockMask has its initialized nullptr value. |
7859 | BlockMask = EdgeMask; |
7860 | continue; |
7861 | } |
7862 | |
7863 | BlockMask = Builder.createOr(LHS: BlockMask, RHS: EdgeMask, DL: {}); |
7864 | } |
7865 | |
7866 | BlockMaskCache[BB] = BlockMask; |
7867 | } |
7868 | |
7869 | VPWidenMemoryRecipe * |
7870 | VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands, |
7871 | VFRange &Range) { |
7872 | assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && |
7873 | "Must be called with either a load or store" ); |
7874 | |
7875 | auto willWiden = [&](ElementCount VF) -> bool { |
7876 | LoopVectorizationCostModel::InstWidening Decision = |
7877 | CM.getWideningDecision(I, VF); |
7878 | assert(Decision != LoopVectorizationCostModel::CM_Unknown && |
7879 | "CM decision should be taken at this point." ); |
7880 | if (Decision == LoopVectorizationCostModel::CM_Interleave) |
7881 | return true; |
7882 | if (CM.isScalarAfterVectorization(I, VF) || |
7883 | CM.isProfitableToScalarize(I, VF)) |
7884 | return false; |
7885 | return Decision != LoopVectorizationCostModel::CM_Scalarize; |
7886 | }; |
7887 | |
7888 | if (!LoopVectorizationPlanner::getDecisionAndClampRange(Predicate: willWiden, Range)) |
7889 | return nullptr; |
7890 | |
7891 | VPValue *Mask = nullptr; |
7892 | if (Legal->isMaskRequired(I)) |
7893 | Mask = getBlockInMask(BB: I->getParent()); |
7894 | |
7895 | // Determine if the pointer operand of the access is either consecutive or |
7896 | // reverse consecutive. |
7897 | LoopVectorizationCostModel::InstWidening Decision = |
7898 | CM.getWideningDecision(I, VF: Range.Start); |
7899 | bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; |
7900 | bool Consecutive = |
7901 | Reverse || Decision == LoopVectorizationCostModel::CM_Widen; |
7902 | |
7903 | VPValue *Ptr = isa<LoadInst>(Val: I) ? Operands[0] : Operands[1]; |
7904 | if (Consecutive) { |
7905 | auto *GEP = dyn_cast<GetElementPtrInst>( |
7906 | Val: Ptr->getUnderlyingValue()->stripPointerCasts()); |
7907 | auto *VectorPtr = new VPVectorPointerRecipe( |
7908 | Ptr, getLoadStoreType(I), Reverse, GEP ? GEP->isInBounds() : false, |
7909 | I->getDebugLoc()); |
7910 | Builder.getInsertBlock()->appendRecipe(Recipe: VectorPtr); |
7911 | Ptr = VectorPtr; |
7912 | } |
7913 | if (LoadInst *Load = dyn_cast<LoadInst>(Val: I)) |
7914 | return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse, |
7915 | I->getDebugLoc()); |
7916 | |
7917 | StoreInst *Store = cast<StoreInst>(Val: I); |
7918 | return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive, |
7919 | Reverse, I->getDebugLoc()); |
7920 | } |
7921 | |
7922 | /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also |
7923 | /// insert a recipe to expand the step for the induction recipe. |
7924 | static VPWidenIntOrFpInductionRecipe * |
7925 | createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc, |
7926 | VPValue *Start, const InductionDescriptor &IndDesc, |
7927 | VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop) { |
7928 | assert(IndDesc.getStartValue() == |
7929 | Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())); |
7930 | assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) && |
7931 | "step must be loop invariant" ); |
7932 | |
7933 | VPValue *Step = |
7934 | vputils::getOrCreateVPValueForSCEVExpr(Plan, Expr: IndDesc.getStep(), SE); |
7935 | if (auto *TruncI = dyn_cast<TruncInst>(Val: PhiOrTrunc)) { |
7936 | return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI); |
7937 | } |
7938 | assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here" ); |
7939 | return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc); |
7940 | } |
7941 | |
7942 | VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI( |
7943 | PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) { |
7944 | |
7945 | // Check if this is an integer or fp induction. If so, build the recipe that |
7946 | // produces its scalar and vector values. |
7947 | if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) |
7948 | return createWidenInductionRecipes(Phi, PhiOrTrunc: Phi, Start: Operands[0], IndDesc: *II, Plan, |
7949 | SE&: *PSE.getSE(), OrigLoop&: *OrigLoop); |
7950 | |
7951 | // Check if this is pointer induction. If so, build the recipe for it. |
7952 | if (auto *II = Legal->getPointerInductionDescriptor(Phi)) { |
7953 | VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, Expr: II->getStep(), |
7954 | SE&: *PSE.getSE()); |
7955 | return new VPWidenPointerInductionRecipe( |
7956 | Phi, Operands[0], Step, *II, |
7957 | LoopVectorizationPlanner::getDecisionAndClampRange( |
7958 | Predicate: [&](ElementCount VF) { |
7959 | return CM.isScalarAfterVectorization(I: Phi, VF); |
7960 | }, |
7961 | Range)); |
7962 | } |
7963 | return nullptr; |
7964 | } |
7965 | |
7966 | VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( |
7967 | TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range) { |
7968 | // Optimize the special case where the source is a constant integer |
7969 | // induction variable. Notice that we can only optimize the 'trunc' case |
7970 | // because (a) FP conversions lose precision, (b) sext/zext may wrap, and |
7971 | // (c) other casts depend on pointer size. |
7972 | |
7973 | // Determine whether \p K is a truncation based on an induction variable that |
7974 | // can be optimized. |
7975 | auto isOptimizableIVTruncate = |
7976 | [&](Instruction *K) -> std::function<bool(ElementCount)> { |
7977 | return [=](ElementCount VF) -> bool { |
7978 | return CM.isOptimizableIVTruncate(I: K, VF); |
7979 | }; |
7980 | }; |
7981 | |
7982 | if (LoopVectorizationPlanner::getDecisionAndClampRange( |
7983 | Predicate: isOptimizableIVTruncate(I), Range)) { |
7984 | |
7985 | auto *Phi = cast<PHINode>(Val: I->getOperand(i_nocapture: 0)); |
7986 | const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); |
7987 | VPValue *Start = Plan.getOrAddLiveIn(V: II.getStartValue()); |
7988 | return createWidenInductionRecipes(Phi, PhiOrTrunc: I, Start, IndDesc: II, Plan, SE&: *PSE.getSE(), |
7989 | OrigLoop&: *OrigLoop); |
7990 | } |
7991 | return nullptr; |
7992 | } |
7993 | |
7994 | VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, |
7995 | ArrayRef<VPValue *> Operands) { |
7996 | unsigned NumIncoming = Phi->getNumIncomingValues(); |
7997 | |
7998 | // We know that all PHIs in non-header blocks are converted into selects, so |
7999 | // we don't have to worry about the insertion order and we can just use the |
8000 | // builder. At this point we generate the predication tree. There may be |
8001 | // duplications since this is a simple recursive scan, but future |
8002 | // optimizations will clean it up. |
8003 | // TODO: At the moment the first mask is always skipped, but it would be |
8004 | // better to skip the most expensive mask. |
8005 | SmallVector<VPValue *, 2> OperandsWithMask; |
8006 | |
8007 | for (unsigned In = 0; In < NumIncoming; In++) { |
8008 | OperandsWithMask.push_back(Elt: Operands[In]); |
8009 | VPValue *EdgeMask = |
8010 | getEdgeMask(Src: Phi->getIncomingBlock(i: In), Dst: Phi->getParent()); |
8011 | if (!EdgeMask) { |
8012 | assert(In == 0 && "Both null and non-null edge masks found" ); |
8013 | assert(all_equal(Operands) && |
8014 | "Distinct incoming values with one having a full mask" ); |
8015 | break; |
8016 | } |
8017 | if (In == 0) |
8018 | continue; |
8019 | OperandsWithMask.push_back(Elt: EdgeMask); |
8020 | } |
8021 | return new VPBlendRecipe(Phi, OperandsWithMask); |
8022 | } |
8023 | |
8024 | VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, |
8025 | ArrayRef<VPValue *> Operands, |
8026 | VFRange &Range) { |
8027 | bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( |
8028 | Predicate: [this, CI](ElementCount VF) { |
8029 | return CM.isScalarWithPredication(I: CI, VF); |
8030 | }, |
8031 | Range); |
8032 | |
8033 | if (IsPredicated) |
8034 | return nullptr; |
8035 | |
8036 | Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); |
8037 | if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || |
8038 | ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || |
8039 | ID == Intrinsic::pseudoprobe || |
8040 | ID == Intrinsic::experimental_noalias_scope_decl)) |
8041 | return nullptr; |
8042 | |
8043 | SmallVector<VPValue *, 4> Ops(Operands.take_front(N: CI->arg_size())); |
8044 | Ops.push_back(Elt: Operands.back()); |
8045 | |
8046 | // Is it beneficial to perform intrinsic call compared to lib call? |
8047 | bool ShouldUseVectorIntrinsic = |
8048 | ID && LoopVectorizationPlanner::getDecisionAndClampRange( |
8049 | Predicate: [&](ElementCount VF) -> bool { |
8050 | return CM.getCallWideningDecision(CI, VF).Kind == |
8051 | LoopVectorizationCostModel::CM_IntrinsicCall; |
8052 | }, |
8053 | Range); |
8054 | if (ShouldUseVectorIntrinsic) |
8055 | return new VPWidenCallRecipe(CI, make_range(x: Ops.begin(), y: Ops.end()), ID, |
8056 | CI->getDebugLoc()); |
8057 | |
8058 | Function *Variant = nullptr; |
8059 | std::optional<unsigned> MaskPos; |
8060 | // Is better to call a vectorized version of the function than to to scalarize |
8061 | // the call? |
8062 | auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange( |
8063 | Predicate: [&](ElementCount VF) -> bool { |
8064 | // The following case may be scalarized depending on the VF. |
8065 | // The flag shows whether we can use a usual Call for vectorized |
8066 | // version of the instruction. |
8067 | |
8068 | // If we've found a variant at a previous VF, then stop looking. A |
8069 | // vectorized variant of a function expects input in a certain shape |
8070 | // -- basically the number of input registers, the number of lanes |
8071 | // per register, and whether there's a mask required. |
8072 | // We store a pointer to the variant in the VPWidenCallRecipe, so |
8073 | // once we have an appropriate variant it's only valid for that VF. |
8074 | // This will force a different vplan to be generated for each VF that |
8075 | // finds a valid variant. |
8076 | if (Variant) |
8077 | return false; |
8078 | LoopVectorizationCostModel::CallWideningDecision Decision = |
8079 | CM.getCallWideningDecision(CI, VF); |
8080 | if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) { |
8081 | Variant = Decision.Variant; |
8082 | MaskPos = Decision.MaskPos; |
8083 | return true; |
8084 | } |
8085 | |
8086 | return false; |
8087 | }, |
8088 | Range); |
8089 | if (ShouldUseVectorCall) { |
8090 | if (MaskPos.has_value()) { |
8091 | // We have 2 cases that would require a mask: |
8092 | // 1) The block needs to be predicated, either due to a conditional |
8093 | // in the scalar loop or use of an active lane mask with |
8094 | // tail-folding, and we use the appropriate mask for the block. |
8095 | // 2) No mask is required for the block, but the only available |
8096 | // vector variant at this VF requires a mask, so we synthesize an |
8097 | // all-true mask. |
8098 | VPValue *Mask = nullptr; |
8099 | if (Legal->isMaskRequired(I: CI)) |
8100 | Mask = getBlockInMask(BB: CI->getParent()); |
8101 | else |
8102 | Mask = Plan.getOrAddLiveIn(V: ConstantInt::getTrue( |
8103 | Ty: IntegerType::getInt1Ty(C&: Variant->getFunctionType()->getContext()))); |
8104 | |
8105 | Ops.insert(I: Ops.begin() + *MaskPos, Elt: Mask); |
8106 | } |
8107 | |
8108 | return new VPWidenCallRecipe(CI, make_range(x: Ops.begin(), y: Ops.end()), |
8109 | Intrinsic::not_intrinsic, CI->getDebugLoc(), |
8110 | Variant); |
8111 | } |
8112 | |
8113 | return nullptr; |
8114 | } |
8115 | |
8116 | bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { |
8117 | assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && |
8118 | !isa<StoreInst>(I) && "Instruction should have been handled earlier" ); |
8119 | // Instruction should be widened, unless it is scalar after vectorization, |
8120 | // scalarization is profitable or it is predicated. |
8121 | auto WillScalarize = [this, I](ElementCount VF) -> bool { |
8122 | return CM.isScalarAfterVectorization(I, VF) || |
8123 | CM.isProfitableToScalarize(I, VF) || |
8124 | CM.isScalarWithPredication(I, VF); |
8125 | }; |
8126 | return !LoopVectorizationPlanner::getDecisionAndClampRange(Predicate: WillScalarize, |
8127 | Range); |
8128 | } |
8129 | |
8130 | VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, |
8131 | ArrayRef<VPValue *> Operands, |
8132 | VPBasicBlock *VPBB) { |
8133 | switch (I->getOpcode()) { |
8134 | default: |
8135 | return nullptr; |
8136 | case Instruction::SDiv: |
8137 | case Instruction::UDiv: |
8138 | case Instruction::SRem: |
8139 | case Instruction::URem: { |
8140 | // If not provably safe, use a select to form a safe divisor before widening the |
8141 | // div/rem operation itself. Otherwise fall through to general handling below. |
8142 | if (CM.isPredicatedInst(I)) { |
8143 | SmallVector<VPValue *> Ops(Operands.begin(), Operands.end()); |
8144 | VPValue *Mask = getBlockInMask(BB: I->getParent()); |
8145 | VPValue *One = |
8146 | Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: I->getType(), V: 1u, IsSigned: false)); |
8147 | auto *SafeRHS = Builder.createSelect(Cond: Mask, TrueVal: Ops[1], FalseVal: One, DL: I->getDebugLoc()); |
8148 | Ops[1] = SafeRHS; |
8149 | return new VPWidenRecipe(*I, make_range(x: Ops.begin(), y: Ops.end())); |
8150 | } |
8151 | [[fallthrough]]; |
8152 | } |
8153 | case Instruction::Add: |
8154 | case Instruction::And: |
8155 | case Instruction::AShr: |
8156 | case Instruction::FAdd: |
8157 | case Instruction::FCmp: |
8158 | case Instruction::FDiv: |
8159 | case Instruction::FMul: |
8160 | case Instruction::FNeg: |
8161 | case Instruction::FRem: |
8162 | case Instruction::FSub: |
8163 | case Instruction::ICmp: |
8164 | case Instruction::LShr: |
8165 | case Instruction::Mul: |
8166 | case Instruction::Or: |
8167 | case Instruction::Select: |
8168 | case Instruction::Shl: |
8169 | case Instruction::Sub: |
8170 | case Instruction::Xor: |
8171 | case Instruction::Freeze: |
8172 | return new VPWidenRecipe(*I, make_range(x: Operands.begin(), y: Operands.end())); |
8173 | }; |
8174 | } |
8175 | |
8176 | void VPRecipeBuilder::() { |
8177 | BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); |
8178 | for (VPHeaderPHIRecipe *R : PhisToFix) { |
8179 | auto *PN = cast<PHINode>(Val: R->getUnderlyingValue()); |
8180 | VPRecipeBase *IncR = |
8181 | getRecipe(I: cast<Instruction>(Val: PN->getIncomingValueForBlock(BB: OrigLatch))); |
8182 | R->addOperand(Operand: IncR->getVPSingleValue()); |
8183 | } |
8184 | } |
8185 | |
8186 | VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I, |
8187 | VFRange &Range) { |
8188 | bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( |
8189 | Predicate: [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, |
8190 | Range); |
8191 | |
8192 | bool IsPredicated = CM.isPredicatedInst(I); |
8193 | |
8194 | // Even if the instruction is not marked as uniform, there are certain |
8195 | // intrinsic calls that can be effectively treated as such, so we check for |
8196 | // them here. Conservatively, we only do this for scalable vectors, since |
8197 | // for fixed-width VFs we can always fall back on full scalarization. |
8198 | if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(Val: I)) { |
8199 | switch (cast<IntrinsicInst>(Val: I)->getIntrinsicID()) { |
8200 | case Intrinsic::assume: |
8201 | case Intrinsic::lifetime_start: |
8202 | case Intrinsic::lifetime_end: |
8203 | // For scalable vectors if one of the operands is variant then we still |
8204 | // want to mark as uniform, which will generate one instruction for just |
8205 | // the first lane of the vector. We can't scalarize the call in the same |
8206 | // way as for fixed-width vectors because we don't know how many lanes |
8207 | // there are. |
8208 | // |
8209 | // The reasons for doing it this way for scalable vectors are: |
8210 | // 1. For the assume intrinsic generating the instruction for the first |
8211 | // lane is still be better than not generating any at all. For |
8212 | // example, the input may be a splat across all lanes. |
8213 | // 2. For the lifetime start/end intrinsics the pointer operand only |
8214 | // does anything useful when the input comes from a stack object, |
8215 | // which suggests it should always be uniform. For non-stack objects |
8216 | // the effect is to poison the object, which still allows us to |
8217 | // remove the call. |
8218 | IsUniform = true; |
8219 | break; |
8220 | default: |
8221 | break; |
8222 | } |
8223 | } |
8224 | VPValue *BlockInMask = nullptr; |
8225 | if (!IsPredicated) { |
8226 | // Finalize the recipe for Instr, first if it is not predicated. |
8227 | LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n" ); |
8228 | } else { |
8229 | LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n" ); |
8230 | // Instructions marked for predication are replicated and a mask operand is |
8231 | // added initially. Masked replicate recipes will later be placed under an |
8232 | // if-then construct to prevent side-effects. Generate recipes to compute |
8233 | // the block mask for this region. |
8234 | BlockInMask = getBlockInMask(BB: I->getParent()); |
8235 | } |
8236 | |
8237 | // Note that there is some custom logic to mark some intrinsics as uniform |
8238 | // manually above for scalable vectors, which this assert needs to account for |
8239 | // as well. |
8240 | assert((Range.Start.isScalar() || !IsUniform || !IsPredicated || |
8241 | (Range.Start.isScalable() && isa<IntrinsicInst>(I))) && |
8242 | "Should not predicate a uniform recipe" ); |
8243 | auto *Recipe = new VPReplicateRecipe(I, mapToVPValues(Operands: I->operands()), |
8244 | IsUniform, BlockInMask); |
8245 | return Recipe; |
8246 | } |
8247 | |
8248 | VPRecipeBase * |
8249 | VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, |
8250 | ArrayRef<VPValue *> Operands, |
8251 | VFRange &Range, VPBasicBlock *VPBB) { |
8252 | // First, check for specific widening recipes that deal with inductions, Phi |
8253 | // nodes, calls and memory operations. |
8254 | VPRecipeBase *Recipe; |
8255 | if (auto Phi = dyn_cast<PHINode>(Val: Instr)) { |
8256 | if (Phi->getParent() != OrigLoop->getHeader()) |
8257 | return tryToBlend(Phi, Operands); |
8258 | |
8259 | if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range))) |
8260 | return Recipe; |
8261 | |
8262 | VPHeaderPHIRecipe *PhiRecipe = nullptr; |
8263 | assert((Legal->isReductionVariable(Phi) || |
8264 | Legal->isFixedOrderRecurrence(Phi)) && |
8265 | "can only widen reductions and fixed-order recurrences here" ); |
8266 | VPValue *StartV = Operands[0]; |
8267 | if (Legal->isReductionVariable(PN: Phi)) { |
8268 | const RecurrenceDescriptor &RdxDesc = |
8269 | Legal->getReductionVars().find(Key: Phi)->second; |
8270 | assert(RdxDesc.getRecurrenceStartValue() == |
8271 | Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); |
8272 | PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, |
8273 | CM.isInLoopReduction(Phi), |
8274 | CM.useOrderedReductions(RdxDesc)); |
8275 | } else { |
8276 | // TODO: Currently fixed-order recurrences are modeled as chains of |
8277 | // first-order recurrences. If there are no users of the intermediate |
8278 | // recurrences in the chain, the fixed order recurrence should be modeled |
8279 | // directly, enabling more efficient codegen. |
8280 | PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); |
8281 | } |
8282 | |
8283 | PhisToFix.push_back(Elt: PhiRecipe); |
8284 | return PhiRecipe; |
8285 | } |
8286 | |
8287 | if (isa<TruncInst>(Val: Instr) && (Recipe = tryToOptimizeInductionTruncate( |
8288 | I: cast<TruncInst>(Val: Instr), Operands, Range))) |
8289 | return Recipe; |
8290 | |
8291 | // All widen recipes below deal only with VF > 1. |
8292 | if (LoopVectorizationPlanner::getDecisionAndClampRange( |
8293 | Predicate: [&](ElementCount VF) { return VF.isScalar(); }, Range)) |
8294 | return nullptr; |
8295 | |
8296 | if (auto *CI = dyn_cast<CallInst>(Val: Instr)) |
8297 | return tryToWidenCall(CI, Operands, Range); |
8298 | |
8299 | if (isa<LoadInst>(Val: Instr) || isa<StoreInst>(Val: Instr)) |
8300 | return tryToWidenMemory(I: Instr, Operands, Range); |
8301 | |
8302 | if (!shouldWiden(I: Instr, Range)) |
8303 | return nullptr; |
8304 | |
8305 | if (auto GEP = dyn_cast<GetElementPtrInst>(Val: Instr)) |
8306 | return new VPWidenGEPRecipe(GEP, |
8307 | make_range(x: Operands.begin(), y: Operands.end())); |
8308 | |
8309 | if (auto *SI = dyn_cast<SelectInst>(Val: Instr)) { |
8310 | return new VPWidenSelectRecipe( |
8311 | *SI, make_range(x: Operands.begin(), y: Operands.end())); |
8312 | } |
8313 | |
8314 | if (auto *CI = dyn_cast<CastInst>(Val: Instr)) { |
8315 | return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(), |
8316 | *CI); |
8317 | } |
8318 | |
8319 | return tryToWiden(I: Instr, Operands, VPBB); |
8320 | } |
8321 | |
8322 | void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, |
8323 | ElementCount MaxVF) { |
8324 | assert(OrigLoop->isInnermost() && "Inner loop expected." ); |
8325 | |
8326 | auto MaxVFTimes2 = MaxVF * 2; |
8327 | for (ElementCount VF = MinVF; ElementCount::isKnownLT(LHS: VF, RHS: MaxVFTimes2);) { |
8328 | VFRange SubRange = {VF, MaxVFTimes2}; |
8329 | if (auto Plan = tryToBuildVPlanWithVPRecipes(Range&: SubRange)) { |
8330 | // Now optimize the initial VPlan. |
8331 | if (!Plan->hasVF(VF: ElementCount::getFixed(MinVal: 1))) |
8332 | VPlanTransforms::truncateToMinimalBitwidths( |
8333 | Plan&: *Plan, MinBWs: CM.getMinimalBitwidths(), Ctx&: PSE.getSE()->getContext()); |
8334 | VPlanTransforms::optimize(Plan&: *Plan, SE&: *PSE.getSE()); |
8335 | // TODO: try to put it close to addActiveLaneMask(). |
8336 | // Discard the plan if it is not EVL-compatible |
8337 | if (CM.foldTailWithEVL() && |
8338 | !VPlanTransforms::tryAddExplicitVectorLength(Plan&: *Plan)) |
8339 | break; |
8340 | assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid" ); |
8341 | VPlans.push_back(Elt: std::move(Plan)); |
8342 | } |
8343 | VF = SubRange.End; |
8344 | } |
8345 | } |
8346 | |
8347 | // Add the necessary canonical IV and branch recipes required to control the |
8348 | // loop. |
8349 | static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, |
8350 | DebugLoc DL) { |
8351 | Value *StartIdx = ConstantInt::get(Ty: IdxTy, V: 0); |
8352 | auto *StartV = Plan.getOrAddLiveIn(V: StartIdx); |
8353 | |
8354 | // Add a VPCanonicalIVPHIRecipe starting at 0 to the header. |
8355 | auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); |
8356 | VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); |
8357 | VPBasicBlock * = TopRegion->getEntryBasicBlock(); |
8358 | Header->insert(Recipe: CanonicalIVPHI, InsertPt: Header->begin()); |
8359 | |
8360 | VPBuilder Builder(TopRegion->getExitingBasicBlock()); |
8361 | // Add a VPInstruction to increment the scalar canonical IV by VF * UF. |
8362 | auto *CanonicalIVIncrement = Builder.createOverflowingOp( |
8363 | Opcode: Instruction::Add, Operands: {CanonicalIVPHI, &Plan.getVFxUF()}, WrapFlags: {HasNUW, false}, DL, |
8364 | Name: "index.next" ); |
8365 | CanonicalIVPHI->addOperand(Operand: CanonicalIVIncrement); |
8366 | |
8367 | // Add the BranchOnCount VPInstruction to the latch. |
8368 | Builder.createNaryOp(Opcode: VPInstruction::BranchOnCount, |
8369 | Operands: {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); |
8370 | } |
8371 | |
8372 | // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the |
8373 | // original exit block. |
8374 | static void addUsersInExitBlock(VPBasicBlock *, Loop *OrigLoop, |
8375 | VPRecipeBuilder &Builder, VPlan &Plan) { |
8376 | BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock(); |
8377 | BasicBlock *ExitingBB = OrigLoop->getExitingBlock(); |
8378 | // Only handle single-exit loops with unique exit blocks for now. |
8379 | if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB) |
8380 | return; |
8381 | |
8382 | // Introduce VPUsers modeling the exit values. |
8383 | for (PHINode &ExitPhi : ExitBB->phis()) { |
8384 | Value *IncomingValue = |
8385 | ExitPhi.getIncomingValueForBlock(BB: ExitingBB); |
8386 | VPValue *V = Builder.getVPValueOrAddLiveIn(V: IncomingValue, Plan); |
8387 | // Exit values for inductions are computed and updated outside of VPlan and |
8388 | // independent of induction recipes. |
8389 | // TODO: Compute induction exit values in VPlan, use VPLiveOuts to update |
8390 | // live-outs. |
8391 | if ((isa<VPWidenIntOrFpInductionRecipe>(Val: V) && |
8392 | !cast<VPWidenIntOrFpInductionRecipe>(Val: V)->getTruncInst()) || |
8393 | isa<VPWidenPointerInductionRecipe>(Val: V)) |
8394 | continue; |
8395 | Plan.addLiveOut(PN: &ExitPhi, V); |
8396 | } |
8397 | } |
8398 | |
8399 | /// Feed a resume value for every FOR from the vector loop to the scalar loop, |
8400 | /// if middle block branches to scalar preheader, by introducing ExtractFromEnd |
8401 | /// and ResumePhi recipes in each, respectively, and a VPLiveOut which uses the |
8402 | /// latter and corresponds to the scalar header. |
8403 | static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan) { |
8404 | VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); |
8405 | |
8406 | // Start by finding out if middle block branches to scalar preheader, which is |
8407 | // not a VPIRBasicBlock, unlike Exit block - the other possible successor of |
8408 | // middle block. |
8409 | // TODO: Should be replaced by |
8410 | // Plan->getScalarLoopRegion()->getSinglePredecessor() in the future once the |
8411 | // scalar region is modeled as well. |
8412 | VPBasicBlock *ScalarPHVPBB = nullptr; |
8413 | auto *MiddleVPBB = cast<VPBasicBlock>(Val: VectorRegion->getSingleSuccessor()); |
8414 | for (VPBlockBase *Succ : MiddleVPBB->getSuccessors()) { |
8415 | if (isa<VPIRBasicBlock>(Val: Succ)) |
8416 | continue; |
8417 | assert(!ScalarPHVPBB && "Two candidates for ScalarPHVPBB?" ); |
8418 | ScalarPHVPBB = cast<VPBasicBlock>(Val: Succ); |
8419 | } |
8420 | if (!ScalarPHVPBB) |
8421 | return; |
8422 | |
8423 | VPBuilder ScalarPHBuilder(ScalarPHVPBB); |
8424 | VPBuilder MiddleBuilder(MiddleVPBB); |
8425 | // Reset insert point so new recipes are inserted before terminator and |
8426 | // condition, if there is either the former or both. |
8427 | if (auto *Terminator = MiddleVPBB->getTerminator()) { |
8428 | auto *Condition = dyn_cast<VPInstruction>(Val: Terminator->getOperand(N: 0)); |
8429 | assert((!Condition || Condition->getParent() == MiddleVPBB) && |
8430 | "Condition expected in MiddleVPBB" ); |
8431 | MiddleBuilder.setInsertPoint(Condition ? Condition : Terminator); |
8432 | } |
8433 | VPValue *OneVPV = Plan.getOrAddLiveIn( |
8434 | V: ConstantInt::get(Ty: Plan.getCanonicalIV()->getScalarType(), V: 1)); |
8435 | |
8436 | for (auto & : VectorRegion->getEntryBasicBlock()->phis()) { |
8437 | auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Val: &HeaderPhi); |
8438 | if (!FOR) |
8439 | continue; |
8440 | |
8441 | // Extract the resume value and create a new VPLiveOut for it. |
8442 | auto *Resume = MiddleBuilder.createNaryOp(Opcode: VPInstruction::ExtractFromEnd, |
8443 | Operands: {FOR->getBackedgeValue(), OneVPV}, |
8444 | Inst: {}, Name: "vector.recur.extract" ); |
8445 | auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp( |
8446 | Opcode: VPInstruction::ResumePhi, Operands: {Resume, FOR->getStartValue()}, Inst: {}, |
8447 | Name: "scalar.recur.init" ); |
8448 | Plan.addLiveOut(PN: cast<PHINode>(Val: FOR->getUnderlyingInstr()), V: ResumePhiRecipe); |
8449 | } |
8450 | } |
8451 | |
8452 | VPlanPtr |
8453 | LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { |
8454 | |
8455 | SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; |
8456 | |
8457 | // --------------------------------------------------------------------------- |
8458 | // Build initial VPlan: Scan the body of the loop in a topological order to |
8459 | // visit each basic block after having visited its predecessor basic blocks. |
8460 | // --------------------------------------------------------------------------- |
8461 | |
8462 | // Create initial VPlan skeleton, having a basic block for the pre-header |
8463 | // which contains SCEV expansions that need to happen before the CFG is |
8464 | // modified; a basic block for the vector pre-header, followed by a region for |
8465 | // the vector loop, followed by the middle basic block. The skeleton vector |
8466 | // loop region contains a header and latch basic blocks. |
8467 | |
8468 | bool RequiresScalarEpilogueCheck = |
8469 | LoopVectorizationPlanner::getDecisionAndClampRange( |
8470 | Predicate: [this](ElementCount VF) { |
8471 | return !CM.requiresScalarEpilogue(IsVectorizing: VF.isVector()); |
8472 | }, |
8473 | Range); |
8474 | VPlanPtr Plan = VPlan::createInitialVPlan( |
8475 | TripCount: createTripCountSCEV(IdxTy: Legal->getWidestInductionType(), PSE, OrigLoop), |
8476 | PSE&: *PSE.getSE(), RequiresScalarEpilogueCheck, TailFolded: CM.foldTailByMasking(), |
8477 | TheLoop: OrigLoop); |
8478 | |
8479 | // Don't use getDecisionAndClampRange here, because we don't know the UF |
8480 | // so this function is better to be conservative, rather than to split |
8481 | // it up into different VPlans. |
8482 | // TODO: Consider using getDecisionAndClampRange here to split up VPlans. |
8483 | bool IVUpdateMayOverflow = false; |
8484 | for (ElementCount VF : Range) |
8485 | IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(Cost: &CM, VF); |
8486 | |
8487 | DebugLoc DL = getDebugLocFromInstOrOperands(I: Legal->getPrimaryInduction()); |
8488 | TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow); |
8489 | // When not folding the tail, we know that the induction increment will not |
8490 | // overflow. |
8491 | bool HasNUW = Style == TailFoldingStyle::None; |
8492 | addCanonicalIVRecipes(Plan&: *Plan, IdxTy: Legal->getWidestInductionType(), HasNUW, DL); |
8493 | |
8494 | VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder); |
8495 | |
8496 | // --------------------------------------------------------------------------- |
8497 | // Pre-construction: record ingredients whose recipes we'll need to further |
8498 | // process after constructing the initial VPlan. |
8499 | // --------------------------------------------------------------------------- |
8500 | |
8501 | // For each interleave group which is relevant for this (possibly trimmed) |
8502 | // Range, add it to the set of groups to be later applied to the VPlan and add |
8503 | // placeholders for its members' Recipes which we'll be replacing with a |
8504 | // single VPInterleaveRecipe. |
8505 | for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { |
8506 | auto applyIG = [IG, this](ElementCount VF) -> bool { |
8507 | bool Result = (VF.isVector() && // Query is illegal for VF == 1 |
8508 | CM.getWideningDecision(I: IG->getInsertPos(), VF) == |
8509 | LoopVectorizationCostModel::CM_Interleave); |
8510 | // For scalable vectors, the only interleave factor currently supported |
8511 | // is 2 since we require the (de)interleave2 intrinsics instead of |
8512 | // shufflevectors. |
8513 | assert((!Result || !VF.isScalable() || IG->getFactor() == 2) && |
8514 | "Unsupported interleave factor for scalable vectors" ); |
8515 | return Result; |
8516 | }; |
8517 | if (!getDecisionAndClampRange(Predicate: applyIG, Range)) |
8518 | continue; |
8519 | InterleaveGroups.insert(Ptr: IG); |
8520 | }; |
8521 | |
8522 | // --------------------------------------------------------------------------- |
8523 | // Construct recipes for the instructions in the loop |
8524 | // --------------------------------------------------------------------------- |
8525 | |
8526 | // Scan the body of the loop in a topological order to visit each basic block |
8527 | // after having visited its predecessor basic blocks. |
8528 | LoopBlocksDFS DFS(OrigLoop); |
8529 | DFS.perform(LI); |
8530 | |
8531 | VPBasicBlock * = Plan->getVectorLoopRegion()->getEntryBasicBlock(); |
8532 | VPBasicBlock *VPBB = HeaderVPBB; |
8533 | BasicBlock * = OrigLoop->getHeader(); |
8534 | bool NeedsMasks = |
8535 | CM.foldTailByMasking() || |
8536 | any_of(Range: OrigLoop->blocks(), P: [this, HeaderBB](BasicBlock *BB) { |
8537 | bool NeedsBlends = BB != HeaderBB && !BB->phis().empty(); |
8538 | return Legal->blockNeedsPredication(BB) || NeedsBlends; |
8539 | }); |
8540 | for (BasicBlock *BB : make_range(x: DFS.beginRPO(), y: DFS.endRPO())) { |
8541 | // Relevant instructions from basic block BB will be grouped into VPRecipe |
8542 | // ingredients and fill a new VPBasicBlock. |
8543 | if (VPBB != HeaderVPBB) |
8544 | VPBB->setName(BB->getName()); |
8545 | Builder.setInsertPoint(VPBB); |
8546 | |
8547 | if (VPBB == HeaderVPBB) |
8548 | RecipeBuilder.createHeaderMask(); |
8549 | else if (NeedsMasks) |
8550 | RecipeBuilder.createBlockInMask(BB); |
8551 | |
8552 | // Introduce each ingredient into VPlan. |
8553 | // TODO: Model and preserve debug intrinsics in VPlan. |
8554 | for (Instruction &I : drop_end(RangeOrContainer: BB->instructionsWithoutDebug(SkipPseudoOp: false))) { |
8555 | Instruction *Instr = &I; |
8556 | SmallVector<VPValue *, 4> Operands; |
8557 | auto *Phi = dyn_cast<PHINode>(Val: Instr); |
8558 | if (Phi && Phi->getParent() == HeaderBB) { |
8559 | Operands.push_back(Elt: Plan->getOrAddLiveIn( |
8560 | V: Phi->getIncomingValueForBlock(BB: OrigLoop->getLoopPreheader()))); |
8561 | } else { |
8562 | auto OpRange = RecipeBuilder.mapToVPValues(Operands: Instr->operands()); |
8563 | Operands = {OpRange.begin(), OpRange.end()}; |
8564 | } |
8565 | |
8566 | // Invariant stores inside loop will be deleted and a single store |
8567 | // with the final reduction value will be added to the exit block |
8568 | StoreInst *SI; |
8569 | if ((SI = dyn_cast<StoreInst>(Val: &I)) && |
8570 | Legal->isInvariantAddressOfReduction(V: SI->getPointerOperand())) |
8571 | continue; |
8572 | |
8573 | VPRecipeBase *Recipe = |
8574 | RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB); |
8575 | if (!Recipe) |
8576 | Recipe = RecipeBuilder.handleReplication(I: Instr, Range); |
8577 | |
8578 | RecipeBuilder.setRecipe(I: Instr, R: Recipe); |
8579 | if (isa<VPHeaderPHIRecipe>(Val: Recipe)) { |
8580 | // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In |
8581 | // the following cases, VPHeaderPHIRecipes may be created after non-phi |
8582 | // recipes and need to be moved to the phi section of HeaderVPBB: |
8583 | // * tail-folding (non-phi recipes computing the header mask are |
8584 | // introduced earlier than regular header phi recipes, and should appear |
8585 | // after them) |
8586 | // * Optimizing truncates to VPWidenIntOrFpInductionRecipe. |
8587 | |
8588 | assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() || |
8589 | CM.foldTailByMasking() || isa<TruncInst>(Instr)) && |
8590 | "unexpected recipe needs moving" ); |
8591 | Recipe->insertBefore(BB&: *HeaderVPBB, IP: HeaderVPBB->getFirstNonPhi()); |
8592 | } else |
8593 | VPBB->appendRecipe(Recipe); |
8594 | } |
8595 | |
8596 | VPBlockUtils::insertBlockAfter(NewBlock: new VPBasicBlock(), BlockPtr: VPBB); |
8597 | VPBB = cast<VPBasicBlock>(Val: VPBB->getSingleSuccessor()); |
8598 | } |
8599 | |
8600 | // After here, VPBB should not be used. |
8601 | VPBB = nullptr; |
8602 | |
8603 | if (CM.requiresScalarEpilogue(Range)) { |
8604 | // No edge from the middle block to the unique exit block has been inserted |
8605 | // and there is nothing to fix from vector loop; phis should have incoming |
8606 | // from scalar loop only. |
8607 | } else |
8608 | addUsersInExitBlock(HeaderVPBB, OrigLoop, Builder&: RecipeBuilder, Plan&: *Plan); |
8609 | |
8610 | assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && |
8611 | !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && |
8612 | "entry block must be set to a VPRegionBlock having a non-empty entry " |
8613 | "VPBasicBlock" ); |
8614 | RecipeBuilder.fixHeaderPhis(); |
8615 | |
8616 | addLiveOutsForFirstOrderRecurrences(Plan&: *Plan); |
8617 | |
8618 | // --------------------------------------------------------------------------- |
8619 | // Transform initial VPlan: Apply previously taken decisions, in order, to |
8620 | // bring the VPlan to its final state. |
8621 | // --------------------------------------------------------------------------- |
8622 | |
8623 | // Adjust the recipes for any inloop reductions. |
8624 | adjustRecipesForReductions(Plan, RecipeBuilder, MinVF: Range.Start); |
8625 | |
8626 | // Interleave memory: for each Interleave Group we marked earlier as relevant |
8627 | // for this VPlan, replace the Recipes widening its memory instructions with a |
8628 | // single VPInterleaveRecipe at its insertion point. |
8629 | for (const auto *IG : InterleaveGroups) { |
8630 | auto *Recipe = |
8631 | cast<VPWidenMemoryRecipe>(Val: RecipeBuilder.getRecipe(I: IG->getInsertPos())); |
8632 | SmallVector<VPValue *, 4> StoredValues; |
8633 | for (unsigned i = 0; i < IG->getFactor(); ++i) |
8634 | if (auto *SI = dyn_cast_or_null<StoreInst>(Val: IG->getMember(Index: i))) { |
8635 | auto *StoreR = cast<VPWidenStoreRecipe>(Val: RecipeBuilder.getRecipe(I: SI)); |
8636 | StoredValues.push_back(Elt: StoreR->getStoredValue()); |
8637 | } |
8638 | |
8639 | bool NeedsMaskForGaps = |
8640 | IG->requiresScalarEpilogue() && !CM.isScalarEpilogueAllowed(); |
8641 | assert((!NeedsMaskForGaps || useMaskedInterleavedAccesses(CM.TTI)) && |
8642 | "masked interleaved groups are not allowed." ); |
8643 | auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, |
8644 | Recipe->getMask(), NeedsMaskForGaps); |
8645 | VPIG->insertBefore(InsertPos: Recipe); |
8646 | unsigned J = 0; |
8647 | for (unsigned i = 0; i < IG->getFactor(); ++i) |
8648 | if (Instruction *Member = IG->getMember(Index: i)) { |
8649 | VPRecipeBase *MemberR = RecipeBuilder.getRecipe(I: Member); |
8650 | if (!Member->getType()->isVoidTy()) { |
8651 | VPValue *OriginalV = MemberR->getVPSingleValue(); |
8652 | OriginalV->replaceAllUsesWith(New: VPIG->getVPValue(I: J)); |
8653 | J++; |
8654 | } |
8655 | MemberR->eraseFromParent(); |
8656 | } |
8657 | } |
8658 | |
8659 | for (ElementCount VF : Range) |
8660 | Plan->addVF(VF); |
8661 | Plan->setName("Initial VPlan" ); |
8662 | |
8663 | // Replace VPValues for known constant strides guaranteed by predicate scalar |
8664 | // evolution. |
8665 | for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) { |
8666 | auto *StrideV = cast<SCEVUnknown>(Val: Stride)->getValue(); |
8667 | auto *ScevStride = dyn_cast<SCEVConstant>(Val: PSE.getSCEV(V: StrideV)); |
8668 | // Only handle constant strides for now. |
8669 | if (!ScevStride) |
8670 | continue; |
8671 | |
8672 | auto *CI = Plan->getOrAddLiveIn( |
8673 | V: ConstantInt::get(Ty: Stride->getType(), V: ScevStride->getAPInt())); |
8674 | if (VPValue *StrideVPV = Plan->getLiveIn(V: StrideV)) |
8675 | StrideVPV->replaceAllUsesWith(New: CI); |
8676 | |
8677 | // The versioned value may not be used in the loop directly but through a |
8678 | // sext/zext. Add new live-ins in those cases. |
8679 | for (Value *U : StrideV->users()) { |
8680 | if (!isa<SExtInst, ZExtInst>(Val: U)) |
8681 | continue; |
8682 | VPValue *StrideVPV = Plan->getLiveIn(V: U); |
8683 | if (!StrideVPV) |
8684 | continue; |
8685 | unsigned BW = U->getType()->getScalarSizeInBits(); |
8686 | APInt C = isa<SExtInst>(Val: U) ? ScevStride->getAPInt().sext(width: BW) |
8687 | : ScevStride->getAPInt().zext(width: BW); |
8688 | VPValue *CI = Plan->getOrAddLiveIn(V: ConstantInt::get(Ty: U->getType(), V: C)); |
8689 | StrideVPV->replaceAllUsesWith(New: CI); |
8690 | } |
8691 | } |
8692 | |
8693 | VPlanTransforms::dropPoisonGeneratingRecipes(Plan&: *Plan, BlockNeedsPredication: [this](BasicBlock *BB) { |
8694 | return Legal->blockNeedsPredication(BB); |
8695 | }); |
8696 | |
8697 | // Sink users of fixed-order recurrence past the recipe defining the previous |
8698 | // value and introduce FirstOrderRecurrenceSplice VPInstructions. |
8699 | if (!VPlanTransforms::adjustFixedOrderRecurrences(Plan&: *Plan, Builder)) |
8700 | return nullptr; |
8701 | |
8702 | if (useActiveLaneMask(Style)) { |
8703 | // TODO: Move checks to VPlanTransforms::addActiveLaneMask once |
8704 | // TailFoldingStyle is visible there. |
8705 | bool ForControlFlow = useActiveLaneMaskForControlFlow(Style); |
8706 | bool WithoutRuntimeCheck = |
8707 | Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; |
8708 | VPlanTransforms::addActiveLaneMask(Plan&: *Plan, UseActiveLaneMaskForControlFlow: ForControlFlow, |
8709 | DataAndControlFlowWithoutRuntimeCheck: WithoutRuntimeCheck); |
8710 | } |
8711 | return Plan; |
8712 | } |
8713 | |
8714 | VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { |
8715 | // Outer loop handling: They may require CFG and instruction level |
8716 | // transformations before even evaluating whether vectorization is profitable. |
8717 | // Since we cannot modify the incoming IR, we need to build VPlan upfront in |
8718 | // the vectorization pipeline. |
8719 | assert(!OrigLoop->isInnermost()); |
8720 | assert(EnableVPlanNativePath && "VPlan-native path is not enabled." ); |
8721 | |
8722 | // Create new empty VPlan |
8723 | auto Plan = VPlan::createInitialVPlan( |
8724 | TripCount: createTripCountSCEV(IdxTy: Legal->getWidestInductionType(), PSE, OrigLoop), |
8725 | PSE&: *PSE.getSE(), RequiresScalarEpilogueCheck: true, TailFolded: false, TheLoop: OrigLoop); |
8726 | |
8727 | // Build hierarchical CFG |
8728 | VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); |
8729 | HCFGBuilder.buildHierarchicalCFG(); |
8730 | |
8731 | for (ElementCount VF : Range) |
8732 | Plan->addVF(VF); |
8733 | |
8734 | VPlanTransforms::VPInstructionsToVPRecipes( |
8735 | Plan, |
8736 | GetIntOrFpInductionDescriptor: [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(Phi: P); }, |
8737 | SE&: *PSE.getSE(), TLI: *TLI); |
8738 | |
8739 | // Remove the existing terminator of the exiting block of the top-most region. |
8740 | // A BranchOnCount will be added instead when adding the canonical IV recipes. |
8741 | auto *Term = |
8742 | Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator(); |
8743 | Term->eraseFromParent(); |
8744 | |
8745 | // Tail folding is not supported for outer loops, so the induction increment |
8746 | // is guaranteed to not wrap. |
8747 | bool HasNUW = true; |
8748 | addCanonicalIVRecipes(Plan&: *Plan, IdxTy: Legal->getWidestInductionType(), HasNUW, |
8749 | DL: DebugLoc()); |
8750 | assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid" ); |
8751 | return Plan; |
8752 | } |
8753 | |
8754 | // Adjust the recipes for reductions. For in-loop reductions the chain of |
8755 | // instructions leading from the loop exit instr to the phi need to be converted |
8756 | // to reductions, with one operand being vector and the other being the scalar |
8757 | // reduction chain. For other reductions, a select is introduced between the phi |
8758 | // and live-out recipes when folding the tail. |
8759 | // |
8760 | // A ComputeReductionResult recipe is added to the middle block, also for |
8761 | // in-loop reductions which compute their result in-loop, because generating |
8762 | // the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes. |
8763 | // |
8764 | // Adjust AnyOf reductions; replace the reduction phi for the selected value |
8765 | // with a boolean reduction phi node to check if the condition is true in any |
8766 | // iteration. The final value is selected by the final ComputeReductionResult. |
8767 | void LoopVectorizationPlanner::adjustRecipesForReductions( |
8768 | VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) { |
8769 | VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion(); |
8770 | VPBasicBlock * = VectorLoopRegion->getEntryBasicBlock(); |
8771 | // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores |
8772 | // sank outside of the loop would keep the same order as they had in the |
8773 | // original loop. |
8774 | SmallVector<VPReductionPHIRecipe *> ReductionPHIList; |
8775 | for (VPRecipeBase &R : Header->phis()) { |
8776 | if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(Val: &R)) |
8777 | ReductionPHIList.emplace_back(Args&: ReductionPhi); |
8778 | } |
8779 | bool HasIntermediateStore = false; |
8780 | stable_sort(Range&: ReductionPHIList, |
8781 | C: [this, &HasIntermediateStore](const VPReductionPHIRecipe *R1, |
8782 | const VPReductionPHIRecipe *R2) { |
8783 | auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore; |
8784 | auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore; |
8785 | HasIntermediateStore |= IS1 || IS2; |
8786 | |
8787 | // If neither of the recipes has an intermediate store, keep the |
8788 | // order the same. |
8789 | if (!IS1 && !IS2) |
8790 | return false; |
8791 | |
8792 | // If only one of the recipes has an intermediate store, then |
8793 | // move it towards the beginning of the list. |
8794 | if (IS1 && !IS2) |
8795 | return true; |
8796 | |
8797 | if (!IS1 && IS2) |
8798 | return false; |
8799 | |
8800 | // If both recipes have an intermediate store, then the recipe |
8801 | // with the later store should be processed earlier. So it |
8802 | // should go to the beginning of the list. |
8803 | return DT->dominates(Def: IS2, User: IS1); |
8804 | }); |
8805 | |
8806 | if (HasIntermediateStore && ReductionPHIList.size() > 1) |
8807 | for (VPRecipeBase *R : ReductionPHIList) |
8808 | R->moveBefore(BB&: *Header, I: Header->getFirstNonPhi()); |
8809 | |
8810 | for (VPRecipeBase &R : Header->phis()) { |
8811 | auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R); |
8812 | if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered())) |
8813 | continue; |
8814 | |
8815 | const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); |
8816 | RecurKind Kind = RdxDesc.getRecurrenceKind(); |
8817 | assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) && |
8818 | "AnyOf reductions are not allowed for in-loop reductions" ); |
8819 | |
8820 | // Collect the chain of "link" recipes for the reduction starting at PhiR. |
8821 | SetVector<VPSingleDefRecipe *> Worklist; |
8822 | Worklist.insert(X: PhiR); |
8823 | for (unsigned I = 0; I != Worklist.size(); ++I) { |
8824 | VPSingleDefRecipe *Cur = Worklist[I]; |
8825 | for (VPUser *U : Cur->users()) { |
8826 | auto *UserRecipe = dyn_cast<VPSingleDefRecipe>(Val: U); |
8827 | if (!UserRecipe) { |
8828 | assert(isa<VPLiveOut>(U) && |
8829 | "U must either be a VPSingleDef or VPLiveOut" ); |
8830 | continue; |
8831 | } |
8832 | Worklist.insert(X: UserRecipe); |
8833 | } |
8834 | } |
8835 | |
8836 | // Visit operation "Links" along the reduction chain top-down starting from |
8837 | // the phi until LoopExitValue. We keep track of the previous item |
8838 | // (PreviousLink) to tell which of the two operands of a Link will remain |
8839 | // scalar and which will be reduced. For minmax by select(cmp), Link will be |
8840 | // the select instructions. Blend recipes of in-loop reduction phi's will |
8841 | // get folded to their non-phi operand, as the reduction recipe handles the |
8842 | // condition directly. |
8843 | VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0]. |
8844 | for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) { |
8845 | Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr(); |
8846 | |
8847 | // Index of the first operand which holds a non-mask vector operand. |
8848 | unsigned IndexOfFirstOperand; |
8849 | // Recognize a call to the llvm.fmuladd intrinsic. |
8850 | bool IsFMulAdd = (Kind == RecurKind::FMulAdd); |
8851 | VPValue *VecOp; |
8852 | VPBasicBlock *LinkVPBB = CurrentLink->getParent(); |
8853 | if (IsFMulAdd) { |
8854 | assert( |
8855 | RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI) && |
8856 | "Expected instruction to be a call to the llvm.fmuladd intrinsic" ); |
8857 | assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) || |
8858 | isa<VPWidenCallRecipe>(CurrentLink)) && |
8859 | CurrentLink->getOperand(2) == PreviousLink && |
8860 | "expected a call where the previous link is the added operand" ); |
8861 | |
8862 | // If the instruction is a call to the llvm.fmuladd intrinsic then we |
8863 | // need to create an fmul recipe (multiplying the first two operands of |
8864 | // the fmuladd together) to use as the vector operand for the fadd |
8865 | // reduction. |
8866 | VPInstruction *FMulRecipe = new VPInstruction( |
8867 | Instruction::FMul, |
8868 | {CurrentLink->getOperand(N: 0), CurrentLink->getOperand(N: 1)}, |
8869 | CurrentLinkI->getFastMathFlags()); |
8870 | LinkVPBB->insert(Recipe: FMulRecipe, InsertPt: CurrentLink->getIterator()); |
8871 | VecOp = FMulRecipe; |
8872 | } else { |
8873 | auto *Blend = dyn_cast<VPBlendRecipe>(Val: CurrentLink); |
8874 | if (PhiR->isInLoop() && Blend) { |
8875 | assert(Blend->getNumIncomingValues() == 2 && |
8876 | "Blend must have 2 incoming values" ); |
8877 | if (Blend->getIncomingValue(Idx: 0) == PhiR) |
8878 | Blend->replaceAllUsesWith(New: Blend->getIncomingValue(Idx: 1)); |
8879 | else { |
8880 | assert(Blend->getIncomingValue(1) == PhiR && |
8881 | "PhiR must be an operand of the blend" ); |
8882 | Blend->replaceAllUsesWith(New: Blend->getIncomingValue(Idx: 0)); |
8883 | } |
8884 | continue; |
8885 | } |
8886 | |
8887 | if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { |
8888 | if (isa<VPWidenRecipe>(Val: CurrentLink)) { |
8889 | assert(isa<CmpInst>(CurrentLinkI) && |
8890 | "need to have the compare of the select" ); |
8891 | continue; |
8892 | } |
8893 | assert(isa<VPWidenSelectRecipe>(CurrentLink) && |
8894 | "must be a select recipe" ); |
8895 | IndexOfFirstOperand = 1; |
8896 | } else { |
8897 | assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) && |
8898 | "Expected to replace a VPWidenSC" ); |
8899 | IndexOfFirstOperand = 0; |
8900 | } |
8901 | // Note that for non-commutable operands (cmp-selects), the semantics of |
8902 | // the cmp-select are captured in the recurrence kind. |
8903 | unsigned VecOpId = |
8904 | CurrentLink->getOperand(N: IndexOfFirstOperand) == PreviousLink |
8905 | ? IndexOfFirstOperand + 1 |
8906 | : IndexOfFirstOperand; |
8907 | VecOp = CurrentLink->getOperand(N: VecOpId); |
8908 | assert(VecOp != PreviousLink && |
8909 | CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 - |
8910 | (VecOpId - IndexOfFirstOperand)) == |
8911 | PreviousLink && |
8912 | "PreviousLink must be the operand other than VecOp" ); |
8913 | } |
8914 | |
8915 | BasicBlock *BB = CurrentLinkI->getParent(); |
8916 | VPValue *CondOp = nullptr; |
8917 | if (CM.blockNeedsPredicationForAnyReason(BB)) |
8918 | CondOp = RecipeBuilder.getBlockInMask(BB); |
8919 | |
8920 | VPReductionRecipe *RedRecipe = |
8921 | new VPReductionRecipe(RdxDesc, CurrentLinkI, PreviousLink, VecOp, |
8922 | CondOp, CM.useOrderedReductions(RdxDesc)); |
8923 | // Append the recipe to the end of the VPBasicBlock because we need to |
8924 | // ensure that it comes after all of it's inputs, including CondOp. |
8925 | // Note that this transformation may leave over dead recipes (including |
8926 | // CurrentLink), which will be cleaned by a later VPlan transform. |
8927 | LinkVPBB->appendRecipe(Recipe: RedRecipe); |
8928 | CurrentLink->replaceAllUsesWith(New: RedRecipe); |
8929 | PreviousLink = RedRecipe; |
8930 | } |
8931 | } |
8932 | VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock(); |
8933 | Builder.setInsertPoint(&*LatchVPBB->begin()); |
8934 | VPBasicBlock *MiddleVPBB = |
8935 | cast<VPBasicBlock>(Val: VectorLoopRegion->getSingleSuccessor()); |
8936 | VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi(); |
8937 | for (VPRecipeBase &R : |
8938 | Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { |
8939 | VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R); |
8940 | if (!PhiR) |
8941 | continue; |
8942 | |
8943 | const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); |
8944 | // Adjust AnyOf reductions; replace the reduction phi for the selected value |
8945 | // with a boolean reduction phi node to check if the condition is true in |
8946 | // any iteration. The final value is selected by the final |
8947 | // ComputeReductionResult. |
8948 | if (RecurrenceDescriptor::isAnyOfRecurrenceKind( |
8949 | Kind: RdxDesc.getRecurrenceKind())) { |
8950 | auto *Select = cast<VPRecipeBase>(Val: *find_if(Range: PhiR->users(), P: [](VPUser *U) { |
8951 | return isa<VPWidenSelectRecipe>(Val: U) || |
8952 | (isa<VPReplicateRecipe>(Val: U) && |
8953 | cast<VPReplicateRecipe>(Val: U)->getUnderlyingInstr()->getOpcode() == |
8954 | Instruction::Select); |
8955 | })); |
8956 | VPValue *Cmp = Select->getOperand(N: 0); |
8957 | // If the compare is checking the reduction PHI node, adjust it to check |
8958 | // the start value. |
8959 | if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe()) { |
8960 | for (unsigned I = 0; I != CmpR->getNumOperands(); ++I) |
8961 | if (CmpR->getOperand(N: I) == PhiR) |
8962 | CmpR->setOperand(I, New: PhiR->getStartValue()); |
8963 | } |
8964 | VPBuilder::InsertPointGuard Guard(Builder); |
8965 | Builder.setInsertPoint(Select); |
8966 | |
8967 | // If the true value of the select is the reduction phi, the new value is |
8968 | // selected if the negated condition is true in any iteration. |
8969 | if (Select->getOperand(N: 1) == PhiR) |
8970 | Cmp = Builder.createNot(Operand: Cmp); |
8971 | VPValue *Or = Builder.createOr(LHS: PhiR, RHS: Cmp); |
8972 | Select->getVPSingleValue()->replaceAllUsesWith(New: Or); |
8973 | |
8974 | // Convert the reduction phi to operate on bools. |
8975 | PhiR->setOperand(I: 0, New: Plan->getOrAddLiveIn(V: ConstantInt::getFalse( |
8976 | Context&: OrigLoop->getHeader()->getContext()))); |
8977 | } |
8978 | |
8979 | // If tail is folded by masking, introduce selects between the phi |
8980 | // and the live-out instruction of each reduction, at the beginning of the |
8981 | // dedicated latch block. |
8982 | auto *OrigExitingVPV = PhiR->getBackedgeValue(); |
8983 | auto *NewExitingVPV = PhiR->getBackedgeValue(); |
8984 | if (!PhiR->isInLoop() && CM.foldTailByMasking()) { |
8985 | VPValue *Cond = RecipeBuilder.getBlockInMask(BB: OrigLoop->getHeader()); |
8986 | assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB && |
8987 | "reduction recipe must be defined before latch" ); |
8988 | Type *PhiTy = PhiR->getOperand(N: 0)->getLiveInIRValue()->getType(); |
8989 | std::optional<FastMathFlags> FMFs = |
8990 | PhiTy->isFloatingPointTy() |
8991 | ? std::make_optional(t: RdxDesc.getFastMathFlags()) |
8992 | : std::nullopt; |
8993 | NewExitingVPV = |
8994 | Builder.createSelect(Cond, TrueVal: OrigExitingVPV, FalseVal: PhiR, DL: {}, Name: "" , FMFs); |
8995 | OrigExitingVPV->replaceUsesWithIf(New: NewExitingVPV, ShouldReplace: [](VPUser &U, unsigned) { |
8996 | return isa<VPInstruction>(Val: &U) && |
8997 | cast<VPInstruction>(Val: &U)->getOpcode() == |
8998 | VPInstruction::ComputeReductionResult; |
8999 | }); |
9000 | if (PreferPredicatedReductionSelect || |
9001 | TTI.preferPredicatedReductionSelect( |
9002 | Opcode: PhiR->getRecurrenceDescriptor().getOpcode(), Ty: PhiTy, |
9003 | Flags: TargetTransformInfo::ReductionFlags())) |
9004 | PhiR->setOperand(I: 1, New: NewExitingVPV); |
9005 | } |
9006 | |
9007 | // If the vector reduction can be performed in a smaller type, we truncate |
9008 | // then extend the loop exit value to enable InstCombine to evaluate the |
9009 | // entire expression in the smaller type. |
9010 | Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType(); |
9011 | if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() && |
9012 | !RecurrenceDescriptor::isAnyOfRecurrenceKind( |
9013 | Kind: RdxDesc.getRecurrenceKind())) { |
9014 | assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!" ); |
9015 | Type *RdxTy = RdxDesc.getRecurrenceType(); |
9016 | auto *Trunc = |
9017 | new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy); |
9018 | auto *Extnd = |
9019 | RdxDesc.isSigned() |
9020 | ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy) |
9021 | : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy); |
9022 | |
9023 | Trunc->insertAfter(InsertPos: NewExitingVPV->getDefiningRecipe()); |
9024 | Extnd->insertAfter(InsertPos: Trunc); |
9025 | if (PhiR->getOperand(N: 1) == NewExitingVPV) |
9026 | PhiR->setOperand(I: 1, New: Extnd->getVPSingleValue()); |
9027 | NewExitingVPV = Extnd; |
9028 | } |
9029 | |
9030 | // We want code in the middle block to appear to execute on the location of |
9031 | // the scalar loop's latch terminator because: (a) it is all compiler |
9032 | // generated, (b) these instructions are always executed after evaluating |
9033 | // the latch conditional branch, and (c) other passes may add new |
9034 | // predecessors which terminate on this line. This is the easiest way to |
9035 | // ensure we don't accidentally cause an extra step back into the loop while |
9036 | // debugging. |
9037 | DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc(); |
9038 | |
9039 | // TODO: At the moment ComputeReductionResult also drives creation of the |
9040 | // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here |
9041 | // even for in-loop reductions, until the reduction resume value handling is |
9042 | // also modeled in VPlan. |
9043 | auto *FinalReductionResult = new VPInstruction( |
9044 | VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL); |
9045 | FinalReductionResult->insertBefore(BB&: *MiddleVPBB, IP); |
9046 | OrigExitingVPV->replaceUsesWithIf( |
9047 | New: FinalReductionResult, |
9048 | ShouldReplace: [](VPUser &User, unsigned) { return isa<VPLiveOut>(Val: &User); }); |
9049 | } |
9050 | |
9051 | VPlanTransforms::clearReductionWrapFlags(Plan&: *Plan); |
9052 | } |
9053 | |
9054 | void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { |
9055 | assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction && |
9056 | "Not a pointer induction according to InductionDescriptor!" ); |
9057 | assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() && |
9058 | "Unexpected type." ); |
9059 | assert(!onlyScalarsGenerated(State.VF.isScalable()) && |
9060 | "Recipe should have been replaced" ); |
9061 | |
9062 | auto *IVR = getParent()->getPlan()->getCanonicalIV(); |
9063 | PHINode *CanonicalIV = cast<PHINode>(Val: State.get(Def: IVR, Part: 0, /*IsScalar*/ true)); |
9064 | Type *PhiType = IndDesc.getStep()->getType(); |
9065 | |
9066 | // Build a pointer phi |
9067 | Value *ScalarStartValue = getStartValue()->getLiveInIRValue(); |
9068 | Type *ScStValueType = ScalarStartValue->getType(); |
9069 | PHINode *NewPointerPhi = PHINode::Create(Ty: ScStValueType, NumReservedValues: 2, NameStr: "pointer.phi" , |
9070 | InsertBefore: CanonicalIV->getIterator()); |
9071 | |
9072 | BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(R: this); |
9073 | NewPointerPhi->addIncoming(V: ScalarStartValue, BB: VectorPH); |
9074 | |
9075 | // A pointer induction, performed by using a gep |
9076 | BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint(); |
9077 | |
9078 | Value *ScalarStepValue = State.get(Def: getOperand(N: 1), Instance: VPIteration(0, 0)); |
9079 | Value *RuntimeVF = getRuntimeVF(B&: State.Builder, Ty: PhiType, VF: State.VF); |
9080 | Value *NumUnrolledElems = |
9081 | State.Builder.CreateMul(LHS: RuntimeVF, RHS: ConstantInt::get(Ty: PhiType, V: State.UF)); |
9082 | Value *InductionGEP = GetElementPtrInst::Create( |
9083 | PointeeType: State.Builder.getInt8Ty(), Ptr: NewPointerPhi, |
9084 | IdxList: State.Builder.CreateMul(LHS: ScalarStepValue, RHS: NumUnrolledElems), NameStr: "ptr.ind" , |
9085 | InsertBefore: InductionLoc); |
9086 | // Add induction update using an incorrect block temporarily. The phi node |
9087 | // will be fixed after VPlan execution. Note that at this point the latch |
9088 | // block cannot be used, as it does not exist yet. |
9089 | // TODO: Model increment value in VPlan, by turning the recipe into a |
9090 | // multi-def and a subclass of VPHeaderPHIRecipe. |
9091 | NewPointerPhi->addIncoming(V: InductionGEP, BB: VectorPH); |
9092 | |
9093 | // Create UF many actual address geps that use the pointer |
9094 | // phi as base and a vectorized version of the step value |
9095 | // (<step*0, ..., step*N>) as offset. |
9096 | for (unsigned Part = 0; Part < State.UF; ++Part) { |
9097 | Type *VecPhiType = VectorType::get(ElementType: PhiType, EC: State.VF); |
9098 | Value *StartOffsetScalar = |
9099 | State.Builder.CreateMul(LHS: RuntimeVF, RHS: ConstantInt::get(Ty: PhiType, V: Part)); |
9100 | Value *StartOffset = |
9101 | State.Builder.CreateVectorSplat(EC: State.VF, V: StartOffsetScalar); |
9102 | // Create a vector of consecutive numbers from zero to VF. |
9103 | StartOffset = State.Builder.CreateAdd( |
9104 | LHS: StartOffset, RHS: State.Builder.CreateStepVector(DstType: VecPhiType)); |
9105 | |
9106 | assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) && |
9107 | "scalar step must be the same across all parts" ); |
9108 | Value *GEP = State.Builder.CreateGEP( |
9109 | Ty: State.Builder.getInt8Ty(), Ptr: NewPointerPhi, |
9110 | IdxList: State.Builder.CreateMul( |
9111 | LHS: StartOffset, |
9112 | RHS: State.Builder.CreateVectorSplat(EC: State.VF, V: ScalarStepValue), |
9113 | Name: "vector.gep" )); |
9114 | State.set(Def: this, V: GEP, Part); |
9115 | } |
9116 | } |
9117 | |
9118 | void VPDerivedIVRecipe::execute(VPTransformState &State) { |
9119 | assert(!State.Instance && "VPDerivedIVRecipe being replicated." ); |
9120 | |
9121 | // Fast-math-flags propagate from the original induction instruction. |
9122 | IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); |
9123 | if (FPBinOp) |
9124 | State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags()); |
9125 | |
9126 | Value *Step = State.get(Def: getStepValue(), Instance: VPIteration(0, 0)); |
9127 | Value *CanonicalIV = State.get(Def: getOperand(N: 1), Instance: VPIteration(0, 0)); |
9128 | Value *DerivedIV = emitTransformedIndex( |
9129 | B&: State.Builder, Index: CanonicalIV, StartValue: getStartValue()->getLiveInIRValue(), Step, |
9130 | InductionKind: Kind, InductionBinOp: cast_if_present<BinaryOperator>(Val: FPBinOp)); |
9131 | DerivedIV->setName("offset.idx" ); |
9132 | assert(DerivedIV != CanonicalIV && "IV didn't need transforming?" ); |
9133 | |
9134 | State.set(Def: this, V: DerivedIV, Instance: VPIteration(0, 0)); |
9135 | } |
9136 | |
9137 | void VPReplicateRecipe::execute(VPTransformState &State) { |
9138 | Instruction *UI = getUnderlyingInstr(); |
9139 | if (State.Instance) { // Generate a single instance. |
9140 | assert((State.VF.isScalar() || !isUniform()) && |
9141 | "uniform recipe shouldn't be predicated" ); |
9142 | assert(!State.VF.isScalable() && "Can't scalarize a scalable vector" ); |
9143 | State.ILV->scalarizeInstruction(Instr: UI, RepRecipe: this, Instance: *State.Instance, State); |
9144 | // Insert scalar instance packing it into a vector. |
9145 | if (State.VF.isVector() && shouldPack()) { |
9146 | // If we're constructing lane 0, initialize to start from poison. |
9147 | if (State.Instance->Lane.isFirstLane()) { |
9148 | assert(!State.VF.isScalable() && "VF is assumed to be non scalable." ); |
9149 | Value *Poison = PoisonValue::get( |
9150 | T: VectorType::get(ElementType: UI->getType(), EC: State.VF)); |
9151 | State.set(Def: this, V: Poison, Part: State.Instance->Part); |
9152 | } |
9153 | State.packScalarIntoVectorValue(Def: this, Instance: *State.Instance); |
9154 | } |
9155 | return; |
9156 | } |
9157 | |
9158 | if (IsUniform) { |
9159 | // If the recipe is uniform across all parts (instead of just per VF), only |
9160 | // generate a single instance. |
9161 | if ((isa<LoadInst>(Val: UI) || isa<StoreInst>(Val: UI)) && |
9162 | all_of(Range: operands(), P: [](VPValue *Op) { |
9163 | return Op->isDefinedOutsideVectorRegions(); |
9164 | })) { |
9165 | State.ILV->scalarizeInstruction(Instr: UI, RepRecipe: this, Instance: VPIteration(0, 0), State); |
9166 | if (user_begin() != user_end()) { |
9167 | for (unsigned Part = 1; Part < State.UF; ++Part) |
9168 | State.set(Def: this, V: State.get(Def: this, Instance: VPIteration(0, 0)), |
9169 | Instance: VPIteration(Part, 0)); |
9170 | } |
9171 | return; |
9172 | } |
9173 | |
9174 | // Uniform within VL means we need to generate lane 0 only for each |
9175 | // unrolled copy. |
9176 | for (unsigned Part = 0; Part < State.UF; ++Part) |
9177 | State.ILV->scalarizeInstruction(Instr: UI, RepRecipe: this, Instance: VPIteration(Part, 0), State); |
9178 | return; |
9179 | } |
9180 | |
9181 | // A store of a loop varying value to a uniform address only needs the last |
9182 | // copy of the store. |
9183 | if (isa<StoreInst>(Val: UI) && |
9184 | vputils::isUniformAfterVectorization(VPV: getOperand(N: 1))) { |
9185 | auto Lane = VPLane::getLastLaneForVF(VF: State.VF); |
9186 | State.ILV->scalarizeInstruction(Instr: UI, RepRecipe: this, Instance: VPIteration(State.UF - 1, Lane), |
9187 | State); |
9188 | return; |
9189 | } |
9190 | |
9191 | // Generate scalar instances for all VF lanes of all UF parts. |
9192 | assert(!State.VF.isScalable() && "Can't scalarize a scalable vector" ); |
9193 | const unsigned EndLane = State.VF.getKnownMinValue(); |
9194 | for (unsigned Part = 0; Part < State.UF; ++Part) |
9195 | for (unsigned Lane = 0; Lane < EndLane; ++Lane) |
9196 | State.ILV->scalarizeInstruction(Instr: UI, RepRecipe: this, Instance: VPIteration(Part, Lane), State); |
9197 | } |
9198 | |
9199 | void VPWidenLoadRecipe::execute(VPTransformState &State) { |
9200 | auto *LI = cast<LoadInst>(Val: &Ingredient); |
9201 | |
9202 | Type *ScalarDataTy = getLoadStoreType(I: &Ingredient); |
9203 | auto *DataTy = VectorType::get(ElementType: ScalarDataTy, EC: State.VF); |
9204 | const Align Alignment = getLoadStoreAlignment(I: &Ingredient); |
9205 | bool CreateGather = !isConsecutive(); |
9206 | |
9207 | auto &Builder = State.Builder; |
9208 | State.setDebugLocFrom(getDebugLoc()); |
9209 | for (unsigned Part = 0; Part < State.UF; ++Part) { |
9210 | Value *NewLI; |
9211 | Value *Mask = nullptr; |
9212 | if (auto *VPMask = getMask()) { |
9213 | // Mask reversal is only needed for non-all-one (null) masks, as reverse |
9214 | // of a null all-one mask is a null mask. |
9215 | Mask = State.get(Def: VPMask, Part); |
9216 | if (isReverse()) |
9217 | Mask = Builder.CreateVectorReverse(V: Mask, Name: "reverse" ); |
9218 | } |
9219 | |
9220 | Value *Addr = State.get(Def: getAddr(), Part, /*IsScalar*/ !CreateGather); |
9221 | if (CreateGather) { |
9222 | NewLI = Builder.CreateMaskedGather(Ty: DataTy, Ptrs: Addr, Alignment, Mask, PassThru: nullptr, |
9223 | Name: "wide.masked.gather" ); |
9224 | } else if (Mask) { |
9225 | NewLI = Builder.CreateMaskedLoad(Ty: DataTy, Ptr: Addr, Alignment, Mask, |
9226 | PassThru: PoisonValue::get(T: DataTy), |
9227 | Name: "wide.masked.load" ); |
9228 | } else { |
9229 | NewLI = Builder.CreateAlignedLoad(Ty: DataTy, Ptr: Addr, Align: Alignment, Name: "wide.load" ); |
9230 | } |
9231 | // Add metadata to the load, but setVectorValue to the reverse shuffle. |
9232 | State.addMetadata(To: NewLI, From: LI); |
9233 | if (Reverse) |
9234 | NewLI = Builder.CreateVectorReverse(V: NewLI, Name: "reverse" ); |
9235 | State.set(Def: this, V: NewLI, Part); |
9236 | } |
9237 | } |
9238 | |
9239 | /// Use all-true mask for reverse rather than actual mask, as it avoids a |
9240 | /// dependence w/o affecting the result. |
9241 | static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand, |
9242 | Value *EVL, const Twine &Name) { |
9243 | VectorType *ValTy = cast<VectorType>(Val: Operand->getType()); |
9244 | Value *AllTrueMask = |
9245 | Builder.CreateVectorSplat(EC: ValTy->getElementCount(), V: Builder.getTrue()); |
9246 | return Builder.CreateIntrinsic(RetTy: ValTy, ID: Intrinsic::experimental_vp_reverse, |
9247 | Args: {Operand, AllTrueMask, EVL}, FMFSource: nullptr, Name); |
9248 | } |
9249 | |
9250 | void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { |
9251 | assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with " |
9252 | "explicit vector length." ); |
9253 | auto *LI = cast<LoadInst>(Val: &Ingredient); |
9254 | |
9255 | Type *ScalarDataTy = getLoadStoreType(I: &Ingredient); |
9256 | auto *DataTy = VectorType::get(ElementType: ScalarDataTy, EC: State.VF); |
9257 | const Align Alignment = getLoadStoreAlignment(I: &Ingredient); |
9258 | bool CreateGather = !isConsecutive(); |
9259 | |
9260 | auto &Builder = State.Builder; |
9261 | State.setDebugLocFrom(getDebugLoc()); |
9262 | CallInst *NewLI; |
9263 | Value *EVL = State.get(Def: getEVL(), Instance: VPIteration(0, 0)); |
9264 | Value *Addr = State.get(Def: getAddr(), Part: 0, IsScalar: !CreateGather); |
9265 | Value *Mask = nullptr; |
9266 | if (VPValue *VPMask = getMask()) { |
9267 | Mask = State.get(Def: VPMask, Part: 0); |
9268 | if (isReverse()) |
9269 | Mask = createReverseEVL(Builder, Operand: Mask, EVL, Name: "vp.reverse.mask" ); |
9270 | } else { |
9271 | Mask = Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue()); |
9272 | } |
9273 | |
9274 | if (CreateGather) { |
9275 | NewLI = |
9276 | Builder.CreateIntrinsic(RetTy: DataTy, ID: Intrinsic::vp_gather, Args: {Addr, Mask, EVL}, |
9277 | FMFSource: nullptr, Name: "wide.masked.gather" ); |
9278 | } else { |
9279 | VectorBuilder VBuilder(Builder); |
9280 | VBuilder.setEVL(EVL).setMask(Mask); |
9281 | NewLI = cast<CallInst>(Val: VBuilder.createVectorInstruction( |
9282 | Opcode: Instruction::Load, ReturnTy: DataTy, VecOpArray: Addr, Name: "vp.op.load" )); |
9283 | } |
9284 | NewLI->addParamAttr( |
9285 | ArgNo: 0, Attr: Attribute::getWithAlignment(Context&: NewLI->getContext(), Alignment)); |
9286 | State.addMetadata(To: NewLI, From: LI); |
9287 | Instruction *Res = NewLI; |
9288 | if (isReverse()) |
9289 | Res = createReverseEVL(Builder, Operand: Res, EVL, Name: "vp.reverse" ); |
9290 | State.set(Def: this, V: Res, Part: 0); |
9291 | } |
9292 | |
9293 | void VPWidenStoreRecipe::execute(VPTransformState &State) { |
9294 | auto *SI = cast<StoreInst>(Val: &Ingredient); |
9295 | |
9296 | VPValue *StoredVPValue = getStoredValue(); |
9297 | bool CreateScatter = !isConsecutive(); |
9298 | const Align Alignment = getLoadStoreAlignment(I: &Ingredient); |
9299 | |
9300 | auto &Builder = State.Builder; |
9301 | State.setDebugLocFrom(getDebugLoc()); |
9302 | |
9303 | for (unsigned Part = 0; Part < State.UF; ++Part) { |
9304 | Instruction *NewSI = nullptr; |
9305 | Value *Mask = nullptr; |
9306 | if (auto *VPMask = getMask()) { |
9307 | // Mask reversal is only needed for non-all-one (null) masks, as reverse |
9308 | // of a null all-one mask is a null mask. |
9309 | Mask = State.get(Def: VPMask, Part); |
9310 | if (isReverse()) |
9311 | Mask = Builder.CreateVectorReverse(V: Mask, Name: "reverse" ); |
9312 | } |
9313 | |
9314 | Value *StoredVal = State.get(Def: StoredVPValue, Part); |
9315 | if (isReverse()) { |
9316 | // If we store to reverse consecutive memory locations, then we need |
9317 | // to reverse the order of elements in the stored value. |
9318 | StoredVal = Builder.CreateVectorReverse(V: StoredVal, Name: "reverse" ); |
9319 | // We don't want to update the value in the map as it might be used in |
9320 | // another expression. So don't call resetVectorValue(StoredVal). |
9321 | } |
9322 | Value *Addr = State.get(Def: getAddr(), Part, /*IsScalar*/ !CreateScatter); |
9323 | if (CreateScatter) |
9324 | NewSI = Builder.CreateMaskedScatter(Val: StoredVal, Ptrs: Addr, Alignment, Mask); |
9325 | else if (Mask) |
9326 | NewSI = Builder.CreateMaskedStore(Val: StoredVal, Ptr: Addr, Alignment, Mask); |
9327 | else |
9328 | NewSI = Builder.CreateAlignedStore(Val: StoredVal, Ptr: Addr, Align: Alignment); |
9329 | State.addMetadata(To: NewSI, From: SI); |
9330 | } |
9331 | } |
9332 | |
9333 | void VPWidenStoreEVLRecipe::execute(VPTransformState &State) { |
9334 | assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with " |
9335 | "explicit vector length." ); |
9336 | auto *SI = cast<StoreInst>(Val: &Ingredient); |
9337 | |
9338 | VPValue *StoredValue = getStoredValue(); |
9339 | bool CreateScatter = !isConsecutive(); |
9340 | const Align Alignment = getLoadStoreAlignment(I: &Ingredient); |
9341 | |
9342 | auto &Builder = State.Builder; |
9343 | State.setDebugLocFrom(getDebugLoc()); |
9344 | |
9345 | CallInst *NewSI = nullptr; |
9346 | Value *StoredVal = State.get(Def: StoredValue, Part: 0); |
9347 | Value *EVL = State.get(Def: getEVL(), Instance: VPIteration(0, 0)); |
9348 | if (isReverse()) |
9349 | StoredVal = createReverseEVL(Builder, Operand: StoredVal, EVL, Name: "vp.reverse" ); |
9350 | Value *Mask = nullptr; |
9351 | if (VPValue *VPMask = getMask()) { |
9352 | Mask = State.get(Def: VPMask, Part: 0); |
9353 | if (isReverse()) |
9354 | Mask = createReverseEVL(Builder, Operand: Mask, EVL, Name: "vp.reverse.mask" ); |
9355 | } else { |
9356 | Mask = Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue()); |
9357 | } |
9358 | Value *Addr = State.get(Def: getAddr(), Part: 0, IsScalar: !CreateScatter); |
9359 | if (CreateScatter) { |
9360 | NewSI = Builder.CreateIntrinsic(RetTy: Type::getVoidTy(C&: EVL->getContext()), |
9361 | ID: Intrinsic::vp_scatter, |
9362 | Args: {StoredVal, Addr, Mask, EVL}); |
9363 | } else { |
9364 | VectorBuilder VBuilder(Builder); |
9365 | VBuilder.setEVL(EVL).setMask(Mask); |
9366 | NewSI = cast<CallInst>(Val: VBuilder.createVectorInstruction( |
9367 | Opcode: Instruction::Store, ReturnTy: Type::getVoidTy(C&: EVL->getContext()), |
9368 | VecOpArray: {StoredVal, Addr})); |
9369 | } |
9370 | NewSI->addParamAttr( |
9371 | ArgNo: 1, Attr: Attribute::getWithAlignment(Context&: NewSI->getContext(), Alignment)); |
9372 | State.addMetadata(To: NewSI, From: SI); |
9373 | } |
9374 | |
9375 | // Determine how to lower the scalar epilogue, which depends on 1) optimising |
9376 | // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing |
9377 | // predication, and 4) a TTI hook that analyses whether the loop is suitable |
9378 | // for predication. |
9379 | static ScalarEpilogueLowering getScalarEpilogueLowering( |
9380 | Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, |
9381 | BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, |
9382 | LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) { |
9383 | // 1) OptSize takes precedence over all other options, i.e. if this is set, |
9384 | // don't look at hints or options, and don't request a scalar epilogue. |
9385 | // (For PGSO, as shouldOptimizeForSize isn't currently accessible from |
9386 | // LoopAccessInfo (due to code dependency and not being able to reliably get |
9387 | // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection |
9388 | // of strides in LoopAccessInfo::analyzeLoop() and vectorize without |
9389 | // versioning when the vectorization is forced, unlike hasOptSize. So revert |
9390 | // back to the old way and vectorize with versioning when forced. See D81345.) |
9391 | if (F->hasOptSize() || (llvm::shouldOptimizeForSize(BB: L->getHeader(), PSI, BFI, |
9392 | QueryType: PGSOQueryType::IRPass) && |
9393 | Hints.getForce() != LoopVectorizeHints::FK_Enabled)) |
9394 | return CM_ScalarEpilogueNotAllowedOptSize; |
9395 | |
9396 | // 2) If set, obey the directives |
9397 | if (PreferPredicateOverEpilogue.getNumOccurrences()) { |
9398 | switch (PreferPredicateOverEpilogue) { |
9399 | case PreferPredicateTy::ScalarEpilogue: |
9400 | return CM_ScalarEpilogueAllowed; |
9401 | case PreferPredicateTy::PredicateElseScalarEpilogue: |
9402 | return CM_ScalarEpilogueNotNeededUsePredicate; |
9403 | case PreferPredicateTy::PredicateOrDontVectorize: |
9404 | return CM_ScalarEpilogueNotAllowedUsePredicate; |
9405 | }; |
9406 | } |
9407 | |
9408 | // 3) If set, obey the hints |
9409 | switch (Hints.getPredicate()) { |
9410 | case LoopVectorizeHints::FK_Enabled: |
9411 | return CM_ScalarEpilogueNotNeededUsePredicate; |
9412 | case LoopVectorizeHints::FK_Disabled: |
9413 | return CM_ScalarEpilogueAllowed; |
9414 | }; |
9415 | |
9416 | // 4) if the TTI hook indicates this is profitable, request predication. |
9417 | TailFoldingInfo TFI(TLI, &LVL, IAI); |
9418 | if (TTI->preferPredicateOverEpilogue(TFI: &TFI)) |
9419 | return CM_ScalarEpilogueNotNeededUsePredicate; |
9420 | |
9421 | return CM_ScalarEpilogueAllowed; |
9422 | } |
9423 | |
9424 | // Process the loop in the VPlan-native vectorization path. This path builds |
9425 | // VPlan upfront in the vectorization pipeline, which allows to apply |
9426 | // VPlan-to-VPlan transformations from the very beginning without modifying the |
9427 | // input LLVM IR. |
9428 | static bool processLoopInVPlanNativePath( |
9429 | Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, |
9430 | LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, |
9431 | TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, |
9432 | OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, |
9433 | ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, |
9434 | LoopVectorizationRequirements &Requirements) { |
9435 | |
9436 | if (isa<SCEVCouldNotCompute>(Val: PSE.getBackedgeTakenCount())) { |
9437 | LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n" ); |
9438 | return false; |
9439 | } |
9440 | assert(EnableVPlanNativePath && "VPlan-native path is disabled." ); |
9441 | Function *F = L->getHeader()->getParent(); |
9442 | InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); |
9443 | |
9444 | ScalarEpilogueLowering SEL = |
9445 | getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL&: *LVL, IAI: &IAI); |
9446 | |
9447 | LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, |
9448 | &Hints, IAI); |
9449 | // Use the planner for outer loop vectorization. |
9450 | // TODO: CM is not used at this point inside the planner. Turn CM into an |
9451 | // optional argument if we don't need it in the future. |
9452 | LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints, |
9453 | ORE); |
9454 | |
9455 | // Get user vectorization factor. |
9456 | ElementCount UserVF = Hints.getWidth(); |
9457 | |
9458 | CM.collectElementTypesForWidening(); |
9459 | |
9460 | // Plan how to best vectorize, return the best VF and its cost. |
9461 | const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); |
9462 | |
9463 | // If we are stress testing VPlan builds, do not attempt to generate vector |
9464 | // code. Masked vector code generation support will follow soon. |
9465 | // Also, do not attempt to vectorize if no vector code will be produced. |
9466 | if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF) |
9467 | return false; |
9468 | |
9469 | VPlan &BestPlan = LVP.getBestPlanFor(VF: VF.Width); |
9470 | |
9471 | { |
9472 | bool AddBranchWeights = |
9473 | hasBranchWeightMD(I: *L->getLoopLatch()->getTerminator()); |
9474 | GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, |
9475 | F->getDataLayout(), AddBranchWeights); |
9476 | InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, |
9477 | VF.Width, 1, LVL, &CM, BFI, PSI, Checks); |
9478 | LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" |
9479 | << L->getHeader()->getParent()->getName() << "\"\n" ); |
9480 | LVP.executePlan(BestVF: VF.Width, BestUF: 1, BestVPlan&: BestPlan, ILV&: LB, DT, IsEpilogueVectorization: false); |
9481 | } |
9482 | |
9483 | reportVectorization(ORE, TheLoop: L, VF, IC: 1); |
9484 | |
9485 | // Mark the loop as already vectorized to avoid vectorizing again. |
9486 | Hints.setAlreadyVectorized(); |
9487 | assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); |
9488 | return true; |
9489 | } |
9490 | |
9491 | // Emit a remark if there are stores to floats that required a floating point |
9492 | // extension. If the vectorized loop was generated with floating point there |
9493 | // will be a performance penalty from the conversion overhead and the change in |
9494 | // the vector width. |
9495 | static void (Loop *L, OptimizationRemarkEmitter *ORE) { |
9496 | SmallVector<Instruction *, 4> Worklist; |
9497 | for (BasicBlock *BB : L->getBlocks()) { |
9498 | for (Instruction &Inst : *BB) { |
9499 | if (auto *S = dyn_cast<StoreInst>(Val: &Inst)) { |
9500 | if (S->getValueOperand()->getType()->isFloatTy()) |
9501 | Worklist.push_back(Elt: S); |
9502 | } |
9503 | } |
9504 | } |
9505 | |
9506 | // Traverse the floating point stores upwards searching, for floating point |
9507 | // conversions. |
9508 | SmallPtrSet<const Instruction *, 4> Visited; |
9509 | SmallPtrSet<const Instruction *, 4> ; |
9510 | while (!Worklist.empty()) { |
9511 | auto *I = Worklist.pop_back_val(); |
9512 | if (!L->contains(Inst: I)) |
9513 | continue; |
9514 | if (!Visited.insert(Ptr: I).second) |
9515 | continue; |
9516 | |
9517 | // Emit a remark if the floating point store required a floating |
9518 | // point conversion. |
9519 | // TODO: More work could be done to identify the root cause such as a |
9520 | // constant or a function return type and point the user to it. |
9521 | if (isa<FPExtInst>(Val: I) && EmittedRemark.insert(Ptr: I).second) |
9522 | ORE->emit(RemarkBuilder: [&]() { |
9523 | return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision" , |
9524 | I->getDebugLoc(), L->getHeader()) |
9525 | << "floating point conversion changes vector width. " |
9526 | << "Mixed floating point precision requires an up/down " |
9527 | << "cast that will negatively impact performance." ; |
9528 | }); |
9529 | |
9530 | for (Use &Op : I->operands()) |
9531 | if (auto *OpI = dyn_cast<Instruction>(Val&: Op)) |
9532 | Worklist.push_back(Elt: OpI); |
9533 | } |
9534 | } |
9535 | |
9536 | static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, |
9537 | VectorizationFactor &VF, |
9538 | std::optional<unsigned> VScale, Loop *L, |
9539 | ScalarEvolution &SE, |
9540 | ScalarEpilogueLowering SEL) { |
9541 | InstructionCost CheckCost = Checks.getCost(); |
9542 | if (!CheckCost.isValid()) |
9543 | return false; |
9544 | |
9545 | // When interleaving only scalar and vector cost will be equal, which in turn |
9546 | // would lead to a divide by 0. Fall back to hard threshold. |
9547 | if (VF.Width.isScalar()) { |
9548 | if (CheckCost > VectorizeMemoryCheckThreshold) { |
9549 | LLVM_DEBUG( |
9550 | dbgs() |
9551 | << "LV: Interleaving only is not profitable due to runtime checks\n" ); |
9552 | return false; |
9553 | } |
9554 | return true; |
9555 | } |
9556 | |
9557 | // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated. |
9558 | uint64_t ScalarC = *VF.ScalarCost.getValue(); |
9559 | if (ScalarC == 0) |
9560 | return true; |
9561 | |
9562 | // First, compute the minimum iteration count required so that the vector |
9563 | // loop outperforms the scalar loop. |
9564 | // The total cost of the scalar loop is |
9565 | // ScalarC * TC |
9566 | // where |
9567 | // * TC is the actual trip count of the loop. |
9568 | // * ScalarC is the cost of a single scalar iteration. |
9569 | // |
9570 | // The total cost of the vector loop is |
9571 | // RtC + VecC * (TC / VF) + EpiC |
9572 | // where |
9573 | // * RtC is the cost of the generated runtime checks |
9574 | // * VecC is the cost of a single vector iteration. |
9575 | // * TC is the actual trip count of the loop |
9576 | // * VF is the vectorization factor |
9577 | // * EpiCost is the cost of the generated epilogue, including the cost |
9578 | // of the remaining scalar operations. |
9579 | // |
9580 | // Vectorization is profitable once the total vector cost is less than the |
9581 | // total scalar cost: |
9582 | // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC |
9583 | // |
9584 | // Now we can compute the minimum required trip count TC as |
9585 | // VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC |
9586 | // |
9587 | // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that |
9588 | // the computations are performed on doubles, not integers and the result |
9589 | // is rounded up, hence we get an upper estimate of the TC. |
9590 | unsigned IntVF = VF.Width.getKnownMinValue(); |
9591 | if (VF.Width.isScalable()) { |
9592 | unsigned AssumedMinimumVscale = 1; |
9593 | if (VScale) |
9594 | AssumedMinimumVscale = *VScale; |
9595 | IntVF *= AssumedMinimumVscale; |
9596 | } |
9597 | uint64_t RtC = *CheckCost.getValue(); |
9598 | uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue(); |
9599 | uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(Numerator: RtC * IntVF, Denominator: Div); |
9600 | |
9601 | // Second, compute a minimum iteration count so that the cost of the |
9602 | // runtime checks is only a fraction of the total scalar loop cost. This |
9603 | // adds a loop-dependent bound on the overhead incurred if the runtime |
9604 | // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC |
9605 | // * TC. To bound the runtime check to be a fraction 1/X of the scalar |
9606 | // cost, compute |
9607 | // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC |
9608 | uint64_t MinTC2 = divideCeil(Numerator: RtC * 10, Denominator: ScalarC); |
9609 | |
9610 | // Now pick the larger minimum. If it is not a multiple of VF and a scalar |
9611 | // epilogue is allowed, choose the next closest multiple of VF. This should |
9612 | // partly compensate for ignoring the epilogue cost. |
9613 | uint64_t MinTC = std::max(a: MinTC1, b: MinTC2); |
9614 | if (SEL == CM_ScalarEpilogueAllowed) |
9615 | MinTC = alignTo(Value: MinTC, Align: IntVF); |
9616 | VF.MinProfitableTripCount = ElementCount::getFixed(MinVal: MinTC); |
9617 | |
9618 | LLVM_DEBUG( |
9619 | dbgs() << "LV: Minimum required TC for runtime checks to be profitable:" |
9620 | << VF.MinProfitableTripCount << "\n" ); |
9621 | |
9622 | // Skip vectorization if the expected trip count is less than the minimum |
9623 | // required trip count. |
9624 | if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) { |
9625 | if (ElementCount::isKnownLT(LHS: ElementCount::getFixed(MinVal: *ExpectedTC), |
9626 | RHS: VF.MinProfitableTripCount)) { |
9627 | LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected " |
9628 | "trip count < minimum profitable VF (" |
9629 | << *ExpectedTC << " < " << VF.MinProfitableTripCount |
9630 | << ")\n" ); |
9631 | |
9632 | return false; |
9633 | } |
9634 | } |
9635 | return true; |
9636 | } |
9637 | |
9638 | LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) |
9639 | : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || |
9640 | !EnableLoopInterleaving), |
9641 | VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || |
9642 | !EnableLoopVectorization) {} |
9643 | |
9644 | bool LoopVectorizePass::processLoop(Loop *L) { |
9645 | assert((EnableVPlanNativePath || L->isInnermost()) && |
9646 | "VPlan-native path is not enabled. Only process inner loops." ); |
9647 | |
9648 | LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '" |
9649 | << L->getHeader()->getParent()->getName() << "' from " |
9650 | << L->getLocStr() << "\n" ); |
9651 | |
9652 | LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI); |
9653 | |
9654 | LLVM_DEBUG( |
9655 | dbgs() << "LV: Loop hints:" |
9656 | << " force=" |
9657 | << (Hints.getForce() == LoopVectorizeHints::FK_Disabled |
9658 | ? "disabled" |
9659 | : (Hints.getForce() == LoopVectorizeHints::FK_Enabled |
9660 | ? "enabled" |
9661 | : "?" )) |
9662 | << " width=" << Hints.getWidth() |
9663 | << " interleave=" << Hints.getInterleave() << "\n" ); |
9664 | |
9665 | // Function containing loop |
9666 | Function *F = L->getHeader()->getParent(); |
9667 | |
9668 | // Looking at the diagnostic output is the only way to determine if a loop |
9669 | // was vectorized (other than looking at the IR or machine code), so it |
9670 | // is important to generate an optimization remark for each loop. Most of |
9671 | // these messages are generated as OptimizationRemarkAnalysis. Remarks |
9672 | // generated as OptimizationRemark and OptimizationRemarkMissed are |
9673 | // less verbose reporting vectorized loops and unvectorized loops that may |
9674 | // benefit from vectorization, respectively. |
9675 | |
9676 | if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { |
9677 | LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n" ); |
9678 | return false; |
9679 | } |
9680 | |
9681 | PredicatedScalarEvolution PSE(*SE, *L); |
9682 | |
9683 | // Check if it is legal to vectorize the loop. |
9684 | LoopVectorizationRequirements Requirements; |
9685 | LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE, |
9686 | &Requirements, &Hints, DB, AC, BFI, PSI); |
9687 | if (!LVL.canVectorize(UseVPlanNativePath: EnableVPlanNativePath)) { |
9688 | LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n" ); |
9689 | Hints.emitRemarkWithHints(); |
9690 | return false; |
9691 | } |
9692 | |
9693 | // Entrance to the VPlan-native vectorization path. Outer loops are processed |
9694 | // here. They may require CFG and instruction level transformations before |
9695 | // even evaluating whether vectorization is profitable. Since we cannot modify |
9696 | // the incoming IR, we need to build VPlan upfront in the vectorization |
9697 | // pipeline. |
9698 | if (!L->isInnermost()) |
9699 | return processLoopInVPlanNativePath(L, PSE, LI, DT, LVL: &LVL, TTI, TLI, DB, AC, |
9700 | ORE, BFI, PSI, Hints, Requirements); |
9701 | |
9702 | assert(L->isInnermost() && "Inner loop expected." ); |
9703 | |
9704 | InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); |
9705 | bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); |
9706 | |
9707 | // If an override option has been passed in for interleaved accesses, use it. |
9708 | if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) |
9709 | UseInterleaved = EnableInterleavedMemAccesses; |
9710 | |
9711 | // Analyze interleaved memory accesses. |
9712 | if (UseInterleaved) |
9713 | IAI.analyzeInterleaving(EnableMaskedInterleavedGroup: useMaskedInterleavedAccesses(TTI: *TTI)); |
9714 | |
9715 | // Check the function attributes and profiles to find out if this function |
9716 | // should be optimized for size. |
9717 | ScalarEpilogueLowering SEL = |
9718 | getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, IAI: &IAI); |
9719 | |
9720 | // Check the loop for a trip count threshold: vectorize loops with a tiny trip |
9721 | // count by optimizing for size, to minimize overheads. |
9722 | auto ExpectedTC = getSmallBestKnownTC(SE&: *SE, L); |
9723 | if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { |
9724 | LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " |
9725 | << "This loop is worth vectorizing only if no scalar " |
9726 | << "iteration overheads are incurred." ); |
9727 | if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) |
9728 | LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n" ); |
9729 | else { |
9730 | if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) { |
9731 | LLVM_DEBUG(dbgs() << "\n" ); |
9732 | // Predicate tail-folded loops are efficient even when the loop |
9733 | // iteration count is low. However, setting the epilogue policy to |
9734 | // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops |
9735 | // with runtime checks. It's more effective to let |
9736 | // `areRuntimeChecksProfitable` determine if vectorization is beneficial |
9737 | // for the loop. |
9738 | if (SEL != CM_ScalarEpilogueNotNeededUsePredicate) |
9739 | SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; |
9740 | } else { |
9741 | LLVM_DEBUG(dbgs() << " But the target considers the trip count too " |
9742 | "small to consider vectorizing.\n" ); |
9743 | reportVectorizationFailure( |
9744 | DebugMsg: "The trip count is below the minial threshold value." , |
9745 | OREMsg: "loop trip count is too low, avoiding vectorization" , |
9746 | ORETag: "LowTripCount" , ORE, TheLoop: L); |
9747 | Hints.emitRemarkWithHints(); |
9748 | return false; |
9749 | } |
9750 | } |
9751 | } |
9752 | |
9753 | // Check the function attributes to see if implicit floats or vectors are |
9754 | // allowed. |
9755 | if (F->hasFnAttribute(Kind: Attribute::NoImplicitFloat)) { |
9756 | reportVectorizationFailure( |
9757 | DebugMsg: "Can't vectorize when the NoImplicitFloat attribute is used" , |
9758 | OREMsg: "loop not vectorized due to NoImplicitFloat attribute" , |
9759 | ORETag: "NoImplicitFloat" , ORE, TheLoop: L); |
9760 | Hints.emitRemarkWithHints(); |
9761 | return false; |
9762 | } |
9763 | |
9764 | // Check if the target supports potentially unsafe FP vectorization. |
9765 | // FIXME: Add a check for the type of safety issue (denormal, signaling) |
9766 | // for the target we're vectorizing for, to make sure none of the |
9767 | // additional fp-math flags can help. |
9768 | if (Hints.isPotentiallyUnsafe() && |
9769 | TTI->isFPVectorizationPotentiallyUnsafe()) { |
9770 | reportVectorizationFailure( |
9771 | DebugMsg: "Potentially unsafe FP op prevents vectorization" , |
9772 | OREMsg: "loop not vectorized due to unsafe FP support." , |
9773 | ORETag: "UnsafeFP" , ORE, TheLoop: L); |
9774 | Hints.emitRemarkWithHints(); |
9775 | return false; |
9776 | } |
9777 | |
9778 | bool AllowOrderedReductions; |
9779 | // If the flag is set, use that instead and override the TTI behaviour. |
9780 | if (ForceOrderedReductions.getNumOccurrences() > 0) |
9781 | AllowOrderedReductions = ForceOrderedReductions; |
9782 | else |
9783 | AllowOrderedReductions = TTI->enableOrderedReductions(); |
9784 | if (!LVL.canVectorizeFPMath(EnableStrictReductions: AllowOrderedReductions)) { |
9785 | ORE->emit(RemarkBuilder: [&]() { |
9786 | auto *ExactFPMathInst = Requirements.getExactFPInst(); |
9787 | return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps" , |
9788 | ExactFPMathInst->getDebugLoc(), |
9789 | ExactFPMathInst->getParent()) |
9790 | << "loop not vectorized: cannot prove it is safe to reorder " |
9791 | "floating-point operations" ; |
9792 | }); |
9793 | LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " |
9794 | "reorder floating-point operations\n" ); |
9795 | Hints.emitRemarkWithHints(); |
9796 | return false; |
9797 | } |
9798 | |
9799 | // Use the cost model. |
9800 | LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, |
9801 | F, &Hints, IAI); |
9802 | // Use the planner for vectorization. |
9803 | LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints, |
9804 | ORE); |
9805 | |
9806 | // Get user vectorization factor and interleave count. |
9807 | ElementCount UserVF = Hints.getWidth(); |
9808 | unsigned UserIC = Hints.getInterleave(); |
9809 | |
9810 | // Plan how to best vectorize, return the best VF and its cost. |
9811 | std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); |
9812 | |
9813 | VectorizationFactor VF = VectorizationFactor::Disabled(); |
9814 | unsigned IC = 1; |
9815 | |
9816 | bool AddBranchWeights = |
9817 | hasBranchWeightMD(I: *L->getLoopLatch()->getTerminator()); |
9818 | GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, |
9819 | F->getDataLayout(), AddBranchWeights); |
9820 | if (MaybeVF) { |
9821 | VF = *MaybeVF; |
9822 | // Select the interleave count. |
9823 | IC = CM.selectInterleaveCount(VF: VF.Width, LoopCost: VF.Cost); |
9824 | |
9825 | unsigned SelectedIC = std::max(a: IC, b: UserIC); |
9826 | // Optimistically generate runtime checks if they are needed. Drop them if |
9827 | // they turn out to not be profitable. |
9828 | if (VF.Width.isVector() || SelectedIC > 1) |
9829 | Checks.Create(L, LAI: *LVL.getLAI(), UnionPred: PSE.getPredicate(), VF: VF.Width, IC: SelectedIC); |
9830 | |
9831 | // Check if it is profitable to vectorize with runtime checks. |
9832 | bool ForceVectorization = |
9833 | Hints.getForce() == LoopVectorizeHints::FK_Enabled; |
9834 | if (!ForceVectorization && |
9835 | !areRuntimeChecksProfitable(Checks, VF, VScale: getVScaleForTuning(L, TTI: *TTI), L, |
9836 | SE&: *PSE.getSE(), SEL)) { |
9837 | ORE->emit(RemarkBuilder: [&]() { |
9838 | return OptimizationRemarkAnalysisAliasing( |
9839 | DEBUG_TYPE, "CantReorderMemOps" , L->getStartLoc(), |
9840 | L->getHeader()) |
9841 | << "loop not vectorized: cannot prove it is safe to reorder " |
9842 | "memory operations" ; |
9843 | }); |
9844 | LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n" ); |
9845 | Hints.emitRemarkWithHints(); |
9846 | return false; |
9847 | } |
9848 | } |
9849 | |
9850 | // Identify the diagnostic messages that should be produced. |
9851 | std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; |
9852 | bool VectorizeLoop = true, InterleaveLoop = true; |
9853 | if (VF.Width.isScalar()) { |
9854 | LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n" ); |
9855 | VecDiagMsg = std::make_pair( |
9856 | x: "VectorizationNotBeneficial" , |
9857 | y: "the cost-model indicates that vectorization is not beneficial" ); |
9858 | VectorizeLoop = false; |
9859 | } |
9860 | |
9861 | if (!MaybeVF && UserIC > 1) { |
9862 | // Tell the user interleaving was avoided up-front, despite being explicitly |
9863 | // requested. |
9864 | LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " |
9865 | "interleaving should be avoided up front\n" ); |
9866 | IntDiagMsg = std::make_pair( |
9867 | x: "InterleavingAvoided" , |
9868 | y: "Ignoring UserIC, because interleaving was avoided up front" ); |
9869 | InterleaveLoop = false; |
9870 | } else if (IC == 1 && UserIC <= 1) { |
9871 | // Tell the user interleaving is not beneficial. |
9872 | LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n" ); |
9873 | IntDiagMsg = std::make_pair( |
9874 | x: "InterleavingNotBeneficial" , |
9875 | y: "the cost-model indicates that interleaving is not beneficial" ); |
9876 | InterleaveLoop = false; |
9877 | if (UserIC == 1) { |
9878 | IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled" ; |
9879 | IntDiagMsg.second += |
9880 | " and is explicitly disabled or interleave count is set to 1" ; |
9881 | } |
9882 | } else if (IC > 1 && UserIC == 1) { |
9883 | // Tell the user interleaving is beneficial, but it explicitly disabled. |
9884 | LLVM_DEBUG( |
9885 | dbgs() << "LV: Interleaving is beneficial but is explicitly disabled." ); |
9886 | IntDiagMsg = std::make_pair( |
9887 | x: "InterleavingBeneficialButDisabled" , |
9888 | y: "the cost-model indicates that interleaving is beneficial " |
9889 | "but is explicitly disabled or interleave count is set to 1" ); |
9890 | InterleaveLoop = false; |
9891 | } |
9892 | |
9893 | // Override IC if user provided an interleave count. |
9894 | IC = UserIC > 0 ? UserIC : IC; |
9895 | |
9896 | // Emit diagnostic messages, if any. |
9897 | const char *VAPassName = Hints.vectorizeAnalysisPassName(); |
9898 | if (!VectorizeLoop && !InterleaveLoop) { |
9899 | // Do not vectorize or interleaving the loop. |
9900 | ORE->emit(RemarkBuilder: [&]() { |
9901 | return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, |
9902 | L->getStartLoc(), L->getHeader()) |
9903 | << VecDiagMsg.second; |
9904 | }); |
9905 | ORE->emit(RemarkBuilder: [&]() { |
9906 | return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, |
9907 | L->getStartLoc(), L->getHeader()) |
9908 | << IntDiagMsg.second; |
9909 | }); |
9910 | return false; |
9911 | } else if (!VectorizeLoop && InterleaveLoop) { |
9912 | LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); |
9913 | ORE->emit(RemarkBuilder: [&]() { |
9914 | return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, |
9915 | L->getStartLoc(), L->getHeader()) |
9916 | << VecDiagMsg.second; |
9917 | }); |
9918 | } else if (VectorizeLoop && !InterleaveLoop) { |
9919 | LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width |
9920 | << ") in " << L->getLocStr() << '\n'); |
9921 | ORE->emit(RemarkBuilder: [&]() { |
9922 | return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, |
9923 | L->getStartLoc(), L->getHeader()) |
9924 | << IntDiagMsg.second; |
9925 | }); |
9926 | } else if (VectorizeLoop && InterleaveLoop) { |
9927 | LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width |
9928 | << ") in " << L->getLocStr() << '\n'); |
9929 | LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); |
9930 | } |
9931 | |
9932 | bool DisableRuntimeUnroll = false; |
9933 | MDNode *OrigLoopID = L->getLoopID(); |
9934 | { |
9935 | using namespace ore; |
9936 | if (!VectorizeLoop) { |
9937 | assert(IC > 1 && "interleave count should not be 1 or 0" ); |
9938 | // If we decided that it is not legal to vectorize the loop, then |
9939 | // interleave it. |
9940 | InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, |
9941 | &CM, BFI, PSI, Checks); |
9942 | |
9943 | VPlan &BestPlan = |
9944 | UseLegacyCostModel ? LVP.getBestPlanFor(VF: VF.Width) : LVP.getBestPlan(); |
9945 | assert((UseLegacyCostModel || BestPlan.hasScalarVFOnly()) && |
9946 | "VPlan cost model and legacy cost model disagreed" ); |
9947 | LVP.executePlan(BestVF: VF.Width, BestUF: IC, BestVPlan&: BestPlan, ILV&: Unroller, DT, IsEpilogueVectorization: false); |
9948 | |
9949 | ORE->emit(RemarkBuilder: [&]() { |
9950 | return OptimizationRemark(LV_NAME, "Interleaved" , L->getStartLoc(), |
9951 | L->getHeader()) |
9952 | << "interleaved loop (interleaved count: " |
9953 | << NV("InterleaveCount" , IC) << ")" ; |
9954 | }); |
9955 | } else { |
9956 | // If we decided that it is *legal* to vectorize the loop, then do it. |
9957 | |
9958 | // Consider vectorizing the epilogue too if it's profitable. |
9959 | VectorizationFactor EpilogueVF = |
9960 | LVP.selectEpilogueVectorizationFactor(MainLoopVF: VF.Width, IC); |
9961 | if (EpilogueVF.Width.isVector()) { |
9962 | |
9963 | // The first pass vectorizes the main loop and creates a scalar epilogue |
9964 | // to be vectorized by executing the plan (potentially with a different |
9965 | // factor) again shortly afterwards. |
9966 | EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); |
9967 | EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, |
9968 | EPI, &LVL, &CM, BFI, PSI, Checks); |
9969 | |
9970 | std::unique_ptr<VPlan> BestMainPlan( |
9971 | LVP.getBestPlanFor(VF: EPI.MainLoopVF).duplicate()); |
9972 | const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan( |
9973 | BestVF: EPI.MainLoopVF, BestUF: EPI.MainLoopUF, BestVPlan&: *BestMainPlan, ILV&: MainILV, DT, IsEpilogueVectorization: true); |
9974 | ++LoopsVectorized; |
9975 | |
9976 | // Second pass vectorizes the epilogue and adjusts the control flow |
9977 | // edges from the first pass. |
9978 | EPI.MainLoopVF = EPI.EpilogueVF; |
9979 | EPI.MainLoopUF = EPI.EpilogueUF; |
9980 | EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, |
9981 | ORE, EPI, &LVL, &CM, BFI, PSI, |
9982 | Checks); |
9983 | |
9984 | VPlan &BestEpiPlan = LVP.getBestPlanFor(VF: EPI.EpilogueVF); |
9985 | VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion(); |
9986 | VPBasicBlock * = VectorLoop->getEntryBasicBlock(); |
9987 | Header->setName("vec.epilog.vector.body" ); |
9988 | |
9989 | // Re-use the trip count and steps expanded for the main loop, as |
9990 | // skeleton creation needs it as a value that dominates both the scalar |
9991 | // and vector epilogue loops |
9992 | // TODO: This is a workaround needed for epilogue vectorization and it |
9993 | // should be removed once induction resume value creation is done |
9994 | // directly in VPlan. |
9995 | EpilogILV.setTripCount(MainILV.getTripCount()); |
9996 | for (auto &R : make_early_inc_range(Range&: *BestEpiPlan.getPreheader())) { |
9997 | auto *ExpandR = cast<VPExpandSCEVRecipe>(Val: &R); |
9998 | auto *ExpandedVal = BestEpiPlan.getOrAddLiveIn( |
9999 | V: ExpandedSCEVs.find(Val: ExpandR->getSCEV())->second); |
10000 | ExpandR->replaceAllUsesWith(New: ExpandedVal); |
10001 | if (BestEpiPlan.getTripCount() == ExpandR) |
10002 | BestEpiPlan.resetTripCount(NewTripCount: ExpandedVal); |
10003 | ExpandR->eraseFromParent(); |
10004 | } |
10005 | |
10006 | // Ensure that the start values for any VPWidenIntOrFpInductionRecipe, |
10007 | // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated |
10008 | // before vectorizing the epilogue loop. |
10009 | for (VPRecipeBase &R : Header->phis()) { |
10010 | if (isa<VPCanonicalIVPHIRecipe>(Val: &R)) |
10011 | continue; |
10012 | |
10013 | Value *ResumeV = nullptr; |
10014 | // TODO: Move setting of resume values to prepareToExecute. |
10015 | if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(Val: &R)) { |
10016 | const RecurrenceDescriptor &RdxDesc = |
10017 | ReductionPhi->getRecurrenceDescriptor(); |
10018 | RecurKind RK = RdxDesc.getRecurrenceKind(); |
10019 | ResumeV = ReductionResumeValues.find(Val: &RdxDesc)->second; |
10020 | if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RK)) { |
10021 | // VPReductionPHIRecipes for AnyOf reductions expect a boolean as |
10022 | // start value; compare the final value from the main vector loop |
10023 | // to the start value. |
10024 | IRBuilder<> Builder( |
10025 | cast<Instruction>(Val: ResumeV)->getParent()->getFirstNonPHI()); |
10026 | ResumeV = Builder.CreateICmpNE(LHS: ResumeV, |
10027 | RHS: RdxDesc.getRecurrenceStartValue()); |
10028 | } |
10029 | } else { |
10030 | // Create induction resume values for both widened pointer and |
10031 | // integer/fp inductions and update the start value of the induction |
10032 | // recipes to use the resume value. |
10033 | PHINode *IndPhi = nullptr; |
10034 | const InductionDescriptor *ID; |
10035 | if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(Val: &R)) { |
10036 | IndPhi = cast<PHINode>(Val: Ind->getUnderlyingValue()); |
10037 | ID = &Ind->getInductionDescriptor(); |
10038 | } else { |
10039 | auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(Val: &R); |
10040 | IndPhi = WidenInd->getPHINode(); |
10041 | ID = &WidenInd->getInductionDescriptor(); |
10042 | } |
10043 | |
10044 | ResumeV = MainILV.createInductionResumeValue( |
10045 | OrigPhi: IndPhi, II: *ID, Step: getExpandedStep(ID: *ID, ExpandedSCEVs), |
10046 | BypassBlocks: {EPI.MainLoopIterationCountCheck}); |
10047 | } |
10048 | assert(ResumeV && "Must have a resume value" ); |
10049 | VPValue *StartVal = BestEpiPlan.getOrAddLiveIn(V: ResumeV); |
10050 | cast<VPHeaderPHIRecipe>(Val: &R)->setStartValue(StartVal); |
10051 | } |
10052 | |
10053 | assert(DT->verify(DominatorTree::VerificationLevel::Fast) && |
10054 | "DT not preserved correctly" ); |
10055 | LVP.executePlan(BestVF: EPI.EpilogueVF, BestUF: EPI.EpilogueUF, BestVPlan&: BestEpiPlan, ILV&: EpilogILV, |
10056 | DT, IsEpilogueVectorization: true, ExpandedSCEVs: &ExpandedSCEVs); |
10057 | ++LoopsEpilogueVectorized; |
10058 | |
10059 | if (!MainILV.areSafetyChecksAdded()) |
10060 | DisableRuntimeUnroll = true; |
10061 | } else { |
10062 | ElementCount Width = VF.Width; |
10063 | VPlan &BestPlan = |
10064 | UseLegacyCostModel ? LVP.getBestPlanFor(VF: Width) : LVP.getBestPlan(); |
10065 | if (!UseLegacyCostModel) { |
10066 | assert(size(BestPlan.vectorFactors()) == 1 && |
10067 | "Plan should have a single VF" ); |
10068 | Width = *BestPlan.vectorFactors().begin(); |
10069 | LLVM_DEBUG(dbgs() |
10070 | << "VF picked by VPlan cost model: " << Width << "\n" ); |
10071 | assert(VF.Width == Width && |
10072 | "VPlan cost model and legacy cost model disagreed" ); |
10073 | } |
10074 | InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, Width, |
10075 | VF.MinProfitableTripCount, IC, &LVL, &CM, BFI, |
10076 | PSI, Checks); |
10077 | LVP.executePlan(BestVF: Width, BestUF: IC, BestVPlan&: BestPlan, ILV&: LB, DT, IsEpilogueVectorization: false); |
10078 | ++LoopsVectorized; |
10079 | |
10080 | // Add metadata to disable runtime unrolling a scalar loop when there |
10081 | // are no runtime checks about strides and memory. A scalar loop that is |
10082 | // rarely used is not worth unrolling. |
10083 | if (!LB.areSafetyChecksAdded()) |
10084 | DisableRuntimeUnroll = true; |
10085 | } |
10086 | // Report the vectorization decision. |
10087 | reportVectorization(ORE, TheLoop: L, VF, IC); |
10088 | } |
10089 | |
10090 | if (ORE->allowExtraAnalysis(LV_NAME)) |
10091 | checkMixedPrecision(L, ORE); |
10092 | } |
10093 | |
10094 | std::optional<MDNode *> RemainderLoopID = |
10095 | makeFollowupLoopID(OrigLoopID, FollowupAttrs: {LLVMLoopVectorizeFollowupAll, |
10096 | LLVMLoopVectorizeFollowupEpilogue}); |
10097 | if (RemainderLoopID) { |
10098 | L->setLoopID(*RemainderLoopID); |
10099 | } else { |
10100 | if (DisableRuntimeUnroll) |
10101 | AddRuntimeUnrollDisableMetaData(L); |
10102 | |
10103 | // Mark the loop as already vectorized to avoid vectorizing again. |
10104 | Hints.setAlreadyVectorized(); |
10105 | } |
10106 | |
10107 | assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); |
10108 | return true; |
10109 | } |
10110 | |
10111 | LoopVectorizeResult LoopVectorizePass::runImpl( |
10112 | Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, |
10113 | DominatorTree &DT_, BlockFrequencyInfo *BFI_, TargetLibraryInfo *TLI_, |
10114 | DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_, |
10115 | OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { |
10116 | SE = &SE_; |
10117 | LI = &LI_; |
10118 | TTI = &TTI_; |
10119 | DT = &DT_; |
10120 | BFI = BFI_; |
10121 | TLI = TLI_; |
10122 | AC = &AC_; |
10123 | LAIs = &LAIs_; |
10124 | DB = &DB_; |
10125 | ORE = &ORE_; |
10126 | PSI = PSI_; |
10127 | |
10128 | // Don't attempt if |
10129 | // 1. the target claims to have no vector registers, and |
10130 | // 2. interleaving won't help ILP. |
10131 | // |
10132 | // The second condition is necessary because, even if the target has no |
10133 | // vector registers, loop vectorization may still enable scalar |
10134 | // interleaving. |
10135 | if (!TTI->getNumberOfRegisters(ClassID: TTI->getRegisterClassForType(Vector: true)) && |
10136 | TTI->getMaxInterleaveFactor(VF: ElementCount::getFixed(MinVal: 1)) < 2) |
10137 | return LoopVectorizeResult(false, false); |
10138 | |
10139 | bool Changed = false, CFGChanged = false; |
10140 | |
10141 | // The vectorizer requires loops to be in simplified form. |
10142 | // Since simplification may add new inner loops, it has to run before the |
10143 | // legality and profitability checks. This means running the loop vectorizer |
10144 | // will simplify all loops, regardless of whether anything end up being |
10145 | // vectorized. |
10146 | for (const auto &L : *LI) |
10147 | Changed |= CFGChanged |= |
10148 | simplifyLoop(L, DT, LI, SE, AC, MSSAU: nullptr, PreserveLCSSA: false /* PreserveLCSSA */); |
10149 | |
10150 | // Build up a worklist of inner-loops to vectorize. This is necessary as |
10151 | // the act of vectorizing or partially unrolling a loop creates new loops |
10152 | // and can invalidate iterators across the loops. |
10153 | SmallVector<Loop *, 8> Worklist; |
10154 | |
10155 | for (Loop *L : *LI) |
10156 | collectSupportedLoops(L&: *L, LI, ORE, V&: Worklist); |
10157 | |
10158 | LoopsAnalyzed += Worklist.size(); |
10159 | |
10160 | // Now walk the identified inner loops. |
10161 | while (!Worklist.empty()) { |
10162 | Loop *L = Worklist.pop_back_val(); |
10163 | |
10164 | // For the inner loops we actually process, form LCSSA to simplify the |
10165 | // transform. |
10166 | Changed |= formLCSSARecursively(L&: *L, DT: *DT, LI, SE); |
10167 | |
10168 | Changed |= CFGChanged |= processLoop(L); |
10169 | |
10170 | if (Changed) { |
10171 | LAIs->clear(); |
10172 | |
10173 | #ifndef NDEBUG |
10174 | if (VerifySCEV) |
10175 | SE->verify(); |
10176 | #endif |
10177 | } |
10178 | } |
10179 | |
10180 | // Process each loop nest in the function. |
10181 | return LoopVectorizeResult(Changed, CFGChanged); |
10182 | } |
10183 | |
10184 | PreservedAnalyses LoopVectorizePass::run(Function &F, |
10185 | FunctionAnalysisManager &AM) { |
10186 | auto &LI = AM.getResult<LoopAnalysis>(IR&: F); |
10187 | // There are no loops in the function. Return before computing other expensive |
10188 | // analyses. |
10189 | if (LI.empty()) |
10190 | return PreservedAnalyses::all(); |
10191 | auto &SE = AM.getResult<ScalarEvolutionAnalysis>(IR&: F); |
10192 | auto &TTI = AM.getResult<TargetIRAnalysis>(IR&: F); |
10193 | auto &DT = AM.getResult<DominatorTreeAnalysis>(IR&: F); |
10194 | auto &TLI = AM.getResult<TargetLibraryAnalysis>(IR&: F); |
10195 | auto &AC = AM.getResult<AssumptionAnalysis>(IR&: F); |
10196 | auto &DB = AM.getResult<DemandedBitsAnalysis>(IR&: F); |
10197 | auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F); |
10198 | |
10199 | LoopAccessInfoManager &LAIs = AM.getResult<LoopAccessAnalysis>(IR&: F); |
10200 | auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(IR&: F); |
10201 | ProfileSummaryInfo *PSI = |
10202 | MAMProxy.getCachedResult<ProfileSummaryAnalysis>(IR&: *F.getParent()); |
10203 | BlockFrequencyInfo *BFI = nullptr; |
10204 | if (PSI && PSI->hasProfileSummary()) |
10205 | BFI = &AM.getResult<BlockFrequencyAnalysis>(IR&: F); |
10206 | LoopVectorizeResult Result = |
10207 | runImpl(F, SE_&: SE, LI_&: LI, TTI_&: TTI, DT_&: DT, BFI_: BFI, TLI_: &TLI, DB_&: DB, AC_&: AC, LAIs_&: LAIs, ORE_&: ORE, PSI_: PSI); |
10208 | if (!Result.MadeAnyChange) |
10209 | return PreservedAnalyses::all(); |
10210 | PreservedAnalyses PA; |
10211 | |
10212 | if (isAssignmentTrackingEnabled(M: *F.getParent())) { |
10213 | for (auto &BB : F) |
10214 | RemoveRedundantDbgInstrs(BB: &BB); |
10215 | } |
10216 | |
10217 | PA.preserve<LoopAnalysis>(); |
10218 | PA.preserve<DominatorTreeAnalysis>(); |
10219 | PA.preserve<ScalarEvolutionAnalysis>(); |
10220 | PA.preserve<LoopAccessAnalysis>(); |
10221 | |
10222 | if (Result.MadeCFGChange) { |
10223 | // Making CFG changes likely means a loop got vectorized. Indicate that |
10224 | // extra simplification passes should be run. |
10225 | // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only |
10226 | // be run if runtime checks have been added. |
10227 | AM.getResult<ShouldRunExtraVectorPasses>(IR&: F); |
10228 | PA.preserve<ShouldRunExtraVectorPasses>(); |
10229 | } else { |
10230 | PA.preserveSet<CFGAnalyses>(); |
10231 | } |
10232 | return PA; |
10233 | } |
10234 | |
10235 | void LoopVectorizePass::printPipeline( |
10236 | raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { |
10237 | static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( |
10238 | OS, MapClassName2PassName); |
10239 | |
10240 | OS << '<'; |
10241 | OS << (InterleaveOnlyWhenForced ? "" : "no-" ) << "interleave-forced-only;" ; |
10242 | OS << (VectorizeOnlyWhenForced ? "" : "no-" ) << "vectorize-forced-only;" ; |
10243 | OS << '>'; |
10244 | } |
10245 | |