1//===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This transformation analyzes and transforms the induction variables (and
10// computations derived from them) into forms suitable for efficient execution
11// on the target.
12//
13// This pass performs a strength reduction on array references inside loops that
14// have as one or more of their components the loop induction variable, it
15// rewrites expressions to take advantage of scaled-index addressing modes
16// available on the target, and it performs a variety of other optimizations
17// related to loop induction variables.
18//
19// Terminology note: this code has a lot of handling for "post-increment" or
20// "post-inc" users. This is not talking about post-increment addressing modes;
21// it is instead talking about code like this:
22//
23// %i = phi [ 0, %entry ], [ %i.next, %latch ]
24// ...
25// %i.next = add %i, 1
26// %c = icmp eq %i.next, %n
27//
28// The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
29// it's useful to think about these as the same register, with some uses using
30// the value of the register before the add and some using it after. In this
31// example, the icmp is a post-increment user, since it uses %i.next, which is
32// the value of the induction variable after the increment. The other common
33// case of post-increment users is users outside the loop.
34//
35// TODO: More sophistication in the way Formulae are generated and filtered.
36//
37// TODO: Handle multiple loops at a time.
38//
39// TODO: Should the addressing mode BaseGV be changed to a ConstantExpr instead
40// of a GlobalValue?
41//
42// TODO: When truncation is free, truncate ICmp users' operands to make it a
43// smaller encoding (on x86 at least).
44//
45// TODO: When a negated register is used by an add (such as in a list of
46// multiple base registers, or as the increment expression in an addrec),
47// we may not actually need both reg and (-1 * reg) in registers; the
48// negation can be implemented by using a sub instead of an add. The
49// lack of support for taking this into consideration when making
50// register pressure decisions is partly worked around by the "Special"
51// use kind.
52//
53//===----------------------------------------------------------------------===//
54
55#include "llvm/Transforms/Scalar/LoopStrengthReduce.h"
56#include "llvm/ADT/APInt.h"
57#include "llvm/ADT/DenseMap.h"
58#include "llvm/ADT/DenseSet.h"
59#include "llvm/ADT/PointerIntPair.h"
60#include "llvm/ADT/STLExtras.h"
61#include "llvm/ADT/SetVector.h"
62#include "llvm/ADT/SmallBitVector.h"
63#include "llvm/ADT/SmallPtrSet.h"
64#include "llvm/ADT/SmallSet.h"
65#include "llvm/ADT/SmallVector.h"
66#include "llvm/ADT/Statistic.h"
67#include "llvm/ADT/iterator_range.h"
68#include "llvm/Analysis/AssumptionCache.h"
69#include "llvm/Analysis/DomTreeUpdater.h"
70#include "llvm/Analysis/IVUsers.h"
71#include "llvm/Analysis/LoopAnalysisManager.h"
72#include "llvm/Analysis/LoopInfo.h"
73#include "llvm/Analysis/LoopPass.h"
74#include "llvm/Analysis/MemorySSA.h"
75#include "llvm/Analysis/MemorySSAUpdater.h"
76#include "llvm/Analysis/ScalarEvolution.h"
77#include "llvm/Analysis/ScalarEvolutionExpressions.h"
78#include "llvm/Analysis/ScalarEvolutionNormalization.h"
79#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
80#include "llvm/Analysis/TargetLibraryInfo.h"
81#include "llvm/Analysis/TargetTransformInfo.h"
82#include "llvm/Analysis/ValueTracking.h"
83#include "llvm/BinaryFormat/Dwarf.h"
84#include "llvm/IR/BasicBlock.h"
85#include "llvm/IR/Constant.h"
86#include "llvm/IR/Constants.h"
87#include "llvm/IR/DebugInfoMetadata.h"
88#include "llvm/IR/DerivedTypes.h"
89#include "llvm/IR/Dominators.h"
90#include "llvm/IR/GlobalValue.h"
91#include "llvm/IR/IRBuilder.h"
92#include "llvm/IR/InstrTypes.h"
93#include "llvm/IR/Instruction.h"
94#include "llvm/IR/Instructions.h"
95#include "llvm/IR/IntrinsicInst.h"
96#include "llvm/IR/Module.h"
97#include "llvm/IR/Operator.h"
98#include "llvm/IR/Type.h"
99#include "llvm/IR/Use.h"
100#include "llvm/IR/User.h"
101#include "llvm/IR/Value.h"
102#include "llvm/IR/ValueHandle.h"
103#include "llvm/InitializePasses.h"
104#include "llvm/Pass.h"
105#include "llvm/Support/Casting.h"
106#include "llvm/Support/CommandLine.h"
107#include "llvm/Support/Compiler.h"
108#include "llvm/Support/Debug.h"
109#include "llvm/Support/ErrorHandling.h"
110#include "llvm/Support/MathExtras.h"
111#include "llvm/Support/raw_ostream.h"
112#include "llvm/Transforms/Scalar.h"
113#include "llvm/Transforms/Utils.h"
114#include "llvm/Transforms/Utils/BasicBlockUtils.h"
115#include "llvm/Transforms/Utils/Local.h"
116#include "llvm/Transforms/Utils/LoopUtils.h"
117#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
118#include <algorithm>
119#include <cassert>
120#include <cstddef>
121#include <cstdint>
122#include <iterator>
123#include <limits>
124#include <map>
125#include <numeric>
126#include <optional>
127#include <utility>
128
129using namespace llvm;
130using namespace SCEVPatternMatch;
131
132#define DEBUG_TYPE "loop-reduce"
133
134/// MaxIVUsers is an arbitrary threshold that provides an early opportunity for
135/// bail out. This threshold is far beyond the number of users that LSR can
136/// conceivably solve, so it should not affect generated code, but catches the
137/// worst cases before LSR burns too much compile time and stack space.
138static const unsigned MaxIVUsers = 200;
139
140/// Limit the size of expression that SCEV-based salvaging will attempt to
141/// translate into a DIExpression.
142/// Choose a maximum size such that debuginfo is not excessively increased and
143/// the salvaging is not too expensive for the compiler.
144static const unsigned MaxSCEVSalvageExpressionSize = 64;
145
146// Cleanup congruent phis after LSR phi expansion.
147static cl::opt<bool> EnablePhiElim(
148 "enable-lsr-phielim", cl::Hidden, cl::init(Val: true),
149 cl::desc("Enable LSR phi elimination"));
150
151// The flag adds instruction count to solutions cost comparison.
152static cl::opt<bool> InsnsCost(
153 "lsr-insns-cost", cl::Hidden, cl::init(Val: true),
154 cl::desc("Add instruction count to a LSR cost model"));
155
156// Flag to choose how to narrow complex lsr solution
157static cl::opt<bool> LSRExpNarrow(
158 "lsr-exp-narrow", cl::Hidden, cl::init(Val: false),
159 cl::desc("Narrow LSR complex solution using"
160 " expectation of registers number"));
161
162// Flag to narrow search space by filtering non-optimal formulae with
163// the same ScaledReg and Scale.
164static cl::opt<bool> FilterSameScaledReg(
165 "lsr-filter-same-scaled-reg", cl::Hidden, cl::init(Val: true),
166 cl::desc("Narrow LSR search space by filtering non-optimal formulae"
167 " with the same ScaledReg and Scale"));
168
169static cl::opt<TTI::AddressingModeKind> PreferredAddresingMode(
170 "lsr-preferred-addressing-mode", cl::Hidden, cl::init(Val: TTI::AMK_None),
171 cl::desc("A flag that overrides the target's preferred addressing mode."),
172 cl::values(clEnumValN(TTI::AMK_None,
173 "none",
174 "Don't prefer any addressing mode"),
175 clEnumValN(TTI::AMK_PreIndexed,
176 "preindexed",
177 "Prefer pre-indexed addressing mode"),
178 clEnumValN(TTI::AMK_PostIndexed,
179 "postindexed",
180 "Prefer post-indexed addressing mode")));
181
182static cl::opt<unsigned> ComplexityLimit(
183 "lsr-complexity-limit", cl::Hidden,
184 cl::init(Val: std::numeric_limits<uint16_t>::max()),
185 cl::desc("LSR search space complexity limit"));
186
187static cl::opt<unsigned> SetupCostDepthLimit(
188 "lsr-setupcost-depth-limit", cl::Hidden, cl::init(Val: 7),
189 cl::desc("The limit on recursion depth for LSRs setup cost"));
190
191static cl::opt<cl::boolOrDefault> AllowDropSolutionIfLessProfitable(
192 "lsr-drop-solution", cl::Hidden,
193 cl::desc("Attempt to drop solution if it is less profitable"));
194
195static cl::opt<bool> EnableVScaleImmediates(
196 "lsr-enable-vscale-immediates", cl::Hidden, cl::init(Val: true),
197 cl::desc("Enable analysis of vscale-relative immediates in LSR"));
198
199static cl::opt<bool> DropScaledForVScale(
200 "lsr-drop-scaled-reg-for-vscale", cl::Hidden, cl::init(Val: true),
201 cl::desc("Avoid using scaled registers with vscale-relative addressing"));
202
203#ifndef NDEBUG
204// Stress test IV chain generation.
205static cl::opt<bool> StressIVChain(
206 "stress-ivchain", cl::Hidden, cl::init(false),
207 cl::desc("Stress test LSR IV chains"));
208#else
209static bool StressIVChain = false;
210#endif
211
212namespace {
213
214struct MemAccessTy {
215 /// Used in situations where the accessed memory type is unknown.
216 static const unsigned UnknownAddressSpace =
217 std::numeric_limits<unsigned>::max();
218
219 Type *MemTy = nullptr;
220 unsigned AddrSpace = UnknownAddressSpace;
221
222 MemAccessTy() = default;
223 MemAccessTy(Type *Ty, unsigned AS) : MemTy(Ty), AddrSpace(AS) {}
224
225 bool operator==(MemAccessTy Other) const {
226 return MemTy == Other.MemTy && AddrSpace == Other.AddrSpace;
227 }
228
229 bool operator!=(MemAccessTy Other) const { return !(*this == Other); }
230
231 static MemAccessTy getUnknown(LLVMContext &Ctx,
232 unsigned AS = UnknownAddressSpace) {
233 return MemAccessTy(Type::getVoidTy(C&: Ctx), AS);
234 }
235
236 Type *getType() { return MemTy; }
237};
238
239/// This class holds data which is used to order reuse candidates.
240class RegSortData {
241public:
242 /// This represents the set of LSRUse indices which reference
243 /// a particular register.
244 SmallBitVector UsedByIndices;
245
246 void print(raw_ostream &OS) const;
247 void dump() const;
248};
249
250// An offset from an address that is either scalable or fixed. Used for
251// per-target optimizations of addressing modes.
252class Immediate : public details::FixedOrScalableQuantity<Immediate, int64_t> {
253 constexpr Immediate(ScalarTy MinVal, bool Scalable)
254 : FixedOrScalableQuantity(MinVal, Scalable) {}
255
256 constexpr Immediate(const FixedOrScalableQuantity<Immediate, int64_t> &V)
257 : FixedOrScalableQuantity(V) {}
258
259public:
260 constexpr Immediate() = delete;
261
262 static constexpr Immediate getFixed(ScalarTy MinVal) {
263 return {MinVal, false};
264 }
265 static constexpr Immediate getScalable(ScalarTy MinVal) {
266 return {MinVal, true};
267 }
268 static constexpr Immediate get(ScalarTy MinVal, bool Scalable) {
269 return {MinVal, Scalable};
270 }
271 static constexpr Immediate getZero() { return {0, false}; }
272 static constexpr Immediate getFixedMin() {
273 return {std::numeric_limits<int64_t>::min(), false};
274 }
275 static constexpr Immediate getFixedMax() {
276 return {std::numeric_limits<int64_t>::max(), false};
277 }
278 static constexpr Immediate getScalableMin() {
279 return {std::numeric_limits<int64_t>::min(), true};
280 }
281 static constexpr Immediate getScalableMax() {
282 return {std::numeric_limits<int64_t>::max(), true};
283 }
284
285 constexpr bool isLessThanZero() const { return Quantity < 0; }
286
287 constexpr bool isGreaterThanZero() const { return Quantity > 0; }
288
289 constexpr bool isCompatibleImmediate(const Immediate &Imm) const {
290 return isZero() || Imm.isZero() || Imm.Scalable == Scalable;
291 }
292
293 constexpr bool isMin() const {
294 return Quantity == std::numeric_limits<ScalarTy>::min();
295 }
296
297 constexpr bool isMax() const {
298 return Quantity == std::numeric_limits<ScalarTy>::max();
299 }
300
301 // Arithmetic 'operators' that cast to unsigned types first.
302 constexpr Immediate addUnsigned(const Immediate &RHS) const {
303 assert(isCompatibleImmediate(RHS) && "Incompatible Immediates");
304 ScalarTy Value = (uint64_t)Quantity + RHS.getKnownMinValue();
305 return {Value, Scalable || RHS.isScalable()};
306 }
307
308 constexpr Immediate subUnsigned(const Immediate &RHS) const {
309 assert(isCompatibleImmediate(RHS) && "Incompatible Immediates");
310 ScalarTy Value = (uint64_t)Quantity - RHS.getKnownMinValue();
311 return {Value, Scalable || RHS.isScalable()};
312 }
313
314 // Scale the quantity by a constant without caring about runtime scalability.
315 constexpr Immediate mulUnsigned(const ScalarTy RHS) const {
316 ScalarTy Value = (uint64_t)Quantity * RHS;
317 return {Value, Scalable};
318 }
319
320 // Helpers for generating SCEVs with vscale terms where needed.
321 const SCEV *getSCEV(ScalarEvolution &SE, Type *Ty) const {
322 const SCEV *S = SE.getConstant(Ty, V: Quantity);
323 if (Scalable)
324 S = SE.getMulExpr(LHS: S, RHS: SE.getVScale(Ty: S->getType()));
325 return S;
326 }
327
328 const SCEV *getNegativeSCEV(ScalarEvolution &SE, Type *Ty) const {
329 const SCEV *NegS = SE.getConstant(Ty, V: -(uint64_t)Quantity);
330 if (Scalable)
331 NegS = SE.getMulExpr(LHS: NegS, RHS: SE.getVScale(Ty: NegS->getType()));
332 return NegS;
333 }
334
335 const SCEV *getUnknownSCEV(ScalarEvolution &SE, Type *Ty) const {
336 const SCEV *SU = SE.getUnknown(V: ConstantInt::getSigned(Ty, V: Quantity));
337 if (Scalable)
338 SU = SE.getMulExpr(LHS: SU, RHS: SE.getVScale(Ty: SU->getType()));
339 return SU;
340 }
341};
342
343// This is needed for the Compare type of std::map when Immediate is used
344// as a key. We don't need it to be fully correct against any value of vscale,
345// just to make sure that vscale-related terms in the map are considered against
346// each other rather than being mixed up and potentially missing opportunities.
347struct KeyOrderTargetImmediate {
348 bool operator()(const Immediate &LHS, const Immediate &RHS) const {
349 if (LHS.isScalable() && !RHS.isScalable())
350 return false;
351 if (!LHS.isScalable() && RHS.isScalable())
352 return true;
353 return LHS.getKnownMinValue() < RHS.getKnownMinValue();
354 }
355};
356
357// This would be nicer if we could be generic instead of directly using size_t,
358// but there doesn't seem to be a type trait for is_orderable or
359// is_lessthan_comparable or similar.
360struct KeyOrderSizeTAndImmediate {
361 bool operator()(const std::pair<size_t, Immediate> &LHS,
362 const std::pair<size_t, Immediate> &RHS) const {
363 size_t LSize = LHS.first;
364 size_t RSize = RHS.first;
365 if (LSize != RSize)
366 return LSize < RSize;
367 return KeyOrderTargetImmediate()(LHS.second, RHS.second);
368 }
369};
370} // end anonymous namespace
371
372#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
373void RegSortData::print(raw_ostream &OS) const {
374 OS << "[NumUses=" << UsedByIndices.count() << ']';
375}
376
377LLVM_DUMP_METHOD void RegSortData::dump() const {
378 print(errs()); errs() << '\n';
379}
380#endif
381
382namespace {
383
384/// Map register candidates to information about how they are used.
385class RegUseTracker {
386 using RegUsesTy = DenseMap<const SCEV *, RegSortData>;
387
388 RegUsesTy RegUsesMap;
389 SmallVector<const SCEV *, 16> RegSequence;
390
391public:
392 void countRegister(const SCEV *Reg, size_t LUIdx);
393 void dropRegister(const SCEV *Reg, size_t LUIdx);
394 void swapAndDropUse(size_t LUIdx, size_t LastLUIdx);
395
396 bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
397
398 const SmallBitVector &getUsedByIndices(const SCEV *Reg) const;
399
400 void clear();
401
402 using iterator = SmallVectorImpl<const SCEV *>::iterator;
403 using const_iterator = SmallVectorImpl<const SCEV *>::const_iterator;
404
405 iterator begin() { return RegSequence.begin(); }
406 iterator end() { return RegSequence.end(); }
407 const_iterator begin() const { return RegSequence.begin(); }
408 const_iterator end() const { return RegSequence.end(); }
409};
410
411} // end anonymous namespace
412
413void
414RegUseTracker::countRegister(const SCEV *Reg, size_t LUIdx) {
415 std::pair<RegUsesTy::iterator, bool> Pair = RegUsesMap.try_emplace(Key: Reg);
416 RegSortData &RSD = Pair.first->second;
417 if (Pair.second)
418 RegSequence.push_back(Elt: Reg);
419 RSD.UsedByIndices.resize(N: std::max(a: RSD.UsedByIndices.size(), b: LUIdx + 1));
420 RSD.UsedByIndices.set(LUIdx);
421}
422
423void
424RegUseTracker::dropRegister(const SCEV *Reg, size_t LUIdx) {
425 RegUsesTy::iterator It = RegUsesMap.find(Val: Reg);
426 assert(It != RegUsesMap.end());
427 RegSortData &RSD = It->second;
428 assert(RSD.UsedByIndices.size() > LUIdx);
429 RSD.UsedByIndices.reset(Idx: LUIdx);
430}
431
432void
433RegUseTracker::swapAndDropUse(size_t LUIdx, size_t LastLUIdx) {
434 assert(LUIdx <= LastLUIdx);
435
436 // Update RegUses. The data structure is not optimized for this purpose;
437 // we must iterate through it and update each of the bit vectors.
438 for (auto &Pair : RegUsesMap) {
439 SmallBitVector &UsedByIndices = Pair.second.UsedByIndices;
440 if (LUIdx < UsedByIndices.size())
441 UsedByIndices[LUIdx] =
442 LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : false;
443 UsedByIndices.resize(N: std::min(a: UsedByIndices.size(), b: LastLUIdx));
444 }
445}
446
447bool
448RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
449 RegUsesTy::const_iterator I = RegUsesMap.find(Val: Reg);
450 if (I == RegUsesMap.end())
451 return false;
452 const SmallBitVector &UsedByIndices = I->second.UsedByIndices;
453 int i = UsedByIndices.find_first();
454 if (i == -1) return false;
455 if ((size_t)i != LUIdx) return true;
456 return UsedByIndices.find_next(Prev: i) != -1;
457}
458
459const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const {
460 RegUsesTy::const_iterator I = RegUsesMap.find(Val: Reg);
461 assert(I != RegUsesMap.end() && "Unknown register!");
462 return I->second.UsedByIndices;
463}
464
465void RegUseTracker::clear() {
466 RegUsesMap.clear();
467 RegSequence.clear();
468}
469
470namespace {
471
472/// This class holds information that describes a formula for computing
473/// satisfying a use. It may include broken-out immediates and scaled registers.
474struct Formula {
475 /// Global base address used for complex addressing.
476 GlobalValue *BaseGV = nullptr;
477
478 /// Base offset for complex addressing.
479 Immediate BaseOffset = Immediate::getZero();
480
481 /// Whether any complex addressing has a base register.
482 bool HasBaseReg = false;
483
484 /// The scale of any complex addressing.
485 int64_t Scale = 0;
486
487 /// The list of "base" registers for this use. When this is non-empty. The
488 /// canonical representation of a formula is
489 /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and
490 /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty().
491 /// 3. The reg containing recurrent expr related with currect loop in the
492 /// formula should be put in the ScaledReg.
493 /// #1 enforces that the scaled register is always used when at least two
494 /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2.
495 /// #2 enforces that 1 * reg is reg.
496 /// #3 ensures invariant regs with respect to current loop can be combined
497 /// together in LSR codegen.
498 /// This invariant can be temporarily broken while building a formula.
499 /// However, every formula inserted into the LSRInstance must be in canonical
500 /// form.
501 SmallVector<const SCEV *, 4> BaseRegs;
502
503 /// The 'scaled' register for this use. This should be non-null when Scale is
504 /// not zero.
505 const SCEV *ScaledReg = nullptr;
506
507 /// An additional constant offset which added near the use. This requires a
508 /// temporary register, but the offset itself can live in an add immediate
509 /// field rather than a register.
510 Immediate UnfoldedOffset = Immediate::getZero();
511
512 Formula() = default;
513
514 void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
515
516 bool isCanonical(const Loop &L) const;
517
518 void canonicalize(const Loop &L);
519
520 bool unscale();
521
522 bool hasZeroEnd() const;
523
524 size_t getNumRegs() const;
525 Type *getType() const;
526
527 void deleteBaseReg(const SCEV *&S);
528
529 bool referencesReg(const SCEV *S) const;
530 bool hasRegsUsedByUsesOtherThan(size_t LUIdx,
531 const RegUseTracker &RegUses) const;
532
533 void print(raw_ostream &OS) const;
534 void dump() const;
535};
536
537} // end anonymous namespace
538
539/// Recursion helper for initialMatch.
540static void DoInitialMatch(const SCEV *S, Loop *L,
541 SmallVectorImpl<const SCEV *> &Good,
542 SmallVectorImpl<const SCEV *> &Bad,
543 ScalarEvolution &SE) {
544 // Collect expressions which properly dominate the loop header.
545 if (SE.properlyDominates(S, BB: L->getHeader())) {
546 Good.push_back(Elt: S);
547 return;
548 }
549
550 // Look at add operands.
551 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Val: S)) {
552 for (const SCEV *S : Add->operands())
553 DoInitialMatch(S, L, Good, Bad, SE);
554 return;
555 }
556
557 // Look at addrec operands.
558 const SCEV *Start, *Step;
559 const Loop *ARLoop;
560 if (match(S,
561 P: m_scev_AffineAddRec(Op0: m_SCEV(V&: Start), Op1: m_SCEV(V&: Step), L: m_Loop(L&: ARLoop))) &&
562 !Start->isZero()) {
563 DoInitialMatch(S: Start, L, Good, Bad, SE);
564 DoInitialMatch(S: SE.getAddRecExpr(Start: SE.getConstant(Ty: S->getType(), V: 0), Step,
565 // FIXME: AR->getNoWrapFlags()
566 L: ARLoop, Flags: SCEV::FlagAnyWrap),
567 L, Good, Bad, SE);
568 return;
569 }
570
571 // Handle a multiplication by -1 (negation) if it didn't fold.
572 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(Val: S))
573 if (Mul->getOperand(i: 0)->isAllOnesValue()) {
574 SmallVector<const SCEV *, 4> Ops(drop_begin(RangeOrContainer: Mul->operands()));
575 const SCEV *NewMul = SE.getMulExpr(Ops);
576
577 SmallVector<const SCEV *, 4> MyGood;
578 SmallVector<const SCEV *, 4> MyBad;
579 DoInitialMatch(S: NewMul, L, Good&: MyGood, Bad&: MyBad, SE);
580 const SCEV *NegOne = SE.getSCEV(V: ConstantInt::getAllOnesValue(
581 Ty: SE.getEffectiveSCEVType(Ty: NewMul->getType())));
582 for (const SCEV *S : MyGood)
583 Good.push_back(Elt: SE.getMulExpr(LHS: NegOne, RHS: S));
584 for (const SCEV *S : MyBad)
585 Bad.push_back(Elt: SE.getMulExpr(LHS: NegOne, RHS: S));
586 return;
587 }
588
589 // Ok, we can't do anything interesting. Just stuff the whole thing into a
590 // register and hope for the best.
591 Bad.push_back(Elt: S);
592}
593
594/// Incorporate loop-variant parts of S into this Formula, attempting to keep
595/// all loop-invariant and loop-computable values in a single base register.
596void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
597 SmallVector<const SCEV *, 4> Good;
598 SmallVector<const SCEV *, 4> Bad;
599 DoInitialMatch(S, L, Good, Bad, SE);
600 if (!Good.empty()) {
601 const SCEV *Sum = SE.getAddExpr(Ops&: Good);
602 if (!Sum->isZero())
603 BaseRegs.push_back(Elt: Sum);
604 HasBaseReg = true;
605 }
606 if (!Bad.empty()) {
607 const SCEV *Sum = SE.getAddExpr(Ops&: Bad);
608 if (!Sum->isZero())
609 BaseRegs.push_back(Elt: Sum);
610 HasBaseReg = true;
611 }
612 canonicalize(L: *L);
613}
614
615static bool containsAddRecDependentOnLoop(const SCEV *S, const Loop &L) {
616 return SCEVExprContains(Root: S, Pred: [&L](const SCEV *S) {
617 return isa<SCEVAddRecExpr>(Val: S) && (cast<SCEVAddRecExpr>(Val: S)->getLoop() == &L);
618 });
619}
620
621/// Check whether or not this formula satisfies the canonical
622/// representation.
623/// \see Formula::BaseRegs.
624bool Formula::isCanonical(const Loop &L) const {
625 assert((Scale == 0 || ScaledReg) &&
626 "ScaledReg must be non-null if Scale is non-zero");
627
628 if (!ScaledReg)
629 return BaseRegs.size() <= 1;
630
631 if (Scale != 1)
632 return true;
633
634 if (Scale == 1 && BaseRegs.empty())
635 return false;
636
637 if (containsAddRecDependentOnLoop(S: ScaledReg, L))
638 return true;
639
640 // If ScaledReg is not a recurrent expr, or it is but its loop is not current
641 // loop, meanwhile BaseRegs contains a recurrent expr reg related with current
642 // loop, we want to swap the reg in BaseRegs with ScaledReg.
643 return none_of(Range: BaseRegs, P: [&L](const SCEV *S) {
644 return containsAddRecDependentOnLoop(S, L);
645 });
646}
647
648/// Helper method to morph a formula into its canonical representation.
649/// \see Formula::BaseRegs.
650/// Every formula having more than one base register, must use the ScaledReg
651/// field. Otherwise, we would have to do special cases everywhere in LSR
652/// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ...
653/// On the other hand, 1*reg should be canonicalized into reg.
654void Formula::canonicalize(const Loop &L) {
655 if (isCanonical(L))
656 return;
657
658 if (BaseRegs.empty()) {
659 // No base reg? Use scale reg with scale = 1 as such.
660 assert(ScaledReg && "Expected 1*reg => reg");
661 assert(Scale == 1 && "Expected 1*reg => reg");
662 BaseRegs.push_back(Elt: ScaledReg);
663 Scale = 0;
664 ScaledReg = nullptr;
665 return;
666 }
667
668 // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg.
669 if (!ScaledReg) {
670 ScaledReg = BaseRegs.pop_back_val();
671 Scale = 1;
672 }
673
674 // If ScaledReg is an invariant with respect to L, find the reg from
675 // BaseRegs containing the recurrent expr related with Loop L. Swap the
676 // reg with ScaledReg.
677 if (!containsAddRecDependentOnLoop(S: ScaledReg, L)) {
678 auto I = find_if(Range&: BaseRegs, P: [&L](const SCEV *S) {
679 return containsAddRecDependentOnLoop(S, L);
680 });
681 if (I != BaseRegs.end())
682 std::swap(a&: ScaledReg, b&: *I);
683 }
684 assert(isCanonical(L) && "Failed to canonicalize?");
685}
686
687/// Get rid of the scale in the formula.
688/// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2.
689/// \return true if it was possible to get rid of the scale, false otherwise.
690/// \note After this operation the formula may not be in the canonical form.
691bool Formula::unscale() {
692 if (Scale != 1)
693 return false;
694 Scale = 0;
695 BaseRegs.push_back(Elt: ScaledReg);
696 ScaledReg = nullptr;
697 return true;
698}
699
700bool Formula::hasZeroEnd() const {
701 if (UnfoldedOffset || BaseOffset)
702 return false;
703 if (BaseRegs.size() != 1 || ScaledReg)
704 return false;
705 return true;
706}
707
708/// Return the total number of register operands used by this formula. This does
709/// not include register uses implied by non-constant addrec strides.
710size_t Formula::getNumRegs() const {
711 return !!ScaledReg + BaseRegs.size();
712}
713
714/// Return the type of this formula, if it has one, or null otherwise. This type
715/// is meaningless except for the bit size.
716Type *Formula::getType() const {
717 return !BaseRegs.empty() ? BaseRegs.front()->getType() :
718 ScaledReg ? ScaledReg->getType() :
719 BaseGV ? BaseGV->getType() :
720 nullptr;
721}
722
723/// Delete the given base reg from the BaseRegs list.
724void Formula::deleteBaseReg(const SCEV *&S) {
725 if (&S != &BaseRegs.back())
726 std::swap(a&: S, b&: BaseRegs.back());
727 BaseRegs.pop_back();
728}
729
730/// Test if this formula references the given register.
731bool Formula::referencesReg(const SCEV *S) const {
732 return S == ScaledReg || is_contained(Range: BaseRegs, Element: S);
733}
734
735/// Test whether this formula uses registers which are used by uses other than
736/// the use with the given index.
737bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
738 const RegUseTracker &RegUses) const {
739 if (ScaledReg)
740 if (RegUses.isRegUsedByUsesOtherThan(Reg: ScaledReg, LUIdx))
741 return true;
742 for (const SCEV *BaseReg : BaseRegs)
743 if (RegUses.isRegUsedByUsesOtherThan(Reg: BaseReg, LUIdx))
744 return true;
745 return false;
746}
747
748#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
749void Formula::print(raw_ostream &OS) const {
750 bool First = true;
751 if (BaseGV) {
752 if (!First) OS << " + "; else First = false;
753 BaseGV->printAsOperand(OS, /*PrintType=*/false);
754 }
755 if (BaseOffset.isNonZero()) {
756 if (!First) OS << " + "; else First = false;
757 OS << BaseOffset;
758 }
759 for (const SCEV *BaseReg : BaseRegs) {
760 if (!First) OS << " + "; else First = false;
761 OS << "reg(" << *BaseReg << ')';
762 }
763 if (HasBaseReg && BaseRegs.empty()) {
764 if (!First) OS << " + "; else First = false;
765 OS << "**error: HasBaseReg**";
766 } else if (!HasBaseReg && !BaseRegs.empty()) {
767 if (!First) OS << " + "; else First = false;
768 OS << "**error: !HasBaseReg**";
769 }
770 if (Scale != 0) {
771 if (!First) OS << " + "; else First = false;
772 OS << Scale << "*reg(";
773 if (ScaledReg)
774 OS << *ScaledReg;
775 else
776 OS << "<unknown>";
777 OS << ')';
778 }
779 if (UnfoldedOffset.isNonZero()) {
780 if (!First) OS << " + ";
781 OS << "imm(" << UnfoldedOffset << ')';
782 }
783}
784
785LLVM_DUMP_METHOD void Formula::dump() const {
786 print(errs()); errs() << '\n';
787}
788#endif
789
790/// Return true if the given addrec can be sign-extended without changing its
791/// value.
792static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
793 Type *WideTy =
794 IntegerType::get(C&: SE.getContext(), NumBits: SE.getTypeSizeInBits(Ty: AR->getType()) + 1);
795 return isa<SCEVAddRecExpr>(Val: SE.getSignExtendExpr(Op: AR, Ty: WideTy));
796}
797
798/// Return true if the given add can be sign-extended without changing its
799/// value.
800static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
801 Type *WideTy =
802 IntegerType::get(C&: SE.getContext(), NumBits: SE.getTypeSizeInBits(Ty: A->getType()) + 1);
803 return isa<SCEVAddExpr>(Val: SE.getSignExtendExpr(Op: A, Ty: WideTy));
804}
805
806/// Return true if the given mul can be sign-extended without changing its
807/// value.
808static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
809 Type *WideTy =
810 IntegerType::get(C&: SE.getContext(),
811 NumBits: SE.getTypeSizeInBits(Ty: M->getType()) * M->getNumOperands());
812 return isa<SCEVMulExpr>(Val: SE.getSignExtendExpr(Op: M, Ty: WideTy));
813}
814
815/// Return an expression for LHS /s RHS, if it can be determined and if the
816/// remainder is known to be zero, or null otherwise. If IgnoreSignificantBits
817/// is true, expressions like (X * Y) /s Y are simplified to X, ignoring that
818/// the multiplication may overflow, which is useful when the result will be
819/// used in a context where the most significant bits are ignored.
820static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
821 ScalarEvolution &SE,
822 bool IgnoreSignificantBits = false) {
823 // Handle the trivial case, which works for any SCEV type.
824 if (LHS == RHS)
825 return SE.getConstant(Ty: LHS->getType(), V: 1);
826
827 // Handle a few RHS special cases.
828 const SCEVConstant *RC = dyn_cast<SCEVConstant>(Val: RHS);
829 if (RC) {
830 const APInt &RA = RC->getAPInt();
831 // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
832 // some folding.
833 if (RA.isAllOnes()) {
834 if (LHS->getType()->isPointerTy())
835 return nullptr;
836 return SE.getMulExpr(LHS, RHS: RC);
837 }
838 // Handle x /s 1 as x.
839 if (RA == 1)
840 return LHS;
841 }
842
843 // Check for a division of a constant by a constant.
844 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(Val: LHS)) {
845 if (!RC)
846 return nullptr;
847 const APInt &LA = C->getAPInt();
848 const APInt &RA = RC->getAPInt();
849 if (LA.srem(RHS: RA) != 0)
850 return nullptr;
851 return SE.getConstant(Val: LA.sdiv(RHS: RA));
852 }
853
854 // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
855 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Val: LHS)) {
856 if ((IgnoreSignificantBits || isAddRecSExtable(AR, SE)) && AR->isAffine()) {
857 const SCEV *Step = getExactSDiv(LHS: AR->getStepRecurrence(SE), RHS, SE,
858 IgnoreSignificantBits);
859 if (!Step) return nullptr;
860 const SCEV *Start = getExactSDiv(LHS: AR->getStart(), RHS, SE,
861 IgnoreSignificantBits);
862 if (!Start) return nullptr;
863 // FlagNW is independent of the start value, step direction, and is
864 // preserved with smaller magnitude steps.
865 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
866 return SE.getAddRecExpr(Start, Step, L: AR->getLoop(), Flags: SCEV::FlagAnyWrap);
867 }
868 return nullptr;
869 }
870
871 // Distribute the sdiv over add operands, if the add doesn't overflow.
872 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Val: LHS)) {
873 if (IgnoreSignificantBits || isAddSExtable(A: Add, SE)) {
874 SmallVector<const SCEV *, 8> Ops;
875 for (const SCEV *S : Add->operands()) {
876 const SCEV *Op = getExactSDiv(LHS: S, RHS, SE, IgnoreSignificantBits);
877 if (!Op) return nullptr;
878 Ops.push_back(Elt: Op);
879 }
880 return SE.getAddExpr(Ops);
881 }
882 return nullptr;
883 }
884
885 // Check for a multiply operand that we can pull RHS out of.
886 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(Val: LHS)) {
887 if (IgnoreSignificantBits || isMulSExtable(M: Mul, SE)) {
888 // Handle special case C1*X*Y /s C2*X*Y.
889 if (const SCEVMulExpr *MulRHS = dyn_cast<SCEVMulExpr>(Val: RHS)) {
890 if (IgnoreSignificantBits || isMulSExtable(M: MulRHS, SE)) {
891 const SCEVConstant *LC = dyn_cast<SCEVConstant>(Val: Mul->getOperand(i: 0));
892 const SCEVConstant *RC =
893 dyn_cast<SCEVConstant>(Val: MulRHS->getOperand(i: 0));
894 if (LC && RC) {
895 SmallVector<const SCEV *, 4> LOps(drop_begin(RangeOrContainer: Mul->operands()));
896 SmallVector<const SCEV *, 4> ROps(drop_begin(RangeOrContainer: MulRHS->operands()));
897 if (LOps == ROps)
898 return getExactSDiv(LHS: LC, RHS: RC, SE, IgnoreSignificantBits);
899 }
900 }
901 }
902
903 SmallVector<const SCEV *, 4> Ops;
904 bool Found = false;
905 for (const SCEV *S : Mul->operands()) {
906 if (!Found)
907 if (const SCEV *Q = getExactSDiv(LHS: S, RHS, SE,
908 IgnoreSignificantBits)) {
909 S = Q;
910 Found = true;
911 }
912 Ops.push_back(Elt: S);
913 }
914 return Found ? SE.getMulExpr(Ops) : nullptr;
915 }
916 return nullptr;
917 }
918
919 // Otherwise we don't know.
920 return nullptr;
921}
922
923/// If S involves the addition of a constant integer value, return that integer
924/// value, and mutate S to point to a new SCEV with that value excluded.
925static Immediate ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
926 const APInt *C;
927 if (match(S, P: m_scev_APInt(C))) {
928 if (C->getSignificantBits() <= 64) {
929 S = SE.getConstant(Ty: S->getType(), V: 0);
930 return Immediate::getFixed(MinVal: C->getSExtValue());
931 }
932 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Val: S)) {
933 SmallVector<const SCEV *, 8> NewOps(Add->operands());
934 Immediate Result = ExtractImmediate(S&: NewOps.front(), SE);
935 if (Result.isNonZero())
936 S = SE.getAddExpr(Ops&: NewOps);
937 return Result;
938 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Val: S)) {
939 SmallVector<const SCEV *, 8> NewOps(AR->operands());
940 Immediate Result = ExtractImmediate(S&: NewOps.front(), SE);
941 if (Result.isNonZero())
942 S = SE.getAddRecExpr(Operands&: NewOps, L: AR->getLoop(),
943 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
944 Flags: SCEV::FlagAnyWrap);
945 return Result;
946 } else if (EnableVScaleImmediates &&
947 match(S, P: m_scev_Mul(Op0: m_scev_APInt(C), Op1: m_SCEVVScale()))) {
948 S = SE.getConstant(Ty: S->getType(), V: 0);
949 return Immediate::getScalable(MinVal: C->getSExtValue());
950 }
951 return Immediate::getZero();
952}
953
954/// If S involves the addition of a GlobalValue address, return that symbol, and
955/// mutate S to point to a new SCEV with that value excluded.
956static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) {
957 if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Val: S)) {
958 if (GlobalValue *GV = dyn_cast<GlobalValue>(Val: U->getValue())) {
959 S = SE.getConstant(Ty: GV->getType(), V: 0);
960 return GV;
961 }
962 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Val: S)) {
963 SmallVector<const SCEV *, 8> NewOps(Add->operands());
964 GlobalValue *Result = ExtractSymbol(S&: NewOps.back(), SE);
965 if (Result)
966 S = SE.getAddExpr(Ops&: NewOps);
967 return Result;
968 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Val: S)) {
969 SmallVector<const SCEV *, 8> NewOps(AR->operands());
970 GlobalValue *Result = ExtractSymbol(S&: NewOps.front(), SE);
971 if (Result)
972 S = SE.getAddRecExpr(Operands&: NewOps, L: AR->getLoop(),
973 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
974 Flags: SCEV::FlagAnyWrap);
975 return Result;
976 }
977 return nullptr;
978}
979
980/// Returns true if the specified instruction is using the specified value as an
981/// address.
982static bool isAddressUse(const TargetTransformInfo &TTI,
983 Instruction *Inst, Value *OperandVal) {
984 bool isAddress = isa<LoadInst>(Val: Inst);
985 if (StoreInst *SI = dyn_cast<StoreInst>(Val: Inst)) {
986 if (SI->getPointerOperand() == OperandVal)
987 isAddress = true;
988 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: Inst)) {
989 // Addressing modes can also be folded into prefetches and a variety
990 // of intrinsics.
991 switch (II->getIntrinsicID()) {
992 case Intrinsic::memset:
993 case Intrinsic::prefetch:
994 case Intrinsic::masked_load:
995 if (II->getArgOperand(i: 0) == OperandVal)
996 isAddress = true;
997 break;
998 case Intrinsic::masked_store:
999 if (II->getArgOperand(i: 1) == OperandVal)
1000 isAddress = true;
1001 break;
1002 case Intrinsic::memmove:
1003 case Intrinsic::memcpy:
1004 if (II->getArgOperand(i: 0) == OperandVal ||
1005 II->getArgOperand(i: 1) == OperandVal)
1006 isAddress = true;
1007 break;
1008 default: {
1009 MemIntrinsicInfo IntrInfo;
1010 if (TTI.getTgtMemIntrinsic(Inst: II, Info&: IntrInfo)) {
1011 if (IntrInfo.PtrVal == OperandVal)
1012 isAddress = true;
1013 }
1014 }
1015 }
1016 } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Val: Inst)) {
1017 if (RMW->getPointerOperand() == OperandVal)
1018 isAddress = true;
1019 } else if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Val: Inst)) {
1020 if (CmpX->getPointerOperand() == OperandVal)
1021 isAddress = true;
1022 }
1023 return isAddress;
1024}
1025
1026/// Return the type of the memory being accessed.
1027static MemAccessTy getAccessType(const TargetTransformInfo &TTI,
1028 Instruction *Inst, Value *OperandVal) {
1029 MemAccessTy AccessTy = MemAccessTy::getUnknown(Ctx&: Inst->getContext());
1030
1031 // First get the type of memory being accessed.
1032 if (Type *Ty = Inst->getAccessType())
1033 AccessTy.MemTy = Ty;
1034
1035 // Then get the pointer address space.
1036 if (const StoreInst *SI = dyn_cast<StoreInst>(Val: Inst)) {
1037 AccessTy.AddrSpace = SI->getPointerAddressSpace();
1038 } else if (const LoadInst *LI = dyn_cast<LoadInst>(Val: Inst)) {
1039 AccessTy.AddrSpace = LI->getPointerAddressSpace();
1040 } else if (const AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Val: Inst)) {
1041 AccessTy.AddrSpace = RMW->getPointerAddressSpace();
1042 } else if (const AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Val: Inst)) {
1043 AccessTy.AddrSpace = CmpX->getPointerAddressSpace();
1044 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: Inst)) {
1045 switch (II->getIntrinsicID()) {
1046 case Intrinsic::prefetch:
1047 case Intrinsic::memset:
1048 AccessTy.AddrSpace = II->getArgOperand(i: 0)->getType()->getPointerAddressSpace();
1049 AccessTy.MemTy = OperandVal->getType();
1050 break;
1051 case Intrinsic::memmove:
1052 case Intrinsic::memcpy:
1053 AccessTy.AddrSpace = OperandVal->getType()->getPointerAddressSpace();
1054 AccessTy.MemTy = OperandVal->getType();
1055 break;
1056 case Intrinsic::masked_load:
1057 AccessTy.AddrSpace =
1058 II->getArgOperand(i: 0)->getType()->getPointerAddressSpace();
1059 break;
1060 case Intrinsic::masked_store:
1061 AccessTy.AddrSpace =
1062 II->getArgOperand(i: 1)->getType()->getPointerAddressSpace();
1063 break;
1064 default: {
1065 MemIntrinsicInfo IntrInfo;
1066 if (TTI.getTgtMemIntrinsic(Inst: II, Info&: IntrInfo) && IntrInfo.PtrVal) {
1067 AccessTy.AddrSpace
1068 = IntrInfo.PtrVal->getType()->getPointerAddressSpace();
1069 }
1070
1071 break;
1072 }
1073 }
1074 }
1075
1076 return AccessTy;
1077}
1078
1079/// Return true if this AddRec is already a phi in its loop.
1080static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
1081 for (PHINode &PN : AR->getLoop()->getHeader()->phis()) {
1082 if (SE.isSCEVable(Ty: PN.getType()) &&
1083 (SE.getEffectiveSCEVType(Ty: PN.getType()) ==
1084 SE.getEffectiveSCEVType(Ty: AR->getType())) &&
1085 SE.getSCEV(V: &PN) == AR)
1086 return true;
1087 }
1088 return false;
1089}
1090
1091/// Check if expanding this expression is likely to incur significant cost. This
1092/// is tricky because SCEV doesn't track which expressions are actually computed
1093/// by the current IR.
1094///
1095/// We currently allow expansion of IV increments that involve adds,
1096/// multiplication by constants, and AddRecs from existing phis.
1097///
1098/// TODO: Allow UDivExpr if we can find an existing IV increment that is an
1099/// obvious multiple of the UDivExpr.
1100static bool isHighCostExpansion(const SCEV *S,
1101 SmallPtrSetImpl<const SCEV*> &Processed,
1102 ScalarEvolution &SE) {
1103 // Zero/One operand expressions
1104 switch (S->getSCEVType()) {
1105 case scUnknown:
1106 case scConstant:
1107 case scVScale:
1108 return false;
1109 case scTruncate:
1110 return isHighCostExpansion(S: cast<SCEVTruncateExpr>(Val: S)->getOperand(),
1111 Processed, SE);
1112 case scZeroExtend:
1113 return isHighCostExpansion(S: cast<SCEVZeroExtendExpr>(Val: S)->getOperand(),
1114 Processed, SE);
1115 case scSignExtend:
1116 return isHighCostExpansion(S: cast<SCEVSignExtendExpr>(Val: S)->getOperand(),
1117 Processed, SE);
1118 default:
1119 break;
1120 }
1121
1122 if (!Processed.insert(Ptr: S).second)
1123 return false;
1124
1125 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Val: S)) {
1126 for (const SCEV *S : Add->operands()) {
1127 if (isHighCostExpansion(S, Processed, SE))
1128 return true;
1129 }
1130 return false;
1131 }
1132
1133 const SCEV *Op0, *Op1;
1134 if (match(S, P: m_scev_Mul(Op0: m_SCEV(V&: Op0), Op1: m_SCEV(V&: Op1)))) {
1135 // Multiplication by a constant is ok
1136 if (isa<SCEVConstant>(Val: Op0))
1137 return isHighCostExpansion(S: Op1, Processed, SE);
1138
1139 // If we have the value of one operand, check if an existing
1140 // multiplication already generates this expression.
1141 if (const auto *U = dyn_cast<SCEVUnknown>(Val: Op1)) {
1142 Value *UVal = U->getValue();
1143 for (User *UR : UVal->users()) {
1144 // If U is a constant, it may be used by a ConstantExpr.
1145 Instruction *UI = dyn_cast<Instruction>(Val: UR);
1146 if (UI && UI->getOpcode() == Instruction::Mul &&
1147 SE.isSCEVable(Ty: UI->getType())) {
1148 return SE.getSCEV(V: UI) == S;
1149 }
1150 }
1151 }
1152 }
1153
1154 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Val: S)) {
1155 if (isExistingPhi(AR, SE))
1156 return false;
1157 }
1158
1159 // Fow now, consider any other type of expression (div/mul/min/max) high cost.
1160 return true;
1161}
1162
1163namespace {
1164
1165class LSRUse;
1166
1167} // end anonymous namespace
1168
1169/// Check if the addressing mode defined by \p F is completely
1170/// folded in \p LU at isel time.
1171/// This includes address-mode folding and special icmp tricks.
1172/// This function returns true if \p LU can accommodate what \p F
1173/// defines and up to 1 base + 1 scaled + offset.
1174/// In other words, if \p F has several base registers, this function may
1175/// still return true. Therefore, users still need to account for
1176/// additional base registers and/or unfolded offsets to derive an
1177/// accurate cost model.
1178static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1179 const LSRUse &LU, const Formula &F);
1180
1181// Get the cost of the scaling factor used in F for LU.
1182static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI,
1183 const LSRUse &LU, const Formula &F,
1184 const Loop &L);
1185
1186namespace {
1187
1188/// This class is used to measure and compare candidate formulae.
1189class Cost {
1190 const Loop *L = nullptr;
1191 ScalarEvolution *SE = nullptr;
1192 const TargetTransformInfo *TTI = nullptr;
1193 TargetTransformInfo::LSRCost C;
1194 TTI::AddressingModeKind AMK = TTI::AMK_None;
1195
1196public:
1197 Cost() = delete;
1198 Cost(const Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
1199 TTI::AddressingModeKind AMK) :
1200 L(L), SE(&SE), TTI(&TTI), AMK(AMK) {
1201 C.Insns = 0;
1202 C.NumRegs = 0;
1203 C.AddRecCost = 0;
1204 C.NumIVMuls = 0;
1205 C.NumBaseAdds = 0;
1206 C.ImmCost = 0;
1207 C.SetupCost = 0;
1208 C.ScaleCost = 0;
1209 }
1210
1211 bool isLess(const Cost &Other) const;
1212
1213 void Lose();
1214
1215#ifndef NDEBUG
1216 // Once any of the metrics loses, they must all remain losers.
1217 bool isValid() {
1218 return ((C.Insns | C.NumRegs | C.AddRecCost | C.NumIVMuls | C.NumBaseAdds
1219 | C.ImmCost | C.SetupCost | C.ScaleCost) != ~0u)
1220 || ((C.Insns & C.NumRegs & C.AddRecCost & C.NumIVMuls & C.NumBaseAdds
1221 & C.ImmCost & C.SetupCost & C.ScaleCost) == ~0u);
1222 }
1223#endif
1224
1225 bool isLoser() {
1226 assert(isValid() && "invalid cost");
1227 return C.NumRegs == ~0u;
1228 }
1229
1230 void RateFormula(const Formula &F,
1231 SmallPtrSetImpl<const SCEV *> &Regs,
1232 const DenseSet<const SCEV *> &VisitedRegs,
1233 const LSRUse &LU,
1234 SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
1235
1236 void print(raw_ostream &OS) const;
1237 void dump() const;
1238
1239private:
1240 void RateRegister(const Formula &F, const SCEV *Reg,
1241 SmallPtrSetImpl<const SCEV *> &Regs);
1242 void RatePrimaryRegister(const Formula &F, const SCEV *Reg,
1243 SmallPtrSetImpl<const SCEV *> &Regs,
1244 SmallPtrSetImpl<const SCEV *> *LoserRegs);
1245};
1246
1247/// An operand value in an instruction which is to be replaced with some
1248/// equivalent, possibly strength-reduced, replacement.
1249struct LSRFixup {
1250 /// The instruction which will be updated.
1251 Instruction *UserInst = nullptr;
1252
1253 /// The operand of the instruction which will be replaced. The operand may be
1254 /// used more than once; every instance will be replaced.
1255 Value *OperandValToReplace = nullptr;
1256
1257 /// If this user is to use the post-incremented value of an induction
1258 /// variable, this set is non-empty and holds the loops associated with the
1259 /// induction variable.
1260 PostIncLoopSet PostIncLoops;
1261
1262 /// A constant offset to be added to the LSRUse expression. This allows
1263 /// multiple fixups to share the same LSRUse with different offsets, for
1264 /// example in an unrolled loop.
1265 Immediate Offset = Immediate::getZero();
1266
1267 LSRFixup() = default;
1268
1269 bool isUseFullyOutsideLoop(const Loop *L) const;
1270
1271 void print(raw_ostream &OS) const;
1272 void dump() const;
1273};
1274
1275/// This class holds the state that LSR keeps for each use in IVUsers, as well
1276/// as uses invented by LSR itself. It includes information about what kinds of
1277/// things can be folded into the user, information about the user itself, and
1278/// information about how the use may be satisfied. TODO: Represent multiple
1279/// users of the same expression in common?
1280class LSRUse {
1281 DenseSet<SmallVector<const SCEV *, 4>> Uniquifier;
1282
1283public:
1284 /// An enum for a kind of use, indicating what types of scaled and immediate
1285 /// operands it might support.
1286 enum KindType {
1287 Basic, ///< A normal use, with no folding.
1288 Special, ///< A special case of basic, allowing -1 scales.
1289 Address, ///< An address use; folding according to TargetLowering
1290 ICmpZero ///< An equality icmp with both operands folded into one.
1291 // TODO: Add a generic icmp too?
1292 };
1293
1294 using SCEVUseKindPair = PointerIntPair<const SCEV *, 2, KindType>;
1295
1296 KindType Kind;
1297 MemAccessTy AccessTy;
1298
1299 /// The list of operands which are to be replaced.
1300 SmallVector<LSRFixup, 8> Fixups;
1301
1302 /// Keep track of the min and max offsets of the fixups.
1303 Immediate MinOffset = Immediate::getFixedMax();
1304 Immediate MaxOffset = Immediate::getFixedMin();
1305
1306 /// This records whether all of the fixups using this LSRUse are outside of
1307 /// the loop, in which case some special-case heuristics may be used.
1308 bool AllFixupsOutsideLoop = true;
1309
1310 /// RigidFormula is set to true to guarantee that this use will be associated
1311 /// with a single formula--the one that initially matched. Some SCEV
1312 /// expressions cannot be expanded. This allows LSR to consider the registers
1313 /// used by those expressions without the need to expand them later after
1314 /// changing the formula.
1315 bool RigidFormula = false;
1316
1317 /// This records the widest use type for any fixup using this
1318 /// LSRUse. FindUseWithSimilarFormula can't consider uses with different max
1319 /// fixup widths to be equivalent, because the narrower one may be relying on
1320 /// the implicit truncation to truncate away bogus bits.
1321 Type *WidestFixupType = nullptr;
1322
1323 /// A list of ways to build a value that can satisfy this user. After the
1324 /// list is populated, one of these is selected heuristically and used to
1325 /// formulate a replacement for OperandValToReplace in UserInst.
1326 SmallVector<Formula, 12> Formulae;
1327
1328 /// The set of register candidates used by all formulae in this LSRUse.
1329 SmallPtrSet<const SCEV *, 4> Regs;
1330
1331 LSRUse(KindType K, MemAccessTy AT) : Kind(K), AccessTy(AT) {}
1332
1333 LSRFixup &getNewFixup() {
1334 Fixups.push_back(Elt: LSRFixup());
1335 return Fixups.back();
1336 }
1337
1338 void pushFixup(LSRFixup &f) {
1339 Fixups.push_back(Elt: f);
1340 if (Immediate::isKnownGT(LHS: f.Offset, RHS: MaxOffset))
1341 MaxOffset = f.Offset;
1342 if (Immediate::isKnownLT(LHS: f.Offset, RHS: MinOffset))
1343 MinOffset = f.Offset;
1344 }
1345
1346 bool HasFormulaWithSameRegs(const Formula &F) const;
1347 float getNotSelectedProbability(const SCEV *Reg) const;
1348 bool InsertFormula(const Formula &F, const Loop &L);
1349 void DeleteFormula(Formula &F);
1350 void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
1351
1352 void print(raw_ostream &OS) const;
1353 void dump() const;
1354};
1355
1356} // end anonymous namespace
1357
1358static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1359 LSRUse::KindType Kind, MemAccessTy AccessTy,
1360 GlobalValue *BaseGV, Immediate BaseOffset,
1361 bool HasBaseReg, int64_t Scale,
1362 Instruction *Fixup = nullptr);
1363
1364static unsigned getSetupCost(const SCEV *Reg, unsigned Depth) {
1365 if (isa<SCEVUnknown>(Val: Reg) || isa<SCEVConstant>(Val: Reg))
1366 return 1;
1367 if (Depth == 0)
1368 return 0;
1369 if (const auto *S = dyn_cast<SCEVAddRecExpr>(Val: Reg))
1370 return getSetupCost(Reg: S->getStart(), Depth: Depth - 1);
1371 if (auto S = dyn_cast<SCEVIntegralCastExpr>(Val: Reg))
1372 return getSetupCost(Reg: S->getOperand(), Depth: Depth - 1);
1373 if (auto S = dyn_cast<SCEVNAryExpr>(Val: Reg))
1374 return std::accumulate(first: S->operands().begin(), last: S->operands().end(), init: 0,
1375 binary_op: [&](unsigned i, const SCEV *Reg) {
1376 return i + getSetupCost(Reg, Depth: Depth - 1);
1377 });
1378 if (auto S = dyn_cast<SCEVUDivExpr>(Val: Reg))
1379 return getSetupCost(Reg: S->getLHS(), Depth: Depth - 1) +
1380 getSetupCost(Reg: S->getRHS(), Depth: Depth - 1);
1381 return 0;
1382}
1383
1384/// Tally up interesting quantities from the given register.
1385void Cost::RateRegister(const Formula &F, const SCEV *Reg,
1386 SmallPtrSetImpl<const SCEV *> &Regs) {
1387 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Val: Reg)) {
1388 // If this is an addrec for another loop, it should be an invariant
1389 // with respect to L since L is the innermost loop (at least
1390 // for now LSR only handles innermost loops).
1391 if (AR->getLoop() != L) {
1392 // If the AddRec exists, consider it's register free and leave it alone.
1393 if (isExistingPhi(AR, SE&: *SE) && AMK != TTI::AMK_PostIndexed)
1394 return;
1395
1396 // It is bad to allow LSR for current loop to add induction variables
1397 // for its sibling loops.
1398 if (!AR->getLoop()->contains(L)) {
1399 Lose();
1400 return;
1401 }
1402
1403 // Otherwise, it will be an invariant with respect to Loop L.
1404 ++C.NumRegs;
1405 return;
1406 }
1407
1408 unsigned LoopCost = 1;
1409 if (TTI->isIndexedLoadLegal(Mode: TTI->MIM_PostInc, Ty: AR->getType()) ||
1410 TTI->isIndexedStoreLegal(Mode: TTI->MIM_PostInc, Ty: AR->getType())) {
1411 const SCEV *Start;
1412 const SCEVConstant *Step;
1413 if (match(S: AR, P: m_scev_AffineAddRec(Op0: m_SCEV(V&: Start), Op1: m_SCEVConstant(V&: Step))))
1414 // If the step size matches the base offset, we could use pre-indexed
1415 // addressing.
1416 if ((AMK == TTI::AMK_PreIndexed && F.BaseOffset.isFixed() &&
1417 Step->getAPInt() == F.BaseOffset.getFixedValue()) ||
1418 (AMK == TTI::AMK_PostIndexed && !isa<SCEVConstant>(Val: Start) &&
1419 SE->isLoopInvariant(S: Start, L)))
1420 LoopCost = 0;
1421 }
1422 C.AddRecCost += LoopCost;
1423
1424 // Add the step value register, if it needs one.
1425 // TODO: The non-affine case isn't precisely modeled here.
1426 if (!AR->isAffine() || !isa<SCEVConstant>(Val: AR->getOperand(i: 1))) {
1427 if (!Regs.count(Ptr: AR->getOperand(i: 1))) {
1428 RateRegister(F, Reg: AR->getOperand(i: 1), Regs);
1429 if (isLoser())
1430 return;
1431 }
1432 }
1433 }
1434 ++C.NumRegs;
1435
1436 // Rough heuristic; favor registers which don't require extra setup
1437 // instructions in the preheader.
1438 C.SetupCost += getSetupCost(Reg, Depth: SetupCostDepthLimit);
1439 // Ensure we don't, even with the recusion limit, produce invalid costs.
1440 C.SetupCost = std::min<unsigned>(a: C.SetupCost, b: 1 << 16);
1441
1442 C.NumIVMuls += isa<SCEVMulExpr>(Val: Reg) &&
1443 SE->hasComputableLoopEvolution(S: Reg, L);
1444}
1445
1446/// Record this register in the set. If we haven't seen it before, rate
1447/// it. Optional LoserRegs provides a way to declare any formula that refers to
1448/// one of those regs an instant loser.
1449void Cost::RatePrimaryRegister(const Formula &F, const SCEV *Reg,
1450 SmallPtrSetImpl<const SCEV *> &Regs,
1451 SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1452 if (LoserRegs && LoserRegs->count(Ptr: Reg)) {
1453 Lose();
1454 return;
1455 }
1456 if (Regs.insert(Ptr: Reg).second) {
1457 RateRegister(F, Reg, Regs);
1458 if (LoserRegs && isLoser())
1459 LoserRegs->insert(Ptr: Reg);
1460 }
1461}
1462
1463void Cost::RateFormula(const Formula &F,
1464 SmallPtrSetImpl<const SCEV *> &Regs,
1465 const DenseSet<const SCEV *> &VisitedRegs,
1466 const LSRUse &LU,
1467 SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1468 if (isLoser())
1469 return;
1470 assert(F.isCanonical(*L) && "Cost is accurate only for canonical formula");
1471 // Tally up the registers.
1472 unsigned PrevAddRecCost = C.AddRecCost;
1473 unsigned PrevNumRegs = C.NumRegs;
1474 unsigned PrevNumBaseAdds = C.NumBaseAdds;
1475 if (const SCEV *ScaledReg = F.ScaledReg) {
1476 if (VisitedRegs.count(V: ScaledReg)) {
1477 Lose();
1478 return;
1479 }
1480 RatePrimaryRegister(F, Reg: ScaledReg, Regs, LoserRegs);
1481 if (isLoser())
1482 return;
1483 }
1484 for (const SCEV *BaseReg : F.BaseRegs) {
1485 if (VisitedRegs.count(V: BaseReg)) {
1486 Lose();
1487 return;
1488 }
1489 RatePrimaryRegister(F, Reg: BaseReg, Regs, LoserRegs);
1490 if (isLoser())
1491 return;
1492 }
1493
1494 // Determine how many (unfolded) adds we'll need inside the loop.
1495 size_t NumBaseParts = F.getNumRegs();
1496 if (NumBaseParts > 1)
1497 // Do not count the base and a possible second register if the target
1498 // allows to fold 2 registers.
1499 C.NumBaseAdds +=
1500 NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(TTI: *TTI, LU, F)));
1501 C.NumBaseAdds += (F.UnfoldedOffset.isNonZero());
1502
1503 // Accumulate non-free scaling amounts.
1504 C.ScaleCost += getScalingFactorCost(TTI: *TTI, LU, F, L: *L).getValue();
1505
1506 // Tally up the non-zero immediates.
1507 for (const LSRFixup &Fixup : LU.Fixups) {
1508 if (Fixup.Offset.isCompatibleImmediate(Imm: F.BaseOffset)) {
1509 Immediate Offset = Fixup.Offset.addUnsigned(RHS: F.BaseOffset);
1510 if (F.BaseGV)
1511 C.ImmCost += 64; // Handle symbolic values conservatively.
1512 // TODO: This should probably be the pointer size.
1513 else if (Offset.isNonZero())
1514 C.ImmCost +=
1515 APInt(64, Offset.getKnownMinValue(), true).getSignificantBits();
1516
1517 // Check with target if this offset with this instruction is
1518 // specifically not supported.
1519 if (LU.Kind == LSRUse::Address && Offset.isNonZero() &&
1520 !isAMCompletelyFolded(TTI: *TTI, Kind: LSRUse::Address, AccessTy: LU.AccessTy, BaseGV: F.BaseGV,
1521 BaseOffset: Offset, HasBaseReg: F.HasBaseReg, Scale: F.Scale, Fixup: Fixup.UserInst))
1522 C.NumBaseAdds++;
1523 } else {
1524 // Incompatible immediate type, increase cost to avoid using
1525 C.ImmCost += 2048;
1526 }
1527 }
1528
1529 // If we don't count instruction cost exit here.
1530 if (!InsnsCost) {
1531 assert(isValid() && "invalid cost");
1532 return;
1533 }
1534
1535 // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as
1536 // additional instruction (at least fill).
1537 // TODO: Need distinguish register class?
1538 unsigned TTIRegNum = TTI->getNumberOfRegisters(
1539 ClassID: TTI->getRegisterClassForType(Vector: false, Ty: F.getType())) - 1;
1540 if (C.NumRegs > TTIRegNum) {
1541 // Cost already exceeded TTIRegNum, then only newly added register can add
1542 // new instructions.
1543 if (PrevNumRegs > TTIRegNum)
1544 C.Insns += (C.NumRegs - PrevNumRegs);
1545 else
1546 C.Insns += (C.NumRegs - TTIRegNum);
1547 }
1548
1549 // If ICmpZero formula ends with not 0, it could not be replaced by
1550 // just add or sub. We'll need to compare final result of AddRec.
1551 // That means we'll need an additional instruction. But if the target can
1552 // macro-fuse a compare with a branch, don't count this extra instruction.
1553 // For -10 + {0, +, 1}:
1554 // i = i + 1;
1555 // cmp i, 10
1556 //
1557 // For {-10, +, 1}:
1558 // i = i + 1;
1559 if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd() &&
1560 !TTI->canMacroFuseCmp())
1561 C.Insns++;
1562 // Each new AddRec adds 1 instruction to calculation.
1563 C.Insns += (C.AddRecCost - PrevAddRecCost);
1564
1565 // BaseAdds adds instructions for unfolded registers.
1566 if (LU.Kind != LSRUse::ICmpZero)
1567 C.Insns += C.NumBaseAdds - PrevNumBaseAdds;
1568 assert(isValid() && "invalid cost");
1569}
1570
1571/// Set this cost to a losing value.
1572void Cost::Lose() {
1573 C.Insns = std::numeric_limits<unsigned>::max();
1574 C.NumRegs = std::numeric_limits<unsigned>::max();
1575 C.AddRecCost = std::numeric_limits<unsigned>::max();
1576 C.NumIVMuls = std::numeric_limits<unsigned>::max();
1577 C.NumBaseAdds = std::numeric_limits<unsigned>::max();
1578 C.ImmCost = std::numeric_limits<unsigned>::max();
1579 C.SetupCost = std::numeric_limits<unsigned>::max();
1580 C.ScaleCost = std::numeric_limits<unsigned>::max();
1581}
1582
1583/// Choose the lower cost.
1584bool Cost::isLess(const Cost &Other) const {
1585 if (InsnsCost.getNumOccurrences() > 0 && InsnsCost &&
1586 C.Insns != Other.C.Insns)
1587 return C.Insns < Other.C.Insns;
1588 return TTI->isLSRCostLess(C1: C, C2: Other.C);
1589}
1590
1591#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1592void Cost::print(raw_ostream &OS) const {
1593 if (InsnsCost)
1594 OS << C.Insns << " instruction" << (C.Insns == 1 ? " " : "s ");
1595 OS << C.NumRegs << " reg" << (C.NumRegs == 1 ? "" : "s");
1596 if (C.AddRecCost != 0)
1597 OS << ", with addrec cost " << C.AddRecCost;
1598 if (C.NumIVMuls != 0)
1599 OS << ", plus " << C.NumIVMuls << " IV mul"
1600 << (C.NumIVMuls == 1 ? "" : "s");
1601 if (C.NumBaseAdds != 0)
1602 OS << ", plus " << C.NumBaseAdds << " base add"
1603 << (C.NumBaseAdds == 1 ? "" : "s");
1604 if (C.ScaleCost != 0)
1605 OS << ", plus " << C.ScaleCost << " scale cost";
1606 if (C.ImmCost != 0)
1607 OS << ", plus " << C.ImmCost << " imm cost";
1608 if (C.SetupCost != 0)
1609 OS << ", plus " << C.SetupCost << " setup cost";
1610}
1611
1612LLVM_DUMP_METHOD void Cost::dump() const {
1613 print(errs()); errs() << '\n';
1614}
1615#endif
1616
1617/// Test whether this fixup always uses its value outside of the given loop.
1618bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
1619 // PHI nodes use their value in their incoming blocks.
1620 if (const PHINode *PN = dyn_cast<PHINode>(Val: UserInst)) {
1621 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
1622 if (PN->getIncomingValue(i) == OperandValToReplace &&
1623 L->contains(BB: PN->getIncomingBlock(i)))
1624 return false;
1625 return true;
1626 }
1627
1628 return !L->contains(Inst: UserInst);
1629}
1630
1631#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1632void LSRFixup::print(raw_ostream &OS) const {
1633 OS << "UserInst=";
1634 // Store is common and interesting enough to be worth special-casing.
1635 if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
1636 OS << "store ";
1637 Store->getOperand(0)->printAsOperand(OS, /*PrintType=*/false);
1638 } else if (UserInst->getType()->isVoidTy())
1639 OS << UserInst->getOpcodeName();
1640 else
1641 UserInst->printAsOperand(OS, /*PrintType=*/false);
1642
1643 OS << ", OperandValToReplace=";
1644 OperandValToReplace->printAsOperand(OS, /*PrintType=*/false);
1645
1646 for (const Loop *PIL : PostIncLoops) {
1647 OS << ", PostIncLoop=";
1648 PIL->getHeader()->printAsOperand(OS, /*PrintType=*/false);
1649 }
1650
1651 if (Offset.isNonZero())
1652 OS << ", Offset=" << Offset;
1653}
1654
1655LLVM_DUMP_METHOD void LSRFixup::dump() const {
1656 print(errs()); errs() << '\n';
1657}
1658#endif
1659
1660/// Test whether this use as a formula which has the same registers as the given
1661/// formula.
1662bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
1663 SmallVector<const SCEV *, 4> Key = F.BaseRegs;
1664 if (F.ScaledReg) Key.push_back(Elt: F.ScaledReg);
1665 // Unstable sort by host order ok, because this is only used for uniquifying.
1666 llvm::sort(C&: Key);
1667 return Uniquifier.count(V: Key);
1668}
1669
1670/// The function returns a probability of selecting formula without Reg.
1671float LSRUse::getNotSelectedProbability(const SCEV *Reg) const {
1672 unsigned FNum = 0;
1673 for (const Formula &F : Formulae)
1674 if (F.referencesReg(S: Reg))
1675 FNum++;
1676 return ((float)(Formulae.size() - FNum)) / Formulae.size();
1677}
1678
1679/// If the given formula has not yet been inserted, add it to the list, and
1680/// return true. Return false otherwise. The formula must be in canonical form.
1681bool LSRUse::InsertFormula(const Formula &F, const Loop &L) {
1682 assert(F.isCanonical(L) && "Invalid canonical representation");
1683
1684 if (!Formulae.empty() && RigidFormula)
1685 return false;
1686
1687 SmallVector<const SCEV *, 4> Key = F.BaseRegs;
1688 if (F.ScaledReg) Key.push_back(Elt: F.ScaledReg);
1689 // Unstable sort by host order ok, because this is only used for uniquifying.
1690 llvm::sort(C&: Key);
1691
1692 if (!Uniquifier.insert(V: Key).second)
1693 return false;
1694
1695 // Using a register to hold the value of 0 is not profitable.
1696 assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
1697 "Zero allocated in a scaled register!");
1698#ifndef NDEBUG
1699 for (const SCEV *BaseReg : F.BaseRegs)
1700 assert(!BaseReg->isZero() && "Zero allocated in a base register!");
1701#endif
1702
1703 // Add the formula to the list.
1704 Formulae.push_back(Elt: F);
1705
1706 // Record registers now being used by this use.
1707 Regs.insert_range(R: F.BaseRegs);
1708 if (F.ScaledReg)
1709 Regs.insert(Ptr: F.ScaledReg);
1710
1711 return true;
1712}
1713
1714/// Remove the given formula from this use's list.
1715void LSRUse::DeleteFormula(Formula &F) {
1716 if (&F != &Formulae.back())
1717 std::swap(a&: F, b&: Formulae.back());
1718 Formulae.pop_back();
1719}
1720
1721/// Recompute the Regs field, and update RegUses.
1722void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
1723 // Now that we've filtered out some formulae, recompute the Regs set.
1724 SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs);
1725 Regs.clear();
1726 for (const Formula &F : Formulae) {
1727 if (F.ScaledReg) Regs.insert(Ptr: F.ScaledReg);
1728 Regs.insert_range(R: F.BaseRegs);
1729 }
1730
1731 // Update the RegTracker.
1732 for (const SCEV *S : OldRegs)
1733 if (!Regs.count(Ptr: S))
1734 RegUses.dropRegister(Reg: S, LUIdx);
1735}
1736
1737#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1738void LSRUse::print(raw_ostream &OS) const {
1739 OS << "LSR Use: Kind=";
1740 switch (Kind) {
1741 case Basic: OS << "Basic"; break;
1742 case Special: OS << "Special"; break;
1743 case ICmpZero: OS << "ICmpZero"; break;
1744 case Address:
1745 OS << "Address of ";
1746 if (AccessTy.MemTy->isPointerTy())
1747 OS << "pointer"; // the full pointer type could be really verbose
1748 else {
1749 OS << *AccessTy.MemTy;
1750 }
1751
1752 OS << " in addrspace(" << AccessTy.AddrSpace << ')';
1753 }
1754
1755 OS << ", Offsets={";
1756 bool NeedComma = false;
1757 for (const LSRFixup &Fixup : Fixups) {
1758 if (NeedComma) OS << ',';
1759 OS << Fixup.Offset;
1760 NeedComma = true;
1761 }
1762 OS << '}';
1763
1764 if (AllFixupsOutsideLoop)
1765 OS << ", all-fixups-outside-loop";
1766
1767 if (WidestFixupType)
1768 OS << ", widest fixup type: " << *WidestFixupType;
1769}
1770
1771LLVM_DUMP_METHOD void LSRUse::dump() const {
1772 print(errs()); errs() << '\n';
1773}
1774#endif
1775
1776static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1777 LSRUse::KindType Kind, MemAccessTy AccessTy,
1778 GlobalValue *BaseGV, Immediate BaseOffset,
1779 bool HasBaseReg, int64_t Scale,
1780 Instruction *Fixup /* = nullptr */) {
1781 switch (Kind) {
1782 case LSRUse::Address: {
1783 int64_t FixedOffset =
1784 BaseOffset.isScalable() ? 0 : BaseOffset.getFixedValue();
1785 int64_t ScalableOffset =
1786 BaseOffset.isScalable() ? BaseOffset.getKnownMinValue() : 0;
1787 return TTI.isLegalAddressingMode(Ty: AccessTy.MemTy, BaseGV, BaseOffset: FixedOffset,
1788 HasBaseReg, Scale, AddrSpace: AccessTy.AddrSpace,
1789 I: Fixup, ScalableOffset);
1790 }
1791 case LSRUse::ICmpZero:
1792 // There's not even a target hook for querying whether it would be legal to
1793 // fold a GV into an ICmp.
1794 if (BaseGV)
1795 return false;
1796
1797 // ICmp only has two operands; don't allow more than two non-trivial parts.
1798 if (Scale != 0 && HasBaseReg && BaseOffset.isNonZero())
1799 return false;
1800
1801 // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
1802 // putting the scaled register in the other operand of the icmp.
1803 if (Scale != 0 && Scale != -1)
1804 return false;
1805
1806 // If we have low-level target information, ask the target if it can fold an
1807 // integer immediate on an icmp.
1808 if (BaseOffset.isNonZero()) {
1809 // We don't have an interface to query whether the target supports
1810 // icmpzero against scalable quantities yet.
1811 if (BaseOffset.isScalable())
1812 return false;
1813
1814 // We have one of:
1815 // ICmpZero BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset
1816 // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
1817 // Offs is the ICmp immediate.
1818 if (Scale == 0)
1819 // The cast does the right thing with
1820 // std::numeric_limits<int64_t>::min().
1821 BaseOffset = BaseOffset.getFixed(MinVal: -(uint64_t)BaseOffset.getFixedValue());
1822 return TTI.isLegalICmpImmediate(Imm: BaseOffset.getFixedValue());
1823 }
1824
1825 // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg
1826 return true;
1827
1828 case LSRUse::Basic:
1829 // Only handle single-register values.
1830 return !BaseGV && Scale == 0 && BaseOffset.isZero();
1831
1832 case LSRUse::Special:
1833 // Special case Basic to handle -1 scales.
1834 return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset.isZero();
1835 }
1836
1837 llvm_unreachable("Invalid LSRUse Kind!");
1838}
1839
1840static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1841 Immediate MinOffset, Immediate MaxOffset,
1842 LSRUse::KindType Kind, MemAccessTy AccessTy,
1843 GlobalValue *BaseGV, Immediate BaseOffset,
1844 bool HasBaseReg, int64_t Scale) {
1845 if (BaseOffset.isNonZero() &&
1846 (BaseOffset.isScalable() != MinOffset.isScalable() ||
1847 BaseOffset.isScalable() != MaxOffset.isScalable()))
1848 return false;
1849 // Check for overflow.
1850 int64_t Base = BaseOffset.getKnownMinValue();
1851 int64_t Min = MinOffset.getKnownMinValue();
1852 int64_t Max = MaxOffset.getKnownMinValue();
1853 if (((int64_t)((uint64_t)Base + Min) > Base) != (Min > 0))
1854 return false;
1855 MinOffset = Immediate::get(MinVal: (uint64_t)Base + Min, Scalable: MinOffset.isScalable());
1856 if (((int64_t)((uint64_t)Base + Max) > Base) != (Max > 0))
1857 return false;
1858 MaxOffset = Immediate::get(MinVal: (uint64_t)Base + Max, Scalable: MaxOffset.isScalable());
1859
1860 return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset: MinOffset,
1861 HasBaseReg, Scale) &&
1862 isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset: MaxOffset,
1863 HasBaseReg, Scale);
1864}
1865
1866static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1867 Immediate MinOffset, Immediate MaxOffset,
1868 LSRUse::KindType Kind, MemAccessTy AccessTy,
1869 const Formula &F, const Loop &L) {
1870 // For the purpose of isAMCompletelyFolded either having a canonical formula
1871 // or a scale not equal to zero is correct.
1872 // Problems may arise from non canonical formulae having a scale == 0.
1873 // Strictly speaking it would best to just rely on canonical formulae.
1874 // However, when we generate the scaled formulae, we first check that the
1875 // scaling factor is profitable before computing the actual ScaledReg for
1876 // compile time sake.
1877 assert((F.isCanonical(L) || F.Scale != 0));
1878 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1879 BaseGV: F.BaseGV, BaseOffset: F.BaseOffset, HasBaseReg: F.HasBaseReg, Scale: F.Scale);
1880}
1881
1882/// Test whether we know how to expand the current formula.
1883static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
1884 Immediate MaxOffset, LSRUse::KindType Kind,
1885 MemAccessTy AccessTy, GlobalValue *BaseGV,
1886 Immediate BaseOffset, bool HasBaseReg, int64_t Scale) {
1887 // We know how to expand completely foldable formulae.
1888 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
1889 BaseOffset, HasBaseReg, Scale) ||
1890 // Or formulae that use a base register produced by a sum of base
1891 // registers.
1892 (Scale == 1 &&
1893 isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1894 BaseGV, BaseOffset, HasBaseReg: true, Scale: 0));
1895}
1896
1897static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
1898 Immediate MaxOffset, LSRUse::KindType Kind,
1899 MemAccessTy AccessTy, const Formula &F) {
1900 return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV: F.BaseGV,
1901 BaseOffset: F.BaseOffset, HasBaseReg: F.HasBaseReg, Scale: F.Scale);
1902}
1903
1904static bool isLegalAddImmediate(const TargetTransformInfo &TTI,
1905 Immediate Offset) {
1906 if (Offset.isScalable())
1907 return TTI.isLegalAddScalableImmediate(Imm: Offset.getKnownMinValue());
1908
1909 return TTI.isLegalAddImmediate(Imm: Offset.getFixedValue());
1910}
1911
1912static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1913 const LSRUse &LU, const Formula &F) {
1914 // Target may want to look at the user instructions.
1915 if (LU.Kind == LSRUse::Address && TTI.LSRWithInstrQueries()) {
1916 for (const LSRFixup &Fixup : LU.Fixups)
1917 if (!isAMCompletelyFolded(TTI, Kind: LSRUse::Address, AccessTy: LU.AccessTy, BaseGV: F.BaseGV,
1918 BaseOffset: (F.BaseOffset + Fixup.Offset), HasBaseReg: F.HasBaseReg,
1919 Scale: F.Scale, Fixup: Fixup.UserInst))
1920 return false;
1921 return true;
1922 }
1923
1924 return isAMCompletelyFolded(TTI, MinOffset: LU.MinOffset, MaxOffset: LU.MaxOffset, Kind: LU.Kind,
1925 AccessTy: LU.AccessTy, BaseGV: F.BaseGV, BaseOffset: F.BaseOffset, HasBaseReg: F.HasBaseReg,
1926 Scale: F.Scale);
1927}
1928
1929static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI,
1930 const LSRUse &LU, const Formula &F,
1931 const Loop &L) {
1932 if (!F.Scale)
1933 return 0;
1934
1935 // If the use is not completely folded in that instruction, we will have to
1936 // pay an extra cost only for scale != 1.
1937 if (!isAMCompletelyFolded(TTI, MinOffset: LU.MinOffset, MaxOffset: LU.MaxOffset, Kind: LU.Kind,
1938 AccessTy: LU.AccessTy, F, L))
1939 return F.Scale != 1;
1940
1941 switch (LU.Kind) {
1942 case LSRUse::Address: {
1943 // Check the scaling factor cost with both the min and max offsets.
1944 int64_t ScalableMin = 0, ScalableMax = 0, FixedMin = 0, FixedMax = 0;
1945 if (F.BaseOffset.isScalable()) {
1946 ScalableMin = (F.BaseOffset + LU.MinOffset).getKnownMinValue();
1947 ScalableMax = (F.BaseOffset + LU.MaxOffset).getKnownMinValue();
1948 } else {
1949 FixedMin = (F.BaseOffset + LU.MinOffset).getFixedValue();
1950 FixedMax = (F.BaseOffset + LU.MaxOffset).getFixedValue();
1951 }
1952 InstructionCost ScaleCostMinOffset = TTI.getScalingFactorCost(
1953 Ty: LU.AccessTy.MemTy, BaseGV: F.BaseGV, BaseOffset: StackOffset::get(Fixed: FixedMin, Scalable: ScalableMin),
1954 HasBaseReg: F.HasBaseReg, Scale: F.Scale, AddrSpace: LU.AccessTy.AddrSpace);
1955 InstructionCost ScaleCostMaxOffset = TTI.getScalingFactorCost(
1956 Ty: LU.AccessTy.MemTy, BaseGV: F.BaseGV, BaseOffset: StackOffset::get(Fixed: FixedMax, Scalable: ScalableMax),
1957 HasBaseReg: F.HasBaseReg, Scale: F.Scale, AddrSpace: LU.AccessTy.AddrSpace);
1958
1959 assert(ScaleCostMinOffset.isValid() && ScaleCostMaxOffset.isValid() &&
1960 "Legal addressing mode has an illegal cost!");
1961 return std::max(a: ScaleCostMinOffset, b: ScaleCostMaxOffset);
1962 }
1963 case LSRUse::ICmpZero:
1964 case LSRUse::Basic:
1965 case LSRUse::Special:
1966 // The use is completely folded, i.e., everything is folded into the
1967 // instruction.
1968 return 0;
1969 }
1970
1971 llvm_unreachable("Invalid LSRUse Kind!");
1972}
1973
1974static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
1975 LSRUse::KindType Kind, MemAccessTy AccessTy,
1976 GlobalValue *BaseGV, Immediate BaseOffset,
1977 bool HasBaseReg) {
1978 // Fast-path: zero is always foldable.
1979 if (BaseOffset.isZero() && !BaseGV)
1980 return true;
1981
1982 // Conservatively, create an address with an immediate and a
1983 // base and a scale.
1984 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
1985
1986 // Canonicalize a scale of 1 to a base register if the formula doesn't
1987 // already have a base register.
1988 if (!HasBaseReg && Scale == 1) {
1989 Scale = 0;
1990 HasBaseReg = true;
1991 }
1992
1993 // FIXME: Try with + without a scale? Maybe based on TTI?
1994 // I think basereg + scaledreg + immediateoffset isn't a good 'conservative'
1995 // default for many architectures, not just AArch64 SVE. More investigation
1996 // needed later to determine if this should be used more widely than just
1997 // on scalable types.
1998 if (HasBaseReg && BaseOffset.isNonZero() && Kind != LSRUse::ICmpZero &&
1999 AccessTy.MemTy && AccessTy.MemTy->isScalableTy() && DropScaledForVScale)
2000 Scale = 0;
2001
2002 return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset,
2003 HasBaseReg, Scale);
2004}
2005
2006static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
2007 ScalarEvolution &SE, Immediate MinOffset,
2008 Immediate MaxOffset, LSRUse::KindType Kind,
2009 MemAccessTy AccessTy, const SCEV *S,
2010 bool HasBaseReg) {
2011 // Fast-path: zero is always foldable.
2012 if (S->isZero()) return true;
2013
2014 // Conservatively, create an address with an immediate and a
2015 // base and a scale.
2016 Immediate BaseOffset = ExtractImmediate(S, SE);
2017 GlobalValue *BaseGV = ExtractSymbol(S, SE);
2018
2019 // If there's anything else involved, it's not foldable.
2020 if (!S->isZero()) return false;
2021
2022 // Fast-path: zero is always foldable.
2023 if (BaseOffset.isZero() && !BaseGV)
2024 return true;
2025
2026 if (BaseOffset.isScalable())
2027 return false;
2028
2029 // Conservatively, create an address with an immediate and a
2030 // base and a scale.
2031 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
2032
2033 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
2034 BaseOffset, HasBaseReg, Scale);
2035}
2036
2037namespace {
2038
2039/// An individual increment in a Chain of IV increments. Relate an IV user to
2040/// an expression that computes the IV it uses from the IV used by the previous
2041/// link in the Chain.
2042///
2043/// For the head of a chain, IncExpr holds the absolute SCEV expression for the
2044/// original IVOperand. The head of the chain's IVOperand is only valid during
2045/// chain collection, before LSR replaces IV users. During chain generation,
2046/// IncExpr can be used to find the new IVOperand that computes the same
2047/// expression.
2048struct IVInc {
2049 Instruction *UserInst;
2050 Value* IVOperand;
2051 const SCEV *IncExpr;
2052
2053 IVInc(Instruction *U, Value *O, const SCEV *E)
2054 : UserInst(U), IVOperand(O), IncExpr(E) {}
2055};
2056
2057// The list of IV increments in program order. We typically add the head of a
2058// chain without finding subsequent links.
2059struct IVChain {
2060 SmallVector<IVInc, 1> Incs;
2061 const SCEV *ExprBase = nullptr;
2062
2063 IVChain() = default;
2064 IVChain(const IVInc &Head, const SCEV *Base)
2065 : Incs(1, Head), ExprBase(Base) {}
2066
2067 using const_iterator = SmallVectorImpl<IVInc>::const_iterator;
2068
2069 // Return the first increment in the chain.
2070 const_iterator begin() const {
2071 assert(!Incs.empty());
2072 return std::next(x: Incs.begin());
2073 }
2074 const_iterator end() const {
2075 return Incs.end();
2076 }
2077
2078 // Returns true if this chain contains any increments.
2079 bool hasIncs() const { return Incs.size() >= 2; }
2080
2081 // Add an IVInc to the end of this chain.
2082 void add(const IVInc &X) { Incs.push_back(Elt: X); }
2083
2084 // Returns the last UserInst in the chain.
2085 Instruction *tailUserInst() const { return Incs.back().UserInst; }
2086
2087 // Returns true if IncExpr can be profitably added to this chain.
2088 bool isProfitableIncrement(const SCEV *OperExpr,
2089 const SCEV *IncExpr,
2090 ScalarEvolution&);
2091};
2092
2093/// Helper for CollectChains to track multiple IV increment uses. Distinguish
2094/// between FarUsers that definitely cross IV increments and NearUsers that may
2095/// be used between IV increments.
2096struct ChainUsers {
2097 SmallPtrSet<Instruction*, 4> FarUsers;
2098 SmallPtrSet<Instruction*, 4> NearUsers;
2099};
2100
2101/// This class holds state for the main loop strength reduction logic.
2102class LSRInstance {
2103 IVUsers &IU;
2104 ScalarEvolution &SE;
2105 DominatorTree &DT;
2106 LoopInfo &LI;
2107 AssumptionCache &AC;
2108 TargetLibraryInfo &TLI;
2109 const TargetTransformInfo &TTI;
2110 Loop *const L;
2111 MemorySSAUpdater *MSSAU;
2112 TTI::AddressingModeKind AMK;
2113 mutable SCEVExpander Rewriter;
2114 bool Changed = false;
2115
2116 /// This is the insert position that the current loop's induction variable
2117 /// increment should be placed. In simple loops, this is the latch block's
2118 /// terminator. But in more complicated cases, this is a position which will
2119 /// dominate all the in-loop post-increment users.
2120 Instruction *IVIncInsertPos = nullptr;
2121
2122 /// Interesting factors between use strides.
2123 ///
2124 /// We explicitly use a SetVector which contains a SmallSet, instead of the
2125 /// default, a SmallDenseSet, because we need to use the full range of
2126 /// int64_ts, and there's currently no good way of doing that with
2127 /// SmallDenseSet.
2128 SetVector<int64_t, SmallVector<int64_t, 8>, SmallSet<int64_t, 8>> Factors;
2129
2130 /// The cost of the current SCEV, the best solution by LSR will be dropped if
2131 /// the solution is not profitable.
2132 Cost BaselineCost;
2133
2134 /// Interesting use types, to facilitate truncation reuse.
2135 SmallSetVector<Type *, 4> Types;
2136
2137 /// The list of interesting uses.
2138 mutable SmallVector<LSRUse, 16> Uses;
2139
2140 /// Track which uses use which register candidates.
2141 RegUseTracker RegUses;
2142
2143 // Limit the number of chains to avoid quadratic behavior. We don't expect to
2144 // have more than a few IV increment chains in a loop. Missing a Chain falls
2145 // back to normal LSR behavior for those uses.
2146 static const unsigned MaxChains = 8;
2147
2148 /// IV users can form a chain of IV increments.
2149 SmallVector<IVChain, MaxChains> IVChainVec;
2150
2151 /// IV users that belong to profitable IVChains.
2152 SmallPtrSet<Use*, MaxChains> IVIncSet;
2153
2154 /// Induction variables that were generated and inserted by the SCEV Expander.
2155 SmallVector<llvm::WeakVH, 2> ScalarEvolutionIVs;
2156
2157 // Inserting instructions in the loop and using them as PHI's input could
2158 // break LCSSA in case if PHI's parent block is not a loop exit (i.e. the
2159 // corresponding incoming block is not loop exiting). So collect all such
2160 // instructions to form LCSSA for them later.
2161 SmallSetVector<Instruction *, 4> InsertedNonLCSSAInsts;
2162
2163 void OptimizeShadowIV();
2164 bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
2165 ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
2166 void OptimizeLoopTermCond();
2167
2168 void ChainInstruction(Instruction *UserInst, Instruction *IVOper,
2169 SmallVectorImpl<ChainUsers> &ChainUsersVec);
2170 void FinalizeChain(IVChain &Chain);
2171 void CollectChains();
2172 void GenerateIVChain(const IVChain &Chain,
2173 SmallVectorImpl<WeakTrackingVH> &DeadInsts);
2174
2175 void CollectInterestingTypesAndFactors();
2176 void CollectFixupsAndInitialFormulae();
2177
2178 // Support for sharing of LSRUses between LSRFixups.
2179 using UseMapTy = DenseMap<LSRUse::SCEVUseKindPair, size_t>;
2180 UseMapTy UseMap;
2181
2182 bool reconcileNewOffset(LSRUse &LU, Immediate NewOffset, bool HasBaseReg,
2183 LSRUse::KindType Kind, MemAccessTy AccessTy);
2184
2185 std::pair<size_t, Immediate> getUse(const SCEV *&Expr, LSRUse::KindType Kind,
2186 MemAccessTy AccessTy);
2187
2188 void DeleteUse(LSRUse &LU, size_t LUIdx);
2189
2190 LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
2191
2192 void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
2193 void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
2194 void CountRegisters(const Formula &F, size_t LUIdx);
2195 bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
2196
2197 void CollectLoopInvariantFixupsAndFormulae();
2198
2199 void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
2200 unsigned Depth = 0);
2201
2202 void GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
2203 const Formula &Base, unsigned Depth,
2204 size_t Idx, bool IsScaledReg = false);
2205 void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
2206 void GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
2207 const Formula &Base, size_t Idx,
2208 bool IsScaledReg = false);
2209 void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
2210 void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx,
2211 const Formula &Base,
2212 const SmallVectorImpl<Immediate> &Worklist,
2213 size_t Idx, bool IsScaledReg = false);
2214 void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
2215 void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
2216 void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
2217 void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
2218 void GenerateCrossUseConstantOffsets();
2219 void GenerateAllReuseFormulae();
2220
2221 void FilterOutUndesirableDedicatedRegisters();
2222
2223 size_t EstimateSearchSpaceComplexity() const;
2224 void NarrowSearchSpaceByDetectingSupersets();
2225 void NarrowSearchSpaceByCollapsingUnrolledCode();
2226 void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
2227 void NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
2228 void NarrowSearchSpaceByFilterPostInc();
2229 void NarrowSearchSpaceByDeletingCostlyFormulas();
2230 void NarrowSearchSpaceByPickingWinnerRegs();
2231 void NarrowSearchSpaceUsingHeuristics();
2232
2233 void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
2234 Cost &SolutionCost,
2235 SmallVectorImpl<const Formula *> &Workspace,
2236 const Cost &CurCost,
2237 const SmallPtrSet<const SCEV *, 16> &CurRegs,
2238 DenseSet<const SCEV *> &VisitedRegs) const;
2239 void Solve(SmallVectorImpl<const Formula *> &Solution) const;
2240
2241 BasicBlock::iterator
2242 HoistInsertPosition(BasicBlock::iterator IP,
2243 const SmallVectorImpl<Instruction *> &Inputs) const;
2244 BasicBlock::iterator AdjustInsertPositionForExpand(BasicBlock::iterator IP,
2245 const LSRFixup &LF,
2246 const LSRUse &LU) const;
2247
2248 Value *Expand(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2249 BasicBlock::iterator IP,
2250 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
2251 void RewriteForPHI(PHINode *PN, const LSRUse &LU, const LSRFixup &LF,
2252 const Formula &F,
2253 SmallVectorImpl<WeakTrackingVH> &DeadInsts);
2254 void Rewrite(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2255 SmallVectorImpl<WeakTrackingVH> &DeadInsts);
2256 void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution);
2257
2258public:
2259 LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT,
2260 LoopInfo &LI, const TargetTransformInfo &TTI, AssumptionCache &AC,
2261 TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU);
2262
2263 bool getChanged() const { return Changed; }
2264 const SmallVectorImpl<WeakVH> &getScalarEvolutionIVs() const {
2265 return ScalarEvolutionIVs;
2266 }
2267
2268 void print_factors_and_types(raw_ostream &OS) const;
2269 void print_fixups(raw_ostream &OS) const;
2270 void print_uses(raw_ostream &OS) const;
2271 void print(raw_ostream &OS) const;
2272 void dump() const;
2273};
2274
2275} // end anonymous namespace
2276
2277/// If IV is used in a int-to-float cast inside the loop then try to eliminate
2278/// the cast operation.
2279void LSRInstance::OptimizeShadowIV() {
2280 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2281 if (isa<SCEVCouldNotCompute>(Val: BackedgeTakenCount))
2282 return;
2283
2284 for (IVUsers::const_iterator UI = IU.begin(), E = IU.end();
2285 UI != E; /* empty */) {
2286 IVUsers::const_iterator CandidateUI = UI;
2287 ++UI;
2288 Instruction *ShadowUse = CandidateUI->getUser();
2289 Type *DestTy = nullptr;
2290 bool IsSigned = false;
2291
2292 /* If shadow use is a int->float cast then insert a second IV
2293 to eliminate this cast.
2294
2295 for (unsigned i = 0; i < n; ++i)
2296 foo((double)i);
2297
2298 is transformed into
2299
2300 double d = 0.0;
2301 for (unsigned i = 0; i < n; ++i, ++d)
2302 foo(d);
2303 */
2304 if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(Val: CandidateUI->getUser())) {
2305 IsSigned = false;
2306 DestTy = UCast->getDestTy();
2307 }
2308 else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(Val: CandidateUI->getUser())) {
2309 IsSigned = true;
2310 DestTy = SCast->getDestTy();
2311 }
2312 if (!DestTy) continue;
2313
2314 // If target does not support DestTy natively then do not apply
2315 // this transformation.
2316 if (!TTI.isTypeLegal(Ty: DestTy)) continue;
2317
2318 PHINode *PH = dyn_cast<PHINode>(Val: ShadowUse->getOperand(i: 0));
2319 if (!PH) continue;
2320 if (PH->getNumIncomingValues() != 2) continue;
2321
2322 // If the calculation in integers overflows, the result in FP type will
2323 // differ. So we only can do this transformation if we are guaranteed to not
2324 // deal with overflowing values
2325 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Val: SE.getSCEV(V: PH));
2326 if (!AR) continue;
2327 if (IsSigned && !AR->hasNoSignedWrap()) continue;
2328 if (!IsSigned && !AR->hasNoUnsignedWrap()) continue;
2329
2330 Type *SrcTy = PH->getType();
2331 int Mantissa = DestTy->getFPMantissaWidth();
2332 if (Mantissa == -1) continue;
2333 if ((int)SE.getTypeSizeInBits(Ty: SrcTy) > Mantissa)
2334 continue;
2335
2336 unsigned Entry, Latch;
2337 if (PH->getIncomingBlock(i: 0) == L->getLoopPreheader()) {
2338 Entry = 0;
2339 Latch = 1;
2340 } else {
2341 Entry = 1;
2342 Latch = 0;
2343 }
2344
2345 ConstantInt *Init = dyn_cast<ConstantInt>(Val: PH->getIncomingValue(i: Entry));
2346 if (!Init) continue;
2347 Constant *NewInit = ConstantFP::get(Ty: DestTy, V: IsSigned ?
2348 (double)Init->getSExtValue() :
2349 (double)Init->getZExtValue());
2350
2351 BinaryOperator *Incr =
2352 dyn_cast<BinaryOperator>(Val: PH->getIncomingValue(i: Latch));
2353 if (!Incr) continue;
2354 if (Incr->getOpcode() != Instruction::Add
2355 && Incr->getOpcode() != Instruction::Sub)
2356 continue;
2357
2358 /* Initialize new IV, double d = 0.0 in above example. */
2359 ConstantInt *C = nullptr;
2360 if (Incr->getOperand(i_nocapture: 0) == PH)
2361 C = dyn_cast<ConstantInt>(Val: Incr->getOperand(i_nocapture: 1));
2362 else if (Incr->getOperand(i_nocapture: 1) == PH)
2363 C = dyn_cast<ConstantInt>(Val: Incr->getOperand(i_nocapture: 0));
2364 else
2365 continue;
2366
2367 if (!C) continue;
2368
2369 // Ignore negative constants, as the code below doesn't handle them
2370 // correctly. TODO: Remove this restriction.
2371 if (!C->getValue().isStrictlyPositive())
2372 continue;
2373
2374 /* Add new PHINode. */
2375 PHINode *NewPH = PHINode::Create(Ty: DestTy, NumReservedValues: 2, NameStr: "IV.S.", InsertBefore: PH->getIterator());
2376 NewPH->setDebugLoc(PH->getDebugLoc());
2377
2378 /* create new increment. '++d' in above example. */
2379 Constant *CFP = ConstantFP::get(Ty: DestTy, V: C->getZExtValue());
2380 BinaryOperator *NewIncr = BinaryOperator::Create(
2381 Op: Incr->getOpcode() == Instruction::Add ? Instruction::FAdd
2382 : Instruction::FSub,
2383 S1: NewPH, S2: CFP, Name: "IV.S.next.", InsertBefore: Incr->getIterator());
2384 NewIncr->setDebugLoc(Incr->getDebugLoc());
2385
2386 NewPH->addIncoming(V: NewInit, BB: PH->getIncomingBlock(i: Entry));
2387 NewPH->addIncoming(V: NewIncr, BB: PH->getIncomingBlock(i: Latch));
2388
2389 /* Remove cast operation */
2390 ShadowUse->replaceAllUsesWith(V: NewPH);
2391 ShadowUse->eraseFromParent();
2392 Changed = true;
2393 break;
2394 }
2395}
2396
2397/// If Cond has an operand that is an expression of an IV, set the IV user and
2398/// stride information and return true, otherwise return false.
2399bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
2400 for (IVStrideUse &U : IU)
2401 if (U.getUser() == Cond) {
2402 // NOTE: we could handle setcc instructions with multiple uses here, but
2403 // InstCombine does it as well for simple uses, it's not clear that it
2404 // occurs enough in real life to handle.
2405 CondUse = &U;
2406 return true;
2407 }
2408 return false;
2409}
2410
2411/// Rewrite the loop's terminating condition if it uses a max computation.
2412///
2413/// This is a narrow solution to a specific, but acute, problem. For loops
2414/// like this:
2415///
2416/// i = 0;
2417/// do {
2418/// p[i] = 0.0;
2419/// } while (++i < n);
2420///
2421/// the trip count isn't just 'n', because 'n' might not be positive. And
2422/// unfortunately this can come up even for loops where the user didn't use
2423/// a C do-while loop. For example, seemingly well-behaved top-test loops
2424/// will commonly be lowered like this:
2425///
2426/// if (n > 0) {
2427/// i = 0;
2428/// do {
2429/// p[i] = 0.0;
2430/// } while (++i < n);
2431/// }
2432///
2433/// and then it's possible for subsequent optimization to obscure the if
2434/// test in such a way that indvars can't find it.
2435///
2436/// When indvars can't find the if test in loops like this, it creates a
2437/// max expression, which allows it to give the loop a canonical
2438/// induction variable:
2439///
2440/// i = 0;
2441/// max = n < 1 ? 1 : n;
2442/// do {
2443/// p[i] = 0.0;
2444/// } while (++i != max);
2445///
2446/// Canonical induction variables are necessary because the loop passes
2447/// are designed around them. The most obvious example of this is the
2448/// LoopInfo analysis, which doesn't remember trip count values. It
2449/// expects to be able to rediscover the trip count each time it is
2450/// needed, and it does this using a simple analysis that only succeeds if
2451/// the loop has a canonical induction variable.
2452///
2453/// However, when it comes time to generate code, the maximum operation
2454/// can be quite costly, especially if it's inside of an outer loop.
2455///
2456/// This function solves this problem by detecting this type of loop and
2457/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
2458/// the instructions for the maximum computation.
2459ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
2460 // Check that the loop matches the pattern we're looking for.
2461 if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
2462 Cond->getPredicate() != CmpInst::ICMP_NE)
2463 return Cond;
2464
2465 SelectInst *Sel = dyn_cast<SelectInst>(Val: Cond->getOperand(i_nocapture: 1));
2466 if (!Sel || !Sel->hasOneUse()) return Cond;
2467
2468 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2469 if (isa<SCEVCouldNotCompute>(Val: BackedgeTakenCount))
2470 return Cond;
2471 const SCEV *One = SE.getConstant(Ty: BackedgeTakenCount->getType(), V: 1);
2472
2473 // Add one to the backedge-taken count to get the trip count.
2474 const SCEV *IterationCount = SE.getAddExpr(LHS: One, RHS: BackedgeTakenCount);
2475 if (IterationCount != SE.getSCEV(V: Sel)) return Cond;
2476
2477 // Check for a max calculation that matches the pattern. There's no check
2478 // for ICMP_ULE here because the comparison would be with zero, which
2479 // isn't interesting.
2480 CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
2481 const SCEVNAryExpr *Max = nullptr;
2482 if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(Val: BackedgeTakenCount)) {
2483 Pred = ICmpInst::ICMP_SLE;
2484 Max = S;
2485 } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(Val: IterationCount)) {
2486 Pred = ICmpInst::ICMP_SLT;
2487 Max = S;
2488 } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(Val: IterationCount)) {
2489 Pred = ICmpInst::ICMP_ULT;
2490 Max = U;
2491 } else {
2492 // No match; bail.
2493 return Cond;
2494 }
2495
2496 // To handle a max with more than two operands, this optimization would
2497 // require additional checking and setup.
2498 if (Max->getNumOperands() != 2)
2499 return Cond;
2500
2501 const SCEV *MaxLHS = Max->getOperand(i: 0);
2502 const SCEV *MaxRHS = Max->getOperand(i: 1);
2503
2504 // ScalarEvolution canonicalizes constants to the left. For < and >, look
2505 // for a comparison with 1. For <= and >=, a comparison with zero.
2506 if (!MaxLHS ||
2507 (ICmpInst::isTrueWhenEqual(predicate: Pred) ? !MaxLHS->isZero() : (MaxLHS != One)))
2508 return Cond;
2509
2510 // Check the relevant induction variable for conformance to
2511 // the pattern.
2512 const SCEV *IV = SE.getSCEV(V: Cond->getOperand(i_nocapture: 0));
2513 if (!match(S: IV,
2514 P: m_scev_AffineAddRec(Op0: m_scev_SpecificInt(V: 1), Op1: m_scev_SpecificInt(V: 1))))
2515 return Cond;
2516
2517 assert(cast<SCEVAddRecExpr>(IV)->getLoop() == L &&
2518 "Loop condition operand is an addrec in a different loop!");
2519
2520 // Check the right operand of the select, and remember it, as it will
2521 // be used in the new comparison instruction.
2522 Value *NewRHS = nullptr;
2523 if (ICmpInst::isTrueWhenEqual(predicate: Pred)) {
2524 // Look for n+1, and grab n.
2525 if (AddOperator *BO = dyn_cast<AddOperator>(Val: Sel->getOperand(i_nocapture: 1)))
2526 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(Val: BO->getOperand(i_nocapture: 1)))
2527 if (BO1->isOne() && SE.getSCEV(V: BO->getOperand(i_nocapture: 0)) == MaxRHS)
2528 NewRHS = BO->getOperand(i_nocapture: 0);
2529 if (AddOperator *BO = dyn_cast<AddOperator>(Val: Sel->getOperand(i_nocapture: 2)))
2530 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(Val: BO->getOperand(i_nocapture: 1)))
2531 if (BO1->isOne() && SE.getSCEV(V: BO->getOperand(i_nocapture: 0)) == MaxRHS)
2532 NewRHS = BO->getOperand(i_nocapture: 0);
2533 if (!NewRHS)
2534 return Cond;
2535 } else if (SE.getSCEV(V: Sel->getOperand(i_nocapture: 1)) == MaxRHS)
2536 NewRHS = Sel->getOperand(i_nocapture: 1);
2537 else if (SE.getSCEV(V: Sel->getOperand(i_nocapture: 2)) == MaxRHS)
2538 NewRHS = Sel->getOperand(i_nocapture: 2);
2539 else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(Val: MaxRHS))
2540 NewRHS = SU->getValue();
2541 else
2542 // Max doesn't match expected pattern.
2543 return Cond;
2544
2545 // Determine the new comparison opcode. It may be signed or unsigned,
2546 // and the original comparison may be either equality or inequality.
2547 if (Cond->getPredicate() == CmpInst::ICMP_EQ)
2548 Pred = CmpInst::getInversePredicate(pred: Pred);
2549
2550 // Ok, everything looks ok to change the condition into an SLT or SGE and
2551 // delete the max calculation.
2552 ICmpInst *NewCond = new ICmpInst(Cond->getIterator(), Pred,
2553 Cond->getOperand(i_nocapture: 0), NewRHS, "scmp");
2554
2555 // Delete the max calculation instructions.
2556 NewCond->setDebugLoc(Cond->getDebugLoc());
2557 Cond->replaceAllUsesWith(V: NewCond);
2558 CondUse->setUser(NewCond);
2559 Instruction *Cmp = cast<Instruction>(Val: Sel->getOperand(i_nocapture: 0));
2560 Cond->eraseFromParent();
2561 Sel->eraseFromParent();
2562 if (Cmp->use_empty())
2563 Cmp->eraseFromParent();
2564 return NewCond;
2565}
2566
2567/// Change loop terminating condition to use the postinc iv when possible.
2568void
2569LSRInstance::OptimizeLoopTermCond() {
2570 SmallPtrSet<Instruction *, 4> PostIncs;
2571
2572 // We need a different set of heuristics for rotated and non-rotated loops.
2573 // If a loop is rotated then the latch is also the backedge, so inserting
2574 // post-inc expressions just before the latch is ideal. To reduce live ranges
2575 // it also makes sense to rewrite terminating conditions to use post-inc
2576 // expressions.
2577 //
2578 // If the loop is not rotated then the latch is not a backedge; the latch
2579 // check is done in the loop head. Adding post-inc expressions before the
2580 // latch will cause overlapping live-ranges of pre-inc and post-inc expressions
2581 // in the loop body. In this case we do *not* want to use post-inc expressions
2582 // in the latch check, and we want to insert post-inc expressions before
2583 // the backedge.
2584 BasicBlock *LatchBlock = L->getLoopLatch();
2585 SmallVector<BasicBlock*, 8> ExitingBlocks;
2586 L->getExitingBlocks(ExitingBlocks);
2587 if (!llvm::is_contained(Range&: ExitingBlocks, Element: LatchBlock)) {
2588 // The backedge doesn't exit the loop; treat this as a head-tested loop.
2589 IVIncInsertPos = LatchBlock->getTerminator();
2590 return;
2591 }
2592
2593 // Otherwise treat this as a rotated loop.
2594 for (BasicBlock *ExitingBlock : ExitingBlocks) {
2595 // Get the terminating condition for the loop if possible. If we
2596 // can, we want to change it to use a post-incremented version of its
2597 // induction variable, to allow coalescing the live ranges for the IV into
2598 // one register value.
2599
2600 BranchInst *TermBr = dyn_cast<BranchInst>(Val: ExitingBlock->getTerminator());
2601 if (!TermBr)
2602 continue;
2603 // FIXME: Overly conservative, termination condition could be an 'or' etc..
2604 if (TermBr->isUnconditional() || !isa<ICmpInst>(Val: TermBr->getCondition()))
2605 continue;
2606
2607 // Search IVUsesByStride to find Cond's IVUse if there is one.
2608 IVStrideUse *CondUse = nullptr;
2609 ICmpInst *Cond = cast<ICmpInst>(Val: TermBr->getCondition());
2610 if (!FindIVUserForCond(Cond, CondUse))
2611 continue;
2612
2613 // If the trip count is computed in terms of a max (due to ScalarEvolution
2614 // being unable to find a sufficient guard, for example), change the loop
2615 // comparison to use SLT or ULT instead of NE.
2616 // One consequence of doing this now is that it disrupts the count-down
2617 // optimization. That's not always a bad thing though, because in such
2618 // cases it may still be worthwhile to avoid a max.
2619 Cond = OptimizeMax(Cond, CondUse);
2620
2621 // If this exiting block dominates the latch block, it may also use
2622 // the post-inc value if it won't be shared with other uses.
2623 // Check for dominance.
2624 if (!DT.dominates(A: ExitingBlock, B: LatchBlock))
2625 continue;
2626
2627 // Conservatively avoid trying to use the post-inc value in non-latch
2628 // exits if there may be pre-inc users in intervening blocks.
2629 if (LatchBlock != ExitingBlock)
2630 for (const IVStrideUse &UI : IU)
2631 // Test if the use is reachable from the exiting block. This dominator
2632 // query is a conservative approximation of reachability.
2633 if (&UI != CondUse &&
2634 !DT.properlyDominates(A: UI.getUser()->getParent(), B: ExitingBlock)) {
2635 // Conservatively assume there may be reuse if the quotient of their
2636 // strides could be a legal scale.
2637 const SCEV *A = IU.getStride(IU: *CondUse, L);
2638 const SCEV *B = IU.getStride(IU: UI, L);
2639 if (!A || !B) continue;
2640 if (SE.getTypeSizeInBits(Ty: A->getType()) !=
2641 SE.getTypeSizeInBits(Ty: B->getType())) {
2642 if (SE.getTypeSizeInBits(Ty: A->getType()) >
2643 SE.getTypeSizeInBits(Ty: B->getType()))
2644 B = SE.getSignExtendExpr(Op: B, Ty: A->getType());
2645 else
2646 A = SE.getSignExtendExpr(Op: A, Ty: B->getType());
2647 }
2648 if (const SCEVConstant *D =
2649 dyn_cast_or_null<SCEVConstant>(Val: getExactSDiv(LHS: B, RHS: A, SE))) {
2650 const ConstantInt *C = D->getValue();
2651 // Stride of one or negative one can have reuse with non-addresses.
2652 if (C->isOne() || C->isMinusOne())
2653 goto decline_post_inc;
2654 // Avoid weird situations.
2655 if (C->getValue().getSignificantBits() >= 64 ||
2656 C->getValue().isMinSignedValue())
2657 goto decline_post_inc;
2658 // Check for possible scaled-address reuse.
2659 if (isAddressUse(TTI, Inst: UI.getUser(), OperandVal: UI.getOperandValToReplace())) {
2660 MemAccessTy AccessTy =
2661 getAccessType(TTI, Inst: UI.getUser(), OperandVal: UI.getOperandValToReplace());
2662 int64_t Scale = C->getSExtValue();
2663 if (TTI.isLegalAddressingMode(Ty: AccessTy.MemTy, /*BaseGV=*/nullptr,
2664 /*BaseOffset=*/0,
2665 /*HasBaseReg=*/true, Scale,
2666 AddrSpace: AccessTy.AddrSpace))
2667 goto decline_post_inc;
2668 Scale = -Scale;
2669 if (TTI.isLegalAddressingMode(Ty: AccessTy.MemTy, /*BaseGV=*/nullptr,
2670 /*BaseOffset=*/0,
2671 /*HasBaseReg=*/true, Scale,
2672 AddrSpace: AccessTy.AddrSpace))
2673 goto decline_post_inc;
2674 }
2675 }
2676 }
2677
2678 LLVM_DEBUG(dbgs() << " Change loop exiting icmp to use postinc iv: "
2679 << *Cond << '\n');
2680
2681 // It's possible for the setcc instruction to be anywhere in the loop, and
2682 // possible for it to have multiple users. If it is not immediately before
2683 // the exiting block branch, move it.
2684 if (Cond->getNextNonDebugInstruction() != TermBr) {
2685 if (Cond->hasOneUse()) {
2686 Cond->moveBefore(InsertPos: TermBr->getIterator());
2687 } else {
2688 // Clone the terminating condition and insert into the loopend.
2689 ICmpInst *OldCond = Cond;
2690 Cond = cast<ICmpInst>(Val: Cond->clone());
2691 Cond->setName(L->getHeader()->getName() + ".termcond");
2692 Cond->insertInto(ParentBB: ExitingBlock, It: TermBr->getIterator());
2693
2694 // Clone the IVUse, as the old use still exists!
2695 CondUse = &IU.AddUser(User: Cond, Operand: CondUse->getOperandValToReplace());
2696 TermBr->replaceUsesOfWith(From: OldCond, To: Cond);
2697 }
2698 }
2699
2700 // If we get to here, we know that we can transform the setcc instruction to
2701 // use the post-incremented version of the IV, allowing us to coalesce the
2702 // live ranges for the IV correctly.
2703 CondUse->transformToPostInc(L);
2704 Changed = true;
2705
2706 PostIncs.insert(Ptr: Cond);
2707 decline_post_inc:;
2708 }
2709
2710 // Determine an insertion point for the loop induction variable increment. It
2711 // must dominate all the post-inc comparisons we just set up, and it must
2712 // dominate the loop latch edge.
2713 IVIncInsertPos = L->getLoopLatch()->getTerminator();
2714 for (Instruction *Inst : PostIncs)
2715 IVIncInsertPos = DT.findNearestCommonDominator(I1: IVIncInsertPos, I2: Inst);
2716}
2717
2718/// Determine if the given use can accommodate a fixup at the given offset and
2719/// other details. If so, update the use and return true.
2720bool LSRInstance::reconcileNewOffset(LSRUse &LU, Immediate NewOffset,
2721 bool HasBaseReg, LSRUse::KindType Kind,
2722 MemAccessTy AccessTy) {
2723 Immediate NewMinOffset = LU.MinOffset;
2724 Immediate NewMaxOffset = LU.MaxOffset;
2725 MemAccessTy NewAccessTy = AccessTy;
2726
2727 // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
2728 // something conservative, however this can pessimize in the case that one of
2729 // the uses will have all its uses outside the loop, for example.
2730 if (LU.Kind != Kind)
2731 return false;
2732
2733 // Check for a mismatched access type, and fall back conservatively as needed.
2734 // TODO: Be less conservative when the type is similar and can use the same
2735 // addressing modes.
2736 if (Kind == LSRUse::Address) {
2737 if (AccessTy.MemTy != LU.AccessTy.MemTy) {
2738 NewAccessTy = MemAccessTy::getUnknown(Ctx&: AccessTy.MemTy->getContext(),
2739 AS: AccessTy.AddrSpace);
2740 }
2741 }
2742
2743 // Conservatively assume HasBaseReg is true for now.
2744 if (Immediate::isKnownLT(LHS: NewOffset, RHS: LU.MinOffset)) {
2745 if (!isAlwaysFoldable(TTI, Kind, AccessTy: NewAccessTy, /*BaseGV=*/nullptr,
2746 BaseOffset: LU.MaxOffset - NewOffset, HasBaseReg))
2747 return false;
2748 NewMinOffset = NewOffset;
2749 } else if (Immediate::isKnownGT(LHS: NewOffset, RHS: LU.MaxOffset)) {
2750 if (!isAlwaysFoldable(TTI, Kind, AccessTy: NewAccessTy, /*BaseGV=*/nullptr,
2751 BaseOffset: NewOffset - LU.MinOffset, HasBaseReg))
2752 return false;
2753 NewMaxOffset = NewOffset;
2754 }
2755
2756 // FIXME: We should be able to handle some level of scalable offset support
2757 // for 'void', but in order to get basic support up and running this is
2758 // being left out.
2759 if (NewAccessTy.MemTy && NewAccessTy.MemTy->isVoidTy() &&
2760 (NewMinOffset.isScalable() || NewMaxOffset.isScalable()))
2761 return false;
2762
2763 // Update the use.
2764 LU.MinOffset = NewMinOffset;
2765 LU.MaxOffset = NewMaxOffset;
2766 LU.AccessTy = NewAccessTy;
2767 return true;
2768}
2769
2770/// Return an LSRUse index and an offset value for a fixup which needs the given
2771/// expression, with the given kind and optional access type. Either reuse an
2772/// existing use or create a new one, as needed.
2773std::pair<size_t, Immediate> LSRInstance::getUse(const SCEV *&Expr,
2774 LSRUse::KindType Kind,
2775 MemAccessTy AccessTy) {
2776 const SCEV *Copy = Expr;
2777 Immediate Offset = ExtractImmediate(S&: Expr, SE);
2778
2779 // Basic uses can't accept any offset, for example.
2780 if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr,
2781 BaseOffset: Offset, /*HasBaseReg=*/ true)) {
2782 Expr = Copy;
2783 Offset = Immediate::getFixed(MinVal: 0);
2784 }
2785
2786 std::pair<UseMapTy::iterator, bool> P =
2787 UseMap.try_emplace(Key: LSRUse::SCEVUseKindPair(Expr, Kind));
2788 if (!P.second) {
2789 // A use already existed with this base.
2790 size_t LUIdx = P.first->second;
2791 LSRUse &LU = Uses[LUIdx];
2792 if (reconcileNewOffset(LU, NewOffset: Offset, /*HasBaseReg=*/true, Kind, AccessTy))
2793 // Reuse this use.
2794 return std::make_pair(x&: LUIdx, y&: Offset);
2795 }
2796
2797 // Create a new use.
2798 size_t LUIdx = Uses.size();
2799 P.first->second = LUIdx;
2800 Uses.push_back(Elt: LSRUse(Kind, AccessTy));
2801 LSRUse &LU = Uses[LUIdx];
2802
2803 LU.MinOffset = Offset;
2804 LU.MaxOffset = Offset;
2805 return std::make_pair(x&: LUIdx, y&: Offset);
2806}
2807
2808/// Delete the given use from the Uses list.
2809void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {
2810 if (&LU != &Uses.back())
2811 std::swap(a&: LU, b&: Uses.back());
2812 Uses.pop_back();
2813
2814 // Update RegUses.
2815 RegUses.swapAndDropUse(LUIdx, LastLUIdx: Uses.size());
2816}
2817
2818/// Look for a use distinct from OrigLU which is has a formula that has the same
2819/// registers as the given formula.
2820LSRUse *
2821LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
2822 const LSRUse &OrigLU) {
2823 // Search all uses for the formula. This could be more clever.
2824 for (LSRUse &LU : Uses) {
2825 // Check whether this use is close enough to OrigLU, to see whether it's
2826 // worthwhile looking through its formulae.
2827 // Ignore ICmpZero uses because they may contain formulae generated by
2828 // GenerateICmpZeroScales, in which case adding fixup offsets may
2829 // be invalid.
2830 if (&LU != &OrigLU &&
2831 LU.Kind != LSRUse::ICmpZero &&
2832 LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
2833 LU.WidestFixupType == OrigLU.WidestFixupType &&
2834 LU.HasFormulaWithSameRegs(F: OrigF)) {
2835 // Scan through this use's formulae.
2836 for (const Formula &F : LU.Formulae) {
2837 // Check to see if this formula has the same registers and symbols
2838 // as OrigF.
2839 if (F.BaseRegs == OrigF.BaseRegs &&
2840 F.ScaledReg == OrigF.ScaledReg &&
2841 F.BaseGV == OrigF.BaseGV &&
2842 F.Scale == OrigF.Scale &&
2843 F.UnfoldedOffset == OrigF.UnfoldedOffset) {
2844 if (F.BaseOffset.isZero())
2845 return &LU;
2846 // This is the formula where all the registers and symbols matched;
2847 // there aren't going to be any others. Since we declined it, we
2848 // can skip the rest of the formulae and proceed to the next LSRUse.
2849 break;
2850 }
2851 }
2852 }
2853 }
2854
2855 // Nothing looked good.
2856 return nullptr;
2857}
2858
2859void LSRInstance::CollectInterestingTypesAndFactors() {
2860 SmallSetVector<const SCEV *, 4> Strides;
2861
2862 // Collect interesting types and strides.
2863 SmallVector<const SCEV *, 4> Worklist;
2864 for (const IVStrideUse &U : IU) {
2865 const SCEV *Expr = IU.getExpr(IU: U);
2866 if (!Expr)
2867 continue;
2868
2869 // Collect interesting types.
2870 Types.insert(X: SE.getEffectiveSCEVType(Ty: Expr->getType()));
2871
2872 // Add strides for mentioned loops.
2873 Worklist.push_back(Elt: Expr);
2874 do {
2875 const SCEV *S = Worklist.pop_back_val();
2876 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Val: S)) {
2877 if (AR->getLoop() == L)
2878 Strides.insert(X: AR->getStepRecurrence(SE));
2879 Worklist.push_back(Elt: AR->getStart());
2880 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Val: S)) {
2881 append_range(C&: Worklist, R: Add->operands());
2882 }
2883 } while (!Worklist.empty());
2884 }
2885
2886 // Compute interesting factors from the set of interesting strides.
2887 for (SmallSetVector<const SCEV *, 4>::const_iterator
2888 I = Strides.begin(), E = Strides.end(); I != E; ++I)
2889 for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter =
2890 std::next(x: I); NewStrideIter != E; ++NewStrideIter) {
2891 const SCEV *OldStride = *I;
2892 const SCEV *NewStride = *NewStrideIter;
2893
2894 if (SE.getTypeSizeInBits(Ty: OldStride->getType()) !=
2895 SE.getTypeSizeInBits(Ty: NewStride->getType())) {
2896 if (SE.getTypeSizeInBits(Ty: OldStride->getType()) >
2897 SE.getTypeSizeInBits(Ty: NewStride->getType()))
2898 NewStride = SE.getSignExtendExpr(Op: NewStride, Ty: OldStride->getType());
2899 else
2900 OldStride = SE.getSignExtendExpr(Op: OldStride, Ty: NewStride->getType());
2901 }
2902 if (const SCEVConstant *Factor =
2903 dyn_cast_or_null<SCEVConstant>(Val: getExactSDiv(LHS: NewStride, RHS: OldStride,
2904 SE, IgnoreSignificantBits: true))) {
2905 if (Factor->getAPInt().getSignificantBits() <= 64 && !Factor->isZero())
2906 Factors.insert(X: Factor->getAPInt().getSExtValue());
2907 } else if (const SCEVConstant *Factor =
2908 dyn_cast_or_null<SCEVConstant>(Val: getExactSDiv(LHS: OldStride,
2909 RHS: NewStride,
2910 SE, IgnoreSignificantBits: true))) {
2911 if (Factor->getAPInt().getSignificantBits() <= 64 && !Factor->isZero())
2912 Factors.insert(X: Factor->getAPInt().getSExtValue());
2913 }
2914 }
2915
2916 // If all uses use the same type, don't bother looking for truncation-based
2917 // reuse.
2918 if (Types.size() == 1)
2919 Types.clear();
2920
2921 LLVM_DEBUG(print_factors_and_types(dbgs()));
2922}
2923
2924/// Helper for CollectChains that finds an IV operand (computed by an AddRec in
2925/// this loop) within [OI,OE) or returns OE. If IVUsers mapped Instructions to
2926/// IVStrideUses, we could partially skip this.
2927static User::op_iterator
2928findIVOperand(User::op_iterator OI, User::op_iterator OE,
2929 Loop *L, ScalarEvolution &SE) {
2930 for(; OI != OE; ++OI) {
2931 if (Instruction *Oper = dyn_cast<Instruction>(Val&: *OI)) {
2932 if (!SE.isSCEVable(Ty: Oper->getType()))
2933 continue;
2934
2935 if (const SCEVAddRecExpr *AR =
2936 dyn_cast<SCEVAddRecExpr>(Val: SE.getSCEV(V: Oper))) {
2937 if (AR->getLoop() == L)
2938 break;
2939 }
2940 }
2941 }
2942 return OI;
2943}
2944
2945/// IVChain logic must consistently peek base TruncInst operands, so wrap it in
2946/// a convenient helper.
2947static Value *getWideOperand(Value *Oper) {
2948 if (TruncInst *Trunc = dyn_cast<TruncInst>(Val: Oper))
2949 return Trunc->getOperand(i_nocapture: 0);
2950 return Oper;
2951}
2952
2953/// Return an approximation of this SCEV expression's "base", or NULL for any
2954/// constant. Returning the expression itself is conservative. Returning a
2955/// deeper subexpression is more precise and valid as long as it isn't less
2956/// complex than another subexpression. For expressions involving multiple
2957/// unscaled values, we need to return the pointer-type SCEVUnknown. This avoids
2958/// forming chains across objects, such as: PrevOper==a[i], IVOper==b[i],
2959/// IVInc==b-a.
2960///
2961/// Since SCEVUnknown is the rightmost type, and pointers are the rightmost
2962/// SCEVUnknown, we simply return the rightmost SCEV operand.
2963static const SCEV *getExprBase(const SCEV *S) {
2964 switch (S->getSCEVType()) {
2965 default: // including scUnknown.
2966 return S;
2967 case scConstant:
2968 case scVScale:
2969 return nullptr;
2970 case scTruncate:
2971 return getExprBase(S: cast<SCEVTruncateExpr>(Val: S)->getOperand());
2972 case scZeroExtend:
2973 return getExprBase(S: cast<SCEVZeroExtendExpr>(Val: S)->getOperand());
2974 case scSignExtend:
2975 return getExprBase(S: cast<SCEVSignExtendExpr>(Val: S)->getOperand());
2976 case scAddExpr: {
2977 // Skip over scaled operands (scMulExpr) to follow add operands as long as
2978 // there's nothing more complex.
2979 // FIXME: not sure if we want to recognize negation.
2980 const SCEVAddExpr *Add = cast<SCEVAddExpr>(Val: S);
2981 for (const SCEV *SubExpr : reverse(C: Add->operands())) {
2982 if (SubExpr->getSCEVType() == scAddExpr)
2983 return getExprBase(S: SubExpr);
2984
2985 if (SubExpr->getSCEVType() != scMulExpr)
2986 return SubExpr;
2987 }
2988 return S; // all operands are scaled, be conservative.
2989 }
2990 case scAddRecExpr:
2991 return getExprBase(S: cast<SCEVAddRecExpr>(Val: S)->getStart());
2992 }
2993 llvm_unreachable("Unknown SCEV kind!");
2994}
2995
2996/// Return true if the chain increment is profitable to expand into a loop
2997/// invariant value, which may require its own register. A profitable chain
2998/// increment will be an offset relative to the same base. We allow such offsets
2999/// to potentially be used as chain increment as long as it's not obviously
3000/// expensive to expand using real instructions.
3001bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
3002 const SCEV *IncExpr,
3003 ScalarEvolution &SE) {
3004 // Aggressively form chains when -stress-ivchain.
3005 if (StressIVChain)
3006 return true;
3007
3008 // Do not replace a constant offset from IV head with a nonconstant IV
3009 // increment.
3010 if (!isa<SCEVConstant>(Val: IncExpr)) {
3011 const SCEV *HeadExpr = SE.getSCEV(V: getWideOperand(Oper: Incs[0].IVOperand));
3012 if (isa<SCEVConstant>(Val: SE.getMinusSCEV(LHS: OperExpr, RHS: HeadExpr)))
3013 return false;
3014 }
3015
3016 SmallPtrSet<const SCEV*, 8> Processed;
3017 return !isHighCostExpansion(S: IncExpr, Processed, SE);
3018}
3019
3020/// Return true if the number of registers needed for the chain is estimated to
3021/// be less than the number required for the individual IV users. First prohibit
3022/// any IV users that keep the IV live across increments (the Users set should
3023/// be empty). Next count the number and type of increments in the chain.
3024///
3025/// Chaining IVs can lead to considerable code bloat if ISEL doesn't
3026/// effectively use postinc addressing modes. Only consider it profitable it the
3027/// increments can be computed in fewer registers when chained.
3028///
3029/// TODO: Consider IVInc free if it's already used in another chains.
3030static bool isProfitableChain(IVChain &Chain,
3031 SmallPtrSetImpl<Instruction *> &Users,
3032 ScalarEvolution &SE,
3033 const TargetTransformInfo &TTI) {
3034 if (StressIVChain)
3035 return true;
3036
3037 if (!Chain.hasIncs())
3038 return false;
3039
3040 if (!Users.empty()) {
3041 LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
3042 for (Instruction *Inst
3043 : Users) { dbgs() << " " << *Inst << "\n"; });
3044 return false;
3045 }
3046 assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
3047
3048 // The chain itself may require a register, so intialize cost to 1.
3049 int cost = 1;
3050
3051 // A complete chain likely eliminates the need for keeping the original IV in
3052 // a register. LSR does not currently know how to form a complete chain unless
3053 // the header phi already exists.
3054 if (isa<PHINode>(Val: Chain.tailUserInst())
3055 && SE.getSCEV(V: Chain.tailUserInst()) == Chain.Incs[0].IncExpr) {
3056 --cost;
3057 }
3058 const SCEV *LastIncExpr = nullptr;
3059 unsigned NumConstIncrements = 0;
3060 unsigned NumVarIncrements = 0;
3061 unsigned NumReusedIncrements = 0;
3062
3063 if (TTI.isProfitableLSRChainElement(I: Chain.Incs[0].UserInst))
3064 return true;
3065
3066 for (const IVInc &Inc : Chain) {
3067 if (TTI.isProfitableLSRChainElement(I: Inc.UserInst))
3068 return true;
3069 if (Inc.IncExpr->isZero())
3070 continue;
3071
3072 // Incrementing by zero or some constant is neutral. We assume constants can
3073 // be folded into an addressing mode or an add's immediate operand.
3074 if (isa<SCEVConstant>(Val: Inc.IncExpr)) {
3075 ++NumConstIncrements;
3076 continue;
3077 }
3078
3079 if (Inc.IncExpr == LastIncExpr)
3080 ++NumReusedIncrements;
3081 else
3082 ++NumVarIncrements;
3083
3084 LastIncExpr = Inc.IncExpr;
3085 }
3086 // An IV chain with a single increment is handled by LSR's postinc
3087 // uses. However, a chain with multiple increments requires keeping the IV's
3088 // value live longer than it needs to be if chained.
3089 if (NumConstIncrements > 1)
3090 --cost;
3091
3092 // Materializing increment expressions in the preheader that didn't exist in
3093 // the original code may cost a register. For example, sign-extended array
3094 // indices can produce ridiculous increments like this:
3095 // IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64)))
3096 cost += NumVarIncrements;
3097
3098 // Reusing variable increments likely saves a register to hold the multiple of
3099 // the stride.
3100 cost -= NumReusedIncrements;
3101
3102 LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost
3103 << "\n");
3104
3105 return cost < 0;
3106}
3107
3108/// Add this IV user to an existing chain or make it the head of a new chain.
3109void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
3110 SmallVectorImpl<ChainUsers> &ChainUsersVec) {
3111 // When IVs are used as types of varying widths, they are generally converted
3112 // to a wider type with some uses remaining narrow under a (free) trunc.
3113 Value *const NextIV = getWideOperand(Oper: IVOper);
3114 const SCEV *const OperExpr = SE.getSCEV(V: NextIV);
3115 const SCEV *const OperExprBase = getExprBase(S: OperExpr);
3116
3117 // Visit all existing chains. Check if its IVOper can be computed as a
3118 // profitable loop invariant increment from the last link in the Chain.
3119 unsigned ChainIdx = 0, NChains = IVChainVec.size();
3120 const SCEV *LastIncExpr = nullptr;
3121 for (; ChainIdx < NChains; ++ChainIdx) {
3122 IVChain &Chain = IVChainVec[ChainIdx];
3123
3124 // Prune the solution space aggressively by checking that both IV operands
3125 // are expressions that operate on the same unscaled SCEVUnknown. This
3126 // "base" will be canceled by the subsequent getMinusSCEV call. Checking
3127 // first avoids creating extra SCEV expressions.
3128 if (!StressIVChain && Chain.ExprBase != OperExprBase)
3129 continue;
3130
3131 Value *PrevIV = getWideOperand(Oper: Chain.Incs.back().IVOperand);
3132 if (PrevIV->getType() != NextIV->getType())
3133 continue;
3134
3135 // A phi node terminates a chain.
3136 if (isa<PHINode>(Val: UserInst) && isa<PHINode>(Val: Chain.tailUserInst()))
3137 continue;
3138
3139 // The increment must be loop-invariant so it can be kept in a register.
3140 const SCEV *PrevExpr = SE.getSCEV(V: PrevIV);
3141 const SCEV *IncExpr = SE.getMinusSCEV(LHS: OperExpr, RHS: PrevExpr);
3142 if (isa<SCEVCouldNotCompute>(Val: IncExpr) || !SE.isLoopInvariant(S: IncExpr, L))
3143 continue;
3144
3145 if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) {
3146 LastIncExpr = IncExpr;
3147 break;
3148 }
3149 }
3150 // If we haven't found a chain, create a new one, unless we hit the max. Don't
3151 // bother for phi nodes, because they must be last in the chain.
3152 if (ChainIdx == NChains) {
3153 if (isa<PHINode>(Val: UserInst))
3154 return;
3155 if (NChains >= MaxChains && !StressIVChain) {
3156 LLVM_DEBUG(dbgs() << "IV Chain Limit\n");
3157 return;
3158 }
3159 LastIncExpr = OperExpr;
3160 // IVUsers may have skipped over sign/zero extensions. We don't currently
3161 // attempt to form chains involving extensions unless they can be hoisted
3162 // into this loop's AddRec.
3163 if (!isa<SCEVAddRecExpr>(Val: LastIncExpr))
3164 return;
3165 ++NChains;
3166 IVChainVec.push_back(Elt: IVChain(IVInc(UserInst, IVOper, LastIncExpr),
3167 OperExprBase));
3168 ChainUsersVec.resize(N: NChains);
3169 LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst
3170 << ") IV=" << *LastIncExpr << "\n");
3171 } else {
3172 LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Inc: (" << *UserInst
3173 << ") IV+" << *LastIncExpr << "\n");
3174 // Add this IV user to the end of the chain.
3175 IVChainVec[ChainIdx].add(X: IVInc(UserInst, IVOper, LastIncExpr));
3176 }
3177 IVChain &Chain = IVChainVec[ChainIdx];
3178
3179 SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers;
3180 // This chain's NearUsers become FarUsers.
3181 if (!LastIncExpr->isZero()) {
3182 ChainUsersVec[ChainIdx].FarUsers.insert_range(R&: NearUsers);
3183 NearUsers.clear();
3184 }
3185
3186 // All other uses of IVOperand become near uses of the chain.
3187 // We currently ignore intermediate values within SCEV expressions, assuming
3188 // they will eventually be used be the current chain, or can be computed
3189 // from one of the chain increments. To be more precise we could
3190 // transitively follow its user and only add leaf IV users to the set.
3191 for (User *U : IVOper->users()) {
3192 Instruction *OtherUse = dyn_cast<Instruction>(Val: U);
3193 if (!OtherUse)
3194 continue;
3195 // Uses in the chain will no longer be uses if the chain is formed.
3196 // Include the head of the chain in this iteration (not Chain.begin()).
3197 IVChain::const_iterator IncIter = Chain.Incs.begin();
3198 IVChain::const_iterator IncEnd = Chain.Incs.end();
3199 for( ; IncIter != IncEnd; ++IncIter) {
3200 if (IncIter->UserInst == OtherUse)
3201 break;
3202 }
3203 if (IncIter != IncEnd)
3204 continue;
3205
3206 if (SE.isSCEVable(Ty: OtherUse->getType())
3207 && !isa<SCEVUnknown>(Val: SE.getSCEV(V: OtherUse))
3208 && IU.isIVUserOrOperand(Inst: OtherUse)) {
3209 continue;
3210 }
3211 NearUsers.insert(Ptr: OtherUse);
3212 }
3213
3214 // Since this user is part of the chain, it's no longer considered a use
3215 // of the chain.
3216 ChainUsersVec[ChainIdx].FarUsers.erase(Ptr: UserInst);
3217}
3218
3219/// Populate the vector of Chains.
3220///
3221/// This decreases ILP at the architecture level. Targets with ample registers,
3222/// multiple memory ports, and no register renaming probably don't want
3223/// this. However, such targets should probably disable LSR altogether.
3224///
3225/// The job of LSR is to make a reasonable choice of induction variables across
3226/// the loop. Subsequent passes can easily "unchain" computation exposing more
3227/// ILP *within the loop* if the target wants it.
3228///
3229/// Finding the best IV chain is potentially a scheduling problem. Since LSR
3230/// will not reorder memory operations, it will recognize this as a chain, but
3231/// will generate redundant IV increments. Ideally this would be corrected later
3232/// by a smart scheduler:
3233/// = A[i]
3234/// = A[i+x]
3235/// A[i] =
3236/// A[i+x] =
3237///
3238/// TODO: Walk the entire domtree within this loop, not just the path to the
3239/// loop latch. This will discover chains on side paths, but requires
3240/// maintaining multiple copies of the Chains state.
3241void LSRInstance::CollectChains() {
3242 LLVM_DEBUG(dbgs() << "Collecting IV Chains.\n");
3243 SmallVector<ChainUsers, 8> ChainUsersVec;
3244
3245 SmallVector<BasicBlock *,8> LatchPath;
3246 BasicBlock *LoopHeader = L->getHeader();
3247 for (DomTreeNode *Rung = DT.getNode(BB: L->getLoopLatch());
3248 Rung->getBlock() != LoopHeader; Rung = Rung->getIDom()) {
3249 LatchPath.push_back(Elt: Rung->getBlock());
3250 }
3251 LatchPath.push_back(Elt: LoopHeader);
3252
3253 // Walk the instruction stream from the loop header to the loop latch.
3254 for (BasicBlock *BB : reverse(C&: LatchPath)) {
3255 for (Instruction &I : *BB) {
3256 // Skip instructions that weren't seen by IVUsers analysis.
3257 if (isa<PHINode>(Val: I) || !IU.isIVUserOrOperand(Inst: &I))
3258 continue;
3259
3260 // Ignore users that are part of a SCEV expression. This way we only
3261 // consider leaf IV Users. This effectively rediscovers a portion of
3262 // IVUsers analysis but in program order this time.
3263 if (SE.isSCEVable(Ty: I.getType()) && !isa<SCEVUnknown>(Val: SE.getSCEV(V: &I)))
3264 continue;
3265
3266 // Remove this instruction from any NearUsers set it may be in.
3267 for (unsigned ChainIdx = 0, NChains = IVChainVec.size();
3268 ChainIdx < NChains; ++ChainIdx) {
3269 ChainUsersVec[ChainIdx].NearUsers.erase(Ptr: &I);
3270 }
3271 // Search for operands that can be chained.
3272 SmallPtrSet<Instruction*, 4> UniqueOperands;
3273 User::op_iterator IVOpEnd = I.op_end();
3274 User::op_iterator IVOpIter = findIVOperand(OI: I.op_begin(), OE: IVOpEnd, L, SE);
3275 while (IVOpIter != IVOpEnd) {
3276 Instruction *IVOpInst = cast<Instruction>(Val&: *IVOpIter);
3277 if (UniqueOperands.insert(Ptr: IVOpInst).second)
3278 ChainInstruction(UserInst: &I, IVOper: IVOpInst, ChainUsersVec);
3279 IVOpIter = findIVOperand(OI: std::next(x: IVOpIter), OE: IVOpEnd, L, SE);
3280 }
3281 } // Continue walking down the instructions.
3282 } // Continue walking down the domtree.
3283 // Visit phi backedges to determine if the chain can generate the IV postinc.
3284 for (PHINode &PN : L->getHeader()->phis()) {
3285 if (!SE.isSCEVable(Ty: PN.getType()))
3286 continue;
3287
3288 Instruction *IncV =
3289 dyn_cast<Instruction>(Val: PN.getIncomingValueForBlock(BB: L->getLoopLatch()));
3290 if (IncV)
3291 ChainInstruction(UserInst: &PN, IVOper: IncV, ChainUsersVec);
3292 }
3293 // Remove any unprofitable chains.
3294 unsigned ChainIdx = 0;
3295 for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
3296 UsersIdx < NChains; ++UsersIdx) {
3297 if (!isProfitableChain(Chain&: IVChainVec[UsersIdx],
3298 Users&: ChainUsersVec[UsersIdx].FarUsers, SE, TTI))
3299 continue;
3300 // Preserve the chain at UsesIdx.
3301 if (ChainIdx != UsersIdx)
3302 IVChainVec[ChainIdx] = IVChainVec[UsersIdx];
3303 FinalizeChain(Chain&: IVChainVec[ChainIdx]);
3304 ++ChainIdx;
3305 }
3306 IVChainVec.resize(N: ChainIdx);
3307}
3308
3309void LSRInstance::FinalizeChain(IVChain &Chain) {
3310 assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
3311 LLVM_DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
3312
3313 for (const IVInc &Inc : Chain) {
3314 LLVM_DEBUG(dbgs() << " Inc: " << *Inc.UserInst << "\n");
3315 auto UseI = find(Range: Inc.UserInst->operands(), Val: Inc.IVOperand);
3316 assert(UseI != Inc.UserInst->op_end() && "cannot find IV operand");
3317 IVIncSet.insert(Ptr: UseI);
3318 }
3319}
3320
3321/// Return true if the IVInc can be folded into an addressing mode.
3322static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
3323 Value *Operand, const TargetTransformInfo &TTI) {
3324 const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(Val: IncExpr);
3325 Immediate IncOffset = Immediate::getZero();
3326 if (IncConst) {
3327 if (IncConst && IncConst->getAPInt().getSignificantBits() > 64)
3328 return false;
3329 IncOffset = Immediate::getFixed(MinVal: IncConst->getValue()->getSExtValue());
3330 } else {
3331 // Look for mul(vscale, constant), to detect a scalable offset.
3332 const APInt *C;
3333 if (!match(S: IncExpr, P: m_scev_Mul(Op0: m_scev_APInt(C), Op1: m_SCEVVScale())) ||
3334 C->getSignificantBits() > 64)
3335 return false;
3336 IncOffset = Immediate::getScalable(MinVal: C->getSExtValue());
3337 }
3338
3339 if (!isAddressUse(TTI, Inst: UserInst, OperandVal: Operand))
3340 return false;
3341
3342 MemAccessTy AccessTy = getAccessType(TTI, Inst: UserInst, OperandVal: Operand);
3343 if (!isAlwaysFoldable(TTI, Kind: LSRUse::Address, AccessTy, /*BaseGV=*/nullptr,
3344 BaseOffset: IncOffset, /*HasBaseReg=*/false))
3345 return false;
3346
3347 return true;
3348}
3349
3350/// Generate an add or subtract for each IVInc in a chain to materialize the IV
3351/// user's operand from the previous IV user's operand.
3352void LSRInstance::GenerateIVChain(const IVChain &Chain,
3353 SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
3354 // Find the new IVOperand for the head of the chain. It may have been replaced
3355 // by LSR.
3356 const IVInc &Head = Chain.Incs[0];
3357 User::op_iterator IVOpEnd = Head.UserInst->op_end();
3358 // findIVOperand returns IVOpEnd if it can no longer find a valid IV user.
3359 User::op_iterator IVOpIter = findIVOperand(OI: Head.UserInst->op_begin(),
3360 OE: IVOpEnd, L, SE);
3361 Value *IVSrc = nullptr;
3362 while (IVOpIter != IVOpEnd) {
3363 IVSrc = getWideOperand(Oper: *IVOpIter);
3364
3365 // If this operand computes the expression that the chain needs, we may use
3366 // it. (Check this after setting IVSrc which is used below.)
3367 //
3368 // Note that if Head.IncExpr is wider than IVSrc, then this phi is too
3369 // narrow for the chain, so we can no longer use it. We do allow using a
3370 // wider phi, assuming the LSR checked for free truncation. In that case we
3371 // should already have a truncate on this operand such that
3372 // getSCEV(IVSrc) == IncExpr.
3373 if (SE.getSCEV(V: *IVOpIter) == Head.IncExpr
3374 || SE.getSCEV(V: IVSrc) == Head.IncExpr) {
3375 break;
3376 }
3377 IVOpIter = findIVOperand(OI: std::next(x: IVOpIter), OE: IVOpEnd, L, SE);
3378 }
3379 if (IVOpIter == IVOpEnd) {
3380 // Gracefully give up on this chain.
3381 LLVM_DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
3382 return;
3383 }
3384 assert(IVSrc && "Failed to find IV chain source");
3385
3386 LLVM_DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
3387 Type *IVTy = IVSrc->getType();
3388 Type *IntTy = SE.getEffectiveSCEVType(Ty: IVTy);
3389 const SCEV *LeftOverExpr = nullptr;
3390 const SCEV *Accum = SE.getZero(Ty: IntTy);
3391 SmallVector<std::pair<const SCEV *, Value *>> Bases;
3392 Bases.emplace_back(Args&: Accum, Args&: IVSrc);
3393
3394 for (const IVInc &Inc : Chain) {
3395 Instruction *InsertPt = Inc.UserInst;
3396 if (isa<PHINode>(Val: InsertPt))
3397 InsertPt = L->getLoopLatch()->getTerminator();
3398
3399 // IVOper will replace the current IV User's operand. IVSrc is the IV
3400 // value currently held in a register.
3401 Value *IVOper = IVSrc;
3402 if (!Inc.IncExpr->isZero()) {
3403 // IncExpr was the result of subtraction of two narrow values, so must
3404 // be signed.
3405 const SCEV *IncExpr = SE.getNoopOrSignExtend(V: Inc.IncExpr, Ty: IntTy);
3406 Accum = SE.getAddExpr(LHS: Accum, RHS: IncExpr);
3407 LeftOverExpr = LeftOverExpr ?
3408 SE.getAddExpr(LHS: LeftOverExpr, RHS: IncExpr) : IncExpr;
3409 }
3410
3411 // Look through each base to see if any can produce a nice addressing mode.
3412 bool FoundBase = false;
3413 for (auto [MapScev, MapIVOper] : reverse(C&: Bases)) {
3414 const SCEV *Remainder = SE.getMinusSCEV(LHS: Accum, RHS: MapScev);
3415 if (canFoldIVIncExpr(IncExpr: Remainder, UserInst: Inc.UserInst, Operand: Inc.IVOperand, TTI)) {
3416 if (!Remainder->isZero()) {
3417 Rewriter.clearPostInc();
3418 Value *IncV = Rewriter.expandCodeFor(SH: Remainder, Ty: IntTy, I: InsertPt);
3419 const SCEV *IVOperExpr =
3420 SE.getAddExpr(LHS: SE.getUnknown(V: MapIVOper), RHS: SE.getUnknown(V: IncV));
3421 IVOper = Rewriter.expandCodeFor(SH: IVOperExpr, Ty: IVTy, I: InsertPt);
3422 } else {
3423 IVOper = MapIVOper;
3424 }
3425
3426 FoundBase = true;
3427 break;
3428 }
3429 }
3430 if (!FoundBase && LeftOverExpr && !LeftOverExpr->isZero()) {
3431 // Expand the IV increment.
3432 Rewriter.clearPostInc();
3433 Value *IncV = Rewriter.expandCodeFor(SH: LeftOverExpr, Ty: IntTy, I: InsertPt);
3434 const SCEV *IVOperExpr = SE.getAddExpr(LHS: SE.getUnknown(V: IVSrc),
3435 RHS: SE.getUnknown(V: IncV));
3436 IVOper = Rewriter.expandCodeFor(SH: IVOperExpr, Ty: IVTy, I: InsertPt);
3437
3438 // If an IV increment can't be folded, use it as the next IV value.
3439 if (!canFoldIVIncExpr(IncExpr: LeftOverExpr, UserInst: Inc.UserInst, Operand: Inc.IVOperand, TTI)) {
3440 assert(IVTy == IVOper->getType() && "inconsistent IV increment type");
3441 Bases.emplace_back(Args&: Accum, Args&: IVOper);
3442 IVSrc = IVOper;
3443 LeftOverExpr = nullptr;
3444 }
3445 }
3446 Type *OperTy = Inc.IVOperand->getType();
3447 if (IVTy != OperTy) {
3448 assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) &&
3449 "cannot extend a chained IV");
3450 IRBuilder<> Builder(InsertPt);
3451 IVOper = Builder.CreateTruncOrBitCast(V: IVOper, DestTy: OperTy, Name: "lsr.chain");
3452 }
3453 Inc.UserInst->replaceUsesOfWith(From: Inc.IVOperand, To: IVOper);
3454 if (auto *OperandIsInstr = dyn_cast<Instruction>(Val: Inc.IVOperand))
3455 DeadInsts.emplace_back(Args&: OperandIsInstr);
3456 }
3457 // If LSR created a new, wider phi, we may also replace its postinc. We only
3458 // do this if we also found a wide value for the head of the chain.
3459 if (isa<PHINode>(Val: Chain.tailUserInst())) {
3460 for (PHINode &Phi : L->getHeader()->phis()) {
3461 if (Phi.getType() != IVSrc->getType())
3462 continue;
3463 Instruction *PostIncV = dyn_cast<Instruction>(
3464 Val: Phi.getIncomingValueForBlock(BB: L->getLoopLatch()));
3465 if (!PostIncV || (SE.getSCEV(V: PostIncV) != SE.getSCEV(V: IVSrc)))
3466 continue;
3467 Value *IVOper = IVSrc;
3468 Type *PostIncTy = PostIncV->getType();
3469 if (IVTy != PostIncTy) {
3470 assert(PostIncTy->isPointerTy() && "mixing int/ptr IV types");
3471 IRBuilder<> Builder(L->getLoopLatch()->getTerminator());
3472 Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc());
3473 IVOper = Builder.CreatePointerCast(V: IVSrc, DestTy: PostIncTy, Name: "lsr.chain");
3474 }
3475 Phi.replaceUsesOfWith(From: PostIncV, To: IVOper);
3476 DeadInsts.emplace_back(Args&: PostIncV);
3477 }
3478 }
3479}
3480
3481void LSRInstance::CollectFixupsAndInitialFormulae() {
3482 BranchInst *ExitBranch = nullptr;
3483 bool SaveCmp = TTI.canSaveCmp(L, BI: &ExitBranch, SE: &SE, LI: &LI, DT: &DT, AC: &AC, LibInfo: &TLI);
3484
3485 // For calculating baseline cost
3486 SmallPtrSet<const SCEV *, 16> Regs;
3487 DenseSet<const SCEV *> VisitedRegs;
3488 DenseSet<size_t> VisitedLSRUse;
3489
3490 for (const IVStrideUse &U : IU) {
3491 Instruction *UserInst = U.getUser();
3492 // Skip IV users that are part of profitable IV Chains.
3493 User::op_iterator UseI =
3494 find(Range: UserInst->operands(), Val: U.getOperandValToReplace());
3495 assert(UseI != UserInst->op_end() && "cannot find IV operand");
3496 if (IVIncSet.count(Ptr: UseI)) {
3497 LLVM_DEBUG(dbgs() << "Use is in profitable chain: " << **UseI << '\n');
3498 continue;
3499 }
3500
3501 LSRUse::KindType Kind = LSRUse::Basic;
3502 MemAccessTy AccessTy;
3503 if (isAddressUse(TTI, Inst: UserInst, OperandVal: U.getOperandValToReplace())) {
3504 Kind = LSRUse::Address;
3505 AccessTy = getAccessType(TTI, Inst: UserInst, OperandVal: U.getOperandValToReplace());
3506 }
3507
3508 const SCEV *S = IU.getExpr(IU: U);
3509 if (!S)
3510 continue;
3511 PostIncLoopSet TmpPostIncLoops = U.getPostIncLoops();
3512
3513 // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
3514 // (N - i == 0), and this allows (N - i) to be the expression that we work
3515 // with rather than just N or i, so we can consider the register
3516 // requirements for both N and i at the same time. Limiting this code to
3517 // equality icmps is not a problem because all interesting loops use
3518 // equality icmps, thanks to IndVarSimplify.
3519 if (ICmpInst *CI = dyn_cast<ICmpInst>(Val: UserInst)) {
3520 // If CI can be saved in some target, like replaced inside hardware loop
3521 // in PowerPC, no need to generate initial formulae for it.
3522 if (SaveCmp && CI == dyn_cast<ICmpInst>(Val: ExitBranch->getCondition()))
3523 continue;
3524 if (CI->isEquality()) {
3525 // Swap the operands if needed to put the OperandValToReplace on the
3526 // left, for consistency.
3527 Value *NV = CI->getOperand(i_nocapture: 1);
3528 if (NV == U.getOperandValToReplace()) {
3529 CI->setOperand(i_nocapture: 1, Val_nocapture: CI->getOperand(i_nocapture: 0));
3530 CI->setOperand(i_nocapture: 0, Val_nocapture: NV);
3531 NV = CI->getOperand(i_nocapture: 1);
3532 Changed = true;
3533 }
3534
3535 // x == y --> x - y == 0
3536 const SCEV *N = SE.getSCEV(V: NV);
3537 if (SE.isLoopInvariant(S: N, L) && Rewriter.isSafeToExpand(S: N) &&
3538 (!NV->getType()->isPointerTy() ||
3539 SE.getPointerBase(V: N) == SE.getPointerBase(V: S))) {
3540 // S is normalized, so normalize N before folding it into S
3541 // to keep the result normalized.
3542 N = normalizeForPostIncUse(S: N, Loops: TmpPostIncLoops, SE);
3543 if (!N)
3544 continue;
3545 Kind = LSRUse::ICmpZero;
3546 S = SE.getMinusSCEV(LHS: N, RHS: S);
3547 } else if (L->isLoopInvariant(V: NV) &&
3548 (!isa<Instruction>(Val: NV) ||
3549 DT.dominates(Def: cast<Instruction>(Val: NV), BB: L->getHeader())) &&
3550 !NV->getType()->isPointerTy()) {
3551 // If we can't generally expand the expression (e.g. it contains
3552 // a divide), but it is already at a loop invariant point before the
3553 // loop, wrap it in an unknown (to prevent the expander from trying
3554 // to re-expand in a potentially unsafe way.) The restriction to
3555 // integer types is required because the unknown hides the base, and
3556 // SCEV can't compute the difference of two unknown pointers.
3557 N = SE.getUnknown(V: NV);
3558 N = normalizeForPostIncUse(S: N, Loops: TmpPostIncLoops, SE);
3559 if (!N)
3560 continue;
3561 Kind = LSRUse::ICmpZero;
3562 S = SE.getMinusSCEV(LHS: N, RHS: S);
3563 assert(!isa<SCEVCouldNotCompute>(S));
3564 }
3565
3566 // -1 and the negations of all interesting strides (except the negation
3567 // of -1) are now also interesting.
3568 for (size_t i = 0, e = Factors.size(); i != e; ++i)
3569 if (Factors[i] != -1)
3570 Factors.insert(X: -(uint64_t)Factors[i]);
3571 Factors.insert(X: -1);
3572 }
3573 }
3574
3575 // Get or create an LSRUse.
3576 std::pair<size_t, Immediate> P = getUse(Expr&: S, Kind, AccessTy);
3577 size_t LUIdx = P.first;
3578 Immediate Offset = P.second;
3579 LSRUse &LU = Uses[LUIdx];
3580
3581 // Record the fixup.
3582 LSRFixup &LF = LU.getNewFixup();
3583 LF.UserInst = UserInst;
3584 LF.OperandValToReplace = U.getOperandValToReplace();
3585 LF.PostIncLoops = TmpPostIncLoops;
3586 LF.Offset = Offset;
3587 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3588
3589 // Create SCEV as Formula for calculating baseline cost
3590 if (!VisitedLSRUse.count(V: LUIdx) && !LF.isUseFullyOutsideLoop(L)) {
3591 Formula F;
3592 F.initialMatch(S, L, SE);
3593 BaselineCost.RateFormula(F, Regs, VisitedRegs, LU);
3594 VisitedLSRUse.insert(V: LUIdx);
3595 }
3596
3597 if (!LU.WidestFixupType ||
3598 SE.getTypeSizeInBits(Ty: LU.WidestFixupType) <
3599 SE.getTypeSizeInBits(Ty: LF.OperandValToReplace->getType()))
3600 LU.WidestFixupType = LF.OperandValToReplace->getType();
3601
3602 // If this is the first use of this LSRUse, give it a formula.
3603 if (LU.Formulae.empty()) {
3604 InsertInitialFormula(S, LU, LUIdx);
3605 CountRegisters(F: LU.Formulae.back(), LUIdx);
3606 }
3607 }
3608
3609 LLVM_DEBUG(print_fixups(dbgs()));
3610}
3611
3612/// Insert a formula for the given expression into the given use, separating out
3613/// loop-variant portions from loop-invariant and loop-computable portions.
3614void LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU,
3615 size_t LUIdx) {
3616 // Mark uses whose expressions cannot be expanded.
3617 if (!Rewriter.isSafeToExpand(S))
3618 LU.RigidFormula = true;
3619
3620 Formula F;
3621 F.initialMatch(S, L, SE);
3622 bool Inserted = InsertFormula(LU, LUIdx, F);
3623 assert(Inserted && "Initial formula already exists!"); (void)Inserted;
3624}
3625
3626/// Insert a simple single-register formula for the given expression into the
3627/// given use.
3628void
3629LSRInstance::InsertSupplementalFormula(const SCEV *S,
3630 LSRUse &LU, size_t LUIdx) {
3631 Formula F;
3632 F.BaseRegs.push_back(Elt: S);
3633 F.HasBaseReg = true;
3634 bool Inserted = InsertFormula(LU, LUIdx, F);
3635 assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
3636}
3637
3638/// Note which registers are used by the given formula, updating RegUses.
3639void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
3640 if (F.ScaledReg)
3641 RegUses.countRegister(Reg: F.ScaledReg, LUIdx);
3642 for (const SCEV *BaseReg : F.BaseRegs)
3643 RegUses.countRegister(Reg: BaseReg, LUIdx);
3644}
3645
3646/// If the given formula has not yet been inserted, add it to the list, and
3647/// return true. Return false otherwise.
3648bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
3649 // Do not insert formula that we will not be able to expand.
3650 assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
3651 "Formula is illegal");
3652
3653 if (!LU.InsertFormula(F, L: *L))
3654 return false;
3655
3656 CountRegisters(F, LUIdx);
3657 return true;
3658}
3659
3660/// Check for other uses of loop-invariant values which we're tracking. These
3661/// other uses will pin these values in registers, making them less profitable
3662/// for elimination.
3663/// TODO: This currently misses non-constant addrec step registers.
3664/// TODO: Should this give more weight to users inside the loop?
3665void
3666LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
3667 SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
3668 SmallPtrSet<const SCEV *, 32> Visited;
3669
3670 // Don't collect outside uses if we are favoring postinc - the instructions in
3671 // the loop are more important than the ones outside of it.
3672 if (AMK == TTI::AMK_PostIndexed)
3673 return;
3674
3675 while (!Worklist.empty()) {
3676 const SCEV *S = Worklist.pop_back_val();
3677
3678 // Don't process the same SCEV twice
3679 if (!Visited.insert(Ptr: S).second)
3680 continue;
3681
3682 if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(Val: S))
3683 append_range(C&: Worklist, R: N->operands());
3684 else if (const SCEVIntegralCastExpr *C = dyn_cast<SCEVIntegralCastExpr>(Val: S))
3685 Worklist.push_back(Elt: C->getOperand());
3686 else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(Val: S)) {
3687 Worklist.push_back(Elt: D->getLHS());
3688 Worklist.push_back(Elt: D->getRHS());
3689 } else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(Val: S)) {
3690 const Value *V = US->getValue();
3691 if (const Instruction *Inst = dyn_cast<Instruction>(Val: V)) {
3692 // Look for instructions defined outside the loop.
3693 if (L->contains(Inst)) continue;
3694 } else if (isa<Constant>(Val: V))
3695 // Constants can be re-materialized.
3696 continue;
3697 for (const Use &U : V->uses()) {
3698 const Instruction *UserInst = dyn_cast<Instruction>(Val: U.getUser());
3699 // Ignore non-instructions.
3700 if (!UserInst)
3701 continue;
3702 // Don't bother if the instruction is an EHPad.
3703 if (UserInst->isEHPad())
3704 continue;
3705 // Ignore instructions in other functions (as can happen with
3706 // Constants).
3707 if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
3708 continue;
3709 // Ignore instructions not dominated by the loop.
3710 const BasicBlock *UseBB = !isa<PHINode>(Val: UserInst) ?
3711 UserInst->getParent() :
3712 cast<PHINode>(Val: UserInst)->getIncomingBlock(
3713 i: PHINode::getIncomingValueNumForOperand(i: U.getOperandNo()));
3714 if (!DT.dominates(A: L->getHeader(), B: UseBB))
3715 continue;
3716 // Don't bother if the instruction is in a BB which ends in an EHPad.
3717 if (UseBB->getTerminator()->isEHPad())
3718 continue;
3719
3720 // Ignore cases in which the currently-examined value could come from
3721 // a basic block terminated with an EHPad. This checks all incoming
3722 // blocks of the phi node since it is possible that the same incoming
3723 // value comes from multiple basic blocks, only some of which may end
3724 // in an EHPad. If any of them do, a subsequent rewrite attempt by this
3725 // pass would try to insert instructions into an EHPad, hitting an
3726 // assertion.
3727 if (isa<PHINode>(Val: UserInst)) {
3728 const auto *PhiNode = cast<PHINode>(Val: UserInst);
3729 bool HasIncompatibleEHPTerminatedBlock = false;
3730 llvm::Value *ExpectedValue = U;
3731 for (unsigned int I = 0; I < PhiNode->getNumIncomingValues(); I++) {
3732 if (PhiNode->getIncomingValue(i: I) == ExpectedValue) {
3733 if (PhiNode->getIncomingBlock(i: I)->getTerminator()->isEHPad()) {
3734 HasIncompatibleEHPTerminatedBlock = true;
3735 break;
3736 }
3737 }
3738 }
3739 if (HasIncompatibleEHPTerminatedBlock) {
3740 continue;
3741 }
3742 }
3743
3744 // Don't bother rewriting PHIs in catchswitch blocks.
3745 if (isa<CatchSwitchInst>(Val: UserInst->getParent()->getTerminator()))
3746 continue;
3747 // Ignore uses which are part of other SCEV expressions, to avoid
3748 // analyzing them multiple times.
3749 if (SE.isSCEVable(Ty: UserInst->getType())) {
3750 const SCEV *UserS = SE.getSCEV(V: const_cast<Instruction *>(UserInst));
3751 // If the user is a no-op, look through to its uses.
3752 if (!isa<SCEVUnknown>(Val: UserS))
3753 continue;
3754 if (UserS == US) {
3755 Worklist.push_back(
3756 Elt: SE.getUnknown(V: const_cast<Instruction *>(UserInst)));
3757 continue;
3758 }
3759 }
3760 // Ignore icmp instructions which are already being analyzed.
3761 if (const ICmpInst *ICI = dyn_cast<ICmpInst>(Val: UserInst)) {
3762 unsigned OtherIdx = !U.getOperandNo();
3763 Value *OtherOp = const_cast<Value *>(ICI->getOperand(i_nocapture: OtherIdx));
3764 if (SE.hasComputableLoopEvolution(S: SE.getSCEV(V: OtherOp), L))
3765 continue;
3766 }
3767
3768 std::pair<size_t, Immediate> P =
3769 getUse(Expr&: S, Kind: LSRUse::Basic, AccessTy: MemAccessTy());
3770 size_t LUIdx = P.first;
3771 Immediate Offset = P.second;
3772 LSRUse &LU = Uses[LUIdx];
3773 LSRFixup &LF = LU.getNewFixup();
3774 LF.UserInst = const_cast<Instruction *>(UserInst);
3775 LF.OperandValToReplace = U;
3776 LF.Offset = Offset;
3777 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3778 if (!LU.WidestFixupType ||
3779 SE.getTypeSizeInBits(Ty: LU.WidestFixupType) <
3780 SE.getTypeSizeInBits(Ty: LF.OperandValToReplace->getType()))
3781 LU.WidestFixupType = LF.OperandValToReplace->getType();
3782 InsertSupplementalFormula(S: US, LU, LUIdx);
3783 CountRegisters(F: LU.Formulae.back(), LUIdx: Uses.size() - 1);
3784 break;
3785 }
3786 }
3787 }
3788}
3789
3790/// Split S into subexpressions which can be pulled out into separate
3791/// registers. If C is non-null, multiply each subexpression by C.
3792///
3793/// Return remainder expression after factoring the subexpressions captured by
3794/// Ops. If Ops is complete, return NULL.
3795static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
3796 SmallVectorImpl<const SCEV *> &Ops,
3797 const Loop *L,
3798 ScalarEvolution &SE,
3799 unsigned Depth = 0) {
3800 // Arbitrarily cap recursion to protect compile time.
3801 if (Depth >= 3)
3802 return S;
3803
3804 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Val: S)) {
3805 // Break out add operands.
3806 for (const SCEV *S : Add->operands()) {
3807 const SCEV *Remainder = CollectSubexprs(S, C, Ops, L, SE, Depth: Depth+1);
3808 if (Remainder)
3809 Ops.push_back(Elt: C ? SE.getMulExpr(LHS: C, RHS: Remainder) : Remainder);
3810 }
3811 return nullptr;
3812 }
3813 const SCEV *Start, *Step;
3814 const SCEVConstant *Op0;
3815 const SCEV *Op1;
3816 if (match(S, P: m_scev_AffineAddRec(Op0: m_SCEV(V&: Start), Op1: m_SCEV(V&: Step)))) {
3817 // Split a non-zero base out of an addrec.
3818 if (Start->isZero())
3819 return S;
3820
3821 const SCEV *Remainder = CollectSubexprs(S: Start, C, Ops, L, SE, Depth: Depth + 1);
3822 // Split the non-zero AddRec unless it is part of a nested recurrence that
3823 // does not pertain to this loop.
3824 if (Remainder && (cast<SCEVAddRecExpr>(Val: S)->getLoop() == L ||
3825 !isa<SCEVAddRecExpr>(Val: Remainder))) {
3826 Ops.push_back(Elt: C ? SE.getMulExpr(LHS: C, RHS: Remainder) : Remainder);
3827 Remainder = nullptr;
3828 }
3829 if (Remainder != Start) {
3830 if (!Remainder)
3831 Remainder = SE.getConstant(Ty: S->getType(), V: 0);
3832 return SE.getAddRecExpr(Start: Remainder, Step,
3833 L: cast<SCEVAddRecExpr>(Val: S)->getLoop(),
3834 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
3835 Flags: SCEV::FlagAnyWrap);
3836 }
3837 } else if (match(S, P: m_scev_Mul(Op0: m_SCEVConstant(V&: Op0), Op1: m_SCEV(V&: Op1)))) {
3838 // Break (C * (a + b + c)) into C*a + C*b + C*c.
3839 C = C ? cast<SCEVConstant>(Val: SE.getMulExpr(LHS: C, RHS: Op0)) : Op0;
3840 const SCEV *Remainder = CollectSubexprs(S: Op1, C, Ops, L, SE, Depth: Depth + 1);
3841 if (Remainder)
3842 Ops.push_back(Elt: SE.getMulExpr(LHS: C, RHS: Remainder));
3843 return nullptr;
3844 }
3845 return S;
3846}
3847
3848/// Return true if the SCEV represents a value that may end up as a
3849/// post-increment operation.
3850static bool mayUsePostIncMode(const TargetTransformInfo &TTI,
3851 LSRUse &LU, const SCEV *S, const Loop *L,
3852 ScalarEvolution &SE) {
3853 if (LU.Kind != LSRUse::Address ||
3854 !LU.AccessTy.getType()->isIntOrIntVectorTy())
3855 return false;
3856 const SCEV *Start;
3857 if (!match(S, P: m_scev_AffineAddRec(Op0: m_SCEV(V&: Start), Op1: m_SCEVConstant())))
3858 return false;
3859 // Check if a post-indexed load/store can be used.
3860 if (TTI.isIndexedLoadLegal(Mode: TTI.MIM_PostInc, Ty: S->getType()) ||
3861 TTI.isIndexedStoreLegal(Mode: TTI.MIM_PostInc, Ty: S->getType())) {
3862 if (!isa<SCEVConstant>(Val: Start) && SE.isLoopInvariant(S: Start, L))
3863 return true;
3864 }
3865 return false;
3866}
3867
3868/// Helper function for LSRInstance::GenerateReassociations.
3869void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
3870 const Formula &Base,
3871 unsigned Depth, size_t Idx,
3872 bool IsScaledReg) {
3873 const SCEV *BaseReg = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
3874 // Don't generate reassociations for the base register of a value that
3875 // may generate a post-increment operator. The reason is that the
3876 // reassociations cause extra base+register formula to be created,
3877 // and possibly chosen, but the post-increment is more efficient.
3878 if (AMK == TTI::AMK_PostIndexed && mayUsePostIncMode(TTI, LU, S: BaseReg, L, SE))
3879 return;
3880 SmallVector<const SCEV *, 8> AddOps;
3881 const SCEV *Remainder = CollectSubexprs(S: BaseReg, C: nullptr, Ops&: AddOps, L, SE);
3882 if (Remainder)
3883 AddOps.push_back(Elt: Remainder);
3884
3885 if (AddOps.size() == 1)
3886 return;
3887
3888 for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(),
3889 JE = AddOps.end();
3890 J != JE; ++J) {
3891 // Loop-variant "unknown" values are uninteresting; we won't be able to
3892 // do anything meaningful with them.
3893 if (isa<SCEVUnknown>(Val: *J) && !SE.isLoopInvariant(S: *J, L))
3894 continue;
3895
3896 // Don't pull a constant into a register if the constant could be folded
3897 // into an immediate field.
3898 if (isAlwaysFoldable(TTI, SE, MinOffset: LU.MinOffset, MaxOffset: LU.MaxOffset, Kind: LU.Kind,
3899 AccessTy: LU.AccessTy, S: *J, HasBaseReg: Base.getNumRegs() > 1))
3900 continue;
3901
3902 // Collect all operands except *J.
3903 SmallVector<const SCEV *, 8> InnerAddOps(std::as_const(t&: AddOps).begin(), J);
3904 InnerAddOps.append(in_start: std::next(x: J), in_end: std::as_const(t&: AddOps).end());
3905
3906 // Don't leave just a constant behind in a register if the constant could
3907 // be folded into an immediate field.
3908 if (InnerAddOps.size() == 1 &&
3909 isAlwaysFoldable(TTI, SE, MinOffset: LU.MinOffset, MaxOffset: LU.MaxOffset, Kind: LU.Kind,
3910 AccessTy: LU.AccessTy, S: InnerAddOps[0], HasBaseReg: Base.getNumRegs() > 1))
3911 continue;
3912
3913 const SCEV *InnerSum = SE.getAddExpr(Ops&: InnerAddOps);
3914 if (InnerSum->isZero())
3915 continue;
3916 Formula F = Base;
3917
3918 if (F.UnfoldedOffset.isNonZero() && F.UnfoldedOffset.isScalable())
3919 continue;
3920
3921 // Add the remaining pieces of the add back into the new formula.
3922 const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(Val: InnerSum);
3923 if (InnerSumSC && SE.getTypeSizeInBits(Ty: InnerSumSC->getType()) <= 64 &&
3924 TTI.isLegalAddImmediate(Imm: (uint64_t)F.UnfoldedOffset.getFixedValue() +
3925 InnerSumSC->getValue()->getZExtValue())) {
3926 F.UnfoldedOffset =
3927 Immediate::getFixed(MinVal: (uint64_t)F.UnfoldedOffset.getFixedValue() +
3928 InnerSumSC->getValue()->getZExtValue());
3929 if (IsScaledReg) {
3930 F.ScaledReg = nullptr;
3931 F.Scale = 0;
3932 } else
3933 F.BaseRegs.erase(CI: F.BaseRegs.begin() + Idx);
3934 } else if (IsScaledReg)
3935 F.ScaledReg = InnerSum;
3936 else
3937 F.BaseRegs[Idx] = InnerSum;
3938
3939 // Add J as its own register, or an unfolded immediate.
3940 const SCEVConstant *SC = dyn_cast<SCEVConstant>(Val: *J);
3941 if (SC && SE.getTypeSizeInBits(Ty: SC->getType()) <= 64 &&
3942 TTI.isLegalAddImmediate(Imm: (uint64_t)F.UnfoldedOffset.getFixedValue() +
3943 SC->getValue()->getZExtValue()))
3944 F.UnfoldedOffset =
3945 Immediate::getFixed(MinVal: (uint64_t)F.UnfoldedOffset.getFixedValue() +
3946 SC->getValue()->getZExtValue());
3947 else
3948 F.BaseRegs.push_back(Elt: *J);
3949 // We may have changed the number of register in base regs, adjust the
3950 // formula accordingly.
3951 F.canonicalize(L: *L);
3952
3953 if (InsertFormula(LU, LUIdx, F))
3954 // If that formula hadn't been seen before, recurse to find more like
3955 // it.
3956 // Add check on Log16(AddOps.size()) - same as Log2_32(AddOps.size()) >> 2)
3957 // Because just Depth is not enough to bound compile time.
3958 // This means that every time AddOps.size() is greater 16^x we will add
3959 // x to Depth.
3960 GenerateReassociations(LU, LUIdx, Base: LU.Formulae.back(),
3961 Depth: Depth + 1 + (Log2_32(Value: AddOps.size()) >> 2));
3962 }
3963}
3964
3965/// Split out subexpressions from adds and the bases of addrecs.
3966void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
3967 Formula Base, unsigned Depth) {
3968 assert(Base.isCanonical(*L) && "Input must be in the canonical form");
3969 // Arbitrarily cap recursion to protect compile time.
3970 if (Depth >= 3)
3971 return;
3972
3973 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
3974 GenerateReassociationsImpl(LU, LUIdx, Base, Depth, Idx: i);
3975
3976 if (Base.Scale == 1)
3977 GenerateReassociationsImpl(LU, LUIdx, Base, Depth,
3978 /* Idx */ -1, /* IsScaledReg */ true);
3979}
3980
3981/// Generate a formula consisting of all of the loop-dominating registers added
3982/// into a single register.
3983void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
3984 Formula Base) {
3985 // This method is only interesting on a plurality of registers.
3986 if (Base.BaseRegs.size() + (Base.Scale == 1) +
3987 (Base.UnfoldedOffset.isNonZero()) <=
3988 1)
3989 return;
3990
3991 // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before
3992 // processing the formula.
3993 Base.unscale();
3994 SmallVector<const SCEV *, 4> Ops;
3995 Formula NewBase = Base;
3996 NewBase.BaseRegs.clear();
3997 Type *CombinedIntegerType = nullptr;
3998 for (const SCEV *BaseReg : Base.BaseRegs) {
3999 if (SE.properlyDominates(S: BaseReg, BB: L->getHeader()) &&
4000 !SE.hasComputableLoopEvolution(S: BaseReg, L)) {
4001 if (!CombinedIntegerType)
4002 CombinedIntegerType = SE.getEffectiveSCEVType(Ty: BaseReg->getType());
4003 Ops.push_back(Elt: BaseReg);
4004 }
4005 else
4006 NewBase.BaseRegs.push_back(Elt: BaseReg);
4007 }
4008
4009 // If no register is relevant, we're done.
4010 if (Ops.size() == 0)
4011 return;
4012
4013 // Utility function for generating the required variants of the combined
4014 // registers.
4015 auto GenerateFormula = [&](const SCEV *Sum) {
4016 Formula F = NewBase;
4017
4018 // TODO: If Sum is zero, it probably means ScalarEvolution missed an
4019 // opportunity to fold something. For now, just ignore such cases
4020 // rather than proceed with zero in a register.
4021 if (Sum->isZero())
4022 return;
4023
4024 F.BaseRegs.push_back(Elt: Sum);
4025 F.canonicalize(L: *L);
4026 (void)InsertFormula(LU, LUIdx, F);
4027 };
4028
4029 // If we collected at least two registers, generate a formula combining them.
4030 if (Ops.size() > 1) {
4031 SmallVector<const SCEV *, 4> OpsCopy(Ops); // Don't let SE modify Ops.
4032 GenerateFormula(SE.getAddExpr(Ops&: OpsCopy));
4033 }
4034
4035 // If we have an unfolded offset, generate a formula combining it with the
4036 // registers collected.
4037 if (NewBase.UnfoldedOffset.isNonZero() && NewBase.UnfoldedOffset.isFixed()) {
4038 assert(CombinedIntegerType && "Missing a type for the unfolded offset");
4039 Ops.push_back(Elt: SE.getConstant(Ty: CombinedIntegerType,
4040 V: NewBase.UnfoldedOffset.getFixedValue(), isSigned: true));
4041 NewBase.UnfoldedOffset = Immediate::getFixed(MinVal: 0);
4042 GenerateFormula(SE.getAddExpr(Ops));
4043 }
4044}
4045
4046/// Helper function for LSRInstance::GenerateSymbolicOffsets.
4047void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
4048 const Formula &Base, size_t Idx,
4049 bool IsScaledReg) {
4050 const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
4051 GlobalValue *GV = ExtractSymbol(S&: G, SE);
4052 if (G->isZero() || !GV)
4053 return;
4054 Formula F = Base;
4055 F.BaseGV = GV;
4056 if (!isLegalUse(TTI, MinOffset: LU.MinOffset, MaxOffset: LU.MaxOffset, Kind: LU.Kind, AccessTy: LU.AccessTy, F))
4057 return;
4058 if (IsScaledReg)
4059 F.ScaledReg = G;
4060 else
4061 F.BaseRegs[Idx] = G;
4062 (void)InsertFormula(LU, LUIdx, F);
4063}
4064
4065/// Generate reuse formulae using symbolic offsets.
4066void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
4067 Formula Base) {
4068 // We can't add a symbolic offset if the address already contains one.
4069 if (Base.BaseGV) return;
4070
4071 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4072 GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, Idx: i);
4073 if (Base.Scale == 1)
4074 GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, /* Idx */ -1,
4075 /* IsScaledReg */ true);
4076}
4077
4078/// Helper function for LSRInstance::GenerateConstantOffsets.
4079void LSRInstance::GenerateConstantOffsetsImpl(
4080 LSRUse &LU, unsigned LUIdx, const Formula &Base,
4081 const SmallVectorImpl<Immediate> &Worklist, size_t Idx, bool IsScaledReg) {
4082
4083 auto GenerateOffset = [&](const SCEV *G, Immediate Offset) {
4084 Formula F = Base;
4085 if (!Base.BaseOffset.isCompatibleImmediate(Imm: Offset))
4086 return;
4087 F.BaseOffset = Base.BaseOffset.subUnsigned(RHS: Offset);
4088
4089 if (isLegalUse(TTI, MinOffset: LU.MinOffset, MaxOffset: LU.MaxOffset, Kind: LU.Kind, AccessTy: LU.AccessTy, F)) {
4090 // Add the offset to the base register.
4091 const SCEV *NewOffset = Offset.getSCEV(SE, Ty: G->getType());
4092 const SCEV *NewG = SE.getAddExpr(LHS: NewOffset, RHS: G);
4093 // If it cancelled out, drop the base register, otherwise update it.
4094 if (NewG->isZero()) {
4095 if (IsScaledReg) {
4096 F.Scale = 0;
4097 F.ScaledReg = nullptr;
4098 } else
4099 F.deleteBaseReg(S&: F.BaseRegs[Idx]);
4100 F.canonicalize(L: *L);
4101 } else if (IsScaledReg)
4102 F.ScaledReg = NewG;
4103 else
4104 F.BaseRegs[Idx] = NewG;
4105
4106 (void)InsertFormula(LU, LUIdx, F);
4107 }
4108 };
4109
4110 const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
4111
4112 // With constant offsets and constant steps, we can generate pre-inc
4113 // accesses by having the offset equal the step. So, for access #0 with a
4114 // step of 8, we generate a G - 8 base which would require the first access
4115 // to be ((G - 8) + 8),+,8. The pre-indexed access then updates the pointer
4116 // for itself and hopefully becomes the base for other accesses. This means
4117 // means that a single pre-indexed access can be generated to become the new
4118 // base pointer for each iteration of the loop, resulting in no extra add/sub
4119 // instructions for pointer updating.
4120 if (AMK == TTI::AMK_PreIndexed && LU.Kind == LSRUse::Address) {
4121 const APInt *StepInt;
4122 if (match(S: G, P: m_scev_AffineAddRec(Op0: m_SCEV(), Op1: m_scev_APInt(C&: StepInt)))) {
4123 int64_t Step = StepInt->isNegative() ? StepInt->getSExtValue()
4124 : StepInt->getZExtValue();
4125
4126 for (Immediate Offset : Worklist) {
4127 if (Offset.isFixed()) {
4128 Offset = Immediate::getFixed(MinVal: Offset.getFixedValue() - Step);
4129 GenerateOffset(G, Offset);
4130 }
4131 }
4132 }
4133 }
4134 for (Immediate Offset : Worklist)
4135 GenerateOffset(G, Offset);
4136
4137 Immediate Imm = ExtractImmediate(S&: G, SE);
4138 if (G->isZero() || Imm.isZero() ||
4139 !Base.BaseOffset.isCompatibleImmediate(Imm))
4140 return;
4141 Formula F = Base;
4142 F.BaseOffset = F.BaseOffset.addUnsigned(RHS: Imm);
4143 if (!isLegalUse(TTI, MinOffset: LU.MinOffset, MaxOffset: LU.MaxOffset, Kind: LU.Kind, AccessTy: LU.AccessTy, F))
4144 return;
4145 if (IsScaledReg) {
4146 F.ScaledReg = G;
4147 } else {
4148 F.BaseRegs[Idx] = G;
4149 // We may generate non canonical Formula if G is a recurrent expr reg
4150 // related with current loop while F.ScaledReg is not.
4151 F.canonicalize(L: *L);
4152 }
4153 (void)InsertFormula(LU, LUIdx, F);
4154}
4155
4156/// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
4157void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
4158 Formula Base) {
4159 // TODO: For now, just add the min and max offset, because it usually isn't
4160 // worthwhile looking at everything inbetween.
4161 SmallVector<Immediate, 2> Worklist;
4162 Worklist.push_back(Elt: LU.MinOffset);
4163 if (LU.MaxOffset != LU.MinOffset)
4164 Worklist.push_back(Elt: LU.MaxOffset);
4165
4166 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4167 GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, Idx: i);
4168 if (Base.Scale == 1)
4169 GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, /* Idx */ -1,
4170 /* IsScaledReg */ true);
4171}
4172
4173/// For ICmpZero, check to see if we can scale up the comparison. For example, x
4174/// == y -> x*c == y*c.
4175void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
4176 Formula Base) {
4177 if (LU.Kind != LSRUse::ICmpZero) return;
4178
4179 // Determine the integer type for the base formula.
4180 Type *IntTy = Base.getType();
4181 if (!IntTy) return;
4182 if (SE.getTypeSizeInBits(Ty: IntTy) > 64) return;
4183
4184 // Don't do this if there is more than one offset.
4185 if (LU.MinOffset != LU.MaxOffset) return;
4186
4187 // Check if transformation is valid. It is illegal to multiply pointer.
4188 if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
4189 return;
4190 for (const SCEV *BaseReg : Base.BaseRegs)
4191 if (BaseReg->getType()->isPointerTy())
4192 return;
4193 assert(!Base.BaseGV && "ICmpZero use is not legal!");
4194
4195 // Check each interesting stride.
4196 for (int64_t Factor : Factors) {
4197 // Check that Factor can be represented by IntTy
4198 if (!ConstantInt::isValueValidForType(Ty: IntTy, V: Factor))
4199 continue;
4200 // Check that the multiplication doesn't overflow.
4201 if (Base.BaseOffset.isMin() && Factor == -1)
4202 continue;
4203 // Not supporting scalable immediates.
4204 if (Base.BaseOffset.isNonZero() && Base.BaseOffset.isScalable())
4205 continue;
4206 Immediate NewBaseOffset = Base.BaseOffset.mulUnsigned(RHS: Factor);
4207 assert(Factor != 0 && "Zero factor not expected!");
4208 if (NewBaseOffset.getFixedValue() / Factor !=
4209 Base.BaseOffset.getFixedValue())
4210 continue;
4211 // If the offset will be truncated at this use, check that it is in bounds.
4212 if (!IntTy->isPointerTy() &&
4213 !ConstantInt::isValueValidForType(Ty: IntTy, V: NewBaseOffset.getFixedValue()))
4214 continue;
4215
4216 // Check that multiplying with the use offset doesn't overflow.
4217 Immediate Offset = LU.MinOffset;
4218 if (Offset.isMin() && Factor == -1)
4219 continue;
4220 Offset = Offset.mulUnsigned(RHS: Factor);
4221 if (Offset.getFixedValue() / Factor != LU.MinOffset.getFixedValue())
4222 continue;
4223 // If the offset will be truncated at this use, check that it is in bounds.
4224 if (!IntTy->isPointerTy() &&
4225 !ConstantInt::isValueValidForType(Ty: IntTy, V: Offset.getFixedValue()))
4226 continue;
4227
4228 Formula F = Base;
4229 F.BaseOffset = NewBaseOffset;
4230
4231 // Check that this scale is legal.
4232 if (!isLegalUse(TTI, MinOffset: Offset, MaxOffset: Offset, Kind: LU.Kind, AccessTy: LU.AccessTy, F))
4233 continue;
4234
4235 // Compensate for the use having MinOffset built into it.
4236 F.BaseOffset = F.BaseOffset.addUnsigned(RHS: Offset).subUnsigned(RHS: LU.MinOffset);
4237
4238 const SCEV *FactorS = SE.getConstant(Ty: IntTy, V: Factor);
4239
4240 // Check that multiplying with each base register doesn't overflow.
4241 for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
4242 F.BaseRegs[i] = SE.getMulExpr(LHS: F.BaseRegs[i], RHS: FactorS);
4243 if (getExactSDiv(LHS: F.BaseRegs[i], RHS: FactorS, SE) != Base.BaseRegs[i])
4244 goto next;
4245 }
4246
4247 // Check that multiplying with the scaled register doesn't overflow.
4248 if (F.ScaledReg) {
4249 F.ScaledReg = SE.getMulExpr(LHS: F.ScaledReg, RHS: FactorS);
4250 if (getExactSDiv(LHS: F.ScaledReg, RHS: FactorS, SE) != Base.ScaledReg)
4251 continue;
4252 }
4253
4254 // Check that multiplying with the unfolded offset doesn't overflow.
4255 if (F.UnfoldedOffset.isNonZero()) {
4256 if (F.UnfoldedOffset.isMin() && Factor == -1)
4257 continue;
4258 F.UnfoldedOffset = F.UnfoldedOffset.mulUnsigned(RHS: Factor);
4259 if (F.UnfoldedOffset.getFixedValue() / Factor !=
4260 Base.UnfoldedOffset.getFixedValue())
4261 continue;
4262 // If the offset will be truncated, check that it is in bounds.
4263 if (!IntTy->isPointerTy() && !ConstantInt::isValueValidForType(
4264 Ty: IntTy, V: F.UnfoldedOffset.getFixedValue()))
4265 continue;
4266 }
4267
4268 // If we make it here and it's legal, add it.
4269 (void)InsertFormula(LU, LUIdx, F);
4270 next:;
4271 }
4272}
4273
4274/// Generate stride factor reuse formulae by making use of scaled-offset address
4275/// modes, for example.
4276void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
4277 // Determine the integer type for the base formula.
4278 Type *IntTy = Base.getType();
4279 if (!IntTy) return;
4280
4281 // If this Formula already has a scaled register, we can't add another one.
4282 // Try to unscale the formula to generate a better scale.
4283 if (Base.Scale != 0 && !Base.unscale())
4284 return;
4285
4286 assert(Base.Scale == 0 && "unscale did not did its job!");
4287
4288 // Check each interesting stride.
4289 for (int64_t Factor : Factors) {
4290 Base.Scale = Factor;
4291 Base.HasBaseReg = Base.BaseRegs.size() > 1;
4292 // Check whether this scale is going to be legal.
4293 if (!isLegalUse(TTI, MinOffset: LU.MinOffset, MaxOffset: LU.MaxOffset, Kind: LU.Kind, AccessTy: LU.AccessTy,
4294 F: Base)) {
4295 // As a special-case, handle special out-of-loop Basic users specially.
4296 // TODO: Reconsider this special case.
4297 if (LU.Kind == LSRUse::Basic &&
4298 isLegalUse(TTI, MinOffset: LU.MinOffset, MaxOffset: LU.MaxOffset, Kind: LSRUse::Special,
4299 AccessTy: LU.AccessTy, F: Base) &&
4300 LU.AllFixupsOutsideLoop)
4301 LU.Kind = LSRUse::Special;
4302 else
4303 continue;
4304 }
4305 // For an ICmpZero, negating a solitary base register won't lead to
4306 // new solutions.
4307 if (LU.Kind == LSRUse::ICmpZero && !Base.HasBaseReg &&
4308 Base.BaseOffset.isZero() && !Base.BaseGV)
4309 continue;
4310 // For each addrec base reg, if its loop is current loop, apply the scale.
4311 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
4312 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Val: Base.BaseRegs[i]);
4313 if (AR && (AR->getLoop() == L || LU.AllFixupsOutsideLoop)) {
4314 const SCEV *FactorS = SE.getConstant(Ty: IntTy, V: Factor);
4315 if (FactorS->isZero())
4316 continue;
4317 // Divide out the factor, ignoring high bits, since we'll be
4318 // scaling the value back up in the end.
4319 if (const SCEV *Quotient = getExactSDiv(LHS: AR, RHS: FactorS, SE, IgnoreSignificantBits: true))
4320 if (!Quotient->isZero()) {
4321 // TODO: This could be optimized to avoid all the copying.
4322 Formula F = Base;
4323 F.ScaledReg = Quotient;
4324 F.deleteBaseReg(S&: F.BaseRegs[i]);
4325 // The canonical representation of 1*reg is reg, which is already in
4326 // Base. In that case, do not try to insert the formula, it will be
4327 // rejected anyway.
4328 if (F.Scale == 1 && (F.BaseRegs.empty() ||
4329 (AR->getLoop() != L && LU.AllFixupsOutsideLoop)))
4330 continue;
4331 // If AllFixupsOutsideLoop is true and F.Scale is 1, we may generate
4332 // non canonical Formula with ScaledReg's loop not being L.
4333 if (F.Scale == 1 && LU.AllFixupsOutsideLoop)
4334 F.canonicalize(L: *L);
4335 (void)InsertFormula(LU, LUIdx, F);
4336 }
4337 }
4338 }
4339 }
4340}
4341
4342/// Extend/Truncate \p Expr to \p ToTy considering post-inc uses in \p Loops.
4343/// For all PostIncLoopSets in \p Loops, first de-normalize \p Expr, then
4344/// perform the extension/truncate and normalize again, as the normalized form
4345/// can result in folds that are not valid in the post-inc use contexts. The
4346/// expressions for all PostIncLoopSets must match, otherwise return nullptr.
4347static const SCEV *
4348getAnyExtendConsideringPostIncUses(ArrayRef<PostIncLoopSet> Loops,
4349 const SCEV *Expr, Type *ToTy,
4350 ScalarEvolution &SE) {
4351 const SCEV *Result = nullptr;
4352 for (auto &L : Loops) {
4353 auto *DenormExpr = denormalizeForPostIncUse(S: Expr, Loops: L, SE);
4354 const SCEV *NewDenormExpr = SE.getAnyExtendExpr(Op: DenormExpr, Ty: ToTy);
4355 const SCEV *New = normalizeForPostIncUse(S: NewDenormExpr, Loops: L, SE);
4356 if (!New || (Result && New != Result))
4357 return nullptr;
4358 Result = New;
4359 }
4360
4361 assert(Result && "failed to create expression");
4362 return Result;
4363}
4364
4365/// Generate reuse formulae from different IV types.
4366void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
4367 // Don't bother truncating symbolic values.
4368 if (Base.BaseGV) return;
4369
4370 // Determine the integer type for the base formula.
4371 Type *DstTy = Base.getType();
4372 if (!DstTy) return;
4373 if (DstTy->isPointerTy())
4374 return;
4375
4376 // It is invalid to extend a pointer type so exit early if ScaledReg or
4377 // any of the BaseRegs are pointers.
4378 if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
4379 return;
4380 if (any_of(Range&: Base.BaseRegs,
4381 P: [](const SCEV *S) { return S->getType()->isPointerTy(); }))
4382 return;
4383
4384 SmallVector<PostIncLoopSet> Loops;
4385 for (auto &LF : LU.Fixups)
4386 Loops.push_back(Elt: LF.PostIncLoops);
4387
4388 for (Type *SrcTy : Types) {
4389 if (SrcTy != DstTy && TTI.isTruncateFree(Ty1: SrcTy, Ty2: DstTy)) {
4390 Formula F = Base;
4391
4392 // Sometimes SCEV is able to prove zero during ext transform. It may
4393 // happen if SCEV did not do all possible transforms while creating the
4394 // initial node (maybe due to depth limitations), but it can do them while
4395 // taking ext.
4396 if (F.ScaledReg) {
4397 const SCEV *NewScaledReg =
4398 getAnyExtendConsideringPostIncUses(Loops, Expr: F.ScaledReg, ToTy: SrcTy, SE);
4399 if (!NewScaledReg || NewScaledReg->isZero())
4400 continue;
4401 F.ScaledReg = NewScaledReg;
4402 }
4403 bool HasZeroBaseReg = false;
4404 for (const SCEV *&BaseReg : F.BaseRegs) {
4405 const SCEV *NewBaseReg =
4406 getAnyExtendConsideringPostIncUses(Loops, Expr: BaseReg, ToTy: SrcTy, SE);
4407 if (!NewBaseReg || NewBaseReg->isZero()) {
4408 HasZeroBaseReg = true;
4409 break;
4410 }
4411 BaseReg = NewBaseReg;
4412 }
4413 if (HasZeroBaseReg)
4414 continue;
4415
4416 // TODO: This assumes we've done basic processing on all uses and
4417 // have an idea what the register usage is.
4418 if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
4419 continue;
4420
4421 F.canonicalize(L: *L);
4422 (void)InsertFormula(LU, LUIdx, F);
4423 }
4424 }
4425}
4426
4427namespace {
4428
4429/// Helper class for GenerateCrossUseConstantOffsets. It's used to defer
4430/// modifications so that the search phase doesn't have to worry about the data
4431/// structures moving underneath it.
4432struct WorkItem {
4433 size_t LUIdx;
4434 Immediate Imm;
4435 const SCEV *OrigReg;
4436
4437 WorkItem(size_t LI, Immediate I, const SCEV *R)
4438 : LUIdx(LI), Imm(I), OrigReg(R) {}
4439
4440 void print(raw_ostream &OS) const;
4441 void dump() const;
4442};
4443
4444} // end anonymous namespace
4445
4446#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4447void WorkItem::print(raw_ostream &OS) const {
4448 OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
4449 << " , add offset " << Imm;
4450}
4451
4452LLVM_DUMP_METHOD void WorkItem::dump() const {
4453 print(errs()); errs() << '\n';
4454}
4455#endif
4456
4457/// Look for registers which are a constant distance apart and try to form reuse
4458/// opportunities between them.
4459void LSRInstance::GenerateCrossUseConstantOffsets() {
4460 // Group the registers by their value without any added constant offset.
4461 using ImmMapTy = std::map<Immediate, const SCEV *, KeyOrderTargetImmediate>;
4462
4463 DenseMap<const SCEV *, ImmMapTy> Map;
4464 DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap;
4465 SmallVector<const SCEV *, 8> Sequence;
4466 for (const SCEV *Use : RegUses) {
4467 const SCEV *Reg = Use; // Make a copy for ExtractImmediate to modify.
4468 Immediate Imm = ExtractImmediate(S&: Reg, SE);
4469 auto Pair = Map.try_emplace(Key: Reg);
4470 if (Pair.second)
4471 Sequence.push_back(Elt: Reg);
4472 Pair.first->second.insert(x: std::make_pair(x&: Imm, y&: Use));
4473 UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(Reg: Use);
4474 }
4475
4476 // Now examine each set of registers with the same base value. Build up
4477 // a list of work to do and do the work in a separate step so that we're
4478 // not adding formulae and register counts while we're searching.
4479 SmallVector<WorkItem, 32> WorkItems;
4480 SmallSet<std::pair<size_t, Immediate>, 32, KeyOrderSizeTAndImmediate>
4481 UniqueItems;
4482 for (const SCEV *Reg : Sequence) {
4483 const ImmMapTy &Imms = Map.find(Val: Reg)->second;
4484
4485 // It's not worthwhile looking for reuse if there's only one offset.
4486 if (Imms.size() == 1)
4487 continue;
4488
4489 LLVM_DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
4490 for (const auto &Entry
4491 : Imms) dbgs()
4492 << ' ' << Entry.first;
4493 dbgs() << '\n');
4494
4495 // Examine each offset.
4496 for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
4497 J != JE; ++J) {
4498 const SCEV *OrigReg = J->second;
4499
4500 Immediate JImm = J->first;
4501 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg: OrigReg);
4502
4503 if (!isa<SCEVConstant>(Val: OrigReg) &&
4504 UsedByIndicesMap[Reg].count() == 1) {
4505 LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
4506 << '\n');
4507 continue;
4508 }
4509
4510 // Conservatively examine offsets between this orig reg a few selected
4511 // other orig regs.
4512 Immediate First = Imms.begin()->first;
4513 Immediate Last = std::prev(x: Imms.end())->first;
4514 if (!First.isCompatibleImmediate(Imm: Last)) {
4515 LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
4516 << "\n");
4517 continue;
4518 }
4519 // Only scalable if both terms are scalable, or if one is scalable and
4520 // the other is 0.
4521 bool Scalable = First.isScalable() || Last.isScalable();
4522 int64_t FI = First.getKnownMinValue();
4523 int64_t LI = Last.getKnownMinValue();
4524 // Compute (First + Last) / 2 without overflow using the fact that
4525 // First + Last = 2 * (First + Last) + (First ^ Last).
4526 int64_t Avg = (FI & LI) + ((FI ^ LI) >> 1);
4527 // If the result is negative and FI is odd and LI even (or vice versa),
4528 // we rounded towards -inf. Add 1 in that case, to round towards 0.
4529 Avg = Avg + ((FI ^ LI) & ((uint64_t)Avg >> 63));
4530 ImmMapTy::const_iterator OtherImms[] = {
4531 Imms.begin(), std::prev(x: Imms.end()),
4532 Imms.lower_bound(x: Immediate::get(MinVal: Avg, Scalable))};
4533 for (const auto &M : OtherImms) {
4534 if (M == J || M == JE) continue;
4535 if (!JImm.isCompatibleImmediate(Imm: M->first))
4536 continue;
4537
4538 // Compute the difference between the two.
4539 Immediate Imm = JImm.subUnsigned(RHS: M->first);
4540 for (unsigned LUIdx : UsedByIndices.set_bits())
4541 // Make a memo of this use, offset, and register tuple.
4542 if (UniqueItems.insert(V: std::make_pair(x&: LUIdx, y&: Imm)).second)
4543 WorkItems.push_back(Elt: WorkItem(LUIdx, Imm, OrigReg));
4544 }
4545 }
4546 }
4547
4548 Map.clear();
4549 Sequence.clear();
4550 UsedByIndicesMap.clear();
4551 UniqueItems.clear();
4552
4553 // Now iterate through the worklist and add new formulae.
4554 for (const WorkItem &WI : WorkItems) {
4555 size_t LUIdx = WI.LUIdx;
4556 LSRUse &LU = Uses[LUIdx];
4557 Immediate Imm = WI.Imm;
4558 const SCEV *OrigReg = WI.OrigReg;
4559
4560 Type *IntTy = SE.getEffectiveSCEVType(Ty: OrigReg->getType());
4561 const SCEV *NegImmS = Imm.getNegativeSCEV(SE, Ty: IntTy);
4562 unsigned BitWidth = SE.getTypeSizeInBits(Ty: IntTy);
4563
4564 // TODO: Use a more targeted data structure.
4565 for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
4566 Formula F = LU.Formulae[L];
4567 // FIXME: The code for the scaled and unscaled registers looks
4568 // very similar but slightly different. Investigate if they
4569 // could be merged. That way, we would not have to unscale the
4570 // Formula.
4571 F.unscale();
4572 // Use the immediate in the scaled register.
4573 if (F.ScaledReg == OrigReg) {
4574 if (!F.BaseOffset.isCompatibleImmediate(Imm))
4575 continue;
4576 Immediate Offset = F.BaseOffset.addUnsigned(RHS: Imm.mulUnsigned(RHS: F.Scale));
4577 // Don't create 50 + reg(-50).
4578 const SCEV *S = Offset.getNegativeSCEV(SE, Ty: IntTy);
4579 if (F.referencesReg(S))
4580 continue;
4581 Formula NewF = F;
4582 NewF.BaseOffset = Offset;
4583 if (!isLegalUse(TTI, MinOffset: LU.MinOffset, MaxOffset: LU.MaxOffset, Kind: LU.Kind, AccessTy: LU.AccessTy,
4584 F: NewF))
4585 continue;
4586 NewF.ScaledReg = SE.getAddExpr(LHS: NegImmS, RHS: NewF.ScaledReg);
4587
4588 // If the new scale is a constant in a register, and adding the constant
4589 // value to the immediate would produce a value closer to zero than the
4590 // immediate itself, then the formula isn't worthwhile.
4591 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(Val: NewF.ScaledReg)) {
4592 // FIXME: Do we need to do something for scalable immediates here?
4593 // A scalable SCEV won't be constant, but we might still have
4594 // something in the offset? Bail out for now to be safe.
4595 if (NewF.BaseOffset.isNonZero() && NewF.BaseOffset.isScalable())
4596 continue;
4597 if (C->getValue()->isNegative() !=
4598 (NewF.BaseOffset.isLessThanZero()) &&
4599 (C->getAPInt().abs() * APInt(BitWidth, F.Scale))
4600 .ule(RHS: std::abs(i: NewF.BaseOffset.getFixedValue())))
4601 continue;
4602 }
4603
4604 // OK, looks good.
4605 NewF.canonicalize(L: *this->L);
4606 (void)InsertFormula(LU, LUIdx, F: NewF);
4607 } else {
4608 // Use the immediate in a base register.
4609 for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
4610 const SCEV *BaseReg = F.BaseRegs[N];
4611 if (BaseReg != OrigReg)
4612 continue;
4613 Formula NewF = F;
4614 if (!NewF.BaseOffset.isCompatibleImmediate(Imm) ||
4615 !NewF.UnfoldedOffset.isCompatibleImmediate(Imm) ||
4616 !NewF.BaseOffset.isCompatibleImmediate(Imm: NewF.UnfoldedOffset))
4617 continue;
4618 NewF.BaseOffset = NewF.BaseOffset.addUnsigned(RHS: Imm);
4619 if (!isLegalUse(TTI, MinOffset: LU.MinOffset, MaxOffset: LU.MaxOffset,
4620 Kind: LU.Kind, AccessTy: LU.AccessTy, F: NewF)) {
4621 if (AMK == TTI::AMK_PostIndexed &&
4622 mayUsePostIncMode(TTI, LU, S: OrigReg, L: this->L, SE))
4623 continue;
4624 Immediate NewUnfoldedOffset = NewF.UnfoldedOffset.addUnsigned(RHS: Imm);
4625 if (!isLegalAddImmediate(TTI, Offset: NewUnfoldedOffset))
4626 continue;
4627 NewF = F;
4628 NewF.UnfoldedOffset = NewUnfoldedOffset;
4629 }
4630 NewF.BaseRegs[N] = SE.getAddExpr(LHS: NegImmS, RHS: BaseReg);
4631
4632 // If the new formula has a constant in a register, and adding the
4633 // constant value to the immediate would produce a value closer to
4634 // zero than the immediate itself, then the formula isn't worthwhile.
4635 for (const SCEV *NewReg : NewF.BaseRegs)
4636 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(Val: NewReg)) {
4637 if (NewF.BaseOffset.isNonZero() && NewF.BaseOffset.isScalable())
4638 goto skip_formula;
4639 if ((C->getAPInt() + NewF.BaseOffset.getFixedValue())
4640 .abs()
4641 .slt(RHS: std::abs(i: NewF.BaseOffset.getFixedValue())) &&
4642 (C->getAPInt() + NewF.BaseOffset.getFixedValue())
4643 .countr_zero() >=
4644 (unsigned)llvm::countr_zero<uint64_t>(
4645 Val: NewF.BaseOffset.getFixedValue()))
4646 goto skip_formula;
4647 }
4648
4649 // Ok, looks good.
4650 NewF.canonicalize(L: *this->L);
4651 (void)InsertFormula(LU, LUIdx, F: NewF);
4652 break;
4653 skip_formula:;
4654 }
4655 }
4656 }
4657 }
4658}
4659
4660/// Generate formulae for each use.
4661void
4662LSRInstance::GenerateAllReuseFormulae() {
4663 // This is split into multiple loops so that hasRegsUsedByUsesOtherThan
4664 // queries are more precise.
4665 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4666 LSRUse &LU = Uses[LUIdx];
4667 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4668 GenerateReassociations(LU, LUIdx, Base: LU.Formulae[i]);
4669 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4670 GenerateCombinations(LU, LUIdx, Base: LU.Formulae[i]);
4671 }
4672 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4673 LSRUse &LU = Uses[LUIdx];
4674 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4675 GenerateSymbolicOffsets(LU, LUIdx, Base: LU.Formulae[i]);
4676 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4677 GenerateConstantOffsets(LU, LUIdx, Base: LU.Formulae[i]);
4678 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4679 GenerateICmpZeroScales(LU, LUIdx, Base: LU.Formulae[i]);
4680 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4681 GenerateScales(LU, LUIdx, Base: LU.Formulae[i]);
4682 }
4683 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4684 LSRUse &LU = Uses[LUIdx];
4685 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4686 GenerateTruncates(LU, LUIdx, Base: LU.Formulae[i]);
4687 }
4688
4689 GenerateCrossUseConstantOffsets();
4690
4691 LLVM_DEBUG(dbgs() << "\n"
4692 "After generating reuse formulae:\n";
4693 print_uses(dbgs()));
4694}
4695
4696/// If there are multiple formulae with the same set of registers used
4697/// by other uses, pick the best one and delete the others.
4698void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
4699 DenseSet<const SCEV *> VisitedRegs;
4700 SmallPtrSet<const SCEV *, 16> Regs;
4701 SmallPtrSet<const SCEV *, 16> LoserRegs;
4702#ifndef NDEBUG
4703 bool ChangedFormulae = false;
4704#endif
4705
4706 // Collect the best formula for each unique set of shared registers. This
4707 // is reset for each use.
4708 using BestFormulaeTy = DenseMap<SmallVector<const SCEV *, 4>, size_t>;
4709
4710 BestFormulaeTy BestFormulae;
4711
4712 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4713 LSRUse &LU = Uses[LUIdx];
4714 LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
4715 dbgs() << '\n');
4716
4717 bool Any = false;
4718 for (size_t FIdx = 0, NumForms = LU.Formulae.size();
4719 FIdx != NumForms; ++FIdx) {
4720 Formula &F = LU.Formulae[FIdx];
4721
4722 // Some formulas are instant losers. For example, they may depend on
4723 // nonexistent AddRecs from other loops. These need to be filtered
4724 // immediately, otherwise heuristics could choose them over others leading
4725 // to an unsatisfactory solution. Passing LoserRegs into RateFormula here
4726 // avoids the need to recompute this information across formulae using the
4727 // same bad AddRec. Passing LoserRegs is also essential unless we remove
4728 // the corresponding bad register from the Regs set.
4729 Cost CostF(L, SE, TTI, AMK);
4730 Regs.clear();
4731 CostF.RateFormula(F, Regs, VisitedRegs, LU, LoserRegs: &LoserRegs);
4732 if (CostF.isLoser()) {
4733 // During initial formula generation, undesirable formulae are generated
4734 // by uses within other loops that have some non-trivial address mode or
4735 // use the postinc form of the IV. LSR needs to provide these formulae
4736 // as the basis of rediscovering the desired formula that uses an AddRec
4737 // corresponding to the existing phi. Once all formulae have been
4738 // generated, these initial losers may be pruned.
4739 LLVM_DEBUG(dbgs() << " Filtering loser "; F.print(dbgs());
4740 dbgs() << "\n");
4741 }
4742 else {
4743 SmallVector<const SCEV *, 4> Key;
4744 for (const SCEV *Reg : F.BaseRegs) {
4745 if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
4746 Key.push_back(Elt: Reg);
4747 }
4748 if (F.ScaledReg &&
4749 RegUses.isRegUsedByUsesOtherThan(Reg: F.ScaledReg, LUIdx))
4750 Key.push_back(Elt: F.ScaledReg);
4751 // Unstable sort by host order ok, because this is only used for
4752 // uniquifying.
4753 llvm::sort(C&: Key);
4754
4755 std::pair<BestFormulaeTy::const_iterator, bool> P =
4756 BestFormulae.insert(KV: std::make_pair(x&: Key, y&: FIdx));
4757 if (P.second)
4758 continue;
4759
4760 Formula &Best = LU.Formulae[P.first->second];
4761
4762 Cost CostBest(L, SE, TTI, AMK);
4763 Regs.clear();
4764 CostBest.RateFormula(F: Best, Regs, VisitedRegs, LU);
4765 if (CostF.isLess(Other: CostBest))
4766 std::swap(a&: F, b&: Best);
4767 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
4768 dbgs() << "\n"
4769 " in favor of formula ";
4770 Best.print(dbgs()); dbgs() << '\n');
4771 }
4772#ifndef NDEBUG
4773 ChangedFormulae = true;
4774#endif
4775 LU.DeleteFormula(F);
4776 --FIdx;
4777 --NumForms;
4778 Any = true;
4779 }
4780
4781 // Now that we've filtered out some formulae, recompute the Regs set.
4782 if (Any)
4783 LU.RecomputeRegs(LUIdx, RegUses);
4784
4785 // Reset this to prepare for the next use.
4786 BestFormulae.clear();
4787 }
4788
4789 LLVM_DEBUG(if (ChangedFormulae) {
4790 dbgs() << "\n"
4791 "After filtering out undesirable candidates:\n";
4792 print_uses(dbgs());
4793 });
4794}
4795
4796/// Estimate the worst-case number of solutions the solver might have to
4797/// consider. It almost never considers this many solutions because it prune the
4798/// search space, but the pruning isn't always sufficient.
4799size_t LSRInstance::EstimateSearchSpaceComplexity() const {
4800 size_t Power = 1;
4801 for (const LSRUse &LU : Uses) {
4802 size_t FSize = LU.Formulae.size();
4803 if (FSize >= ComplexityLimit) {
4804 Power = ComplexityLimit;
4805 break;
4806 }
4807 Power *= FSize;
4808 if (Power >= ComplexityLimit)
4809 break;
4810 }
4811 return Power;
4812}
4813
4814/// When one formula uses a superset of the registers of another formula, it
4815/// won't help reduce register pressure (though it may not necessarily hurt
4816/// register pressure); remove it to simplify the system.
4817void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
4818 if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
4819 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
4820
4821 LLVM_DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
4822 "which use a superset of registers used by other "
4823 "formulae.\n");
4824
4825 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4826 LSRUse &LU = Uses[LUIdx];
4827 bool Any = false;
4828 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
4829 Formula &F = LU.Formulae[i];
4830 if (F.BaseOffset.isNonZero() && F.BaseOffset.isScalable())
4831 continue;
4832 // Look for a formula with a constant or GV in a register. If the use
4833 // also has a formula with that same value in an immediate field,
4834 // delete the one that uses a register.
4835 for (SmallVectorImpl<const SCEV *>::const_iterator
4836 I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
4837 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(Val: *I)) {
4838 Formula NewF = F;
4839 //FIXME: Formulas should store bitwidth to do wrapping properly.
4840 // See PR41034.
4841 NewF.BaseOffset =
4842 Immediate::getFixed(MinVal: NewF.BaseOffset.getFixedValue() +
4843 (uint64_t)C->getValue()->getSExtValue());
4844 NewF.BaseRegs.erase(CI: NewF.BaseRegs.begin() +
4845 (I - F.BaseRegs.begin()));
4846 if (LU.HasFormulaWithSameRegs(F: NewF)) {
4847 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
4848 dbgs() << '\n');
4849 LU.DeleteFormula(F);
4850 --i;
4851 --e;
4852 Any = true;
4853 break;
4854 }
4855 } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Val: *I)) {
4856 if (GlobalValue *GV = dyn_cast<GlobalValue>(Val: U->getValue()))
4857 if (!F.BaseGV) {
4858 Formula NewF = F;
4859 NewF.BaseGV = GV;
4860 NewF.BaseRegs.erase(CI: NewF.BaseRegs.begin() +
4861 (I - F.BaseRegs.begin()));
4862 if (LU.HasFormulaWithSameRegs(F: NewF)) {
4863 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
4864 dbgs() << '\n');
4865 LU.DeleteFormula(F);
4866 --i;
4867 --e;
4868 Any = true;
4869 break;
4870 }
4871 }
4872 }
4873 }
4874 }
4875 if (Any)
4876 LU.RecomputeRegs(LUIdx, RegUses);
4877 }
4878
4879 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4880 }
4881}
4882
4883/// When there are many registers for expressions like A, A+1, A+2, etc.,
4884/// allocate a single register for them.
4885void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
4886 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
4887 return;
4888
4889 LLVM_DEBUG(
4890 dbgs() << "The search space is too complex.\n"
4891 "Narrowing the search space by assuming that uses separated "
4892 "by a constant offset will use the same registers.\n");
4893
4894 // This is especially useful for unrolled loops.
4895
4896 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4897 LSRUse &LU = Uses[LUIdx];
4898 for (const Formula &F : LU.Formulae) {
4899 if (F.BaseOffset.isZero() || (F.Scale != 0 && F.Scale != 1))
4900 continue;
4901
4902 LSRUse *LUThatHas = FindUseWithSimilarFormula(OrigF: F, OrigLU: LU);
4903 if (!LUThatHas)
4904 continue;
4905
4906 if (!reconcileNewOffset(LU&: *LUThatHas, NewOffset: F.BaseOffset, /*HasBaseReg=*/ false,
4907 Kind: LU.Kind, AccessTy: LU.AccessTy))
4908 continue;
4909
4910 LLVM_DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); dbgs() << '\n');
4911
4912 LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
4913
4914 // Transfer the fixups of LU to LUThatHas.
4915 for (LSRFixup &Fixup : LU.Fixups) {
4916 Fixup.Offset += F.BaseOffset;
4917 LUThatHas->pushFixup(f&: Fixup);
4918 LLVM_DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
4919 }
4920
4921 // Delete formulae from the new use which are no longer legal.
4922 bool Any = false;
4923 for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
4924 Formula &F = LUThatHas->Formulae[i];
4925 if (!isLegalUse(TTI, MinOffset: LUThatHas->MinOffset, MaxOffset: LUThatHas->MaxOffset,
4926 Kind: LUThatHas->Kind, AccessTy: LUThatHas->AccessTy, F)) {
4927 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
4928 LUThatHas->DeleteFormula(F);
4929 --i;
4930 --e;
4931 Any = true;
4932 }
4933 }
4934
4935 if (Any)
4936 LUThatHas->RecomputeRegs(LUIdx: LUThatHas - &Uses.front(), RegUses);
4937
4938 // Delete the old use.
4939 DeleteUse(LU, LUIdx);
4940 --LUIdx;
4941 --NumUses;
4942 break;
4943 }
4944 }
4945
4946 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4947}
4948
4949/// Call FilterOutUndesirableDedicatedRegisters again, if necessary, now that
4950/// we've done more filtering, as it may be able to find more formulae to
4951/// eliminate.
4952void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
4953 if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
4954 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
4955
4956 LLVM_DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
4957 "undesirable dedicated registers.\n");
4958
4959 FilterOutUndesirableDedicatedRegisters();
4960
4961 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4962 }
4963}
4964
4965/// If a LSRUse has multiple formulae with the same ScaledReg and Scale.
4966/// Pick the best one and delete the others.
4967/// This narrowing heuristic is to keep as many formulae with different
4968/// Scale and ScaledReg pair as possible while narrowing the search space.
4969/// The benefit is that it is more likely to find out a better solution
4970/// from a formulae set with more Scale and ScaledReg variations than
4971/// a formulae set with the same Scale and ScaledReg. The picking winner
4972/// reg heuristic will often keep the formulae with the same Scale and
4973/// ScaledReg and filter others, and we want to avoid that if possible.
4974void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
4975 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
4976 return;
4977
4978 LLVM_DEBUG(
4979 dbgs() << "The search space is too complex.\n"
4980 "Narrowing the search space by choosing the best Formula "
4981 "from the Formulae with the same Scale and ScaledReg.\n");
4982
4983 // Map the "Scale * ScaledReg" pair to the best formula of current LSRUse.
4984 using BestFormulaeTy = DenseMap<std::pair<const SCEV *, int64_t>, size_t>;
4985
4986 BestFormulaeTy BestFormulae;
4987#ifndef NDEBUG
4988 bool ChangedFormulae = false;
4989#endif
4990 DenseSet<const SCEV *> VisitedRegs;
4991 SmallPtrSet<const SCEV *, 16> Regs;
4992
4993 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4994 LSRUse &LU = Uses[LUIdx];
4995 LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
4996 dbgs() << '\n');
4997
4998 // Return true if Formula FA is better than Formula FB.
4999 auto IsBetterThan = [&](Formula &FA, Formula &FB) {
5000 // First we will try to choose the Formula with fewer new registers.
5001 // For a register used by current Formula, the more the register is
5002 // shared among LSRUses, the less we increase the register number
5003 // counter of the formula.
5004 size_t FARegNum = 0;
5005 for (const SCEV *Reg : FA.BaseRegs) {
5006 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
5007 FARegNum += (NumUses - UsedByIndices.count() + 1);
5008 }
5009 size_t FBRegNum = 0;
5010 for (const SCEV *Reg : FB.BaseRegs) {
5011 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
5012 FBRegNum += (NumUses - UsedByIndices.count() + 1);
5013 }
5014 if (FARegNum != FBRegNum)
5015 return FARegNum < FBRegNum;
5016
5017 // If the new register numbers are the same, choose the Formula with
5018 // less Cost.
5019 Cost CostFA(L, SE, TTI, AMK);
5020 Cost CostFB(L, SE, TTI, AMK);
5021 Regs.clear();
5022 CostFA.RateFormula(F: FA, Regs, VisitedRegs, LU);
5023 Regs.clear();
5024 CostFB.RateFormula(F: FB, Regs, VisitedRegs, LU);
5025 return CostFA.isLess(Other: CostFB);
5026 };
5027
5028 bool Any = false;
5029 for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
5030 ++FIdx) {
5031 Formula &F = LU.Formulae[FIdx];
5032 if (!F.ScaledReg)
5033 continue;
5034 auto P = BestFormulae.insert(KV: {{F.ScaledReg, F.Scale}, FIdx});
5035 if (P.second)
5036 continue;
5037
5038 Formula &Best = LU.Formulae[P.first->second];
5039 if (IsBetterThan(F, Best))
5040 std::swap(a&: F, b&: Best);
5041 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
5042 dbgs() << "\n"
5043 " in favor of formula ";
5044 Best.print(dbgs()); dbgs() << '\n');
5045#ifndef NDEBUG
5046 ChangedFormulae = true;
5047#endif
5048 LU.DeleteFormula(F);
5049 --FIdx;
5050 --NumForms;
5051 Any = true;
5052 }
5053 if (Any)
5054 LU.RecomputeRegs(LUIdx, RegUses);
5055
5056 // Reset this to prepare for the next use.
5057 BestFormulae.clear();
5058 }
5059
5060 LLVM_DEBUG(if (ChangedFormulae) {
5061 dbgs() << "\n"
5062 "After filtering out undesirable candidates:\n";
5063 print_uses(dbgs());
5064 });
5065}
5066
5067/// If we are over the complexity limit, filter out any post-inc prefering
5068/// variables to only post-inc values.
5069void LSRInstance::NarrowSearchSpaceByFilterPostInc() {
5070 if (AMK != TTI::AMK_PostIndexed)
5071 return;
5072 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5073 return;
5074
5075 LLVM_DEBUG(dbgs() << "The search space is too complex.\n"
5076 "Narrowing the search space by choosing the lowest "
5077 "register Formula for PostInc Uses.\n");
5078
5079 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5080 LSRUse &LU = Uses[LUIdx];
5081
5082 if (LU.Kind != LSRUse::Address)
5083 continue;
5084 if (!TTI.isIndexedLoadLegal(Mode: TTI.MIM_PostInc, Ty: LU.AccessTy.getType()) &&
5085 !TTI.isIndexedStoreLegal(Mode: TTI.MIM_PostInc, Ty: LU.AccessTy.getType()))
5086 continue;
5087
5088 size_t MinRegs = std::numeric_limits<size_t>::max();
5089 for (const Formula &F : LU.Formulae)
5090 MinRegs = std::min(a: F.getNumRegs(), b: MinRegs);
5091
5092 bool Any = false;
5093 for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
5094 ++FIdx) {
5095 Formula &F = LU.Formulae[FIdx];
5096 if (F.getNumRegs() > MinRegs) {
5097 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
5098 dbgs() << "\n");
5099 LU.DeleteFormula(F);
5100 --FIdx;
5101 --NumForms;
5102 Any = true;
5103 }
5104 }
5105 if (Any)
5106 LU.RecomputeRegs(LUIdx, RegUses);
5107
5108 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5109 break;
5110 }
5111
5112 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5113}
5114
5115/// The function delete formulas with high registers number expectation.
5116/// Assuming we don't know the value of each formula (already delete
5117/// all inefficient), generate probability of not selecting for each
5118/// register.
5119/// For example,
5120/// Use1:
5121/// reg(a) + reg({0,+,1})
5122/// reg(a) + reg({-1,+,1}) + 1
5123/// reg({a,+,1})
5124/// Use2:
5125/// reg(b) + reg({0,+,1})
5126/// reg(b) + reg({-1,+,1}) + 1
5127/// reg({b,+,1})
5128/// Use3:
5129/// reg(c) + reg(b) + reg({0,+,1})
5130/// reg(c) + reg({b,+,1})
5131///
5132/// Probability of not selecting
5133/// Use1 Use2 Use3
5134/// reg(a) (1/3) * 1 * 1
5135/// reg(b) 1 * (1/3) * (1/2)
5136/// reg({0,+,1}) (2/3) * (2/3) * (1/2)
5137/// reg({-1,+,1}) (2/3) * (2/3) * 1
5138/// reg({a,+,1}) (2/3) * 1 * 1
5139/// reg({b,+,1}) 1 * (2/3) * (2/3)
5140/// reg(c) 1 * 1 * 0
5141///
5142/// Now count registers number mathematical expectation for each formula:
5143/// Note that for each use we exclude probability if not selecting for the use.
5144/// For example for Use1 probability for reg(a) would be just 1 * 1 (excluding
5145/// probabilty 1/3 of not selecting for Use1).
5146/// Use1:
5147/// reg(a) + reg({0,+,1}) 1 + 1/3 -- to be deleted
5148/// reg(a) + reg({-1,+,1}) + 1 1 + 4/9 -- to be deleted
5149/// reg({a,+,1}) 1
5150/// Use2:
5151/// reg(b) + reg({0,+,1}) 1/2 + 1/3 -- to be deleted
5152/// reg(b) + reg({-1,+,1}) + 1 1/2 + 2/3 -- to be deleted
5153/// reg({b,+,1}) 2/3
5154/// Use3:
5155/// reg(c) + reg(b) + reg({0,+,1}) 1 + 1/3 + 4/9 -- to be deleted
5156/// reg(c) + reg({b,+,1}) 1 + 2/3
5157void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() {
5158 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5159 return;
5160 // Ok, we have too many of formulae on our hands to conveniently handle.
5161 // Use a rough heuristic to thin out the list.
5162
5163 // Set of Regs wich will be 100% used in final solution.
5164 // Used in each formula of a solution (in example above this is reg(c)).
5165 // We can skip them in calculations.
5166 SmallPtrSet<const SCEV *, 4> UniqRegs;
5167 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5168
5169 // Map each register to probability of not selecting
5170 DenseMap <const SCEV *, float> RegNumMap;
5171 for (const SCEV *Reg : RegUses) {
5172 if (UniqRegs.count(Ptr: Reg))
5173 continue;
5174 float PNotSel = 1;
5175 for (const LSRUse &LU : Uses) {
5176 if (!LU.Regs.count(Ptr: Reg))
5177 continue;
5178 float P = LU.getNotSelectedProbability(Reg);
5179 if (P != 0.0)
5180 PNotSel *= P;
5181 else
5182 UniqRegs.insert(Ptr: Reg);
5183 }
5184 RegNumMap.insert(KV: std::make_pair(x&: Reg, y&: PNotSel));
5185 }
5186
5187 LLVM_DEBUG(
5188 dbgs() << "Narrowing the search space by deleting costly formulas\n");
5189
5190 // Delete formulas where registers number expectation is high.
5191 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5192 LSRUse &LU = Uses[LUIdx];
5193 // If nothing to delete - continue.
5194 if (LU.Formulae.size() < 2)
5195 continue;
5196 // This is temporary solution to test performance. Float should be
5197 // replaced with round independent type (based on integers) to avoid
5198 // different results for different target builds.
5199 float FMinRegNum = LU.Formulae[0].getNumRegs();
5200 float FMinARegNum = LU.Formulae[0].getNumRegs();
5201 size_t MinIdx = 0;
5202 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
5203 Formula &F = LU.Formulae[i];
5204 float FRegNum = 0;
5205 float FARegNum = 0;
5206 for (const SCEV *BaseReg : F.BaseRegs) {
5207 if (UniqRegs.count(Ptr: BaseReg))
5208 continue;
5209 FRegNum += RegNumMap[BaseReg] / LU.getNotSelectedProbability(Reg: BaseReg);
5210 if (isa<SCEVAddRecExpr>(Val: BaseReg))
5211 FARegNum +=
5212 RegNumMap[BaseReg] / LU.getNotSelectedProbability(Reg: BaseReg);
5213 }
5214 if (const SCEV *ScaledReg = F.ScaledReg) {
5215 if (!UniqRegs.count(Ptr: ScaledReg)) {
5216 FRegNum +=
5217 RegNumMap[ScaledReg] / LU.getNotSelectedProbability(Reg: ScaledReg);
5218 if (isa<SCEVAddRecExpr>(Val: ScaledReg))
5219 FARegNum +=
5220 RegNumMap[ScaledReg] / LU.getNotSelectedProbability(Reg: ScaledReg);
5221 }
5222 }
5223 if (FMinRegNum > FRegNum ||
5224 (FMinRegNum == FRegNum && FMinARegNum > FARegNum)) {
5225 FMinRegNum = FRegNum;
5226 FMinARegNum = FARegNum;
5227 MinIdx = i;
5228 }
5229 }
5230 LLVM_DEBUG(dbgs() << " The formula "; LU.Formulae[MinIdx].print(dbgs());
5231 dbgs() << " with min reg num " << FMinRegNum << '\n');
5232 if (MinIdx != 0)
5233 std::swap(a&: LU.Formulae[MinIdx], b&: LU.Formulae[0]);
5234 while (LU.Formulae.size() != 1) {
5235 LLVM_DEBUG(dbgs() << " Deleting "; LU.Formulae.back().print(dbgs());
5236 dbgs() << '\n');
5237 LU.Formulae.pop_back();
5238 }
5239 LU.RecomputeRegs(LUIdx, RegUses);
5240 assert(LU.Formulae.size() == 1 && "Should be exactly 1 min regs formula");
5241 Formula &F = LU.Formulae[0];
5242 LLVM_DEBUG(dbgs() << " Leaving only "; F.print(dbgs()); dbgs() << '\n');
5243 // When we choose the formula, the regs become unique.
5244 UniqRegs.insert_range(R&: F.BaseRegs);
5245 if (F.ScaledReg)
5246 UniqRegs.insert(Ptr: F.ScaledReg);
5247 }
5248 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5249}
5250
5251// Check if Best and Reg are SCEVs separated by a constant amount C, and if so
5252// would the addressing offset +C would be legal where the negative offset -C is
5253// not.
5254static bool IsSimplerBaseSCEVForTarget(const TargetTransformInfo &TTI,
5255 ScalarEvolution &SE, const SCEV *Best,
5256 const SCEV *Reg,
5257 MemAccessTy AccessType) {
5258 if (Best->getType() != Reg->getType() ||
5259 (isa<SCEVAddRecExpr>(Val: Best) && isa<SCEVAddRecExpr>(Val: Reg) &&
5260 cast<SCEVAddRecExpr>(Val: Best)->getLoop() !=
5261 cast<SCEVAddRecExpr>(Val: Reg)->getLoop()))
5262 return false;
5263 std::optional<APInt> Diff = SE.computeConstantDifference(LHS: Best, RHS: Reg);
5264 if (!Diff)
5265 return false;
5266
5267 return TTI.isLegalAddressingMode(
5268 Ty: AccessType.MemTy, /*BaseGV=*/nullptr,
5269 /*BaseOffset=*/Diff->getSExtValue(),
5270 /*HasBaseReg=*/true, /*Scale=*/0, AddrSpace: AccessType.AddrSpace) &&
5271 !TTI.isLegalAddressingMode(
5272 Ty: AccessType.MemTy, /*BaseGV=*/nullptr,
5273 /*BaseOffset=*/-Diff->getSExtValue(),
5274 /*HasBaseReg=*/true, /*Scale=*/0, AddrSpace: AccessType.AddrSpace);
5275}
5276
5277/// Pick a register which seems likely to be profitable, and then in any use
5278/// which has any reference to that register, delete all formulae which do not
5279/// reference that register.
5280void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
5281 // With all other options exhausted, loop until the system is simple
5282 // enough to handle.
5283 SmallPtrSet<const SCEV *, 4> Taken;
5284 while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
5285 // Ok, we have too many of formulae on our hands to conveniently handle.
5286 // Use a rough heuristic to thin out the list.
5287 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5288
5289 // Pick the register which is used by the most LSRUses, which is likely
5290 // to be a good reuse register candidate.
5291 const SCEV *Best = nullptr;
5292 unsigned BestNum = 0;
5293 for (const SCEV *Reg : RegUses) {
5294 if (Taken.count(Ptr: Reg))
5295 continue;
5296 if (!Best) {
5297 Best = Reg;
5298 BestNum = RegUses.getUsedByIndices(Reg).count();
5299 } else {
5300 unsigned Count = RegUses.getUsedByIndices(Reg).count();
5301 if (Count > BestNum) {
5302 Best = Reg;
5303 BestNum = Count;
5304 }
5305
5306 // If the scores are the same, but the Reg is simpler for the target
5307 // (for example {x,+,1} as opposed to {x+C,+,1}, where the target can
5308 // handle +C but not -C), opt for the simpler formula.
5309 if (Count == BestNum) {
5310 int LUIdx = RegUses.getUsedByIndices(Reg).find_first();
5311 if (LUIdx >= 0 && Uses[LUIdx].Kind == LSRUse::Address &&
5312 IsSimplerBaseSCEVForTarget(TTI, SE, Best, Reg,
5313 AccessType: Uses[LUIdx].AccessTy)) {
5314 Best = Reg;
5315 BestNum = Count;
5316 }
5317 }
5318 }
5319 }
5320 assert(Best && "Failed to find best LSRUse candidate");
5321
5322 LLVM_DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
5323 << " will yield profitable reuse.\n");
5324 Taken.insert(Ptr: Best);
5325
5326 // In any use with formulae which references this register, delete formulae
5327 // which don't reference it.
5328 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5329 LSRUse &LU = Uses[LUIdx];
5330 if (!LU.Regs.count(Ptr: Best)) continue;
5331
5332 bool Any = false;
5333 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
5334 Formula &F = LU.Formulae[i];
5335 if (!F.referencesReg(S: Best)) {
5336 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
5337 LU.DeleteFormula(F);
5338 --e;
5339 --i;
5340 Any = true;
5341 assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?");
5342 continue;
5343 }
5344 }
5345
5346 if (Any)
5347 LU.RecomputeRegs(LUIdx, RegUses);
5348 }
5349
5350 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5351 }
5352}
5353
5354/// If there are an extraordinary number of formulae to choose from, use some
5355/// rough heuristics to prune down the number of formulae. This keeps the main
5356/// solver from taking an extraordinary amount of time in some worst-case
5357/// scenarios.
5358void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
5359 NarrowSearchSpaceByDetectingSupersets();
5360 NarrowSearchSpaceByCollapsingUnrolledCode();
5361 NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
5362 if (FilterSameScaledReg)
5363 NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
5364 NarrowSearchSpaceByFilterPostInc();
5365 if (LSRExpNarrow)
5366 NarrowSearchSpaceByDeletingCostlyFormulas();
5367 else
5368 NarrowSearchSpaceByPickingWinnerRegs();
5369}
5370
5371/// This is the recursive solver.
5372void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
5373 Cost &SolutionCost,
5374 SmallVectorImpl<const Formula *> &Workspace,
5375 const Cost &CurCost,
5376 const SmallPtrSet<const SCEV *, 16> &CurRegs,
5377 DenseSet<const SCEV *> &VisitedRegs) const {
5378 // Some ideas:
5379 // - prune more:
5380 // - use more aggressive filtering
5381 // - sort the formula so that the most profitable solutions are found first
5382 // - sort the uses too
5383 // - search faster:
5384 // - don't compute a cost, and then compare. compare while computing a cost
5385 // and bail early.
5386 // - track register sets with SmallBitVector
5387
5388 const LSRUse &LU = Uses[Workspace.size()];
5389
5390 // If this use references any register that's already a part of the
5391 // in-progress solution, consider it a requirement that a formula must
5392 // reference that register in order to be considered. This prunes out
5393 // unprofitable searching.
5394 SmallSetVector<const SCEV *, 4> ReqRegs;
5395 for (const SCEV *S : CurRegs)
5396 if (LU.Regs.count(Ptr: S))
5397 ReqRegs.insert(X: S);
5398
5399 SmallPtrSet<const SCEV *, 16> NewRegs;
5400 Cost NewCost(L, SE, TTI, AMK);
5401 for (const Formula &F : LU.Formulae) {
5402 // Ignore formulae which may not be ideal in terms of register reuse of
5403 // ReqRegs. The formula should use all required registers before
5404 // introducing new ones.
5405 // This can sometimes (notably when trying to favour postinc) lead to
5406 // sub-optimial decisions. There it is best left to the cost modelling to
5407 // get correct.
5408 if (AMK != TTI::AMK_PostIndexed || LU.Kind != LSRUse::Address) {
5409 int NumReqRegsToFind = std::min(a: F.getNumRegs(), b: ReqRegs.size());
5410 for (const SCEV *Reg : ReqRegs) {
5411 if ((F.ScaledReg && F.ScaledReg == Reg) ||
5412 is_contained(Range: F.BaseRegs, Element: Reg)) {
5413 --NumReqRegsToFind;
5414 if (NumReqRegsToFind == 0)
5415 break;
5416 }
5417 }
5418 if (NumReqRegsToFind != 0) {
5419 // If none of the formulae satisfied the required registers, then we could
5420 // clear ReqRegs and try again. Currently, we simply give up in this case.
5421 continue;
5422 }
5423 }
5424
5425 // Evaluate the cost of the current formula. If it's already worse than
5426 // the current best, prune the search at that point.
5427 NewCost = CurCost;
5428 NewRegs = CurRegs;
5429 NewCost.RateFormula(F, Regs&: NewRegs, VisitedRegs, LU);
5430 if (NewCost.isLess(Other: SolutionCost)) {
5431 Workspace.push_back(Elt: &F);
5432 if (Workspace.size() != Uses.size()) {
5433 SolveRecurse(Solution, SolutionCost, Workspace, CurCost: NewCost,
5434 CurRegs: NewRegs, VisitedRegs);
5435 if (F.getNumRegs() == 1 && Workspace.size() == 1)
5436 VisitedRegs.insert(V: F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
5437 } else {
5438 LLVM_DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
5439 dbgs() << ".\nRegs:\n";
5440 for (const SCEV *S : NewRegs) dbgs()
5441 << "- " << *S << "\n";
5442 dbgs() << '\n');
5443
5444 SolutionCost = NewCost;
5445 Solution = Workspace;
5446 }
5447 Workspace.pop_back();
5448 }
5449 }
5450}
5451
5452/// Choose one formula from each use. Return the results in the given Solution
5453/// vector.
5454void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
5455 SmallVector<const Formula *, 8> Workspace;
5456 Cost SolutionCost(L, SE, TTI, AMK);
5457 SolutionCost.Lose();
5458 Cost CurCost(L, SE, TTI, AMK);
5459 SmallPtrSet<const SCEV *, 16> CurRegs;
5460 DenseSet<const SCEV *> VisitedRegs;
5461 Workspace.reserve(N: Uses.size());
5462
5463 // SolveRecurse does all the work.
5464 SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
5465 CurRegs, VisitedRegs);
5466 if (Solution.empty()) {
5467 LLVM_DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
5468 return;
5469 }
5470
5471 // Ok, we've now made all our decisions.
5472 LLVM_DEBUG(dbgs() << "\n"
5473 "The chosen solution requires ";
5474 SolutionCost.print(dbgs()); dbgs() << ":\n";
5475 for (size_t i = 0, e = Uses.size(); i != e; ++i) {
5476 dbgs() << " ";
5477 Uses[i].print(dbgs());
5478 dbgs() << "\n"
5479 " ";
5480 Solution[i]->print(dbgs());
5481 dbgs() << '\n';
5482 });
5483
5484 assert(Solution.size() == Uses.size() && "Malformed solution!");
5485
5486 const bool EnableDropUnprofitableSolution = [&] {
5487 switch (AllowDropSolutionIfLessProfitable) {
5488 case cl::BOU_TRUE:
5489 return true;
5490 case cl::BOU_FALSE:
5491 return false;
5492 case cl::BOU_UNSET:
5493 return TTI.shouldDropLSRSolutionIfLessProfitable();
5494 }
5495 llvm_unreachable("Unhandled cl::boolOrDefault enum");
5496 }();
5497
5498 if (BaselineCost.isLess(Other: SolutionCost)) {
5499 if (!EnableDropUnprofitableSolution)
5500 LLVM_DEBUG(
5501 dbgs() << "Baseline is more profitable than chosen solution, "
5502 "add option 'lsr-drop-solution' to drop LSR solution.\n");
5503 else {
5504 LLVM_DEBUG(dbgs() << "Baseline is more profitable than chosen "
5505 "solution, dropping LSR solution.\n";);
5506 Solution.clear();
5507 }
5508 }
5509}
5510
5511/// Helper for AdjustInsertPositionForExpand. Climb up the dominator tree far as
5512/// we can go while still being dominated by the input positions. This helps
5513/// canonicalize the insert position, which encourages sharing.
5514BasicBlock::iterator
5515LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
5516 const SmallVectorImpl<Instruction *> &Inputs)
5517 const {
5518 Instruction *Tentative = &*IP;
5519 while (true) {
5520 bool AllDominate = true;
5521 Instruction *BetterPos = nullptr;
5522 // Don't bother attempting to insert before a catchswitch, their basic block
5523 // cannot have other non-PHI instructions.
5524 if (isa<CatchSwitchInst>(Val: Tentative))
5525 return IP;
5526
5527 for (Instruction *Inst : Inputs) {
5528 if (Inst == Tentative || !DT.dominates(Def: Inst, User: Tentative)) {
5529 AllDominate = false;
5530 break;
5531 }
5532 // Attempt to find an insert position in the middle of the block,
5533 // instead of at the end, so that it can be used for other expansions.
5534 if (Tentative->getParent() == Inst->getParent() &&
5535 (!BetterPos || !DT.dominates(Def: Inst, User: BetterPos)))
5536 BetterPos = &*std::next(x: BasicBlock::iterator(Inst));
5537 }
5538 if (!AllDominate)
5539 break;
5540 if (BetterPos)
5541 IP = BetterPos->getIterator();
5542 else
5543 IP = Tentative->getIterator();
5544
5545 const Loop *IPLoop = LI.getLoopFor(BB: IP->getParent());
5546 unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0;
5547
5548 BasicBlock *IDom;
5549 for (DomTreeNode *Rung = DT.getNode(BB: IP->getParent()); ; ) {
5550 if (!Rung) return IP;
5551 Rung = Rung->getIDom();
5552 if (!Rung) return IP;
5553 IDom = Rung->getBlock();
5554
5555 // Don't climb into a loop though.
5556 const Loop *IDomLoop = LI.getLoopFor(BB: IDom);
5557 unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0;
5558 if (IDomDepth <= IPLoopDepth &&
5559 (IDomDepth != IPLoopDepth || IDomLoop == IPLoop))
5560 break;
5561 }
5562
5563 Tentative = IDom->getTerminator();
5564 }
5565
5566 return IP;
5567}
5568
5569/// Determine an input position which will be dominated by the operands and
5570/// which will dominate the result.
5571BasicBlock::iterator LSRInstance::AdjustInsertPositionForExpand(
5572 BasicBlock::iterator LowestIP, const LSRFixup &LF, const LSRUse &LU) const {
5573 // Collect some instructions which must be dominated by the
5574 // expanding replacement. These must be dominated by any operands that
5575 // will be required in the expansion.
5576 SmallVector<Instruction *, 4> Inputs;
5577 if (Instruction *I = dyn_cast<Instruction>(Val: LF.OperandValToReplace))
5578 Inputs.push_back(Elt: I);
5579 if (LU.Kind == LSRUse::ICmpZero)
5580 if (Instruction *I =
5581 dyn_cast<Instruction>(Val: cast<ICmpInst>(Val: LF.UserInst)->getOperand(i_nocapture: 1)))
5582 Inputs.push_back(Elt: I);
5583 if (LF.PostIncLoops.count(Ptr: L)) {
5584 if (LF.isUseFullyOutsideLoop(L))
5585 Inputs.push_back(Elt: L->getLoopLatch()->getTerminator());
5586 else
5587 Inputs.push_back(Elt: IVIncInsertPos);
5588 }
5589 // The expansion must also be dominated by the increment positions of any
5590 // loops it for which it is using post-inc mode.
5591 for (const Loop *PIL : LF.PostIncLoops) {
5592 if (PIL == L) continue;
5593
5594 // Be dominated by the loop exit.
5595 SmallVector<BasicBlock *, 4> ExitingBlocks;
5596 PIL->getExitingBlocks(ExitingBlocks);
5597 if (!ExitingBlocks.empty()) {
5598 BasicBlock *BB = ExitingBlocks[0];
5599 for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i)
5600 BB = DT.findNearestCommonDominator(A: BB, B: ExitingBlocks[i]);
5601 Inputs.push_back(Elt: BB->getTerminator());
5602 }
5603 }
5604
5605 assert(!isa<PHINode>(LowestIP) && !LowestIP->isEHPad() &&
5606 "Insertion point must be a normal instruction");
5607
5608 // Then, climb up the immediate dominator tree as far as we can go while
5609 // still being dominated by the input positions.
5610 BasicBlock::iterator IP = HoistInsertPosition(IP: LowestIP, Inputs);
5611
5612 // Don't insert instructions before PHI nodes.
5613 while (isa<PHINode>(Val: IP)) ++IP;
5614
5615 // Ignore landingpad instructions.
5616 while (IP->isEHPad()) ++IP;
5617
5618 // Set IP below instructions recently inserted by SCEVExpander. This keeps the
5619 // IP consistent across expansions and allows the previously inserted
5620 // instructions to be reused by subsequent expansion.
5621 while (Rewriter.isInsertedInstruction(I: &*IP) && IP != LowestIP)
5622 ++IP;
5623
5624 return IP;
5625}
5626
5627/// Emit instructions for the leading candidate expression for this LSRUse (this
5628/// is called "expanding").
5629Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
5630 const Formula &F, BasicBlock::iterator IP,
5631 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
5632 if (LU.RigidFormula)
5633 return LF.OperandValToReplace;
5634
5635 // Determine an input position which will be dominated by the operands and
5636 // which will dominate the result.
5637 IP = AdjustInsertPositionForExpand(LowestIP: IP, LF, LU);
5638 Rewriter.setInsertPoint(&*IP);
5639
5640 // Inform the Rewriter if we have a post-increment use, so that it can
5641 // perform an advantageous expansion.
5642 Rewriter.setPostInc(LF.PostIncLoops);
5643
5644 // This is the type that the user actually needs.
5645 Type *OpTy = LF.OperandValToReplace->getType();
5646 // This will be the type that we'll initially expand to.
5647 Type *Ty = F.getType();
5648 if (!Ty)
5649 // No type known; just expand directly to the ultimate type.
5650 Ty = OpTy;
5651 else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(Ty: OpTy))
5652 // Expand directly to the ultimate type if it's the right size.
5653 Ty = OpTy;
5654 // This is the type to do integer arithmetic in.
5655 Type *IntTy = SE.getEffectiveSCEVType(Ty);
5656
5657 // Build up a list of operands to add together to form the full base.
5658 SmallVector<const SCEV *, 8> Ops;
5659
5660 // Expand the BaseRegs portion.
5661 for (const SCEV *Reg : F.BaseRegs) {
5662 assert(!Reg->isZero() && "Zero allocated in a base register!");
5663
5664 // If we're expanding for a post-inc user, make the post-inc adjustment.
5665 Reg = denormalizeForPostIncUse(S: Reg, Loops: LF.PostIncLoops, SE);
5666 Ops.push_back(Elt: SE.getUnknown(V: Rewriter.expandCodeFor(SH: Reg, Ty: nullptr)));
5667 }
5668
5669 // Expand the ScaledReg portion.
5670 Value *ICmpScaledV = nullptr;
5671 if (F.Scale != 0) {
5672 const SCEV *ScaledS = F.ScaledReg;
5673
5674 // If we're expanding for a post-inc user, make the post-inc adjustment.
5675 PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
5676 ScaledS = denormalizeForPostIncUse(S: ScaledS, Loops, SE);
5677
5678 if (LU.Kind == LSRUse::ICmpZero) {
5679 // Expand ScaleReg as if it was part of the base regs.
5680 if (F.Scale == 1)
5681 Ops.push_back(
5682 Elt: SE.getUnknown(V: Rewriter.expandCodeFor(SH: ScaledS, Ty: nullptr)));
5683 else {
5684 // An interesting way of "folding" with an icmp is to use a negated
5685 // scale, which we'll implement by inserting it into the other operand
5686 // of the icmp.
5687 assert(F.Scale == -1 &&
5688 "The only scale supported by ICmpZero uses is -1!");
5689 ICmpScaledV = Rewriter.expandCodeFor(SH: ScaledS, Ty: nullptr);
5690 }
5691 } else {
5692 // Otherwise just expand the scaled register and an explicit scale,
5693 // which is expected to be matched as part of the address.
5694
5695 // Flush the operand list to suppress SCEVExpander hoisting address modes.
5696 // Unless the addressing mode will not be folded.
5697 if (!Ops.empty() && LU.Kind == LSRUse::Address &&
5698 isAMCompletelyFolded(TTI, LU, F)) {
5699 Value *FullV = Rewriter.expandCodeFor(SH: SE.getAddExpr(Ops), Ty: nullptr);
5700 Ops.clear();
5701 Ops.push_back(Elt: SE.getUnknown(V: FullV));
5702 }
5703 ScaledS = SE.getUnknown(V: Rewriter.expandCodeFor(SH: ScaledS, Ty: nullptr));
5704 if (F.Scale != 1)
5705 ScaledS =
5706 SE.getMulExpr(LHS: ScaledS, RHS: SE.getConstant(Ty: ScaledS->getType(), V: F.Scale));
5707 Ops.push_back(Elt: ScaledS);
5708 }
5709 }
5710
5711 // Expand the GV portion.
5712 if (F.BaseGV) {
5713 // Flush the operand list to suppress SCEVExpander hoisting.
5714 if (!Ops.empty()) {
5715 Value *FullV = Rewriter.expandCodeFor(SH: SE.getAddExpr(Ops), Ty: IntTy);
5716 Ops.clear();
5717 Ops.push_back(Elt: SE.getUnknown(V: FullV));
5718 }
5719 Ops.push_back(Elt: SE.getUnknown(V: F.BaseGV));
5720 }
5721
5722 // Flush the operand list to suppress SCEVExpander hoisting of both folded and
5723 // unfolded offsets. LSR assumes they both live next to their uses.
5724 if (!Ops.empty()) {
5725 Value *FullV = Rewriter.expandCodeFor(SH: SE.getAddExpr(Ops), Ty);
5726 Ops.clear();
5727 Ops.push_back(Elt: SE.getUnknown(V: FullV));
5728 }
5729
5730 // FIXME: Are we sure we won't get a mismatch here? Is there a way to bail
5731 // out at this point, or should we generate a SCEV adding together mixed
5732 // offsets?
5733 assert(F.BaseOffset.isCompatibleImmediate(LF.Offset) &&
5734 "Expanding mismatched offsets\n");
5735 // Expand the immediate portion.
5736 Immediate Offset = F.BaseOffset.addUnsigned(RHS: LF.Offset);
5737 if (Offset.isNonZero()) {
5738 if (LU.Kind == LSRUse::ICmpZero) {
5739 // The other interesting way of "folding" with an ICmpZero is to use a
5740 // negated immediate.
5741 if (!ICmpScaledV)
5742 ICmpScaledV =
5743 ConstantInt::get(Ty: IntTy, V: -(uint64_t)Offset.getFixedValue());
5744 else {
5745 Ops.push_back(Elt: SE.getUnknown(V: ICmpScaledV));
5746 ICmpScaledV = ConstantInt::get(Ty: IntTy, V: Offset.getFixedValue());
5747 }
5748 } else {
5749 // Just add the immediate values. These again are expected to be matched
5750 // as part of the address.
5751 Ops.push_back(Elt: Offset.getUnknownSCEV(SE, Ty: IntTy));
5752 }
5753 }
5754
5755 // Expand the unfolded offset portion.
5756 Immediate UnfoldedOffset = F.UnfoldedOffset;
5757 if (UnfoldedOffset.isNonZero()) {
5758 // Just add the immediate values.
5759 Ops.push_back(Elt: UnfoldedOffset.getUnknownSCEV(SE, Ty: IntTy));
5760 }
5761
5762 // Emit instructions summing all the operands.
5763 const SCEV *FullS = Ops.empty() ?
5764 SE.getConstant(Ty: IntTy, V: 0) :
5765 SE.getAddExpr(Ops);
5766 Value *FullV = Rewriter.expandCodeFor(SH: FullS, Ty);
5767
5768 // We're done expanding now, so reset the rewriter.
5769 Rewriter.clearPostInc();
5770
5771 // An ICmpZero Formula represents an ICmp which we're handling as a
5772 // comparison against zero. Now that we've expanded an expression for that
5773 // form, update the ICmp's other operand.
5774 if (LU.Kind == LSRUse::ICmpZero) {
5775 ICmpInst *CI = cast<ICmpInst>(Val: LF.UserInst);
5776 if (auto *OperandIsInstr = dyn_cast<Instruction>(Val: CI->getOperand(i_nocapture: 1)))
5777 DeadInsts.emplace_back(Args&: OperandIsInstr);
5778 assert(!F.BaseGV && "ICmp does not support folding a global value and "
5779 "a scale at the same time!");
5780 if (F.Scale == -1) {
5781 if (ICmpScaledV->getType() != OpTy) {
5782 Instruction *Cast = CastInst::Create(
5783 CastInst::getCastOpcode(Val: ICmpScaledV, SrcIsSigned: false, Ty: OpTy, DstIsSigned: false),
5784 S: ICmpScaledV, Ty: OpTy, Name: "tmp", InsertBefore: CI->getIterator());
5785 ICmpScaledV = Cast;
5786 }
5787 CI->setOperand(i_nocapture: 1, Val_nocapture: ICmpScaledV);
5788 } else {
5789 // A scale of 1 means that the scale has been expanded as part of the
5790 // base regs.
5791 assert((F.Scale == 0 || F.Scale == 1) &&
5792 "ICmp does not support folding a global value and "
5793 "a scale at the same time!");
5794 Constant *C = ConstantInt::getSigned(Ty: SE.getEffectiveSCEVType(Ty: OpTy),
5795 V: -(uint64_t)Offset.getFixedValue());
5796 if (C->getType() != OpTy) {
5797 C = ConstantFoldCastOperand(
5798 Opcode: CastInst::getCastOpcode(Val: C, SrcIsSigned: false, Ty: OpTy, DstIsSigned: false), C, DestTy: OpTy,
5799 DL: CI->getDataLayout());
5800 assert(C && "Cast of ConstantInt should have folded");
5801 }
5802
5803 CI->setOperand(i_nocapture: 1, Val_nocapture: C);
5804 }
5805 }
5806
5807 return FullV;
5808}
5809
5810/// Helper for Rewrite. PHI nodes are special because the use of their operands
5811/// effectively happens in their predecessor blocks, so the expression may need
5812/// to be expanded in multiple places.
5813void LSRInstance::RewriteForPHI(PHINode *PN, const LSRUse &LU,
5814 const LSRFixup &LF, const Formula &F,
5815 SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
5816 DenseMap<BasicBlock *, Value *> Inserted;
5817
5818 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
5819 if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
5820 bool needUpdateFixups = false;
5821 BasicBlock *BB = PN->getIncomingBlock(i);
5822
5823 // If this is a critical edge, split the edge so that we do not insert
5824 // the code on all predecessor/successor paths. We do this unless this
5825 // is the canonical backedge for this loop, which complicates post-inc
5826 // users.
5827 if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
5828 !isa<IndirectBrInst>(Val: BB->getTerminator()) &&
5829 !isa<CatchSwitchInst>(Val: BB->getTerminator())) {
5830 BasicBlock *Parent = PN->getParent();
5831 Loop *PNLoop = LI.getLoopFor(BB: Parent);
5832 if (!PNLoop || Parent != PNLoop->getHeader()) {
5833 // Split the critical edge.
5834 BasicBlock *NewBB = nullptr;
5835 if (!Parent->isLandingPad()) {
5836 NewBB =
5837 SplitCriticalEdge(Src: BB, Dst: Parent,
5838 Options: CriticalEdgeSplittingOptions(&DT, &LI, MSSAU)
5839 .setMergeIdenticalEdges()
5840 .setKeepOneInputPHIs());
5841 } else {
5842 SmallVector<BasicBlock*, 2> NewBBs;
5843 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
5844 SplitLandingPadPredecessors(OrigBB: Parent, Preds: BB, Suffix: "", Suffix2: "", NewBBs, DTU: &DTU, LI: &LI);
5845 NewBB = NewBBs[0];
5846 }
5847 // If NewBB==NULL, then SplitCriticalEdge refused to split because all
5848 // phi predecessors are identical. The simple thing to do is skip
5849 // splitting in this case rather than complicate the API.
5850 if (NewBB) {
5851 // If PN is outside of the loop and BB is in the loop, we want to
5852 // move the block to be immediately before the PHI block, not
5853 // immediately after BB.
5854 if (L->contains(BB) && !L->contains(Inst: PN))
5855 NewBB->moveBefore(MovePos: PN->getParent());
5856
5857 // Splitting the edge can reduce the number of PHI entries we have.
5858 e = PN->getNumIncomingValues();
5859 BB = NewBB;
5860 i = PN->getBasicBlockIndex(BB);
5861
5862 needUpdateFixups = true;
5863 }
5864 }
5865 }
5866
5867 std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
5868 Inserted.try_emplace(Key: BB);
5869 if (!Pair.second)
5870 PN->setIncomingValue(i, V: Pair.first->second);
5871 else {
5872 Value *FullV =
5873 Expand(LU, LF, F, IP: BB->getTerminator()->getIterator(), DeadInsts);
5874
5875 // If this is reuse-by-noop-cast, insert the noop cast.
5876 Type *OpTy = LF.OperandValToReplace->getType();
5877 if (FullV->getType() != OpTy)
5878 FullV = CastInst::Create(
5879 CastInst::getCastOpcode(Val: FullV, SrcIsSigned: false, Ty: OpTy, DstIsSigned: false), S: FullV,
5880 Ty: LF.OperandValToReplace->getType(), Name: "tmp",
5881 InsertBefore: BB->getTerminator()->getIterator());
5882
5883 // If the incoming block for this value is not in the loop, it means the
5884 // current PHI is not in a loop exit, so we must create a LCSSA PHI for
5885 // the inserted value.
5886 if (auto *I = dyn_cast<Instruction>(Val: FullV))
5887 if (L->contains(Inst: I) && !L->contains(BB))
5888 InsertedNonLCSSAInsts.insert(X: I);
5889
5890 PN->setIncomingValue(i, V: FullV);
5891 Pair.first->second = FullV;
5892 }
5893
5894 // If LSR splits critical edge and phi node has other pending
5895 // fixup operands, we need to update those pending fixups. Otherwise
5896 // formulae will not be implemented completely and some instructions
5897 // will not be eliminated.
5898 if (needUpdateFixups) {
5899 for (LSRUse &LU : Uses)
5900 for (LSRFixup &Fixup : LU.Fixups)
5901 // If fixup is supposed to rewrite some operand in the phi
5902 // that was just updated, it may be already moved to
5903 // another phi node. Such fixup requires update.
5904 if (Fixup.UserInst == PN) {
5905 // Check if the operand we try to replace still exists in the
5906 // original phi.
5907 bool foundInOriginalPHI = false;
5908 for (const auto &val : PN->incoming_values())
5909 if (val == Fixup.OperandValToReplace) {
5910 foundInOriginalPHI = true;
5911 break;
5912 }
5913
5914 // If fixup operand found in original PHI - nothing to do.
5915 if (foundInOriginalPHI)
5916 continue;
5917
5918 // Otherwise it might be moved to another PHI and requires update.
5919 // If fixup operand not found in any of the incoming blocks that
5920 // means we have already rewritten it - nothing to do.
5921 for (const auto &Block : PN->blocks())
5922 for (BasicBlock::iterator I = Block->begin(); isa<PHINode>(Val: I);
5923 ++I) {
5924 PHINode *NewPN = cast<PHINode>(Val&: I);
5925 for (const auto &val : NewPN->incoming_values())
5926 if (val == Fixup.OperandValToReplace)
5927 Fixup.UserInst = NewPN;
5928 }
5929 }
5930 }
5931 }
5932}
5933
5934/// Emit instructions for the leading candidate expression for this LSRUse (this
5935/// is called "expanding"), and update the UserInst to reference the newly
5936/// expanded value.
5937void LSRInstance::Rewrite(const LSRUse &LU, const LSRFixup &LF,
5938 const Formula &F,
5939 SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
5940 // First, find an insertion point that dominates UserInst. For PHI nodes,
5941 // find the nearest block which dominates all the relevant uses.
5942 if (PHINode *PN = dyn_cast<PHINode>(Val: LF.UserInst)) {
5943 RewriteForPHI(PN, LU, LF, F, DeadInsts);
5944 } else {
5945 Value *FullV = Expand(LU, LF, F, IP: LF.UserInst->getIterator(), DeadInsts);
5946
5947 // If this is reuse-by-noop-cast, insert the noop cast.
5948 Type *OpTy = LF.OperandValToReplace->getType();
5949 if (FullV->getType() != OpTy) {
5950 Instruction *Cast =
5951 CastInst::Create(CastInst::getCastOpcode(Val: FullV, SrcIsSigned: false, Ty: OpTy, DstIsSigned: false),
5952 S: FullV, Ty: OpTy, Name: "tmp", InsertBefore: LF.UserInst->getIterator());
5953 FullV = Cast;
5954 }
5955
5956 // Update the user. ICmpZero is handled specially here (for now) because
5957 // Expand may have updated one of the operands of the icmp already, and
5958 // its new value may happen to be equal to LF.OperandValToReplace, in
5959 // which case doing replaceUsesOfWith leads to replacing both operands
5960 // with the same value. TODO: Reorganize this.
5961 if (LU.Kind == LSRUse::ICmpZero)
5962 LF.UserInst->setOperand(i: 0, Val: FullV);
5963 else
5964 LF.UserInst->replaceUsesOfWith(From: LF.OperandValToReplace, To: FullV);
5965 }
5966
5967 if (auto *OperandIsInstr = dyn_cast<Instruction>(Val: LF.OperandValToReplace))
5968 DeadInsts.emplace_back(Args&: OperandIsInstr);
5969}
5970
5971// Trying to hoist the IVInc to loop header if all IVInc users are in
5972// the loop header. It will help backend to generate post index load/store
5973// when the latch block is different from loop header block.
5974static bool canHoistIVInc(const TargetTransformInfo &TTI, const LSRFixup &Fixup,
5975 const LSRUse &LU, Instruction *IVIncInsertPos,
5976 Loop *L) {
5977 if (LU.Kind != LSRUse::Address)
5978 return false;
5979
5980 // For now this code do the conservative optimization, only work for
5981 // the header block. Later we can hoist the IVInc to the block post
5982 // dominate all users.
5983 BasicBlock *LHeader = L->getHeader();
5984 if (IVIncInsertPos->getParent() == LHeader)
5985 return false;
5986
5987 if (!Fixup.OperandValToReplace ||
5988 any_of(Range: Fixup.OperandValToReplace->users(), P: [&LHeader](User *U) {
5989 Instruction *UI = cast<Instruction>(Val: U);
5990 return UI->getParent() != LHeader;
5991 }))
5992 return false;
5993
5994 Instruction *I = Fixup.UserInst;
5995 Type *Ty = I->getType();
5996 return (isa<LoadInst>(Val: I) && TTI.isIndexedLoadLegal(Mode: TTI.MIM_PostInc, Ty)) ||
5997 (isa<StoreInst>(Val: I) && TTI.isIndexedStoreLegal(Mode: TTI.MIM_PostInc, Ty));
5998}
5999
6000/// Rewrite all the fixup locations with new values, following the chosen
6001/// solution.
6002void LSRInstance::ImplementSolution(
6003 const SmallVectorImpl<const Formula *> &Solution) {
6004 // Keep track of instructions we may have made dead, so that
6005 // we can remove them after we are done working.
6006 SmallVector<WeakTrackingVH, 16> DeadInsts;
6007
6008 // Mark phi nodes that terminate chains so the expander tries to reuse them.
6009 for (const IVChain &Chain : IVChainVec) {
6010 if (PHINode *PN = dyn_cast<PHINode>(Val: Chain.tailUserInst()))
6011 Rewriter.setChainedPhi(PN);
6012 }
6013
6014 // Expand the new value definitions and update the users.
6015 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx)
6016 for (const LSRFixup &Fixup : Uses[LUIdx].Fixups) {
6017 Instruction *InsertPos =
6018 canHoistIVInc(TTI, Fixup, LU: Uses[LUIdx], IVIncInsertPos, L)
6019 ? L->getHeader()->getTerminator()
6020 : IVIncInsertPos;
6021 Rewriter.setIVIncInsertPos(L, Pos: InsertPos);
6022 Rewrite(LU: Uses[LUIdx], LF: Fixup, F: *Solution[LUIdx], DeadInsts);
6023 Changed = true;
6024 }
6025
6026 auto InsertedInsts = InsertedNonLCSSAInsts.takeVector();
6027 formLCSSAForInstructions(Worklist&: InsertedInsts, DT, LI, SE: &SE);
6028
6029 for (const IVChain &Chain : IVChainVec) {
6030 GenerateIVChain(Chain, DeadInsts);
6031 Changed = true;
6032 }
6033
6034 for (const WeakVH &IV : Rewriter.getInsertedIVs())
6035 if (IV && dyn_cast<Instruction>(Val: &*IV)->getParent())
6036 ScalarEvolutionIVs.push_back(Elt: IV);
6037
6038 // Clean up after ourselves. This must be done before deleting any
6039 // instructions.
6040 Rewriter.clear();
6041
6042 Changed |= RecursivelyDeleteTriviallyDeadInstructionsPermissive(DeadInsts,
6043 TLI: &TLI, MSSAU);
6044
6045 // In our cost analysis above, we assume that each addrec consumes exactly
6046 // one register, and arrange to have increments inserted just before the
6047 // latch to maximimize the chance this is true. However, if we reused
6048 // existing IVs, we now need to move the increments to match our
6049 // expectations. Otherwise, our cost modeling results in us having a
6050 // chosen a non-optimal result for the actual schedule. (And yes, this
6051 // scheduling decision does impact later codegen.)
6052 for (PHINode &PN : L->getHeader()->phis()) {
6053 BinaryOperator *BO = nullptr;
6054 Value *Start = nullptr, *Step = nullptr;
6055 if (!matchSimpleRecurrence(P: &PN, BO, Start, Step))
6056 continue;
6057
6058 switch (BO->getOpcode()) {
6059 case Instruction::Sub:
6060 if (BO->getOperand(i_nocapture: 0) != &PN)
6061 // sub is non-commutative - match handling elsewhere in LSR
6062 continue;
6063 break;
6064 case Instruction::Add:
6065 break;
6066 default:
6067 continue;
6068 };
6069
6070 if (!isa<Constant>(Val: Step))
6071 // If not a constant step, might increase register pressure
6072 // (We assume constants have been canonicalized to RHS)
6073 continue;
6074
6075 if (BO->getParent() == IVIncInsertPos->getParent())
6076 // Only bother moving across blocks. Isel can handle block local case.
6077 continue;
6078
6079 // Can we legally schedule inc at the desired point?
6080 if (!llvm::all_of(Range: BO->uses(),
6081 P: [&](Use &U) {return DT.dominates(Def: IVIncInsertPos, U);}))
6082 continue;
6083 BO->moveBefore(InsertPos: IVIncInsertPos->getIterator());
6084 Changed = true;
6085 }
6086
6087
6088}
6089
6090LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
6091 DominatorTree &DT, LoopInfo &LI,
6092 const TargetTransformInfo &TTI, AssumptionCache &AC,
6093 TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU)
6094 : IU(IU), SE(SE), DT(DT), LI(LI), AC(AC), TLI(TLI), TTI(TTI), L(L),
6095 MSSAU(MSSAU), AMK(PreferredAddresingMode.getNumOccurrences() > 0
6096 ? PreferredAddresingMode
6097 : TTI.getPreferredAddressingMode(L, SE: &SE)),
6098 Rewriter(SE, L->getHeader()->getDataLayout(), "lsr", false),
6099 BaselineCost(L, SE, TTI, AMK) {
6100 // If LoopSimplify form is not available, stay out of trouble.
6101 if (!L->isLoopSimplifyForm())
6102 return;
6103
6104 // If there's no interesting work to be done, bail early.
6105 if (IU.empty()) return;
6106
6107 // If there's too much analysis to be done, bail early. We won't be able to
6108 // model the problem anyway.
6109 unsigned NumUsers = 0;
6110 for (const IVStrideUse &U : IU) {
6111 if (++NumUsers > MaxIVUsers) {
6112 (void)U;
6113 LLVM_DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << U
6114 << "\n");
6115 return;
6116 }
6117 // Bail out if we have a PHI on an EHPad that gets a value from a
6118 // CatchSwitchInst. Because the CatchSwitchInst cannot be split, there is
6119 // no good place to stick any instructions.
6120 if (auto *PN = dyn_cast<PHINode>(Val: U.getUser())) {
6121 auto FirstNonPHI = PN->getParent()->getFirstNonPHIIt();
6122 if (isa<FuncletPadInst>(Val: FirstNonPHI) ||
6123 isa<CatchSwitchInst>(Val: FirstNonPHI))
6124 for (BasicBlock *PredBB : PN->blocks())
6125 if (isa<CatchSwitchInst>(Val: PredBB->getFirstNonPHIIt()))
6126 return;
6127 }
6128 }
6129
6130 LLVM_DEBUG(dbgs() << "\nLSR on loop ";
6131 L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
6132 dbgs() << ":\n");
6133
6134 // Configure SCEVExpander already now, so the correct mode is used for
6135 // isSafeToExpand() checks.
6136#if LLVM_ENABLE_ABI_BREAKING_CHECKS
6137 Rewriter.setDebugType(DEBUG_TYPE);
6138#endif
6139 Rewriter.disableCanonicalMode();
6140 Rewriter.enableLSRMode();
6141
6142 // First, perform some low-level loop optimizations.
6143 OptimizeShadowIV();
6144 OptimizeLoopTermCond();
6145
6146 // If loop preparation eliminates all interesting IV users, bail.
6147 if (IU.empty()) return;
6148
6149 // Skip nested loops until we can model them better with formulae.
6150 if (!L->isInnermost()) {
6151 LLVM_DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
6152 return;
6153 }
6154
6155 // Start collecting data and preparing for the solver.
6156 // If number of registers is not the major cost, we cannot benefit from the
6157 // current profitable chain optimization which is based on number of
6158 // registers.
6159 // FIXME: add profitable chain optimization for other kinds major cost, for
6160 // example number of instructions.
6161 if (TTI.isNumRegsMajorCostOfLSR() || StressIVChain)
6162 CollectChains();
6163 CollectInterestingTypesAndFactors();
6164 CollectFixupsAndInitialFormulae();
6165 CollectLoopInvariantFixupsAndFormulae();
6166
6167 if (Uses.empty())
6168 return;
6169
6170 LLVM_DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
6171 print_uses(dbgs()));
6172 LLVM_DEBUG(dbgs() << "The baseline solution requires ";
6173 BaselineCost.print(dbgs()); dbgs() << "\n");
6174
6175 // Now use the reuse data to generate a bunch of interesting ways
6176 // to formulate the values needed for the uses.
6177 GenerateAllReuseFormulae();
6178
6179 FilterOutUndesirableDedicatedRegisters();
6180 NarrowSearchSpaceUsingHeuristics();
6181
6182 SmallVector<const Formula *, 8> Solution;
6183 Solve(Solution);
6184
6185 // Release memory that is no longer needed.
6186 Factors.clear();
6187 Types.clear();
6188 RegUses.clear();
6189
6190 if (Solution.empty())
6191 return;
6192
6193#ifndef NDEBUG
6194 // Formulae should be legal.
6195 for (const LSRUse &LU : Uses) {
6196 for (const Formula &F : LU.Formulae)
6197 assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
6198 F) && "Illegal formula generated!");
6199 };
6200#endif
6201
6202 // Now that we've decided what we want, make it so.
6203 ImplementSolution(Solution);
6204}
6205
6206#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
6207void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
6208 if (Factors.empty() && Types.empty()) return;
6209
6210 OS << "LSR has identified the following interesting factors and types: ";
6211 bool First = true;
6212
6213 for (int64_t Factor : Factors) {
6214 if (!First) OS << ", ";
6215 First = false;
6216 OS << '*' << Factor;
6217 }
6218
6219 for (Type *Ty : Types) {
6220 if (!First) OS << ", ";
6221 First = false;
6222 OS << '(' << *Ty << ')';
6223 }
6224 OS << '\n';
6225}
6226
6227void LSRInstance::print_fixups(raw_ostream &OS) const {
6228 OS << "LSR is examining the following fixup sites:\n";
6229 for (const LSRUse &LU : Uses)
6230 for (const LSRFixup &LF : LU.Fixups) {
6231 dbgs() << " ";
6232 LF.print(OS);
6233 OS << '\n';
6234 }
6235}
6236
6237void LSRInstance::print_uses(raw_ostream &OS) const {
6238 OS << "LSR is examining the following uses:\n";
6239 for (const LSRUse &LU : Uses) {
6240 dbgs() << " ";
6241 LU.print(OS);
6242 OS << '\n';
6243 for (const Formula &F : LU.Formulae) {
6244 OS << " ";
6245 F.print(OS);
6246 OS << '\n';
6247 }
6248 }
6249}
6250
6251void LSRInstance::print(raw_ostream &OS) const {
6252 print_factors_and_types(OS);
6253 print_fixups(OS);
6254 print_uses(OS);
6255}
6256
6257LLVM_DUMP_METHOD void LSRInstance::dump() const {
6258 print(errs()); errs() << '\n';
6259}
6260#endif
6261
6262namespace {
6263
6264class LoopStrengthReduce : public LoopPass {
6265public:
6266 static char ID; // Pass ID, replacement for typeid
6267
6268 LoopStrengthReduce();
6269
6270private:
6271 bool runOnLoop(Loop *L, LPPassManager &LPM) override;
6272 void getAnalysisUsage(AnalysisUsage &AU) const override;
6273};
6274
6275} // end anonymous namespace
6276
6277LoopStrengthReduce::LoopStrengthReduce() : LoopPass(ID) {
6278 initializeLoopStrengthReducePass(*PassRegistry::getPassRegistry());
6279}
6280
6281void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
6282 // We split critical edges, so we change the CFG. However, we do update
6283 // many analyses if they are around.
6284 AU.addPreservedID(ID&: LoopSimplifyID);
6285
6286 AU.addRequired<LoopInfoWrapperPass>();
6287 AU.addPreserved<LoopInfoWrapperPass>();
6288 AU.addRequiredID(ID&: LoopSimplifyID);
6289 AU.addRequired<DominatorTreeWrapperPass>();
6290 AU.addPreserved<DominatorTreeWrapperPass>();
6291 AU.addRequired<ScalarEvolutionWrapperPass>();
6292 AU.addPreserved<ScalarEvolutionWrapperPass>();
6293 AU.addRequired<AssumptionCacheTracker>();
6294 AU.addRequired<TargetLibraryInfoWrapperPass>();
6295 // Requiring LoopSimplify a second time here prevents IVUsers from running
6296 // twice, since LoopSimplify was invalidated by running ScalarEvolution.
6297 AU.addRequiredID(ID&: LoopSimplifyID);
6298 AU.addRequired<IVUsersWrapperPass>();
6299 AU.addPreserved<IVUsersWrapperPass>();
6300 AU.addRequired<TargetTransformInfoWrapperPass>();
6301 AU.addPreserved<MemorySSAWrapperPass>();
6302}
6303
6304namespace {
6305
6306/// Enables more convenient iteration over a DWARF expression vector.
6307static iterator_range<llvm::DIExpression::expr_op_iterator>
6308ToDwarfOpIter(SmallVectorImpl<uint64_t> &Expr) {
6309 llvm::DIExpression::expr_op_iterator Begin =
6310 llvm::DIExpression::expr_op_iterator(Expr.begin());
6311 llvm::DIExpression::expr_op_iterator End =
6312 llvm::DIExpression::expr_op_iterator(Expr.end());
6313 return {Begin, End};
6314}
6315
6316struct SCEVDbgValueBuilder {
6317 SCEVDbgValueBuilder() = default;
6318 SCEVDbgValueBuilder(const SCEVDbgValueBuilder &Base) { clone(Base); }
6319
6320 void clone(const SCEVDbgValueBuilder &Base) {
6321 LocationOps = Base.LocationOps;
6322 Expr = Base.Expr;
6323 }
6324
6325 void clear() {
6326 LocationOps.clear();
6327 Expr.clear();
6328 }
6329
6330 /// The DIExpression as we translate the SCEV.
6331 SmallVector<uint64_t, 6> Expr;
6332 /// The location ops of the DIExpression.
6333 SmallVector<Value *, 2> LocationOps;
6334
6335 void pushOperator(uint64_t Op) { Expr.push_back(Elt: Op); }
6336 void pushUInt(uint64_t Operand) { Expr.push_back(Elt: Operand); }
6337
6338 /// Add a DW_OP_LLVM_arg to the expression, followed by the index of the value
6339 /// in the set of values referenced by the expression.
6340 void pushLocation(llvm::Value *V) {
6341 Expr.push_back(Elt: llvm::dwarf::DW_OP_LLVM_arg);
6342 auto *It = llvm::find(Range&: LocationOps, Val: V);
6343 unsigned ArgIndex = 0;
6344 if (It != LocationOps.end()) {
6345 ArgIndex = std::distance(first: LocationOps.begin(), last: It);
6346 } else {
6347 ArgIndex = LocationOps.size();
6348 LocationOps.push_back(Elt: V);
6349 }
6350 Expr.push_back(Elt: ArgIndex);
6351 }
6352
6353 void pushValue(const SCEVUnknown *U) {
6354 llvm::Value *V = cast<SCEVUnknown>(Val: U)->getValue();
6355 pushLocation(V);
6356 }
6357
6358 bool pushConst(const SCEVConstant *C) {
6359 if (C->getAPInt().getSignificantBits() > 64)
6360 return false;
6361 Expr.push_back(Elt: llvm::dwarf::DW_OP_consts);
6362 Expr.push_back(Elt: C->getAPInt().getSExtValue());
6363 return true;
6364 }
6365
6366 // Iterating the expression as DWARF ops is convenient when updating
6367 // DWARF_OP_LLVM_args.
6368 iterator_range<llvm::DIExpression::expr_op_iterator> expr_ops() {
6369 return ToDwarfOpIter(Expr);
6370 }
6371
6372 /// Several SCEV types are sequences of the same arithmetic operator applied
6373 /// to constants and values that may be extended or truncated.
6374 bool pushArithmeticExpr(const llvm::SCEVCommutativeExpr *CommExpr,
6375 uint64_t DwarfOp) {
6376 assert((isa<llvm::SCEVAddExpr>(CommExpr) || isa<SCEVMulExpr>(CommExpr)) &&
6377 "Expected arithmetic SCEV type");
6378 bool Success = true;
6379 unsigned EmitOperator = 0;
6380 for (const auto &Op : CommExpr->operands()) {
6381 Success &= pushSCEV(S: Op);
6382
6383 if (EmitOperator >= 1)
6384 pushOperator(Op: DwarfOp);
6385 ++EmitOperator;
6386 }
6387 return Success;
6388 }
6389
6390 // TODO: Identify and omit noop casts.
6391 bool pushCast(const llvm::SCEVCastExpr *C, bool IsSigned) {
6392 const llvm::SCEV *Inner = C->getOperand(i: 0);
6393 const llvm::Type *Type = C->getType();
6394 uint64_t ToWidth = Type->getIntegerBitWidth();
6395 bool Success = pushSCEV(S: Inner);
6396 uint64_t CastOps[] = {dwarf::DW_OP_LLVM_convert, ToWidth,
6397 IsSigned ? llvm::dwarf::DW_ATE_signed
6398 : llvm::dwarf::DW_ATE_unsigned};
6399 for (const auto &Op : CastOps)
6400 pushOperator(Op);
6401 return Success;
6402 }
6403
6404 // TODO: MinMax - although these haven't been encountered in the test suite.
6405 bool pushSCEV(const llvm::SCEV *S) {
6406 bool Success = true;
6407 if (const SCEVConstant *StartInt = dyn_cast<SCEVConstant>(Val: S)) {
6408 Success &= pushConst(C: StartInt);
6409
6410 } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Val: S)) {
6411 if (!U->getValue())
6412 return false;
6413 pushLocation(V: U->getValue());
6414
6415 } else if (const SCEVMulExpr *MulRec = dyn_cast<SCEVMulExpr>(Val: S)) {
6416 Success &= pushArithmeticExpr(CommExpr: MulRec, DwarfOp: llvm::dwarf::DW_OP_mul);
6417
6418 } else if (const SCEVUDivExpr *UDiv = dyn_cast<SCEVUDivExpr>(Val: S)) {
6419 Success &= pushSCEV(S: UDiv->getLHS());
6420 Success &= pushSCEV(S: UDiv->getRHS());
6421 pushOperator(Op: llvm::dwarf::DW_OP_div);
6422
6423 } else if (const SCEVCastExpr *Cast = dyn_cast<SCEVCastExpr>(Val: S)) {
6424 // Assert if a new and unknown SCEVCastEXpr type is encountered.
6425 assert((isa<SCEVZeroExtendExpr>(Cast) || isa<SCEVTruncateExpr>(Cast) ||
6426 isa<SCEVPtrToIntExpr>(Cast) || isa<SCEVSignExtendExpr>(Cast)) &&
6427 "Unexpected cast type in SCEV.");
6428 Success &= pushCast(C: Cast, IsSigned: (isa<SCEVSignExtendExpr>(Val: Cast)));
6429
6430 } else if (const SCEVAddExpr *AddExpr = dyn_cast<SCEVAddExpr>(Val: S)) {
6431 Success &= pushArithmeticExpr(CommExpr: AddExpr, DwarfOp: llvm::dwarf::DW_OP_plus);
6432
6433 } else if (isa<SCEVAddRecExpr>(Val: S)) {
6434 // Nested SCEVAddRecExpr are generated by nested loops and are currently
6435 // unsupported.
6436 return false;
6437
6438 } else {
6439 return false;
6440 }
6441 return Success;
6442 }
6443
6444 /// Return true if the combination of arithmetic operator and underlying
6445 /// SCEV constant value is an identity function.
6446 bool isIdentityFunction(uint64_t Op, const SCEV *S) {
6447 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(Val: S)) {
6448 if (C->getAPInt().getSignificantBits() > 64)
6449 return false;
6450 int64_t I = C->getAPInt().getSExtValue();
6451 switch (Op) {
6452 case llvm::dwarf::DW_OP_plus:
6453 case llvm::dwarf::DW_OP_minus:
6454 return I == 0;
6455 case llvm::dwarf::DW_OP_mul:
6456 case llvm::dwarf::DW_OP_div:
6457 return I == 1;
6458 }
6459 }
6460 return false;
6461 }
6462
6463 /// Convert a SCEV of a value to a DIExpression that is pushed onto the
6464 /// builder's expression stack. The stack should already contain an
6465 /// expression for the iteration count, so that it can be multiplied by
6466 /// the stride and added to the start.
6467 /// Components of the expression are omitted if they are an identity function.
6468 /// Chain (non-affine) SCEVs are not supported.
6469 bool SCEVToValueExpr(const llvm::SCEVAddRecExpr &SAR, ScalarEvolution &SE) {
6470 assert(SAR.isAffine() && "Expected affine SCEV");
6471 const SCEV *Start = SAR.getStart();
6472 const SCEV *Stride = SAR.getStepRecurrence(SE);
6473
6474 // Skip pushing arithmetic noops.
6475 if (!isIdentityFunction(Op: llvm::dwarf::DW_OP_mul, S: Stride)) {
6476 if (!pushSCEV(S: Stride))
6477 return false;
6478 pushOperator(Op: llvm::dwarf::DW_OP_mul);
6479 }
6480 if (!isIdentityFunction(Op: llvm::dwarf::DW_OP_plus, S: Start)) {
6481 if (!pushSCEV(S: Start))
6482 return false;
6483 pushOperator(Op: llvm::dwarf::DW_OP_plus);
6484 }
6485 return true;
6486 }
6487
6488 /// Create an expression that is an offset from a value (usually the IV).
6489 void createOffsetExpr(int64_t Offset, Value *OffsetValue) {
6490 pushLocation(V: OffsetValue);
6491 DIExpression::appendOffset(Ops&: Expr, Offset);
6492 LLVM_DEBUG(
6493 dbgs() << "scev-salvage: Generated IV offset expression. Offset: "
6494 << std::to_string(Offset) << "\n");
6495 }
6496
6497 /// Combine a translation of the SCEV and the IV to create an expression that
6498 /// recovers a location's value.
6499 /// returns true if an expression was created.
6500 bool createIterCountExpr(const SCEV *S,
6501 const SCEVDbgValueBuilder &IterationCount,
6502 ScalarEvolution &SE) {
6503 // SCEVs for SSA values are most frquently of the form
6504 // {start,+,stride}, but sometimes they are ({start,+,stride} + %a + ..).
6505 // This is because %a is a PHI node that is not the IV. However, these
6506 // SCEVs have not been observed to result in debuginfo-lossy optimisations,
6507 // so its not expected this point will be reached.
6508 if (!isa<SCEVAddRecExpr>(Val: S))
6509 return false;
6510
6511 LLVM_DEBUG(dbgs() << "scev-salvage: Location to salvage SCEV: " << *S
6512 << '\n');
6513
6514 const auto *Rec = cast<SCEVAddRecExpr>(Val: S);
6515 if (!Rec->isAffine())
6516 return false;
6517
6518 if (S->getExpressionSize() > MaxSCEVSalvageExpressionSize)
6519 return false;
6520
6521 // Initialise a new builder with the iteration count expression. In
6522 // combination with the value's SCEV this enables recovery.
6523 clone(Base: IterationCount);
6524 if (!SCEVToValueExpr(SAR: *Rec, SE))
6525 return false;
6526
6527 return true;
6528 }
6529
6530 /// Convert a SCEV of a value to a DIExpression that is pushed onto the
6531 /// builder's expression stack. The stack should already contain an
6532 /// expression for the iteration count, so that it can be multiplied by
6533 /// the stride and added to the start.
6534 /// Components of the expression are omitted if they are an identity function.
6535 bool SCEVToIterCountExpr(const llvm::SCEVAddRecExpr &SAR,
6536 ScalarEvolution &SE) {
6537 assert(SAR.isAffine() && "Expected affine SCEV");
6538 const SCEV *Start = SAR.getStart();
6539 const SCEV *Stride = SAR.getStepRecurrence(SE);
6540
6541 // Skip pushing arithmetic noops.
6542 if (!isIdentityFunction(Op: llvm::dwarf::DW_OP_minus, S: Start)) {
6543 if (!pushSCEV(S: Start))
6544 return false;
6545 pushOperator(Op: llvm::dwarf::DW_OP_minus);
6546 }
6547 if (!isIdentityFunction(Op: llvm::dwarf::DW_OP_div, S: Stride)) {
6548 if (!pushSCEV(S: Stride))
6549 return false;
6550 pushOperator(Op: llvm::dwarf::DW_OP_div);
6551 }
6552 return true;
6553 }
6554
6555 // Append the current expression and locations to a location list and an
6556 // expression list. Modify the DW_OP_LLVM_arg indexes to account for
6557 // the locations already present in the destination list.
6558 void appendToVectors(SmallVectorImpl<uint64_t> &DestExpr,
6559 SmallVectorImpl<Value *> &DestLocations) {
6560 assert(!DestLocations.empty() &&
6561 "Expected the locations vector to contain the IV");
6562 // The DWARF_OP_LLVM_arg arguments of the expression being appended must be
6563 // modified to account for the locations already in the destination vector.
6564 // All builders contain the IV as the first location op.
6565 assert(!LocationOps.empty() &&
6566 "Expected the location ops to contain the IV.");
6567 // DestIndexMap[n] contains the index in DestLocations for the nth
6568 // location in this SCEVDbgValueBuilder.
6569 SmallVector<uint64_t, 2> DestIndexMap;
6570 for (const auto &Op : LocationOps) {
6571 auto It = find(Range&: DestLocations, Val: Op);
6572 if (It != DestLocations.end()) {
6573 // Location already exists in DestLocations, reuse existing ArgIndex.
6574 DestIndexMap.push_back(Elt: std::distance(first: DestLocations.begin(), last: It));
6575 continue;
6576 }
6577 // Location is not in DestLocations, add it.
6578 DestIndexMap.push_back(Elt: DestLocations.size());
6579 DestLocations.push_back(Elt: Op);
6580 }
6581
6582 for (const auto &Op : expr_ops()) {
6583 if (Op.getOp() != dwarf::DW_OP_LLVM_arg) {
6584 Op.appendToVector(V&: DestExpr);
6585 continue;
6586 }
6587
6588 DestExpr.push_back(Elt: dwarf::DW_OP_LLVM_arg);
6589 // `DW_OP_LLVM_arg n` represents the nth LocationOp in this SCEV,
6590 // DestIndexMap[n] contains its new index in DestLocations.
6591 uint64_t NewIndex = DestIndexMap[Op.getArg(I: 0)];
6592 DestExpr.push_back(Elt: NewIndex);
6593 }
6594 }
6595};
6596
6597/// Holds all the required data to salvage a dbg.value using the pre-LSR SCEVs
6598/// and DIExpression.
6599struct DVIRecoveryRec {
6600 DVIRecoveryRec(DbgValueInst *DbgValue)
6601 : DbgRef(DbgValue), Expr(DbgValue->getExpression()),
6602 HadLocationArgList(false) {}
6603 DVIRecoveryRec(DbgVariableRecord *DVR)
6604 : DbgRef(DVR), Expr(DVR->getExpression()), HadLocationArgList(false) {}
6605
6606 PointerUnion<DbgValueInst *, DbgVariableRecord *> DbgRef;
6607 DIExpression *Expr;
6608 bool HadLocationArgList;
6609 SmallVector<WeakVH, 2> LocationOps;
6610 SmallVector<const llvm::SCEV *, 2> SCEVs;
6611 SmallVector<std::unique_ptr<SCEVDbgValueBuilder>, 2> RecoveryExprs;
6612
6613 void clear() {
6614 for (auto &RE : RecoveryExprs)
6615 RE.reset();
6616 RecoveryExprs.clear();
6617 }
6618
6619 ~DVIRecoveryRec() { clear(); }
6620};
6621} // namespace
6622
6623/// Returns the total number of DW_OP_llvm_arg operands in the expression.
6624/// This helps in determining if a DIArglist is necessary or can be omitted from
6625/// the dbg.value.
6626static unsigned numLLVMArgOps(SmallVectorImpl<uint64_t> &Expr) {
6627 auto expr_ops = ToDwarfOpIter(Expr);
6628 unsigned Count = 0;
6629 for (auto Op : expr_ops)
6630 if (Op.getOp() == dwarf::DW_OP_LLVM_arg)
6631 Count++;
6632 return Count;
6633}
6634
6635/// Overwrites DVI with the location and Ops as the DIExpression. This will
6636/// create an invalid expression if Ops has any dwarf::DW_OP_llvm_arg operands,
6637/// because a DIArglist is not created for the first argument of the dbg.value.
6638template <typename T>
6639static void updateDVIWithLocation(T &DbgVal, Value *Location,
6640 SmallVectorImpl<uint64_t> &Ops) {
6641 assert(numLLVMArgOps(Ops) == 0 && "Expected expression that does not "
6642 "contain any DW_OP_llvm_arg operands.");
6643 DbgVal.setRawLocation(ValueAsMetadata::get(V: Location));
6644 DbgVal.setExpression(DIExpression::get(Context&: DbgVal.getContext(), Elements: Ops));
6645 DbgVal.setExpression(DIExpression::get(Context&: DbgVal.getContext(), Elements: Ops));
6646}
6647
6648/// Overwrite DVI with locations placed into a DIArglist.
6649template <typename T>
6650static void updateDVIWithLocations(T &DbgVal,
6651 SmallVectorImpl<Value *> &Locations,
6652 SmallVectorImpl<uint64_t> &Ops) {
6653 assert(numLLVMArgOps(Ops) != 0 &&
6654 "Expected expression that references DIArglist locations using "
6655 "DW_OP_llvm_arg operands.");
6656 SmallVector<ValueAsMetadata *, 3> MetadataLocs;
6657 for (Value *V : Locations)
6658 MetadataLocs.push_back(Elt: ValueAsMetadata::get(V));
6659 auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
6660 DbgVal.setRawLocation(llvm::DIArgList::get(Context&: DbgVal.getContext(), Args: ValArrayRef));
6661 DbgVal.setExpression(DIExpression::get(Context&: DbgVal.getContext(), Elements: Ops));
6662}
6663
6664/// Write the new expression and new location ops for the dbg.value. If possible
6665/// reduce the szie of the dbg.value intrinsic by omitting DIArglist. This
6666/// can be omitted if:
6667/// 1. There is only a single location, refenced by a single DW_OP_llvm_arg.
6668/// 2. The DW_OP_LLVM_arg is the first operand in the expression.
6669static void UpdateDbgValueInst(DVIRecoveryRec &DVIRec,
6670 SmallVectorImpl<Value *> &NewLocationOps,
6671 SmallVectorImpl<uint64_t> &NewExpr) {
6672 auto UpdateDbgValueInstImpl = [&](auto *DbgVal) {
6673 unsigned NumLLVMArgs = numLLVMArgOps(Expr&: NewExpr);
6674 if (NumLLVMArgs == 0) {
6675 // Location assumed to be on the stack.
6676 updateDVIWithLocation(*DbgVal, NewLocationOps[0], NewExpr);
6677 } else if (NumLLVMArgs == 1 && NewExpr[0] == dwarf::DW_OP_LLVM_arg) {
6678 // There is only a single DW_OP_llvm_arg at the start of the expression,
6679 // so it can be omitted along with DIArglist.
6680 assert(NewExpr[1] == 0 &&
6681 "Lone LLVM_arg in a DIExpression should refer to location-op 0.");
6682 llvm::SmallVector<uint64_t, 6> ShortenedOps(llvm::drop_begin(RangeOrContainer&: NewExpr, N: 2));
6683 updateDVIWithLocation(*DbgVal, NewLocationOps[0], ShortenedOps);
6684 } else {
6685 // Multiple DW_OP_llvm_arg, so DIArgList is strictly necessary.
6686 updateDVIWithLocations(*DbgVal, NewLocationOps, NewExpr);
6687 }
6688
6689 // If the DIExpression was previously empty then add the stack terminator.
6690 // Non-empty expressions have only had elements inserted into them and so
6691 // the terminator should already be present e.g. stack_value or fragment.
6692 DIExpression *SalvageExpr = DbgVal->getExpression();
6693 if (!DVIRec.Expr->isComplex() && SalvageExpr->isComplex()) {
6694 SalvageExpr =
6695 DIExpression::append(Expr: SalvageExpr, Ops: {dwarf::DW_OP_stack_value});
6696 DbgVal->setExpression(SalvageExpr);
6697 }
6698 };
6699 if (isa<DbgValueInst *>(Val: DVIRec.DbgRef))
6700 UpdateDbgValueInstImpl(cast<DbgValueInst *>(Val&: DVIRec.DbgRef));
6701 else
6702 UpdateDbgValueInstImpl(cast<DbgVariableRecord *>(Val&: DVIRec.DbgRef));
6703}
6704
6705/// Cached location ops may be erased during LSR, in which case a poison is
6706/// required when restoring from the cache. The type of that location is no
6707/// longer available, so just use int8. The poison will be replaced by one or
6708/// more locations later when a SCEVDbgValueBuilder selects alternative
6709/// locations to use for the salvage.
6710static Value *getValueOrPoison(WeakVH &VH, LLVMContext &C) {
6711 return (VH) ? VH : PoisonValue::get(T: llvm::Type::getInt8Ty(C));
6712}
6713
6714/// Restore the DVI's pre-LSR arguments. Substitute undef for any erased values.
6715static void restorePreTransformState(DVIRecoveryRec &DVIRec) {
6716 auto RestorePreTransformStateImpl = [&](auto *DbgVal) {
6717 LLVM_DEBUG(dbgs() << "scev-salvage: restore dbg.value to pre-LSR state\n"
6718 << "scev-salvage: post-LSR: " << *DbgVal << '\n');
6719 assert(DVIRec.Expr && "Expected an expression");
6720 DbgVal->setExpression(DVIRec.Expr);
6721
6722 // Even a single location-op may be inside a DIArgList and referenced with
6723 // DW_OP_LLVM_arg, which is valid only with a DIArgList.
6724 if (!DVIRec.HadLocationArgList) {
6725 assert(DVIRec.LocationOps.size() == 1 &&
6726 "Unexpected number of location ops.");
6727 // LSR's unsuccessful salvage attempt may have added DIArgList, which in
6728 // this case was not present before, so force the location back to a
6729 // single uncontained Value.
6730 Value *CachedValue =
6731 getValueOrPoison(DVIRec.LocationOps[0], DbgVal->getContext());
6732 DbgVal->setRawLocation(ValueAsMetadata::get(V: CachedValue));
6733 } else {
6734 SmallVector<ValueAsMetadata *, 3> MetadataLocs;
6735 for (WeakVH VH : DVIRec.LocationOps) {
6736 Value *CachedValue = getValueOrPoison(VH, DbgVal->getContext());
6737 MetadataLocs.push_back(Elt: ValueAsMetadata::get(V: CachedValue));
6738 }
6739 auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
6740 DbgVal->setRawLocation(
6741 llvm::DIArgList::get(Context&: DbgVal->getContext(), Args: ValArrayRef));
6742 }
6743 LLVM_DEBUG(dbgs() << "scev-salvage: pre-LSR: " << *DbgVal << '\n');
6744 };
6745 if (isa<DbgValueInst *>(Val: DVIRec.DbgRef))
6746 RestorePreTransformStateImpl(cast<DbgValueInst *>(Val&: DVIRec.DbgRef));
6747 else
6748 RestorePreTransformStateImpl(cast<DbgVariableRecord *>(Val&: DVIRec.DbgRef));
6749}
6750
6751static bool SalvageDVI(llvm::Loop *L, ScalarEvolution &SE,
6752 llvm::PHINode *LSRInductionVar, DVIRecoveryRec &DVIRec,
6753 const SCEV *SCEVInductionVar,
6754 SCEVDbgValueBuilder IterCountExpr) {
6755
6756 if (isa<DbgValueInst *>(Val: DVIRec.DbgRef)
6757 ? !cast<DbgValueInst *>(Val&: DVIRec.DbgRef)->isKillLocation()
6758 : !cast<DbgVariableRecord *>(Val&: DVIRec.DbgRef)->isKillLocation())
6759 return false;
6760
6761 // LSR may have caused several changes to the dbg.value in the failed salvage
6762 // attempt. So restore the DIExpression, the location ops and also the
6763 // location ops format, which is always DIArglist for multiple ops, but only
6764 // sometimes for a single op.
6765 restorePreTransformState(DVIRec);
6766
6767 // LocationOpIndexMap[i] will store the post-LSR location index of
6768 // the non-optimised out location at pre-LSR index i.
6769 SmallVector<int64_t, 2> LocationOpIndexMap;
6770 LocationOpIndexMap.assign(NumElts: DVIRec.LocationOps.size(), Elt: -1);
6771 SmallVector<Value *, 2> NewLocationOps;
6772 NewLocationOps.push_back(Elt: LSRInductionVar);
6773
6774 for (unsigned i = 0; i < DVIRec.LocationOps.size(); i++) {
6775 WeakVH VH = DVIRec.LocationOps[i];
6776 // Place the locations not optimised out in the list first, avoiding
6777 // inserts later. The map is used to update the DIExpression's
6778 // DW_OP_LLVM_arg arguments as the expression is updated.
6779 if (VH && !isa<UndefValue>(Val: VH)) {
6780 NewLocationOps.push_back(Elt: VH);
6781 LocationOpIndexMap[i] = NewLocationOps.size() - 1;
6782 LLVM_DEBUG(dbgs() << "scev-salvage: Location index " << i
6783 << " now at index " << LocationOpIndexMap[i] << "\n");
6784 continue;
6785 }
6786
6787 // It's possible that a value referred to in the SCEV may have been
6788 // optimised out by LSR.
6789 if (SE.containsErasedValue(S: DVIRec.SCEVs[i]) ||
6790 SE.containsUndefs(S: DVIRec.SCEVs[i])) {
6791 LLVM_DEBUG(dbgs() << "scev-salvage: SCEV for location at index: " << i
6792 << " refers to a location that is now undef or erased. "
6793 "Salvage abandoned.\n");
6794 return false;
6795 }
6796
6797 LLVM_DEBUG(dbgs() << "scev-salvage: salvaging location at index " << i
6798 << " with SCEV: " << *DVIRec.SCEVs[i] << "\n");
6799
6800 DVIRec.RecoveryExprs[i] = std::make_unique<SCEVDbgValueBuilder>();
6801 SCEVDbgValueBuilder *SalvageExpr = DVIRec.RecoveryExprs[i].get();
6802
6803 // Create an offset-based salvage expression if possible, as it requires
6804 // less DWARF ops than an iteration count-based expression.
6805 if (std::optional<APInt> Offset =
6806 SE.computeConstantDifference(LHS: DVIRec.SCEVs[i], RHS: SCEVInductionVar)) {
6807 if (Offset->getSignificantBits() <= 64)
6808 SalvageExpr->createOffsetExpr(Offset: Offset->getSExtValue(), OffsetValue: LSRInductionVar);
6809 else
6810 return false;
6811 } else if (!SalvageExpr->createIterCountExpr(S: DVIRec.SCEVs[i], IterationCount: IterCountExpr,
6812 SE))
6813 return false;
6814 }
6815
6816 // Merge the DbgValueBuilder generated expressions and the original
6817 // DIExpression, place the result into an new vector.
6818 SmallVector<uint64_t, 3> NewExpr;
6819 if (DVIRec.Expr->getNumElements() == 0) {
6820 assert(DVIRec.RecoveryExprs.size() == 1 &&
6821 "Expected only a single recovery expression for an empty "
6822 "DIExpression.");
6823 assert(DVIRec.RecoveryExprs[0] &&
6824 "Expected a SCEVDbgSalvageBuilder for location 0");
6825 SCEVDbgValueBuilder *B = DVIRec.RecoveryExprs[0].get();
6826 B->appendToVectors(DestExpr&: NewExpr, DestLocations&: NewLocationOps);
6827 }
6828 for (const auto &Op : DVIRec.Expr->expr_ops()) {
6829 // Most Ops needn't be updated.
6830 if (Op.getOp() != dwarf::DW_OP_LLVM_arg) {
6831 Op.appendToVector(V&: NewExpr);
6832 continue;
6833 }
6834
6835 uint64_t LocationArgIndex = Op.getArg(I: 0);
6836 SCEVDbgValueBuilder *DbgBuilder =
6837 DVIRec.RecoveryExprs[LocationArgIndex].get();
6838 // The location doesn't have s SCEVDbgValueBuilder, so LSR did not
6839 // optimise it away. So just translate the argument to the updated
6840 // location index.
6841 if (!DbgBuilder) {
6842 NewExpr.push_back(Elt: dwarf::DW_OP_LLVM_arg);
6843 assert(LocationOpIndexMap[Op.getArg(0)] != -1 &&
6844 "Expected a positive index for the location-op position.");
6845 NewExpr.push_back(Elt: LocationOpIndexMap[Op.getArg(I: 0)]);
6846 continue;
6847 }
6848 // The location has a recovery expression.
6849 DbgBuilder->appendToVectors(DestExpr&: NewExpr, DestLocations&: NewLocationOps);
6850 }
6851
6852 UpdateDbgValueInst(DVIRec, NewLocationOps, NewExpr);
6853 if (isa<DbgValueInst *>(Val: DVIRec.DbgRef))
6854 LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: "
6855 << *cast<DbgValueInst *>(DVIRec.DbgRef) << "\n");
6856 else
6857 LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: "
6858 << *cast<DbgVariableRecord *>(DVIRec.DbgRef) << "\n");
6859 return true;
6860}
6861
6862/// Obtain an expression for the iteration count, then attempt to salvage the
6863/// dbg.value intrinsics.
6864static void DbgRewriteSalvageableDVIs(
6865 llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar,
6866 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &DVIToUpdate) {
6867 if (DVIToUpdate.empty())
6868 return;
6869
6870 const llvm::SCEV *SCEVInductionVar = SE.getSCEV(V: LSRInductionVar);
6871 assert(SCEVInductionVar &&
6872 "Anticipated a SCEV for the post-LSR induction variable");
6873
6874 if (const SCEVAddRecExpr *IVAddRec =
6875 dyn_cast<SCEVAddRecExpr>(Val: SCEVInductionVar)) {
6876 if (!IVAddRec->isAffine())
6877 return;
6878
6879 // Prevent translation using excessive resources.
6880 if (IVAddRec->getExpressionSize() > MaxSCEVSalvageExpressionSize)
6881 return;
6882
6883 // The iteration count is required to recover location values.
6884 SCEVDbgValueBuilder IterCountExpr;
6885 IterCountExpr.pushLocation(V: LSRInductionVar);
6886 if (!IterCountExpr.SCEVToIterCountExpr(SAR: *IVAddRec, SE))
6887 return;
6888
6889 LLVM_DEBUG(dbgs() << "scev-salvage: IV SCEV: " << *SCEVInductionVar
6890 << '\n');
6891
6892 for (auto &DVIRec : DVIToUpdate) {
6893 SalvageDVI(L, SE, LSRInductionVar, DVIRec&: *DVIRec, SCEVInductionVar,
6894 IterCountExpr);
6895 }
6896 }
6897}
6898
6899/// Identify and cache salvageable DVI locations and expressions along with the
6900/// corresponding SCEV(s). Also ensure that the DVI is not deleted between
6901/// cacheing and salvaging.
6902static void DbgGatherSalvagableDVI(
6903 Loop *L, ScalarEvolution &SE,
6904 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &SalvageableDVISCEVs,
6905 SmallSet<AssertingVH<DbgValueInst>, 2> &DVIHandles) {
6906 for (const auto &B : L->getBlocks()) {
6907 for (auto &I : *B) {
6908 auto ProcessDbgValue = [&](auto *DbgVal) -> bool {
6909 // Ensure that if any location op is undef that the dbg.vlue is not
6910 // cached.
6911 if (DbgVal->isKillLocation())
6912 return false;
6913
6914 // Check that the location op SCEVs are suitable for translation to
6915 // DIExpression.
6916 const auto &HasTranslatableLocationOps =
6917 [&](const auto *DbgValToTranslate) -> bool {
6918 for (const auto LocOp : DbgValToTranslate->location_ops()) {
6919 if (!LocOp)
6920 return false;
6921
6922 if (!SE.isSCEVable(Ty: LocOp->getType()))
6923 return false;
6924
6925 const SCEV *S = SE.getSCEV(V: LocOp);
6926 if (SE.containsUndefs(S))
6927 return false;
6928 }
6929 return true;
6930 };
6931
6932 if (!HasTranslatableLocationOps(DbgVal))
6933 return false;
6934
6935 std::unique_ptr<DVIRecoveryRec> NewRec =
6936 std::make_unique<DVIRecoveryRec>(DbgVal);
6937 // Each location Op may need a SCEVDbgValueBuilder in order to recover
6938 // it. Pre-allocating a vector will enable quick lookups of the builder
6939 // later during the salvage.
6940 NewRec->RecoveryExprs.resize(DbgVal->getNumVariableLocationOps());
6941 for (const auto LocOp : DbgVal->location_ops()) {
6942 NewRec->SCEVs.push_back(Elt: SE.getSCEV(V: LocOp));
6943 NewRec->LocationOps.push_back(LocOp);
6944 NewRec->HadLocationArgList = DbgVal->hasArgList();
6945 }
6946 SalvageableDVISCEVs.push_back(Elt: std::move(NewRec));
6947 return true;
6948 };
6949 for (DbgVariableRecord &DVR : filterDbgVars(R: I.getDbgRecordRange())) {
6950 if (DVR.isDbgValue() || DVR.isDbgAssign())
6951 ProcessDbgValue(&DVR);
6952 }
6953 auto DVI = dyn_cast<DbgValueInst>(Val: &I);
6954 if (!DVI)
6955 continue;
6956 if (ProcessDbgValue(DVI))
6957 DVIHandles.insert(V: DVI);
6958 }
6959 }
6960}
6961
6962/// Ideally pick the PHI IV inserted by ScalarEvolutionExpander. As a fallback
6963/// any PHi from the loop header is usable, but may have less chance of
6964/// surviving subsequent transforms.
6965static llvm::PHINode *GetInductionVariable(const Loop &L, ScalarEvolution &SE,
6966 const LSRInstance &LSR) {
6967
6968 auto IsSuitableIV = [&](PHINode *P) {
6969 if (!SE.isSCEVable(Ty: P->getType()))
6970 return false;
6971 if (const SCEVAddRecExpr *Rec = dyn_cast<SCEVAddRecExpr>(Val: SE.getSCEV(V: P)))
6972 return Rec->isAffine() && !SE.containsUndefs(S: SE.getSCEV(V: P));
6973 return false;
6974 };
6975
6976 // For now, just pick the first IV that was generated and inserted by
6977 // ScalarEvolution. Ideally pick an IV that is unlikely to be optimised away
6978 // by subsequent transforms.
6979 for (const WeakVH &IV : LSR.getScalarEvolutionIVs()) {
6980 if (!IV)
6981 continue;
6982
6983 // There should only be PHI node IVs.
6984 PHINode *P = cast<PHINode>(Val: &*IV);
6985
6986 if (IsSuitableIV(P))
6987 return P;
6988 }
6989
6990 for (PHINode &P : L.getHeader()->phis()) {
6991 if (IsSuitableIV(&P))
6992 return &P;
6993 }
6994 return nullptr;
6995}
6996
6997static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
6998 DominatorTree &DT, LoopInfo &LI,
6999 const TargetTransformInfo &TTI,
7000 AssumptionCache &AC, TargetLibraryInfo &TLI,
7001 MemorySSA *MSSA) {
7002
7003 // Debug preservation - before we start removing anything identify which DVI
7004 // meet the salvageable criteria and store their DIExpression and SCEVs.
7005 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> SalvageableDVIRecords;
7006 SmallSet<AssertingVH<DbgValueInst>, 2> DVIHandles;
7007 DbgGatherSalvagableDVI(L, SE, SalvageableDVISCEVs&: SalvageableDVIRecords, DVIHandles);
7008
7009 bool Changed = false;
7010 std::unique_ptr<MemorySSAUpdater> MSSAU;
7011 if (MSSA)
7012 MSSAU = std::make_unique<MemorySSAUpdater>(args&: MSSA);
7013
7014 // Run the main LSR transformation.
7015 const LSRInstance &Reducer =
7016 LSRInstance(L, IU, SE, DT, LI, TTI, AC, TLI, MSSAU.get());
7017 Changed |= Reducer.getChanged();
7018
7019 // Remove any extra phis created by processing inner loops.
7020 Changed |= DeleteDeadPHIs(BB: L->getHeader(), TLI: &TLI, MSSAU: MSSAU.get());
7021 if (EnablePhiElim && L->isLoopSimplifyForm()) {
7022 SmallVector<WeakTrackingVH, 16> DeadInsts;
7023 const DataLayout &DL = L->getHeader()->getDataLayout();
7024 SCEVExpander Rewriter(SE, DL, "lsr", false);
7025#if LLVM_ENABLE_ABI_BREAKING_CHECKS
7026 Rewriter.setDebugType(DEBUG_TYPE);
7027#endif
7028 unsigned numFolded = Rewriter.replaceCongruentIVs(L, DT: &DT, DeadInsts, TTI: &TTI);
7029 Rewriter.clear();
7030 if (numFolded) {
7031 Changed = true;
7032 RecursivelyDeleteTriviallyDeadInstructionsPermissive(DeadInsts, TLI: &TLI,
7033 MSSAU: MSSAU.get());
7034 DeleteDeadPHIs(BB: L->getHeader(), TLI: &TLI, MSSAU: MSSAU.get());
7035 }
7036 }
7037 // LSR may at times remove all uses of an induction variable from a loop.
7038 // The only remaining use is the PHI in the exit block.
7039 // When this is the case, if the exit value of the IV can be calculated using
7040 // SCEV, we can replace the exit block PHI with the final value of the IV and
7041 // skip the updates in each loop iteration.
7042 if (L->isRecursivelyLCSSAForm(DT, LI) && L->getExitBlock()) {
7043 SmallVector<WeakTrackingVH, 16> DeadInsts;
7044 const DataLayout &DL = L->getHeader()->getDataLayout();
7045 SCEVExpander Rewriter(SE, DL, "lsr", true);
7046 int Rewrites = rewriteLoopExitValues(L, LI: &LI, TLI: &TLI, SE: &SE, TTI: &TTI, Rewriter, DT: &DT,
7047 ReplaceExitValue: UnusedIndVarInLoop, DeadInsts);
7048 Rewriter.clear();
7049 if (Rewrites) {
7050 Changed = true;
7051 RecursivelyDeleteTriviallyDeadInstructionsPermissive(DeadInsts, TLI: &TLI,
7052 MSSAU: MSSAU.get());
7053 DeleteDeadPHIs(BB: L->getHeader(), TLI: &TLI, MSSAU: MSSAU.get());
7054 }
7055 }
7056
7057 if (SalvageableDVIRecords.empty())
7058 return Changed;
7059
7060 // Obtain relevant IVs and attempt to rewrite the salvageable DVIs with
7061 // expressions composed using the derived iteration count.
7062 // TODO: Allow for multiple IV references for nested AddRecSCEVs
7063 for (const auto &L : LI) {
7064 if (llvm::PHINode *IV = GetInductionVariable(L: *L, SE, LSR: Reducer))
7065 DbgRewriteSalvageableDVIs(L, SE, LSRInductionVar: IV, DVIToUpdate&: SalvageableDVIRecords);
7066 else {
7067 LLVM_DEBUG(dbgs() << "scev-salvage: SCEV salvaging not possible. An IV "
7068 "could not be identified.\n");
7069 }
7070 }
7071
7072 for (auto &Rec : SalvageableDVIRecords)
7073 Rec->clear();
7074 SalvageableDVIRecords.clear();
7075 DVIHandles.clear();
7076 return Changed;
7077}
7078
7079bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
7080 if (skipLoop(L))
7081 return false;
7082
7083 auto &IU = getAnalysis<IVUsersWrapperPass>().getIU();
7084 auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
7085 auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
7086 auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
7087 const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
7088 F: *L->getHeader()->getParent());
7089 auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
7090 F&: *L->getHeader()->getParent());
7091 auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
7092 F: *L->getHeader()->getParent());
7093 auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
7094 MemorySSA *MSSA = nullptr;
7095 if (MSSAAnalysis)
7096 MSSA = &MSSAAnalysis->getMSSA();
7097 return ReduceLoopStrength(L, IU, SE, DT, LI, TTI, AC, TLI, MSSA);
7098}
7099
7100PreservedAnalyses LoopStrengthReducePass::run(Loop &L, LoopAnalysisManager &AM,
7101 LoopStandardAnalysisResults &AR,
7102 LPMUpdater &) {
7103 if (!ReduceLoopStrength(L: &L, IU&: AM.getResult<IVUsersAnalysis>(IR&: L, ExtraArgs&: AR), SE&: AR.SE,
7104 DT&: AR.DT, LI&: AR.LI, TTI: AR.TTI, AC&: AR.AC, TLI&: AR.TLI, MSSA: AR.MSSA))
7105 return PreservedAnalyses::all();
7106
7107 auto PA = getLoopPassPreservedAnalyses();
7108 if (AR.MSSA)
7109 PA.preserve<MemorySSAAnalysis>();
7110 return PA;
7111}
7112
7113char LoopStrengthReduce::ID = 0;
7114
7115INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
7116 "Loop Strength Reduction", false, false)
7117INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7118INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7119INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7120INITIALIZE_PASS_DEPENDENCY(IVUsersWrapperPass)
7121INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7122INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
7123INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
7124 "Loop Strength Reduction", false, false)
7125
7126Pass *llvm::createLoopStrengthReducePass() { return new LoopStrengthReduce(); }
7127