1//===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass merges loads/stores to/from sequential memory addresses into vector
10// loads/stores. Although there's nothing GPU-specific in here, this pass is
11// motivated by the microarchitectural quirks of nVidia and AMD GPUs.
12//
13// (For simplicity below we talk about loads only, but everything also applies
14// to stores.)
15//
16// This pass is intended to be run late in the pipeline, after other
17// vectorization opportunities have been exploited. So the assumption here is
18// that immediately following our new vector load we'll need to extract out the
19// individual elements of the load, so we can operate on them individually.
20//
21// On CPUs this transformation is usually not beneficial, because extracting the
22// elements of a vector register is expensive on most architectures. It's
23// usually better just to load each element individually into its own scalar
24// register.
25//
26// However, nVidia and AMD GPUs don't have proper vector registers. Instead, a
27// "vector load" loads directly into a series of scalar registers. In effect,
28// extracting the elements of the vector is free. It's therefore always
29// beneficial to vectorize a sequence of loads on these architectures.
30//
31// Vectorizing (perhaps a better name might be "coalescing") loads can have
32// large performance impacts on GPU kernels, and opportunities for vectorizing
33// are common in GPU code. This pass tries very hard to find such
34// opportunities; its runtime is quadratic in the number of loads in a BB.
35//
36// Some CPU architectures, such as ARM, have instructions that load into
37// multiple scalar registers, similar to a GPU vectorized load. In theory ARM
38// could use this pass (with some modifications), but currently it implements
39// its own pass to do something similar to what we do here.
40//
41// Overview of the algorithm and terminology in this pass:
42//
43// - Break up each basic block into pseudo-BBs, composed of instructions which
44// are guaranteed to transfer control to their successors.
45// - Within a single pseudo-BB, find all loads, and group them into
46// "equivalence classes" according to getUnderlyingObject() and loaded
47// element size. Do the same for stores.
48// - For each equivalence class, greedily build "chains". Each chain has a
49// leader instruction, and every other member of the chain has a known
50// constant offset from the first instr in the chain.
51// - Break up chains so that they contain only contiguous accesses of legal
52// size with no intervening may-alias instrs.
53// - Convert each chain to vector instructions.
54//
55// The O(n^2) behavior of this pass comes from initially building the chains.
56// In the worst case we have to compare each new instruction to all of those
57// that came before. To limit this, we only calculate the offset to the leaders
58// of the N most recently-used chains.
59
60#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
61#include "llvm/ADT/APInt.h"
62#include "llvm/ADT/ArrayRef.h"
63#include "llvm/ADT/DenseMap.h"
64#include "llvm/ADT/MapVector.h"
65#include "llvm/ADT/PostOrderIterator.h"
66#include "llvm/ADT/STLExtras.h"
67#include "llvm/ADT/Sequence.h"
68#include "llvm/ADT/SmallPtrSet.h"
69#include "llvm/ADT/SmallVector.h"
70#include "llvm/ADT/Statistic.h"
71#include "llvm/ADT/iterator_range.h"
72#include "llvm/Analysis/AliasAnalysis.h"
73#include "llvm/Analysis/AssumptionCache.h"
74#include "llvm/Analysis/MemoryLocation.h"
75#include "llvm/Analysis/ScalarEvolution.h"
76#include "llvm/Analysis/TargetTransformInfo.h"
77#include "llvm/Analysis/ValueTracking.h"
78#include "llvm/Analysis/VectorUtils.h"
79#include "llvm/IR/Attributes.h"
80#include "llvm/IR/BasicBlock.h"
81#include "llvm/IR/ConstantRange.h"
82#include "llvm/IR/Constants.h"
83#include "llvm/IR/DataLayout.h"
84#include "llvm/IR/DerivedTypes.h"
85#include "llvm/IR/Dominators.h"
86#include "llvm/IR/Function.h"
87#include "llvm/IR/GetElementPtrTypeIterator.h"
88#include "llvm/IR/IRBuilder.h"
89#include "llvm/IR/InstrTypes.h"
90#include "llvm/IR/Instruction.h"
91#include "llvm/IR/Instructions.h"
92#include "llvm/IR/LLVMContext.h"
93#include "llvm/IR/Module.h"
94#include "llvm/IR/Type.h"
95#include "llvm/IR/Value.h"
96#include "llvm/InitializePasses.h"
97#include "llvm/Pass.h"
98#include "llvm/Support/Alignment.h"
99#include "llvm/Support/Casting.h"
100#include "llvm/Support/Debug.h"
101#include "llvm/Support/KnownBits.h"
102#include "llvm/Support/MathExtras.h"
103#include "llvm/Support/ModRef.h"
104#include "llvm/Support/raw_ostream.h"
105#include "llvm/Transforms/Utils/Local.h"
106#include <algorithm>
107#include <cassert>
108#include <cstdint>
109#include <cstdlib>
110#include <iterator>
111#include <numeric>
112#include <optional>
113#include <tuple>
114#include <type_traits>
115#include <utility>
116#include <vector>
117
118using namespace llvm;
119
120#define DEBUG_TYPE "load-store-vectorizer"
121
122STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
123STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
124
125namespace {
126
127// Equivalence class key, the initial tuple by which we group loads/stores.
128// Loads/stores with different EqClassKeys are never merged.
129//
130// (We could in theory remove element-size from the this tuple. We'd just need
131// to fix up the vector packing/unpacking code.)
132using EqClassKey =
133 std::tuple<const Value * /* result of getUnderlyingObject() */,
134 unsigned /* AddrSpace */,
135 unsigned /* Load/Store element size bits */,
136 char /* IsLoad; char b/c bool can't be a DenseMap key */
137 >;
138[[maybe_unused]] llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
139 const EqClassKey &K) {
140 const auto &[UnderlyingObject, AddrSpace, ElementSize, IsLoad] = K;
141 return OS << (IsLoad ? "load" : "store") << " of " << *UnderlyingObject
142 << " of element size " << ElementSize << " bits in addrspace "
143 << AddrSpace;
144}
145
146// A Chain is a set of instructions such that:
147// - All instructions have the same equivalence class, so in particular all are
148// loads, or all are stores.
149// - We know the address accessed by the i'th chain elem relative to the
150// chain's leader instruction, which is the first instr of the chain in BB
151// order.
152//
153// Chains have two canonical orderings:
154// - BB order, sorted by Instr->comesBefore.
155// - Offset order, sorted by OffsetFromLeader.
156// This pass switches back and forth between these orders.
157struct ChainElem {
158 Instruction *Inst;
159 APInt OffsetFromLeader;
160 ChainElem(Instruction *Inst, APInt OffsetFromLeader)
161 : Inst(std::move(Inst)), OffsetFromLeader(std::move(OffsetFromLeader)) {}
162};
163using Chain = SmallVector<ChainElem, 1>;
164
165void sortChainInBBOrder(Chain &C) {
166 sort(C, Comp: [](auto &A, auto &B) { return A.Inst->comesBefore(B.Inst); });
167}
168
169void sortChainInOffsetOrder(Chain &C) {
170 sort(C, Comp: [](const auto &A, const auto &B) {
171 if (A.OffsetFromLeader != B.OffsetFromLeader)
172 return A.OffsetFromLeader.slt(B.OffsetFromLeader);
173 return A.Inst->comesBefore(B.Inst); // stable tiebreaker
174 });
175}
176
177[[maybe_unused]] void dumpChain(ArrayRef<ChainElem> C) {
178 for (const auto &E : C) {
179 dbgs() << " " << *E.Inst << " (offset " << E.OffsetFromLeader << ")\n";
180 }
181}
182
183using EquivalenceClassMap =
184 MapVector<EqClassKey, SmallVector<Instruction *, 8>>;
185
186// FIXME: Assuming stack alignment of 4 is always good enough
187constexpr unsigned StackAdjustedAlignment = 4;
188
189Instruction *propagateMetadata(Instruction *I, const Chain &C) {
190 SmallVector<Value *, 8> Values;
191 for (const ChainElem &E : C)
192 Values.emplace_back(Args: E.Inst);
193 return propagateMetadata(I, VL: Values);
194}
195
196bool isInvariantLoad(const Instruction *I) {
197 const LoadInst *LI = dyn_cast<LoadInst>(Val: I);
198 return LI != nullptr && LI->hasMetadata(KindID: LLVMContext::MD_invariant_load);
199}
200
201/// Reorders the instructions that I depends on (the instructions defining its
202/// operands), to ensure they dominate I.
203void reorder(Instruction *I) {
204 SmallPtrSet<Instruction *, 16> InstructionsToMove;
205 SmallVector<Instruction *, 16> Worklist;
206
207 Worklist.emplace_back(Args&: I);
208 while (!Worklist.empty()) {
209 Instruction *IW = Worklist.pop_back_val();
210 int NumOperands = IW->getNumOperands();
211 for (int Idx = 0; Idx < NumOperands; Idx++) {
212 Instruction *IM = dyn_cast<Instruction>(Val: IW->getOperand(i: Idx));
213 if (!IM || IM->getOpcode() == Instruction::PHI)
214 continue;
215
216 // If IM is in another BB, no need to move it, because this pass only
217 // vectorizes instructions within one BB.
218 if (IM->getParent() != I->getParent())
219 continue;
220
221 assert(IM != I && "Unexpected cycle while re-ordering instructions");
222
223 if (!IM->comesBefore(Other: I)) {
224 InstructionsToMove.insert(Ptr: IM);
225 Worklist.emplace_back(Args&: IM);
226 }
227 }
228 }
229
230 // All instructions to move should follow I. Start from I, not from begin().
231 for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;) {
232 Instruction *IM = &*(BBI++);
233 if (!InstructionsToMove.contains(Ptr: IM))
234 continue;
235 IM->moveBefore(InsertPos: I->getIterator());
236 }
237}
238
239class Vectorizer {
240 Function &F;
241 AliasAnalysis &AA;
242 AssumptionCache &AC;
243 DominatorTree &DT;
244 ScalarEvolution &SE;
245 TargetTransformInfo &TTI;
246 const DataLayout &DL;
247 IRBuilder<> Builder;
248
249 // We could erase instrs right after vectorizing them, but that can mess up
250 // our BB iterators, and also can make the equivalence class keys point to
251 // freed memory. This is fixable, but it's simpler just to wait until we're
252 // done with the BB and erase all at once.
253 SmallVector<Instruction *, 128> ToErase;
254
255public:
256 Vectorizer(Function &F, AliasAnalysis &AA, AssumptionCache &AC,
257 DominatorTree &DT, ScalarEvolution &SE, TargetTransformInfo &TTI)
258 : F(F), AA(AA), AC(AC), DT(DT), SE(SE), TTI(TTI),
259 DL(F.getDataLayout()), Builder(SE.getContext()) {}
260
261 bool run();
262
263private:
264 static const unsigned MaxDepth = 3;
265
266 /// Runs the vectorizer on a "pseudo basic block", which is a range of
267 /// instructions [Begin, End) within one BB all of which have
268 /// isGuaranteedToTransferExecutionToSuccessor(I) == true.
269 bool runOnPseudoBB(BasicBlock::iterator Begin, BasicBlock::iterator End);
270
271 /// Runs the vectorizer on one equivalence class, i.e. one set of loads/stores
272 /// in the same BB with the same value for getUnderlyingObject() etc.
273 bool runOnEquivalenceClass(const EqClassKey &EqClassKey,
274 ArrayRef<Instruction *> EqClass);
275
276 /// Runs the vectorizer on one chain, i.e. a subset of an equivalence class
277 /// where all instructions access a known, constant offset from the first
278 /// instruction.
279 bool runOnChain(Chain &C);
280
281 /// Splits the chain into subchains of instructions which read/write a
282 /// contiguous block of memory. Discards any length-1 subchains (because
283 /// there's nothing to vectorize in there).
284 std::vector<Chain> splitChainByContiguity(Chain &C);
285
286 /// Splits the chain into subchains where it's safe to hoist loads up to the
287 /// beginning of the sub-chain and it's safe to sink loads up to the end of
288 /// the sub-chain. Discards any length-1 subchains.
289 std::vector<Chain> splitChainByMayAliasInstrs(Chain &C);
290
291 /// Splits the chain into subchains that make legal, aligned accesses.
292 /// Discards any length-1 subchains.
293 std::vector<Chain> splitChainByAlignment(Chain &C);
294
295 /// Converts the instrs in the chain into a single vectorized load or store.
296 /// Adds the old scalar loads/stores to ToErase.
297 bool vectorizeChain(Chain &C);
298
299 /// Tries to compute the offset in bytes PtrB - PtrA.
300 std::optional<APInt> getConstantOffset(Value *PtrA, Value *PtrB,
301 Instruction *ContextInst,
302 unsigned Depth = 0);
303 std::optional<APInt> getConstantOffsetComplexAddrs(Value *PtrA, Value *PtrB,
304 Instruction *ContextInst,
305 unsigned Depth);
306 std::optional<APInt> getConstantOffsetSelects(Value *PtrA, Value *PtrB,
307 Instruction *ContextInst,
308 unsigned Depth);
309
310 /// Gets the element type of the vector that the chain will load or store.
311 /// This is nontrivial because the chain may contain elements of different
312 /// types; e.g. it's legal to have a chain that contains both i32 and float.
313 Type *getChainElemTy(const Chain &C);
314
315 /// Determines whether ChainElem can be moved up (if IsLoad) or down (if
316 /// !IsLoad) to ChainBegin -- i.e. there are no intervening may-alias
317 /// instructions.
318 ///
319 /// The map ChainElemOffsets must contain all of the elements in
320 /// [ChainBegin, ChainElem] and their offsets from some arbitrary base
321 /// address. It's ok if it contains additional entries.
322 template <bool IsLoadChain>
323 bool isSafeToMove(
324 Instruction *ChainElem, Instruction *ChainBegin,
325 const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets);
326
327 /// Merges the equivalence classes if they have underlying objects that differ
328 /// by one level of indirection (i.e., one is a getelementptr and the other is
329 /// the base pointer in that getelementptr).
330 void mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const;
331
332 /// Collects loads and stores grouped by "equivalence class", where:
333 /// - all elements in an eq class are a load or all are a store,
334 /// - they all load/store the same element size (it's OK to have e.g. i8 and
335 /// <4 x i8> in the same class, but not i32 and <4 x i8>), and
336 /// - they all have the same value for getUnderlyingObject().
337 EquivalenceClassMap collectEquivalenceClasses(BasicBlock::iterator Begin,
338 BasicBlock::iterator End);
339
340 /// Partitions Instrs into "chains" where every instruction has a known
341 /// constant offset from the first instr in the chain.
342 ///
343 /// Postcondition: For all i, ret[i][0].second == 0, because the first instr
344 /// in the chain is the leader, and an instr touches distance 0 from itself.
345 std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs);
346};
347
348class LoadStoreVectorizerLegacyPass : public FunctionPass {
349public:
350 static char ID;
351
352 LoadStoreVectorizerLegacyPass() : FunctionPass(ID) {
353 initializeLoadStoreVectorizerLegacyPassPass(
354 *PassRegistry::getPassRegistry());
355 }
356
357 bool runOnFunction(Function &F) override;
358
359 StringRef getPassName() const override {
360 return "GPU Load and Store Vectorizer";
361 }
362
363 void getAnalysisUsage(AnalysisUsage &AU) const override {
364 AU.addRequired<AAResultsWrapperPass>();
365 AU.addRequired<AssumptionCacheTracker>();
366 AU.addRequired<ScalarEvolutionWrapperPass>();
367 AU.addRequired<DominatorTreeWrapperPass>();
368 AU.addRequired<TargetTransformInfoWrapperPass>();
369 AU.setPreservesCFG();
370 }
371};
372
373} // end anonymous namespace
374
375char LoadStoreVectorizerLegacyPass::ID = 0;
376
377INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
378 "Vectorize load and Store instructions", false, false)
379INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
380INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker);
381INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
382INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
383INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
384INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
385INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
386 "Vectorize load and store instructions", false, false)
387
388Pass *llvm::createLoadStoreVectorizerPass() {
389 return new LoadStoreVectorizerLegacyPass();
390}
391
392bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) {
393 // Don't vectorize when the attribute NoImplicitFloat is used.
394 if (skipFunction(F) || F.hasFnAttribute(Kind: Attribute::NoImplicitFloat))
395 return false;
396
397 AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
398 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
399 ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
400 TargetTransformInfo &TTI =
401 getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
402
403 AssumptionCache &AC =
404 getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
405
406 return Vectorizer(F, AA, AC, DT, SE, TTI).run();
407}
408
409PreservedAnalyses LoadStoreVectorizerPass::run(Function &F,
410 FunctionAnalysisManager &AM) {
411 // Don't vectorize when the attribute NoImplicitFloat is used.
412 if (F.hasFnAttribute(Kind: Attribute::NoImplicitFloat))
413 return PreservedAnalyses::all();
414
415 AliasAnalysis &AA = AM.getResult<AAManager>(IR&: F);
416 DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(IR&: F);
417 ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(IR&: F);
418 TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(IR&: F);
419 AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(IR&: F);
420
421 bool Changed = Vectorizer(F, AA, AC, DT, SE, TTI).run();
422 PreservedAnalyses PA;
423 PA.preserveSet<CFGAnalyses>();
424 return Changed ? PA : PreservedAnalyses::all();
425}
426
427bool Vectorizer::run() {
428 bool Changed = false;
429 // Break up the BB if there are any instrs which aren't guaranteed to transfer
430 // execution to their successor.
431 //
432 // Consider, for example:
433 //
434 // def assert_arr_len(int n) { if (n < 2) exit(); }
435 //
436 // load arr[0]
437 // call assert_array_len(arr.length)
438 // load arr[1]
439 //
440 // Even though assert_arr_len does not read or write any memory, we can't
441 // speculate the second load before the call. More info at
442 // https://github.com/llvm/llvm-project/issues/52950.
443 for (BasicBlock *BB : post_order(G: &F)) {
444 // BB must at least have a terminator.
445 assert(!BB->empty());
446
447 SmallVector<BasicBlock::iterator, 8> Barriers;
448 Barriers.emplace_back(Args: BB->begin());
449 for (Instruction &I : *BB)
450 if (!isGuaranteedToTransferExecutionToSuccessor(I: &I))
451 Barriers.emplace_back(Args: I.getIterator());
452 Barriers.emplace_back(Args: BB->end());
453
454 for (auto It = Barriers.begin(), End = std::prev(x: Barriers.end()); It != End;
455 ++It)
456 Changed |= runOnPseudoBB(Begin: *It, End: *std::next(x: It));
457
458 for (Instruction *I : ToErase) {
459 auto *PtrOperand = getLoadStorePointerOperand(V: I);
460 if (I->use_empty())
461 I->eraseFromParent();
462 RecursivelyDeleteTriviallyDeadInstructions(V: PtrOperand);
463 }
464 ToErase.clear();
465 }
466
467 return Changed;
468}
469
470bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin,
471 BasicBlock::iterator End) {
472 LLVM_DEBUG({
473 dbgs() << "LSV: Running on pseudo-BB [" << *Begin << " ... ";
474 if (End != Begin->getParent()->end())
475 dbgs() << *End;
476 else
477 dbgs() << "<BB end>";
478 dbgs() << ")\n";
479 });
480
481 bool Changed = false;
482 for (const auto &[EqClassKey, EqClass] :
483 collectEquivalenceClasses(Begin, End))
484 Changed |= runOnEquivalenceClass(EqClassKey, EqClass);
485
486 return Changed;
487}
488
489bool Vectorizer::runOnEquivalenceClass(const EqClassKey &EqClassKey,
490 ArrayRef<Instruction *> EqClass) {
491 bool Changed = false;
492
493 LLVM_DEBUG({
494 dbgs() << "LSV: Running on equivalence class of size " << EqClass.size()
495 << " keyed on " << EqClassKey << ":\n";
496 for (Instruction *I : EqClass)
497 dbgs() << " " << *I << "\n";
498 });
499
500 std::vector<Chain> Chains = gatherChains(Instrs: EqClass);
501 LLVM_DEBUG(dbgs() << "LSV: Got " << Chains.size()
502 << " nontrivial chains.\n";);
503 for (Chain &C : Chains)
504 Changed |= runOnChain(C);
505 return Changed;
506}
507
508bool Vectorizer::runOnChain(Chain &C) {
509 LLVM_DEBUG({
510 dbgs() << "LSV: Running on chain with " << C.size() << " instructions:\n";
511 dumpChain(C);
512 });
513
514 // Split up the chain into increasingly smaller chains, until we can finally
515 // vectorize the chains.
516 //
517 // (Don't be scared by the depth of the loop nest here. These operations are
518 // all at worst O(n lg n) in the number of instructions, and splitting chains
519 // doesn't change the number of instrs. So the whole loop nest is O(n lg n).)
520 bool Changed = false;
521 for (auto &C : splitChainByMayAliasInstrs(C))
522 for (auto &C : splitChainByContiguity(C))
523 for (auto &C : splitChainByAlignment(C))
524 Changed |= vectorizeChain(C);
525 return Changed;
526}
527
528std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) {
529 if (C.empty())
530 return {};
531
532 sortChainInBBOrder(C);
533
534 LLVM_DEBUG({
535 dbgs() << "LSV: splitChainByMayAliasInstrs considering chain:\n";
536 dumpChain(C);
537 });
538
539 // We know that elements in the chain with nonverlapping offsets can't
540 // alias, but AA may not be smart enough to figure this out. Use a
541 // hashtable.
542 DenseMap<Instruction *, APInt /*OffsetFromLeader*/> ChainOffsets;
543 for (const auto &E : C)
544 ChainOffsets.insert(KV: {&*E.Inst, E.OffsetFromLeader});
545
546 // Loads get hoisted up to the first load in the chain. Stores get sunk
547 // down to the last store in the chain. Our algorithm for loads is:
548 //
549 // - Take the first element of the chain. This is the start of a new chain.
550 // - Take the next element of `Chain` and check for may-alias instructions
551 // up to the start of NewChain. If no may-alias instrs, add it to
552 // NewChain. Otherwise, start a new NewChain.
553 //
554 // For stores it's the same except in the reverse direction.
555 //
556 // We expect IsLoad to be an std::bool_constant.
557 auto Impl = [&](auto IsLoad) {
558 // MSVC is unhappy if IsLoad is a capture, so pass it as an arg.
559 auto [ChainBegin, ChainEnd] = [&](auto IsLoad) {
560 if constexpr (IsLoad())
561 return std::make_pair(x: C.begin(), y: C.end());
562 else
563 return std::make_pair(x: C.rbegin(), y: C.rend());
564 }(IsLoad);
565 assert(ChainBegin != ChainEnd);
566
567 std::vector<Chain> Chains;
568 SmallVector<ChainElem, 1> NewChain;
569 NewChain.emplace_back(*ChainBegin);
570 for (auto ChainIt = std::next(ChainBegin); ChainIt != ChainEnd; ++ChainIt) {
571 if (isSafeToMove<IsLoad>(ChainIt->Inst, NewChain.front().Inst,
572 ChainOffsets)) {
573 LLVM_DEBUG(dbgs() << "LSV: No intervening may-alias instrs; can merge "
574 << *ChainIt->Inst << " into " << *ChainBegin->Inst
575 << "\n");
576 NewChain.emplace_back(*ChainIt);
577 } else {
578 LLVM_DEBUG(
579 dbgs() << "LSV: Found intervening may-alias instrs; cannot merge "
580 << *ChainIt->Inst << " into " << *ChainBegin->Inst << "\n");
581 if (NewChain.size() > 1) {
582 LLVM_DEBUG({
583 dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
584 dumpChain(NewChain);
585 });
586 Chains.emplace_back(args: std::move(NewChain));
587 }
588
589 // Start a new chain.
590 NewChain = SmallVector<ChainElem, 1>({*ChainIt});
591 }
592 }
593 if (NewChain.size() > 1) {
594 LLVM_DEBUG({
595 dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
596 dumpChain(NewChain);
597 });
598 Chains.emplace_back(args: std::move(NewChain));
599 }
600 return Chains;
601 };
602
603 if (isa<LoadInst>(Val: C[0].Inst))
604 return Impl(/*IsLoad=*/std::bool_constant<true>());
605
606 assert(isa<StoreInst>(C[0].Inst));
607 return Impl(/*IsLoad=*/std::bool_constant<false>());
608}
609
610std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
611 if (C.empty())
612 return {};
613
614 sortChainInOffsetOrder(C);
615
616 LLVM_DEBUG({
617 dbgs() << "LSV: splitChainByContiguity considering chain:\n";
618 dumpChain(C);
619 });
620
621 std::vector<Chain> Ret;
622 Ret.push_back(x: {C.front()});
623
624 for (auto It = std::next(x: C.begin()), End = C.end(); It != End; ++It) {
625 // `prev` accesses offsets [PrevDistFromBase, PrevReadEnd).
626 auto &CurChain = Ret.back();
627 const ChainElem &Prev = CurChain.back();
628 unsigned SzBits = DL.getTypeSizeInBits(Ty: getLoadStoreType(I: &*Prev.Inst));
629 assert(SzBits % 8 == 0 && "Non-byte sizes should have been filtered out by "
630 "collectEquivalenceClass");
631 APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / 8;
632
633 // Add this instruction to the end of the current chain, or start a new one.
634 bool AreContiguous = It->OffsetFromLeader == PrevReadEnd;
635 LLVM_DEBUG(dbgs() << "LSV: Instructions are "
636 << (AreContiguous ? "" : "not ") << "contiguous: "
637 << *Prev.Inst << " (ends at offset " << PrevReadEnd
638 << ") -> " << *It->Inst << " (starts at offset "
639 << It->OffsetFromLeader << ")\n");
640 if (AreContiguous)
641 CurChain.push_back(Elt: *It);
642 else
643 Ret.push_back(x: {*It});
644 }
645
646 // Filter out length-1 chains, these are uninteresting.
647 llvm::erase_if(C&: Ret, P: [](const auto &Chain) { return Chain.size() <= 1; });
648 return Ret;
649}
650
651Type *Vectorizer::getChainElemTy(const Chain &C) {
652 assert(!C.empty());
653 // The rules are:
654 // - If there are any pointer types in the chain, use an integer type.
655 // - Prefer an integer type if it appears in the chain.
656 // - Otherwise, use the first type in the chain.
657 //
658 // The rule about pointer types is a simplification when we merge e.g. a load
659 // of a ptr and a double. There's no direct conversion from a ptr to a
660 // double; it requires a ptrtoint followed by a bitcast.
661 //
662 // It's unclear to me if the other rules have any practical effect, but we do
663 // it to match this pass's previous behavior.
664 if (any_of(Range: C, P: [](const ChainElem &E) {
665 return getLoadStoreType(I: E.Inst)->getScalarType()->isPointerTy();
666 })) {
667 return Type::getIntNTy(
668 C&: F.getContext(),
669 N: DL.getTypeSizeInBits(Ty: getLoadStoreType(I: C[0].Inst)->getScalarType()));
670 }
671
672 for (const ChainElem &E : C)
673 if (Type *T = getLoadStoreType(I: E.Inst)->getScalarType(); T->isIntegerTy())
674 return T;
675 return getLoadStoreType(I: C[0].Inst)->getScalarType();
676}
677
678std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
679 // We use a simple greedy algorithm.
680 // - Given a chain of length N, find all prefixes that
681 // (a) are not longer than the max register length, and
682 // (b) are a power of 2.
683 // - Starting from the longest prefix, try to create a vector of that length.
684 // - If one of them works, great. Repeat the algorithm on any remaining
685 // elements in the chain.
686 // - If none of them work, discard the first element and repeat on a chain
687 // of length N-1.
688 if (C.empty())
689 return {};
690
691 sortChainInOffsetOrder(C);
692
693 LLVM_DEBUG({
694 dbgs() << "LSV: splitChainByAlignment considering chain:\n";
695 dumpChain(C);
696 });
697
698 bool IsLoadChain = isa<LoadInst>(Val: C[0].Inst);
699 auto GetVectorFactor = [&](unsigned VF, unsigned LoadStoreSize,
700 unsigned ChainSizeBytes, VectorType *VecTy) {
701 return IsLoadChain ? TTI.getLoadVectorFactor(VF, LoadSize: LoadStoreSize,
702 ChainSizeInBytes: ChainSizeBytes, VecTy)
703 : TTI.getStoreVectorFactor(VF, StoreSize: LoadStoreSize,
704 ChainSizeInBytes: ChainSizeBytes, VecTy);
705 };
706
707#ifndef NDEBUG
708 for (const auto &E : C) {
709 Type *Ty = getLoadStoreType(E.Inst)->getScalarType();
710 assert(isPowerOf2_32(DL.getTypeSizeInBits(Ty)) &&
711 "Should have filtered out non-power-of-two elements in "
712 "collectEquivalenceClasses.");
713 }
714#endif
715
716 unsigned AS = getLoadStoreAddressSpace(I: C[0].Inst);
717 unsigned VecRegBytes = TTI.getLoadStoreVecRegBitWidth(AddrSpace: AS) / 8;
718
719 std::vector<Chain> Ret;
720 for (unsigned CBegin = 0; CBegin < C.size(); ++CBegin) {
721 // Find candidate chains of size not greater than the largest vector reg.
722 // These chains are over the closed interval [CBegin, CEnd].
723 SmallVector<std::pair<unsigned /*CEnd*/, unsigned /*SizeBytes*/>, 8>
724 CandidateChains;
725 for (unsigned CEnd = CBegin + 1, Size = C.size(); CEnd < Size; ++CEnd) {
726 APInt Sz = C[CEnd].OffsetFromLeader +
727 DL.getTypeStoreSize(Ty: getLoadStoreType(I: C[CEnd].Inst)) -
728 C[CBegin].OffsetFromLeader;
729 if (Sz.sgt(RHS: VecRegBytes))
730 break;
731 CandidateChains.emplace_back(Args&: CEnd,
732 Args: static_cast<unsigned>(Sz.getLimitedValue()));
733 }
734
735 // Consider the longest chain first.
736 for (auto It = CandidateChains.rbegin(), End = CandidateChains.rend();
737 It != End; ++It) {
738 auto [CEnd, SizeBytes] = *It;
739 LLVM_DEBUG(
740 dbgs() << "LSV: splitChainByAlignment considering candidate chain ["
741 << *C[CBegin].Inst << " ... " << *C[CEnd].Inst << "]\n");
742
743 Type *VecElemTy = getChainElemTy(C);
744 // Note, VecElemTy is a power of 2, but might be less than one byte. For
745 // example, we can vectorize 2 x <2 x i4> to <4 x i4>, and in this case
746 // VecElemTy would be i4.
747 unsigned VecElemBits = DL.getTypeSizeInBits(Ty: VecElemTy);
748
749 // SizeBytes and VecElemBits are powers of 2, so they divide evenly.
750 assert((8 * SizeBytes) % VecElemBits == 0);
751 unsigned NumVecElems = 8 * SizeBytes / VecElemBits;
752 FixedVectorType *VecTy = FixedVectorType::get(ElementType: VecElemTy, NumElts: NumVecElems);
753 unsigned VF = 8 * VecRegBytes / VecElemBits;
754
755 // Check that TTI is happy with this vectorization factor.
756 unsigned TargetVF = GetVectorFactor(VF, VecElemBits,
757 VecElemBits * NumVecElems / 8, VecTy);
758 if (TargetVF != VF && TargetVF < NumVecElems) {
759 LLVM_DEBUG(
760 dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
761 "because TargetVF="
762 << TargetVF << " != VF=" << VF
763 << " and TargetVF < NumVecElems=" << NumVecElems << "\n");
764 continue;
765 }
766
767 // Is a load/store with this alignment allowed by TTI and at least as fast
768 // as an unvectorized load/store?
769 //
770 // TTI and F are passed as explicit captures to WAR an MSVC misparse (??).
771 auto IsAllowedAndFast = [&, SizeBytes = SizeBytes, &TTI = TTI,
772 &F = F](Align Alignment) {
773 if (Alignment.value() % SizeBytes == 0)
774 return true;
775 unsigned VectorizedSpeed = 0;
776 bool AllowsMisaligned = TTI.allowsMisalignedMemoryAccesses(
777 Context&: F.getContext(), BitWidth: SizeBytes * 8, AddressSpace: AS, Alignment, Fast: &VectorizedSpeed);
778 if (!AllowsMisaligned) {
779 LLVM_DEBUG(dbgs()
780 << "LSV: Access of " << SizeBytes << "B in addrspace "
781 << AS << " with alignment " << Alignment.value()
782 << " is misaligned, and therefore can't be vectorized.\n");
783 return false;
784 }
785
786 unsigned ElementwiseSpeed = 0;
787 (TTI).allowsMisalignedMemoryAccesses(Context&: (F).getContext(), BitWidth: VecElemBits, AddressSpace: AS,
788 Alignment, Fast: &ElementwiseSpeed);
789 if (VectorizedSpeed < ElementwiseSpeed) {
790 LLVM_DEBUG(dbgs()
791 << "LSV: Access of " << SizeBytes << "B in addrspace "
792 << AS << " with alignment " << Alignment.value()
793 << " has relative speed " << VectorizedSpeed
794 << ", which is lower than the elementwise speed of "
795 << ElementwiseSpeed
796 << ". Therefore this access won't be vectorized.\n");
797 return false;
798 }
799 return true;
800 };
801
802 // If we're loading/storing from an alloca, align it if possible.
803 //
804 // FIXME: We eagerly upgrade the alignment, regardless of whether TTI
805 // tells us this is beneficial. This feels a bit odd, but it matches
806 // existing tests. This isn't *so* bad, because at most we align to 4
807 // bytes (current value of StackAdjustedAlignment).
808 //
809 // FIXME: We will upgrade the alignment of the alloca even if it turns out
810 // we can't vectorize for some other reason.
811 Value *PtrOperand = getLoadStorePointerOperand(V: C[CBegin].Inst);
812 bool IsAllocaAccess = AS == DL.getAllocaAddrSpace() &&
813 isa<AllocaInst>(Val: PtrOperand->stripPointerCasts());
814 Align Alignment = getLoadStoreAlignment(I: C[CBegin].Inst);
815 Align PrefAlign = Align(StackAdjustedAlignment);
816 if (IsAllocaAccess && Alignment.value() % SizeBytes != 0 &&
817 IsAllowedAndFast(PrefAlign)) {
818 Align NewAlign = getOrEnforceKnownAlignment(
819 V: PtrOperand, PrefAlign, DL, CxtI: C[CBegin].Inst, AC: nullptr, DT: &DT);
820 if (NewAlign >= Alignment) {
821 LLVM_DEBUG(dbgs()
822 << "LSV: splitByChain upgrading alloca alignment from "
823 << Alignment.value() << " to " << NewAlign.value()
824 << "\n");
825 Alignment = NewAlign;
826 }
827 }
828
829 if (!IsAllowedAndFast(Alignment)) {
830 LLVM_DEBUG(
831 dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
832 "because its alignment is not AllowedAndFast: "
833 << Alignment.value() << "\n");
834 continue;
835 }
836
837 if ((IsLoadChain &&
838 !TTI.isLegalToVectorizeLoadChain(ChainSizeInBytes: SizeBytes, Alignment, AddrSpace: AS)) ||
839 (!IsLoadChain &&
840 !TTI.isLegalToVectorizeStoreChain(ChainSizeInBytes: SizeBytes, Alignment, AddrSpace: AS))) {
841 LLVM_DEBUG(
842 dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
843 "because !isLegalToVectorizeLoad/StoreChain.");
844 continue;
845 }
846
847 // Hooray, we can vectorize this chain!
848 Chain &NewChain = Ret.emplace_back();
849 for (unsigned I = CBegin; I <= CEnd; ++I)
850 NewChain.emplace_back(Args&: C[I]);
851 CBegin = CEnd; // Skip over the instructions we've added to the chain.
852 break;
853 }
854 }
855 return Ret;
856}
857
858bool Vectorizer::vectorizeChain(Chain &C) {
859 if (C.size() < 2)
860 return false;
861
862 sortChainInOffsetOrder(C);
863
864 LLVM_DEBUG({
865 dbgs() << "LSV: Vectorizing chain of " << C.size() << " instructions:\n";
866 dumpChain(C);
867 });
868
869 Type *VecElemTy = getChainElemTy(C);
870 bool IsLoadChain = isa<LoadInst>(Val: C[0].Inst);
871 unsigned AS = getLoadStoreAddressSpace(I: C[0].Inst);
872 unsigned ChainBytes = std::accumulate(
873 first: C.begin(), last: C.end(), init: 0u, binary_op: [&](unsigned Bytes, const ChainElem &E) {
874 return Bytes + DL.getTypeStoreSize(Ty: getLoadStoreType(I: E.Inst));
875 });
876 assert(ChainBytes % DL.getTypeStoreSize(VecElemTy) == 0);
877 // VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller
878 // than 1 byte (e.g. VecTy == <32 x i1>).
879 Type *VecTy = FixedVectorType::get(
880 ElementType: VecElemTy, NumElts: 8 * ChainBytes / DL.getTypeSizeInBits(Ty: VecElemTy));
881
882 Align Alignment = getLoadStoreAlignment(I: C[0].Inst);
883 // If this is a load/store of an alloca, we might have upgraded the alloca's
884 // alignment earlier. Get the new alignment.
885 if (AS == DL.getAllocaAddrSpace()) {
886 Alignment = std::max(
887 a: Alignment,
888 b: getOrEnforceKnownAlignment(V: getLoadStorePointerOperand(V: C[0].Inst),
889 PrefAlign: MaybeAlign(), DL, CxtI: C[0].Inst, AC: nullptr, DT: &DT));
890 }
891
892 // All elements of the chain must have the same scalar-type size.
893#ifndef NDEBUG
894 for (const ChainElem &E : C)
895 assert(DL.getTypeStoreSize(getLoadStoreType(E.Inst)->getScalarType()) ==
896 DL.getTypeStoreSize(VecElemTy));
897#endif
898
899 Instruction *VecInst;
900 if (IsLoadChain) {
901 // Loads get hoisted to the location of the first load in the chain. We may
902 // also need to hoist the (transitive) operands of the loads.
903 Builder.SetInsertPoint(
904 llvm::min_element(Range&: C, C: [](const auto &A, const auto &B) {
905 return A.Inst->comesBefore(B.Inst);
906 })->Inst);
907
908 // Chain is in offset order, so C[0] is the instr with the lowest offset,
909 // i.e. the root of the vector.
910 VecInst = Builder.CreateAlignedLoad(Ty: VecTy,
911 Ptr: getLoadStorePointerOperand(V: C[0].Inst),
912 Align: Alignment);
913
914 unsigned VecIdx = 0;
915 for (const ChainElem &E : C) {
916 Instruction *I = E.Inst;
917 Value *V;
918 Type *T = getLoadStoreType(I);
919 if (auto *VT = dyn_cast<FixedVectorType>(Val: T)) {
920 auto Mask = llvm::to_vector<8>(
921 Range: llvm::seq<int>(Begin: VecIdx, End: VecIdx + VT->getNumElements()));
922 V = Builder.CreateShuffleVector(V: VecInst, Mask, Name: I->getName());
923 VecIdx += VT->getNumElements();
924 } else {
925 V = Builder.CreateExtractElement(Vec: VecInst, Idx: Builder.getInt32(C: VecIdx),
926 Name: I->getName());
927 ++VecIdx;
928 }
929 if (V->getType() != I->getType())
930 V = Builder.CreateBitOrPointerCast(V, DestTy: I->getType());
931 I->replaceAllUsesWith(V);
932 }
933
934 // Finally, we need to reorder the instrs in the BB so that the (transitive)
935 // operands of VecInst appear before it. To see why, suppose we have
936 // vectorized the following code:
937 //
938 // ptr1 = gep a, 1
939 // load1 = load i32 ptr1
940 // ptr0 = gep a, 0
941 // load0 = load i32 ptr0
942 //
943 // We will put the vectorized load at the location of the earliest load in
944 // the BB, i.e. load1. We get:
945 //
946 // ptr1 = gep a, 1
947 // loadv = load <2 x i32> ptr0
948 // load0 = extractelement loadv, 0
949 // load1 = extractelement loadv, 1
950 // ptr0 = gep a, 0
951 //
952 // Notice that loadv uses ptr0, which is defined *after* it!
953 reorder(I: VecInst);
954 } else {
955 // Stores get sunk to the location of the last store in the chain.
956 Builder.SetInsertPoint(llvm::max_element(Range&: C, C: [](auto &A, auto &B) {
957 return A.Inst->comesBefore(B.Inst);
958 })->Inst);
959
960 // Build the vector to store.
961 Value *Vec = PoisonValue::get(T: VecTy);
962 unsigned VecIdx = 0;
963 auto InsertElem = [&](Value *V) {
964 if (V->getType() != VecElemTy)
965 V = Builder.CreateBitOrPointerCast(V, DestTy: VecElemTy);
966 Vec = Builder.CreateInsertElement(Vec, NewElt: V, Idx: Builder.getInt32(C: VecIdx++));
967 };
968 for (const ChainElem &E : C) {
969 auto *I = cast<StoreInst>(Val: E.Inst);
970 if (FixedVectorType *VT =
971 dyn_cast<FixedVectorType>(Val: getLoadStoreType(I))) {
972 for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) {
973 InsertElem(Builder.CreateExtractElement(Vec: I->getValueOperand(),
974 Idx: Builder.getInt32(C: J)));
975 }
976 } else {
977 InsertElem(I->getValueOperand());
978 }
979 }
980
981 // Chain is in offset order, so C[0] is the instr with the lowest offset,
982 // i.e. the root of the vector.
983 VecInst = Builder.CreateAlignedStore(
984 Val: Vec,
985 Ptr: getLoadStorePointerOperand(V: C[0].Inst),
986 Align: Alignment);
987 }
988
989 propagateMetadata(I: VecInst, C);
990
991 for (const ChainElem &E : C)
992 ToErase.emplace_back(Args: E.Inst);
993
994 ++NumVectorInstructions;
995 NumScalarsVectorized += C.size();
996 return true;
997}
998
999template <bool IsLoadChain>
1000bool Vectorizer::isSafeToMove(
1001 Instruction *ChainElem, Instruction *ChainBegin,
1002 const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets) {
1003 LLVM_DEBUG(dbgs() << "LSV: isSafeToMove(" << *ChainElem << " -> "
1004 << *ChainBegin << ")\n");
1005
1006 assert(isa<LoadInst>(ChainElem) == IsLoadChain);
1007 if (ChainElem == ChainBegin)
1008 return true;
1009
1010 // Invariant loads can always be reordered; by definition they are not
1011 // clobbered by stores.
1012 if (isInvariantLoad(I: ChainElem))
1013 return true;
1014
1015 auto BBIt = std::next([&] {
1016 if constexpr (IsLoadChain)
1017 return BasicBlock::reverse_iterator(ChainElem);
1018 else
1019 return BasicBlock::iterator(ChainElem);
1020 }());
1021 auto BBItEnd = std::next([&] {
1022 if constexpr (IsLoadChain)
1023 return BasicBlock::reverse_iterator(ChainBegin);
1024 else
1025 return BasicBlock::iterator(ChainBegin);
1026 }());
1027
1028 const APInt &ChainElemOffset = ChainOffsets.at(Val: ChainElem);
1029 const unsigned ChainElemSize =
1030 DL.getTypeStoreSize(Ty: getLoadStoreType(I: ChainElem));
1031
1032 for (; BBIt != BBItEnd; ++BBIt) {
1033 Instruction *I = &*BBIt;
1034
1035 if (!I->mayReadOrWriteMemory())
1036 continue;
1037
1038 // Loads can be reordered with other loads.
1039 if (IsLoadChain && isa<LoadInst>(Val: I))
1040 continue;
1041
1042 // Stores can be sunk below invariant loads.
1043 if (!IsLoadChain && isInvariantLoad(I))
1044 continue;
1045
1046 // If I is in the chain, we can tell whether it aliases ChainIt by checking
1047 // what offset ChainIt accesses. This may be better than AA is able to do.
1048 //
1049 // We should really only have duplicate offsets for stores (the duplicate
1050 // loads should be CSE'ed), but in case we have a duplicate load, we'll
1051 // split the chain so we don't have to handle this case specially.
1052 if (auto OffsetIt = ChainOffsets.find(Val: I); OffsetIt != ChainOffsets.end()) {
1053 // I and ChainElem overlap if:
1054 // - I and ChainElem have the same offset, OR
1055 // - I's offset is less than ChainElem's, but I touches past the
1056 // beginning of ChainElem, OR
1057 // - ChainElem's offset is less than I's, but ChainElem touches past the
1058 // beginning of I.
1059 const APInt &IOffset = OffsetIt->second;
1060 unsigned IElemSize = DL.getTypeStoreSize(Ty: getLoadStoreType(I));
1061 if (IOffset == ChainElemOffset ||
1062 (IOffset.sle(RHS: ChainElemOffset) &&
1063 (IOffset + IElemSize).sgt(RHS: ChainElemOffset)) ||
1064 (ChainElemOffset.sle(RHS: IOffset) &&
1065 (ChainElemOffset + ChainElemSize).sgt(RHS: OffsetIt->second))) {
1066 LLVM_DEBUG({
1067 // Double check that AA also sees this alias. If not, we probably
1068 // have a bug.
1069 ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem));
1070 assert(IsLoadChain ? isModSet(MR) : isModOrRefSet(MR));
1071 dbgs() << "LSV: Found alias in chain: " << *I << "\n";
1072 });
1073 return false; // We found an aliasing instruction; bail.
1074 }
1075
1076 continue; // We're confident there's no alias.
1077 }
1078
1079 LLVM_DEBUG(dbgs() << "LSV: Querying AA for " << *I << "\n");
1080 ModRefInfo MR = AA.getModRefInfo(I, OptLoc: MemoryLocation::get(Inst: ChainElem));
1081 if (IsLoadChain ? isModSet(MRI: MR) : isModOrRefSet(MRI: MR)) {
1082 LLVM_DEBUG(dbgs() << "LSV: Found alias in chain:\n"
1083 << " Aliasing instruction:\n"
1084 << " " << *I << '\n'
1085 << " Aliased instruction and pointer:\n"
1086 << " " << *ChainElem << '\n'
1087 << " " << *getLoadStorePointerOperand(ChainElem)
1088 << '\n');
1089
1090 return false;
1091 }
1092 }
1093 return true;
1094}
1095
1096static bool checkNoWrapFlags(Instruction *I, bool Signed) {
1097 BinaryOperator *BinOpI = cast<BinaryOperator>(Val: I);
1098 return (Signed && BinOpI->hasNoSignedWrap()) ||
1099 (!Signed && BinOpI->hasNoUnsignedWrap());
1100}
1101
1102static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA,
1103 unsigned MatchingOpIdxA, Instruction *AddOpB,
1104 unsigned MatchingOpIdxB, bool Signed) {
1105 LLVM_DEBUG(dbgs() << "LSV: checkIfSafeAddSequence IdxDiff=" << IdxDiff
1106 << ", AddOpA=" << *AddOpA << ", MatchingOpIdxA="
1107 << MatchingOpIdxA << ", AddOpB=" << *AddOpB
1108 << ", MatchingOpIdxB=" << MatchingOpIdxB
1109 << ", Signed=" << Signed << "\n");
1110 // If both OpA and OpB are adds with NSW/NUW and with one of the operands
1111 // being the same, we can guarantee that the transformation is safe if we can
1112 // prove that OpA won't overflow when Ret added to the other operand of OpA.
1113 // For example:
1114 // %tmp7 = add nsw i32 %tmp2, %v0
1115 // %tmp8 = sext i32 %tmp7 to i64
1116 // ...
1117 // %tmp11 = add nsw i32 %v0, 1
1118 // %tmp12 = add nsw i32 %tmp2, %tmp11
1119 // %tmp13 = sext i32 %tmp12 to i64
1120 //
1121 // Both %tmp7 and %tmp12 have the nsw flag and the first operand is %tmp2.
1122 // It's guaranteed that adding 1 to %tmp7 won't overflow because %tmp11 adds
1123 // 1 to %v0 and both %tmp11 and %tmp12 have the nsw flag.
1124 assert(AddOpA->getOpcode() == Instruction::Add &&
1125 AddOpB->getOpcode() == Instruction::Add &&
1126 checkNoWrapFlags(AddOpA, Signed) && checkNoWrapFlags(AddOpB, Signed));
1127 if (AddOpA->getOperand(i: MatchingOpIdxA) ==
1128 AddOpB->getOperand(i: MatchingOpIdxB)) {
1129 Value *OtherOperandA = AddOpA->getOperand(i: MatchingOpIdxA == 1 ? 0 : 1);
1130 Value *OtherOperandB = AddOpB->getOperand(i: MatchingOpIdxB == 1 ? 0 : 1);
1131 Instruction *OtherInstrA = dyn_cast<Instruction>(Val: OtherOperandA);
1132 Instruction *OtherInstrB = dyn_cast<Instruction>(Val: OtherOperandB);
1133 // Match `x +nsw/nuw y` and `x +nsw/nuw (y +nsw/nuw IdxDiff)`.
1134 if (OtherInstrB && OtherInstrB->getOpcode() == Instruction::Add &&
1135 checkNoWrapFlags(I: OtherInstrB, Signed) &&
1136 isa<ConstantInt>(Val: OtherInstrB->getOperand(i: 1))) {
1137 int64_t CstVal =
1138 cast<ConstantInt>(Val: OtherInstrB->getOperand(i: 1))->getSExtValue();
1139 if (OtherInstrB->getOperand(i: 0) == OtherOperandA &&
1140 IdxDiff.getSExtValue() == CstVal)
1141 return true;
1142 }
1143 // Match `x +nsw/nuw (y +nsw/nuw -Idx)` and `x +nsw/nuw (y +nsw/nuw x)`.
1144 if (OtherInstrA && OtherInstrA->getOpcode() == Instruction::Add &&
1145 checkNoWrapFlags(I: OtherInstrA, Signed) &&
1146 isa<ConstantInt>(Val: OtherInstrA->getOperand(i: 1))) {
1147 int64_t CstVal =
1148 cast<ConstantInt>(Val: OtherInstrA->getOperand(i: 1))->getSExtValue();
1149 if (OtherInstrA->getOperand(i: 0) == OtherOperandB &&
1150 IdxDiff.getSExtValue() == -CstVal)
1151 return true;
1152 }
1153 // Match `x +nsw/nuw (y +nsw/nuw c)` and
1154 // `x +nsw/nuw (y +nsw/nuw (c + IdxDiff))`.
1155 if (OtherInstrA && OtherInstrB &&
1156 OtherInstrA->getOpcode() == Instruction::Add &&
1157 OtherInstrB->getOpcode() == Instruction::Add &&
1158 checkNoWrapFlags(I: OtherInstrA, Signed) &&
1159 checkNoWrapFlags(I: OtherInstrB, Signed) &&
1160 isa<ConstantInt>(Val: OtherInstrA->getOperand(i: 1)) &&
1161 isa<ConstantInt>(Val: OtherInstrB->getOperand(i: 1))) {
1162 int64_t CstValA =
1163 cast<ConstantInt>(Val: OtherInstrA->getOperand(i: 1))->getSExtValue();
1164 int64_t CstValB =
1165 cast<ConstantInt>(Val: OtherInstrB->getOperand(i: 1))->getSExtValue();
1166 if (OtherInstrA->getOperand(i: 0) == OtherInstrB->getOperand(i: 0) &&
1167 IdxDiff.getSExtValue() == (CstValB - CstValA))
1168 return true;
1169 }
1170 }
1171 return false;
1172}
1173
1174std::optional<APInt> Vectorizer::getConstantOffsetComplexAddrs(
1175 Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) {
1176 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs PtrA=" << *PtrA
1177 << " PtrB=" << *PtrB << " ContextInst=" << *ContextInst
1178 << " Depth=" << Depth << "\n");
1179 auto *GEPA = dyn_cast<GetElementPtrInst>(Val: PtrA);
1180 auto *GEPB = dyn_cast<GetElementPtrInst>(Val: PtrB);
1181 if (!GEPA || !GEPB)
1182 return getConstantOffsetSelects(PtrA, PtrB, ContextInst, Depth);
1183
1184 // Look through GEPs after checking they're the same except for the last
1185 // index.
1186 if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
1187 GEPA->getPointerOperand() != GEPB->getPointerOperand())
1188 return std::nullopt;
1189 gep_type_iterator GTIA = gep_type_begin(GEP: GEPA);
1190 gep_type_iterator GTIB = gep_type_begin(GEP: GEPB);
1191 for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
1192 if (GTIA.getOperand() != GTIB.getOperand())
1193 return std::nullopt;
1194 ++GTIA;
1195 ++GTIB;
1196 }
1197
1198 Instruction *OpA = dyn_cast<Instruction>(Val: GTIA.getOperand());
1199 Instruction *OpB = dyn_cast<Instruction>(Val: GTIB.getOperand());
1200 if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
1201 OpA->getType() != OpB->getType())
1202 return std::nullopt;
1203
1204 uint64_t Stride = GTIA.getSequentialElementStride(DL);
1205
1206 // Only look through a ZExt/SExt.
1207 if (!isa<SExtInst>(Val: OpA) && !isa<ZExtInst>(Val: OpA))
1208 return std::nullopt;
1209
1210 bool Signed = isa<SExtInst>(Val: OpA);
1211
1212 // At this point A could be a function parameter, i.e. not an instruction
1213 Value *ValA = OpA->getOperand(i: 0);
1214 OpB = dyn_cast<Instruction>(Val: OpB->getOperand(i: 0));
1215 if (!OpB || ValA->getType() != OpB->getType())
1216 return std::nullopt;
1217
1218 const SCEV *OffsetSCEVA = SE.getSCEV(V: ValA);
1219 const SCEV *OffsetSCEVB = SE.getSCEV(V: OpB);
1220 const SCEV *IdxDiffSCEV = SE.getMinusSCEV(LHS: OffsetSCEVB, RHS: OffsetSCEVA);
1221 if (IdxDiffSCEV == SE.getCouldNotCompute())
1222 return std::nullopt;
1223
1224 ConstantRange IdxDiffRange = SE.getSignedRange(S: IdxDiffSCEV);
1225 if (!IdxDiffRange.isSingleElement())
1226 return std::nullopt;
1227 APInt IdxDiff = *IdxDiffRange.getSingleElement();
1228
1229 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs IdxDiff=" << IdxDiff
1230 << "\n");
1231
1232 // Now we need to prove that adding IdxDiff to ValA won't overflow.
1233 bool Safe = false;
1234
1235 // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
1236 // ValA, we're okay.
1237 if (OpB->getOpcode() == Instruction::Add &&
1238 isa<ConstantInt>(Val: OpB->getOperand(i: 1)) &&
1239 IdxDiff.sle(RHS: cast<ConstantInt>(Val: OpB->getOperand(i: 1))->getSExtValue()) &&
1240 checkNoWrapFlags(I: OpB, Signed))
1241 Safe = true;
1242
1243 // Second attempt: check if we have eligible add NSW/NUW instruction
1244 // sequences.
1245 OpA = dyn_cast<Instruction>(Val: ValA);
1246 if (!Safe && OpA && OpA->getOpcode() == Instruction::Add &&
1247 OpB->getOpcode() == Instruction::Add && checkNoWrapFlags(I: OpA, Signed) &&
1248 checkNoWrapFlags(I: OpB, Signed)) {
1249 // In the checks below a matching operand in OpA and OpB is an operand which
1250 // is the same in those two instructions. Below we account for possible
1251 // orders of the operands of these add instructions.
1252 for (unsigned MatchingOpIdxA : {0, 1})
1253 for (unsigned MatchingOpIdxB : {0, 1})
1254 if (!Safe)
1255 Safe = checkIfSafeAddSequence(IdxDiff, AddOpA: OpA, MatchingOpIdxA, AddOpB: OpB,
1256 MatchingOpIdxB, Signed);
1257 }
1258
1259 unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
1260
1261 // Third attempt:
1262 //
1263 // Assuming IdxDiff is positive: If all set bits of IdxDiff or any higher
1264 // order bit other than the sign bit are known to be zero in ValA, we can add
1265 // Diff to it while guaranteeing no overflow of any sort.
1266 //
1267 // If IdxDiff is negative, do the same, but swap ValA and ValB.
1268 if (!Safe) {
1269 // When computing known bits, use the GEPs as context instructions, since
1270 // they likely are in the same BB as the load/store.
1271 KnownBits Known(BitWidth);
1272 computeKnownBits(V: (IdxDiff.sge(RHS: 0) ? ValA : OpB), Known, DL, AC: &AC, CxtI: ContextInst,
1273 DT: &DT);
1274 APInt BitsAllowedToBeSet = Known.Zero.zext(width: IdxDiff.getBitWidth());
1275 if (Signed)
1276 BitsAllowedToBeSet.clearBit(BitPosition: BitWidth - 1);
1277 Safe = BitsAllowedToBeSet.uge(RHS: IdxDiff.abs());
1278 }
1279
1280 if (Safe)
1281 return IdxDiff * Stride;
1282 return std::nullopt;
1283}
1284
1285std::optional<APInt> Vectorizer::getConstantOffsetSelects(
1286 Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) {
1287 if (Depth++ == MaxDepth)
1288 return std::nullopt;
1289
1290 if (auto *SelectA = dyn_cast<SelectInst>(Val: PtrA)) {
1291 if (auto *SelectB = dyn_cast<SelectInst>(Val: PtrB)) {
1292 if (SelectA->getCondition() != SelectB->getCondition())
1293 return std::nullopt;
1294 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetSelects, PtrA=" << *PtrA
1295 << ", PtrB=" << *PtrB << ", ContextInst="
1296 << *ContextInst << ", Depth=" << Depth << "\n");
1297 std::optional<APInt> TrueDiff = getConstantOffset(
1298 PtrA: SelectA->getTrueValue(), PtrB: SelectB->getTrueValue(), ContextInst, Depth);
1299 if (!TrueDiff)
1300 return std::nullopt;
1301 std::optional<APInt> FalseDiff =
1302 getConstantOffset(PtrA: SelectA->getFalseValue(), PtrB: SelectB->getFalseValue(),
1303 ContextInst, Depth);
1304 if (TrueDiff == FalseDiff)
1305 return TrueDiff;
1306 }
1307 }
1308 return std::nullopt;
1309}
1310
1311void Vectorizer::mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const {
1312 if (EQClasses.size() < 2) // There is nothing to merge.
1313 return;
1314
1315 // The reduced key has all elements of the ECClassKey except the underlying
1316 // object. Check that EqClassKey has 4 elements and define the reduced key.
1317 static_assert(std::tuple_size_v<EqClassKey> == 4,
1318 "EqClassKey has changed - EqClassReducedKey needs changes too");
1319 using EqClassReducedKey =
1320 std::tuple<std::tuple_element_t<1, EqClassKey> /* AddrSpace */,
1321 std::tuple_element_t<2, EqClassKey> /* Element size */,
1322 std::tuple_element_t<3, EqClassKey> /* IsLoad; */>;
1323 using ECReducedKeyToUnderlyingObjectMap =
1324 MapVector<EqClassReducedKey,
1325 SmallPtrSet<std::tuple_element_t<0, EqClassKey>, 4>>;
1326
1327 // Form a map from the reduced key (without the underlying object) to the
1328 // underlying objects: 1 reduced key to many underlying objects, to form
1329 // groups of potentially merge-able equivalence classes.
1330 ECReducedKeyToUnderlyingObjectMap RedKeyToUOMap;
1331 bool FoundPotentiallyOptimizableEC = false;
1332 for (const auto &EC : EQClasses) {
1333 const auto &Key = EC.first;
1334 EqClassReducedKey RedKey{std::get<1>(t: Key), std::get<2>(t: Key),
1335 std::get<3>(t: Key)};
1336 auto &UOMap = RedKeyToUOMap[RedKey];
1337 UOMap.insert(Ptr: std::get<0>(t: Key));
1338 if (UOMap.size() > 1)
1339 FoundPotentiallyOptimizableEC = true;
1340 }
1341 if (!FoundPotentiallyOptimizableEC)
1342 return;
1343
1344 LLVM_DEBUG({
1345 dbgs() << "LSV: mergeEquivalenceClasses: before merging:\n";
1346 for (const auto &EC : EQClasses) {
1347 dbgs() << " Key: {" << EC.first << "}\n";
1348 for (const auto &Inst : EC.second)
1349 dbgs() << " Inst: " << *Inst << '\n';
1350 }
1351 });
1352 LLVM_DEBUG({
1353 dbgs() << "LSV: mergeEquivalenceClasses: RedKeyToUOMap:\n";
1354 for (const auto &RedKeyToUO : RedKeyToUOMap) {
1355 dbgs() << " Reduced key: {" << std::get<0>(RedKeyToUO.first) << ", "
1356 << std::get<1>(RedKeyToUO.first) << ", "
1357 << static_cast<int>(std::get<2>(RedKeyToUO.first)) << "} --> "
1358 << RedKeyToUO.second.size() << " underlying objects:\n";
1359 for (auto UObject : RedKeyToUO.second)
1360 dbgs() << " " << *UObject << '\n';
1361 }
1362 });
1363
1364 using UObjectToUObjectMap = DenseMap<const Value *, const Value *>;
1365
1366 // Compute the ultimate targets for a set of underlying objects.
1367 auto GetUltimateTargets =
1368 [](SmallPtrSetImpl<const Value *> &UObjects) -> UObjectToUObjectMap {
1369 UObjectToUObjectMap IndirectionMap;
1370 for (const auto *UObject : UObjects) {
1371 const unsigned MaxLookupDepth = 1; // look for 1-level indirections only
1372 const auto *UltimateTarget = getUnderlyingObject(V: UObject, MaxLookup: MaxLookupDepth);
1373 if (UltimateTarget != UObject)
1374 IndirectionMap[UObject] = UltimateTarget;
1375 }
1376 UObjectToUObjectMap UltimateTargetsMap;
1377 for (const auto *UObject : UObjects) {
1378 auto Target = UObject;
1379 auto It = IndirectionMap.find(Val: Target);
1380 for (; It != IndirectionMap.end(); It = IndirectionMap.find(Val: Target))
1381 Target = It->second;
1382 UltimateTargetsMap[UObject] = Target;
1383 }
1384 return UltimateTargetsMap;
1385 };
1386
1387 // For each item in RedKeyToUOMap, if it has more than one underlying object,
1388 // try to merge the equivalence classes.
1389 for (auto &[RedKey, UObjects] : RedKeyToUOMap) {
1390 if (UObjects.size() < 2)
1391 continue;
1392 auto UTMap = GetUltimateTargets(UObjects);
1393 for (const auto &[UObject, UltimateTarget] : UTMap) {
1394 if (UObject == UltimateTarget)
1395 continue;
1396
1397 EqClassKey KeyFrom{UObject, std::get<0>(t&: RedKey), std::get<1>(t&: RedKey),
1398 std::get<2>(t&: RedKey)};
1399 EqClassKey KeyTo{UltimateTarget, std::get<0>(t&: RedKey), std::get<1>(t&: RedKey),
1400 std::get<2>(t&: RedKey)};
1401 // The entry for KeyFrom is guarantted to exist, unlike KeyTo. Thus,
1402 // request the reference to the instructions vector for KeyTo first.
1403 const auto &VecTo = EQClasses[KeyTo];
1404 const auto &VecFrom = EQClasses[KeyFrom];
1405 SmallVector<Instruction *, 8> MergedVec;
1406 std::merge(first1: VecFrom.begin(), last1: VecFrom.end(), first2: VecTo.begin(), last2: VecTo.end(),
1407 result: std::back_inserter(x&: MergedVec),
1408 comp: [](Instruction *A, Instruction *B) {
1409 return A && B && A->comesBefore(Other: B);
1410 });
1411 EQClasses[KeyTo] = std::move(MergedVec);
1412 EQClasses.erase(Key: KeyFrom);
1413 }
1414 }
1415 LLVM_DEBUG({
1416 dbgs() << "LSV: mergeEquivalenceClasses: after merging:\n";
1417 for (const auto &EC : EQClasses) {
1418 dbgs() << " Key: {" << EC.first << "}\n";
1419 for (const auto &Inst : EC.second)
1420 dbgs() << " Inst: " << *Inst << '\n';
1421 }
1422 });
1423}
1424
1425EquivalenceClassMap
1426Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
1427 BasicBlock::iterator End) {
1428 EquivalenceClassMap Ret;
1429
1430 auto GetUnderlyingObject = [](const Value *Ptr) -> const Value * {
1431 const Value *ObjPtr = llvm::getUnderlyingObject(V: Ptr);
1432 if (const auto *Sel = dyn_cast<SelectInst>(Val: ObjPtr)) {
1433 // The select's themselves are distinct instructions even if they share
1434 // the same condition and evaluate to consecutive pointers for true and
1435 // false values of the condition. Therefore using the select's themselves
1436 // for grouping instructions would put consecutive accesses into different
1437 // lists and they won't be even checked for being consecutive, and won't
1438 // be vectorized.
1439 return Sel->getCondition();
1440 }
1441 return ObjPtr;
1442 };
1443
1444 for (Instruction &I : make_range(x: Begin, y: End)) {
1445 auto *LI = dyn_cast<LoadInst>(Val: &I);
1446 auto *SI = dyn_cast<StoreInst>(Val: &I);
1447 if (!LI && !SI)
1448 continue;
1449
1450 if ((LI && !LI->isSimple()) || (SI && !SI->isSimple()))
1451 continue;
1452
1453 if ((LI && !TTI.isLegalToVectorizeLoad(LI)) ||
1454 (SI && !TTI.isLegalToVectorizeStore(SI)))
1455 continue;
1456
1457 Type *Ty = getLoadStoreType(I: &I);
1458 if (!VectorType::isValidElementType(ElemTy: Ty->getScalarType()))
1459 continue;
1460
1461 // Skip weird non-byte sizes. They probably aren't worth the effort of
1462 // handling correctly.
1463 unsigned TySize = DL.getTypeSizeInBits(Ty);
1464 if ((TySize % 8) != 0)
1465 continue;
1466
1467 // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
1468 // functions are currently using an integer type for the vectorized
1469 // load/store, and does not support casting between the integer type and a
1470 // vector of pointers (e.g. i64 to <2 x i16*>)
1471 if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
1472 continue;
1473
1474 Value *Ptr = getLoadStorePointerOperand(V: &I);
1475 unsigned AS = Ptr->getType()->getPointerAddressSpace();
1476 unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AddrSpace: AS);
1477
1478 unsigned VF = VecRegSize / TySize;
1479 VectorType *VecTy = dyn_cast<VectorType>(Val: Ty);
1480
1481 // Only handle power-of-two sized elements.
1482 if ((!VecTy && !isPowerOf2_32(Value: DL.getTypeSizeInBits(Ty))) ||
1483 (VecTy && !isPowerOf2_32(Value: DL.getTypeSizeInBits(Ty: VecTy->getScalarType()))))
1484 continue;
1485
1486 // No point in looking at these if they're too big to vectorize.
1487 if (TySize > VecRegSize / 2 ||
1488 (VecTy && TTI.getLoadVectorFactor(VF, LoadSize: TySize, ChainSizeInBytes: TySize / 8, VecTy) == 0))
1489 continue;
1490
1491 Ret[{GetUnderlyingObject(Ptr), AS,
1492 DL.getTypeSizeInBits(Ty: getLoadStoreType(I: &I)->getScalarType()),
1493 /*IsLoad=*/LI != nullptr}]
1494 .emplace_back(Args: &I);
1495 }
1496
1497 mergeEquivalenceClasses(EQClasses&: Ret);
1498 return Ret;
1499}
1500
1501std::vector<Chain> Vectorizer::gatherChains(ArrayRef<Instruction *> Instrs) {
1502 if (Instrs.empty())
1503 return {};
1504
1505 unsigned AS = getLoadStoreAddressSpace(I: Instrs[0]);
1506 unsigned ASPtrBits = DL.getIndexSizeInBits(AS);
1507
1508#ifndef NDEBUG
1509 // Check that Instrs is in BB order and all have the same addr space.
1510 for (size_t I = 1; I < Instrs.size(); ++I) {
1511 assert(Instrs[I - 1]->comesBefore(Instrs[I]));
1512 assert(getLoadStoreAddressSpace(Instrs[I]) == AS);
1513 }
1514#endif
1515
1516 // Machinery to build an MRU-hashtable of Chains.
1517 //
1518 // (Ideally this could be done with MapVector, but as currently implemented,
1519 // moving an element to the front of a MapVector is O(n).)
1520 struct InstrListElem : ilist_node<InstrListElem>,
1521 std::pair<Instruction *, Chain> {
1522 explicit InstrListElem(Instruction *I)
1523 : std::pair<Instruction *, Chain>(I, {}) {}
1524 };
1525 struct InstrListElemDenseMapInfo {
1526 using PtrInfo = DenseMapInfo<InstrListElem *>;
1527 using IInfo = DenseMapInfo<Instruction *>;
1528 static InstrListElem *getEmptyKey() { return PtrInfo::getEmptyKey(); }
1529 static InstrListElem *getTombstoneKey() {
1530 return PtrInfo::getTombstoneKey();
1531 }
1532 static unsigned getHashValue(const InstrListElem *E) {
1533 return IInfo::getHashValue(PtrVal: E->first);
1534 }
1535 static bool isEqual(const InstrListElem *A, const InstrListElem *B) {
1536 if (A == getEmptyKey() || B == getEmptyKey())
1537 return A == getEmptyKey() && B == getEmptyKey();
1538 if (A == getTombstoneKey() || B == getTombstoneKey())
1539 return A == getTombstoneKey() && B == getTombstoneKey();
1540 return IInfo::isEqual(LHS: A->first, RHS: B->first);
1541 }
1542 };
1543 SpecificBumpPtrAllocator<InstrListElem> Allocator;
1544 simple_ilist<InstrListElem> MRU;
1545 DenseSet<InstrListElem *, InstrListElemDenseMapInfo> Chains;
1546
1547 // Compare each instruction in `instrs` to leader of the N most recently-used
1548 // chains. This limits the O(n^2) behavior of this pass while also allowing
1549 // us to build arbitrarily long chains.
1550 for (Instruction *I : Instrs) {
1551 constexpr int MaxChainsToTry = 64;
1552
1553 bool MatchFound = false;
1554 auto ChainIter = MRU.begin();
1555 for (size_t J = 0; J < MaxChainsToTry && ChainIter != MRU.end();
1556 ++J, ++ChainIter) {
1557 if (std::optional<APInt> Offset = getConstantOffset(
1558 PtrA: getLoadStorePointerOperand(V: ChainIter->first),
1559 PtrB: getLoadStorePointerOperand(V: I),
1560 /*ContextInst=*/
1561 (ChainIter->first->comesBefore(Other: I) ? I : ChainIter->first))) {
1562 // `Offset` might not have the expected number of bits, if e.g. AS has a
1563 // different number of bits than opaque pointers.
1564 ChainIter->second.emplace_back(Args&: I, Args&: Offset.value());
1565 // Move ChainIter to the front of the MRU list.
1566 MRU.remove(N&: *ChainIter);
1567 MRU.push_front(Node&: *ChainIter);
1568 MatchFound = true;
1569 break;
1570 }
1571 }
1572
1573 if (!MatchFound) {
1574 APInt ZeroOffset(ASPtrBits, 0);
1575 InstrListElem *E = new (Allocator.Allocate()) InstrListElem(I);
1576 E->second.emplace_back(Args&: I, Args&: ZeroOffset);
1577 MRU.push_front(Node&: *E);
1578 Chains.insert(V: E);
1579 }
1580 }
1581
1582 std::vector<Chain> Ret;
1583 Ret.reserve(n: Chains.size());
1584 // Iterate over MRU rather than Chains so the order is deterministic.
1585 for (auto &E : MRU)
1586 if (E.second.size() > 1)
1587 Ret.emplace_back(args: std::move(E.second));
1588 return Ret;
1589}
1590
1591std::optional<APInt> Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB,
1592 Instruction *ContextInst,
1593 unsigned Depth) {
1594 LLVM_DEBUG(dbgs() << "LSV: getConstantOffset, PtrA=" << *PtrA
1595 << ", PtrB=" << *PtrB << ", ContextInst= " << *ContextInst
1596 << ", Depth=" << Depth << "\n");
1597 // We'll ultimately return a value of this bit width, even if computations
1598 // happen in a different width.
1599 unsigned OrigBitWidth = DL.getIndexTypeSizeInBits(Ty: PtrA->getType());
1600 APInt OffsetA(OrigBitWidth, 0);
1601 APInt OffsetB(OrigBitWidth, 0);
1602 PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, Offset&: OffsetA);
1603 PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, Offset&: OffsetB);
1604 unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(Ty: PtrA->getType());
1605 if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(Ty: PtrB->getType()))
1606 return std::nullopt;
1607
1608 // If we have to shrink the pointer, stripAndAccumulateInBoundsConstantOffsets
1609 // should properly handle a possible overflow and the value should fit into
1610 // the smallest data type used in the cast/gep chain.
1611 assert(OffsetA.getSignificantBits() <= NewPtrBitWidth &&
1612 OffsetB.getSignificantBits() <= NewPtrBitWidth);
1613
1614 OffsetA = OffsetA.sextOrTrunc(width: NewPtrBitWidth);
1615 OffsetB = OffsetB.sextOrTrunc(width: NewPtrBitWidth);
1616 if (PtrA == PtrB)
1617 return (OffsetB - OffsetA).sextOrTrunc(width: OrigBitWidth);
1618
1619 // Try to compute B - A.
1620 const SCEV *DistScev = SE.getMinusSCEV(LHS: SE.getSCEV(V: PtrB), RHS: SE.getSCEV(V: PtrA));
1621 if (DistScev != SE.getCouldNotCompute()) {
1622 LLVM_DEBUG(dbgs() << "LSV: SCEV PtrB - PtrA =" << *DistScev << "\n");
1623 ConstantRange DistRange = SE.getSignedRange(S: DistScev);
1624 if (DistRange.isSingleElement()) {
1625 // Handle index width (the width of Dist) != pointer width (the width of
1626 // the Offset*s at this point).
1627 APInt Dist = DistRange.getSingleElement()->sextOrTrunc(width: NewPtrBitWidth);
1628 return (OffsetB - OffsetA + Dist).sextOrTrunc(width: OrigBitWidth);
1629 }
1630 }
1631 if (std::optional<APInt> Diff =
1632 getConstantOffsetComplexAddrs(PtrA, PtrB, ContextInst, Depth))
1633 return (OffsetB - OffsetA + Diff->sext(width: OffsetB.getBitWidth()))
1634 .sextOrTrunc(width: OrigBitWidth);
1635 return std::nullopt;
1636}
1637