1//===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass merges loads/stores to/from sequential memory addresses into vector
10// loads/stores. Although there's nothing GPU-specific in here, this pass is
11// motivated by the microarchitectural quirks of nVidia and AMD GPUs.
12//
13// (For simplicity below we talk about loads only, but everything also applies
14// to stores.)
15//
16// This pass is intended to be run late in the pipeline, after other
17// vectorization opportunities have been exploited. So the assumption here is
18// that immediately following our new vector load we'll need to extract out the
19// individual elements of the load, so we can operate on them individually.
20//
21// On CPUs this transformation is usually not beneficial, because extracting the
22// elements of a vector register is expensive on most architectures. It's
23// usually better just to load each element individually into its own scalar
24// register.
25//
26// However, nVidia and AMD GPUs don't have proper vector registers. Instead, a
27// "vector load" loads directly into a series of scalar registers. In effect,
28// extracting the elements of the vector is free. It's therefore always
29// beneficial to vectorize a sequence of loads on these architectures.
30//
31// Vectorizing (perhaps a better name might be "coalescing") loads can have
32// large performance impacts on GPU kernels, and opportunities for vectorizing
33// are common in GPU code. This pass tries very hard to find such
34// opportunities; its runtime is quadratic in the number of loads in a BB.
35//
36// Some CPU architectures, such as ARM, have instructions that load into
37// multiple scalar registers, similar to a GPU vectorized load. In theory ARM
38// could use this pass (with some modifications), but currently it implements
39// its own pass to do something similar to what we do here.
40//
41// Overview of the algorithm and terminology in this pass:
42//
43// - Break up each basic block into pseudo-BBs, composed of instructions which
44// are guaranteed to transfer control to their successors.
45// - Within a single pseudo-BB, find all loads, and group them into
46// "equivalence classes" according to getUnderlyingObject() and loaded
47// element size. Do the same for stores.
48// - For each equivalence class, greedily build "chains". Each chain has a
49// leader instruction, and every other member of the chain has a known
50// constant offset from the first instr in the chain.
51// - Break up chains so that they contain only contiguous accesses of legal
52// size with no intervening may-alias instrs.
53// - Convert each chain to vector instructions.
54//
55// The O(n^2) behavior of this pass comes from initially building the chains.
56// In the worst case we have to compare each new instruction to all of those
57// that came before. To limit this, we only calculate the offset to the leaders
58// of the N most recently-used chains.
59
60#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
61#include "llvm/ADT/APInt.h"
62#include "llvm/ADT/ArrayRef.h"
63#include "llvm/ADT/DenseMap.h"
64#include "llvm/ADT/MapVector.h"
65#include "llvm/ADT/PostOrderIterator.h"
66#include "llvm/ADT/STLExtras.h"
67#include "llvm/ADT/Sequence.h"
68#include "llvm/ADT/SmallPtrSet.h"
69#include "llvm/ADT/SmallVector.h"
70#include "llvm/ADT/Statistic.h"
71#include "llvm/ADT/iterator_range.h"
72#include "llvm/Analysis/AliasAnalysis.h"
73#include "llvm/Analysis/AssumptionCache.h"
74#include "llvm/Analysis/MemoryLocation.h"
75#include "llvm/Analysis/ScalarEvolution.h"
76#include "llvm/Analysis/TargetTransformInfo.h"
77#include "llvm/Analysis/ValueTracking.h"
78#include "llvm/Analysis/VectorUtils.h"
79#include "llvm/IR/Attributes.h"
80#include "llvm/IR/BasicBlock.h"
81#include "llvm/IR/ConstantRange.h"
82#include "llvm/IR/Constants.h"
83#include "llvm/IR/DataLayout.h"
84#include "llvm/IR/DerivedTypes.h"
85#include "llvm/IR/Dominators.h"
86#include "llvm/IR/Function.h"
87#include "llvm/IR/GetElementPtrTypeIterator.h"
88#include "llvm/IR/IRBuilder.h"
89#include "llvm/IR/InstrTypes.h"
90#include "llvm/IR/Instruction.h"
91#include "llvm/IR/Instructions.h"
92#include "llvm/IR/LLVMContext.h"
93#include "llvm/IR/Module.h"
94#include "llvm/IR/Type.h"
95#include "llvm/IR/Value.h"
96#include "llvm/InitializePasses.h"
97#include "llvm/Pass.h"
98#include "llvm/Support/Alignment.h"
99#include "llvm/Support/Casting.h"
100#include "llvm/Support/Debug.h"
101#include "llvm/Support/KnownBits.h"
102#include "llvm/Support/MathExtras.h"
103#include "llvm/Support/ModRef.h"
104#include "llvm/Support/raw_ostream.h"
105#include "llvm/Transforms/Utils/Local.h"
106#include <algorithm>
107#include <cassert>
108#include <cstdint>
109#include <cstdlib>
110#include <iterator>
111#include <numeric>
112#include <optional>
113#include <tuple>
114#include <type_traits>
115#include <utility>
116#include <vector>
117
118using namespace llvm;
119
120#define DEBUG_TYPE "load-store-vectorizer"
121
122STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
123STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
124
125namespace {
126
127// Equivalence class key, the initial tuple by which we group loads/stores.
128// Loads/stores with different EqClassKeys are never merged.
129//
130// (We could in theory remove element-size from the this tuple. We'd just need
131// to fix up the vector packing/unpacking code.)
132using EqClassKey =
133 std::tuple<const Value * /* result of getUnderlyingObject() */,
134 unsigned /* AddrSpace */,
135 unsigned /* Load/Store element size bits */,
136 char /* IsLoad; char b/c bool can't be a DenseMap key */
137 >;
138[[maybe_unused]] llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
139 const EqClassKey &K) {
140 const auto &[UnderlyingObject, AddrSpace, ElementSize, IsLoad] = K;
141 return OS << (IsLoad ? "load" : "store") << " of " << *UnderlyingObject
142 << " of element size " << ElementSize << " bits in addrspace "
143 << AddrSpace;
144}
145
146// A Chain is a set of instructions such that:
147// - All instructions have the same equivalence class, so in particular all are
148// loads, or all are stores.
149// - We know the address accessed by the i'th chain elem relative to the
150// chain's leader instruction, which is the first instr of the chain in BB
151// order.
152//
153// Chains have two canonical orderings:
154// - BB order, sorted by Instr->comesBefore.
155// - Offset order, sorted by OffsetFromLeader.
156// This pass switches back and forth between these orders.
157struct ChainElem {
158 Instruction *Inst;
159 APInt OffsetFromLeader;
160};
161using Chain = SmallVector<ChainElem, 1>;
162
163void sortChainInBBOrder(Chain &C) {
164 sort(C, Comp: [](auto &A, auto &B) { return A.Inst->comesBefore(B.Inst); });
165}
166
167void sortChainInOffsetOrder(Chain &C) {
168 sort(C, Comp: [](const auto &A, const auto &B) {
169 if (A.OffsetFromLeader != B.OffsetFromLeader)
170 return A.OffsetFromLeader.slt(B.OffsetFromLeader);
171 return A.Inst->comesBefore(B.Inst); // stable tiebreaker
172 });
173}
174
175[[maybe_unused]] void dumpChain(ArrayRef<ChainElem> C) {
176 for (const auto &E : C) {
177 dbgs() << " " << *E.Inst << " (offset " << E.OffsetFromLeader << ")\n";
178 }
179}
180
181using EquivalenceClassMap =
182 MapVector<EqClassKey, SmallVector<Instruction *, 8>>;
183
184// FIXME: Assuming stack alignment of 4 is always good enough
185constexpr unsigned StackAdjustedAlignment = 4;
186
187Instruction *propagateMetadata(Instruction *I, const Chain &C) {
188 SmallVector<Value *, 8> Values;
189 for (const ChainElem &E : C)
190 Values.push_back(Elt: E.Inst);
191 return propagateMetadata(I, VL: Values);
192}
193
194bool isInvariantLoad(const Instruction *I) {
195 const LoadInst *LI = dyn_cast<LoadInst>(Val: I);
196 return LI != nullptr && LI->hasMetadata(KindID: LLVMContext::MD_invariant_load);
197}
198
199/// Reorders the instructions that I depends on (the instructions defining its
200/// operands), to ensure they dominate I.
201void reorder(Instruction *I) {
202 SmallPtrSet<Instruction *, 16> InstructionsToMove;
203 SmallVector<Instruction *, 16> Worklist;
204
205 Worklist.push_back(Elt: I);
206 while (!Worklist.empty()) {
207 Instruction *IW = Worklist.pop_back_val();
208 int NumOperands = IW->getNumOperands();
209 for (int i = 0; i < NumOperands; i++) {
210 Instruction *IM = dyn_cast<Instruction>(Val: IW->getOperand(i));
211 if (!IM || IM->getOpcode() == Instruction::PHI)
212 continue;
213
214 // If IM is in another BB, no need to move it, because this pass only
215 // vectorizes instructions within one BB.
216 if (IM->getParent() != I->getParent())
217 continue;
218
219 if (!IM->comesBefore(Other: I)) {
220 InstructionsToMove.insert(Ptr: IM);
221 Worklist.push_back(Elt: IM);
222 }
223 }
224 }
225
226 // All instructions to move should follow I. Start from I, not from begin().
227 for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;) {
228 Instruction *IM = &*(BBI++);
229 if (!InstructionsToMove.count(Ptr: IM))
230 continue;
231 IM->moveBefore(MovePos: I);
232 }
233}
234
235class Vectorizer {
236 Function &F;
237 AliasAnalysis &AA;
238 AssumptionCache &AC;
239 DominatorTree &DT;
240 ScalarEvolution &SE;
241 TargetTransformInfo &TTI;
242 const DataLayout &DL;
243 IRBuilder<> Builder;
244
245 // We could erase instrs right after vectorizing them, but that can mess up
246 // our BB iterators, and also can make the equivalence class keys point to
247 // freed memory. This is fixable, but it's simpler just to wait until we're
248 // done with the BB and erase all at once.
249 SmallVector<Instruction *, 128> ToErase;
250
251public:
252 Vectorizer(Function &F, AliasAnalysis &AA, AssumptionCache &AC,
253 DominatorTree &DT, ScalarEvolution &SE, TargetTransformInfo &TTI)
254 : F(F), AA(AA), AC(AC), DT(DT), SE(SE), TTI(TTI),
255 DL(F.getDataLayout()), Builder(SE.getContext()) {}
256
257 bool run();
258
259private:
260 static const unsigned MaxDepth = 3;
261
262 /// Runs the vectorizer on a "pseudo basic block", which is a range of
263 /// instructions [Begin, End) within one BB all of which have
264 /// isGuaranteedToTransferExecutionToSuccessor(I) == true.
265 bool runOnPseudoBB(BasicBlock::iterator Begin, BasicBlock::iterator End);
266
267 /// Runs the vectorizer on one equivalence class, i.e. one set of loads/stores
268 /// in the same BB with the same value for getUnderlyingObject() etc.
269 bool runOnEquivalenceClass(const EqClassKey &EqClassKey,
270 ArrayRef<Instruction *> EqClass);
271
272 /// Runs the vectorizer on one chain, i.e. a subset of an equivalence class
273 /// where all instructions access a known, constant offset from the first
274 /// instruction.
275 bool runOnChain(Chain &C);
276
277 /// Splits the chain into subchains of instructions which read/write a
278 /// contiguous block of memory. Discards any length-1 subchains (because
279 /// there's nothing to vectorize in there).
280 std::vector<Chain> splitChainByContiguity(Chain &C);
281
282 /// Splits the chain into subchains where it's safe to hoist loads up to the
283 /// beginning of the sub-chain and it's safe to sink loads up to the end of
284 /// the sub-chain. Discards any length-1 subchains.
285 std::vector<Chain> splitChainByMayAliasInstrs(Chain &C);
286
287 /// Splits the chain into subchains that make legal, aligned accesses.
288 /// Discards any length-1 subchains.
289 std::vector<Chain> splitChainByAlignment(Chain &C);
290
291 /// Converts the instrs in the chain into a single vectorized load or store.
292 /// Adds the old scalar loads/stores to ToErase.
293 bool vectorizeChain(Chain &C);
294
295 /// Tries to compute the offset in bytes PtrB - PtrA.
296 std::optional<APInt> getConstantOffset(Value *PtrA, Value *PtrB,
297 Instruction *ContextInst,
298 unsigned Depth = 0);
299 std::optional<APInt> getConstantOffsetComplexAddrs(Value *PtrA, Value *PtrB,
300 Instruction *ContextInst,
301 unsigned Depth);
302 std::optional<APInt> getConstantOffsetSelects(Value *PtrA, Value *PtrB,
303 Instruction *ContextInst,
304 unsigned Depth);
305
306 /// Gets the element type of the vector that the chain will load or store.
307 /// This is nontrivial because the chain may contain elements of different
308 /// types; e.g. it's legal to have a chain that contains both i32 and float.
309 Type *getChainElemTy(const Chain &C);
310
311 /// Determines whether ChainElem can be moved up (if IsLoad) or down (if
312 /// !IsLoad) to ChainBegin -- i.e. there are no intervening may-alias
313 /// instructions.
314 ///
315 /// The map ChainElemOffsets must contain all of the elements in
316 /// [ChainBegin, ChainElem] and their offsets from some arbitrary base
317 /// address. It's ok if it contains additional entries.
318 template <bool IsLoadChain>
319 bool isSafeToMove(
320 Instruction *ChainElem, Instruction *ChainBegin,
321 const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets);
322
323 /// Collects loads and stores grouped by "equivalence class", where:
324 /// - all elements in an eq class are a load or all are a store,
325 /// - they all load/store the same element size (it's OK to have e.g. i8 and
326 /// <4 x i8> in the same class, but not i32 and <4 x i8>), and
327 /// - they all have the same value for getUnderlyingObject().
328 EquivalenceClassMap collectEquivalenceClasses(BasicBlock::iterator Begin,
329 BasicBlock::iterator End);
330
331 /// Partitions Instrs into "chains" where every instruction has a known
332 /// constant offset from the first instr in the chain.
333 ///
334 /// Postcondition: For all i, ret[i][0].second == 0, because the first instr
335 /// in the chain is the leader, and an instr touches distance 0 from itself.
336 std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs);
337};
338
339class LoadStoreVectorizerLegacyPass : public FunctionPass {
340public:
341 static char ID;
342
343 LoadStoreVectorizerLegacyPass() : FunctionPass(ID) {
344 initializeLoadStoreVectorizerLegacyPassPass(
345 *PassRegistry::getPassRegistry());
346 }
347
348 bool runOnFunction(Function &F) override;
349
350 StringRef getPassName() const override {
351 return "GPU Load and Store Vectorizer";
352 }
353
354 void getAnalysisUsage(AnalysisUsage &AU) const override {
355 AU.addRequired<AAResultsWrapperPass>();
356 AU.addRequired<AssumptionCacheTracker>();
357 AU.addRequired<ScalarEvolutionWrapperPass>();
358 AU.addRequired<DominatorTreeWrapperPass>();
359 AU.addRequired<TargetTransformInfoWrapperPass>();
360 AU.setPreservesCFG();
361 }
362};
363
364} // end anonymous namespace
365
366char LoadStoreVectorizerLegacyPass::ID = 0;
367
368INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
369 "Vectorize load and Store instructions", false, false)
370INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
371INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker);
372INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
373INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
374INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
375INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
376INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
377 "Vectorize load and store instructions", false, false)
378
379Pass *llvm::createLoadStoreVectorizerPass() {
380 return new LoadStoreVectorizerLegacyPass();
381}
382
383bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) {
384 // Don't vectorize when the attribute NoImplicitFloat is used.
385 if (skipFunction(F) || F.hasFnAttribute(Kind: Attribute::NoImplicitFloat))
386 return false;
387
388 AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
389 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
390 ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
391 TargetTransformInfo &TTI =
392 getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
393
394 AssumptionCache &AC =
395 getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
396
397 return Vectorizer(F, AA, AC, DT, SE, TTI).run();
398}
399
400PreservedAnalyses LoadStoreVectorizerPass::run(Function &F,
401 FunctionAnalysisManager &AM) {
402 // Don't vectorize when the attribute NoImplicitFloat is used.
403 if (F.hasFnAttribute(Kind: Attribute::NoImplicitFloat))
404 return PreservedAnalyses::all();
405
406 AliasAnalysis &AA = AM.getResult<AAManager>(IR&: F);
407 DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(IR&: F);
408 ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(IR&: F);
409 TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(IR&: F);
410 AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(IR&: F);
411
412 bool Changed = Vectorizer(F, AA, AC, DT, SE, TTI).run();
413 PreservedAnalyses PA;
414 PA.preserveSet<CFGAnalyses>();
415 return Changed ? PA : PreservedAnalyses::all();
416}
417
418bool Vectorizer::run() {
419 bool Changed = false;
420 // Break up the BB if there are any instrs which aren't guaranteed to transfer
421 // execution to their successor.
422 //
423 // Consider, for example:
424 //
425 // def assert_arr_len(int n) { if (n < 2) exit(); }
426 //
427 // load arr[0]
428 // call assert_array_len(arr.length)
429 // load arr[1]
430 //
431 // Even though assert_arr_len does not read or write any memory, we can't
432 // speculate the second load before the call. More info at
433 // https://github.com/llvm/llvm-project/issues/52950.
434 for (BasicBlock *BB : post_order(G: &F)) {
435 // BB must at least have a terminator.
436 assert(!BB->empty());
437
438 SmallVector<BasicBlock::iterator, 8> Barriers;
439 Barriers.push_back(Elt: BB->begin());
440 for (Instruction &I : *BB)
441 if (!isGuaranteedToTransferExecutionToSuccessor(I: &I))
442 Barriers.push_back(Elt: I.getIterator());
443 Barriers.push_back(Elt: BB->end());
444
445 for (auto It = Barriers.begin(), End = std::prev(x: Barriers.end()); It != End;
446 ++It)
447 Changed |= runOnPseudoBB(Begin: *It, End: *std::next(x: It));
448
449 for (Instruction *I : ToErase) {
450 auto *PtrOperand = getLoadStorePointerOperand(V: I);
451 if (I->use_empty())
452 I->eraseFromParent();
453 RecursivelyDeleteTriviallyDeadInstructions(V: PtrOperand);
454 }
455 ToErase.clear();
456 }
457
458 return Changed;
459}
460
461bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin,
462 BasicBlock::iterator End) {
463 LLVM_DEBUG({
464 dbgs() << "LSV: Running on pseudo-BB [" << *Begin << " ... ";
465 if (End != Begin->getParent()->end())
466 dbgs() << *End;
467 else
468 dbgs() << "<BB end>";
469 dbgs() << ")\n";
470 });
471
472 bool Changed = false;
473 for (const auto &[EqClassKey, EqClass] :
474 collectEquivalenceClasses(Begin, End))
475 Changed |= runOnEquivalenceClass(EqClassKey, EqClass);
476
477 return Changed;
478}
479
480bool Vectorizer::runOnEquivalenceClass(const EqClassKey &EqClassKey,
481 ArrayRef<Instruction *> EqClass) {
482 bool Changed = false;
483
484 LLVM_DEBUG({
485 dbgs() << "LSV: Running on equivalence class of size " << EqClass.size()
486 << " keyed on " << EqClassKey << ":\n";
487 for (Instruction *I : EqClass)
488 dbgs() << " " << *I << "\n";
489 });
490
491 std::vector<Chain> Chains = gatherChains(Instrs: EqClass);
492 LLVM_DEBUG(dbgs() << "LSV: Got " << Chains.size()
493 << " nontrivial chains.\n";);
494 for (Chain &C : Chains)
495 Changed |= runOnChain(C);
496 return Changed;
497}
498
499bool Vectorizer::runOnChain(Chain &C) {
500 LLVM_DEBUG({
501 dbgs() << "LSV: Running on chain with " << C.size() << " instructions:\n";
502 dumpChain(C);
503 });
504
505 // Split up the chain into increasingly smaller chains, until we can finally
506 // vectorize the chains.
507 //
508 // (Don't be scared by the depth of the loop nest here. These operations are
509 // all at worst O(n lg n) in the number of instructions, and splitting chains
510 // doesn't change the number of instrs. So the whole loop nest is O(n lg n).)
511 bool Changed = false;
512 for (auto &C : splitChainByMayAliasInstrs(C))
513 for (auto &C : splitChainByContiguity(C))
514 for (auto &C : splitChainByAlignment(C))
515 Changed |= vectorizeChain(C);
516 return Changed;
517}
518
519std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) {
520 if (C.empty())
521 return {};
522
523 sortChainInBBOrder(C);
524
525 LLVM_DEBUG({
526 dbgs() << "LSV: splitChainByMayAliasInstrs considering chain:\n";
527 dumpChain(C);
528 });
529
530 // We know that elements in the chain with nonverlapping offsets can't
531 // alias, but AA may not be smart enough to figure this out. Use a
532 // hashtable.
533 DenseMap<Instruction *, APInt /*OffsetFromLeader*/> ChainOffsets;
534 for (const auto &E : C)
535 ChainOffsets.insert(KV: {&*E.Inst, E.OffsetFromLeader});
536
537 // Loads get hoisted up to the first load in the chain. Stores get sunk
538 // down to the last store in the chain. Our algorithm for loads is:
539 //
540 // - Take the first element of the chain. This is the start of a new chain.
541 // - Take the next element of `Chain` and check for may-alias instructions
542 // up to the start of NewChain. If no may-alias instrs, add it to
543 // NewChain. Otherwise, start a new NewChain.
544 //
545 // For stores it's the same except in the reverse direction.
546 //
547 // We expect IsLoad to be an std::bool_constant.
548 auto Impl = [&](auto IsLoad) {
549 // MSVC is unhappy if IsLoad is a capture, so pass it as an arg.
550 auto [ChainBegin, ChainEnd] = [&](auto IsLoad) {
551 if constexpr (IsLoad())
552 return std::make_pair(x: C.begin(), y: C.end());
553 else
554 return std::make_pair(x: C.rbegin(), y: C.rend());
555 }(IsLoad);
556 assert(ChainBegin != ChainEnd);
557
558 std::vector<Chain> Chains;
559 SmallVector<ChainElem, 1> NewChain;
560 NewChain.push_back(*ChainBegin);
561 for (auto ChainIt = std::next(ChainBegin); ChainIt != ChainEnd; ++ChainIt) {
562 if (isSafeToMove<IsLoad>(ChainIt->Inst, NewChain.front().Inst,
563 ChainOffsets)) {
564 LLVM_DEBUG(dbgs() << "LSV: No intervening may-alias instrs; can merge "
565 << *ChainIt->Inst << " into " << *ChainBegin->Inst
566 << "\n");
567 NewChain.push_back(*ChainIt);
568 } else {
569 LLVM_DEBUG(
570 dbgs() << "LSV: Found intervening may-alias instrs; cannot merge "
571 << *ChainIt->Inst << " into " << *ChainBegin->Inst << "\n");
572 if (NewChain.size() > 1) {
573 LLVM_DEBUG({
574 dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
575 dumpChain(NewChain);
576 });
577 Chains.push_back(x: std::move(NewChain));
578 }
579
580 // Start a new chain.
581 NewChain = SmallVector<ChainElem, 1>({*ChainIt});
582 }
583 }
584 if (NewChain.size() > 1) {
585 LLVM_DEBUG({
586 dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
587 dumpChain(NewChain);
588 });
589 Chains.push_back(x: std::move(NewChain));
590 }
591 return Chains;
592 };
593
594 if (isa<LoadInst>(Val: C[0].Inst))
595 return Impl(/*IsLoad=*/std::bool_constant<true>());
596
597 assert(isa<StoreInst>(C[0].Inst));
598 return Impl(/*IsLoad=*/std::bool_constant<false>());
599}
600
601std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
602 if (C.empty())
603 return {};
604
605 sortChainInOffsetOrder(C);
606
607 LLVM_DEBUG({
608 dbgs() << "LSV: splitChainByContiguity considering chain:\n";
609 dumpChain(C);
610 });
611
612 std::vector<Chain> Ret;
613 Ret.push_back(x: {C.front()});
614
615 for (auto It = std::next(x: C.begin()), End = C.end(); It != End; ++It) {
616 // `prev` accesses offsets [PrevDistFromBase, PrevReadEnd).
617 auto &CurChain = Ret.back();
618 const ChainElem &Prev = CurChain.back();
619 unsigned SzBits = DL.getTypeSizeInBits(Ty: getLoadStoreType(I: &*Prev.Inst));
620 assert(SzBits % 8 == 0 && "Non-byte sizes should have been filtered out by "
621 "collectEquivalenceClass");
622 APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / 8;
623
624 // Add this instruction to the end of the current chain, or start a new one.
625 bool AreContiguous = It->OffsetFromLeader == PrevReadEnd;
626 LLVM_DEBUG(dbgs() << "LSV: Instructions are "
627 << (AreContiguous ? "" : "not ") << "contiguous: "
628 << *Prev.Inst << " (ends at offset " << PrevReadEnd
629 << ") -> " << *It->Inst << " (starts at offset "
630 << It->OffsetFromLeader << ")\n");
631 if (AreContiguous)
632 CurChain.push_back(Elt: *It);
633 else
634 Ret.push_back(x: {*It});
635 }
636
637 // Filter out length-1 chains, these are uninteresting.
638 llvm::erase_if(C&: Ret, P: [](const auto &Chain) { return Chain.size() <= 1; });
639 return Ret;
640}
641
642Type *Vectorizer::getChainElemTy(const Chain &C) {
643 assert(!C.empty());
644 // The rules are:
645 // - If there are any pointer types in the chain, use an integer type.
646 // - Prefer an integer type if it appears in the chain.
647 // - Otherwise, use the first type in the chain.
648 //
649 // The rule about pointer types is a simplification when we merge e.g. a load
650 // of a ptr and a double. There's no direct conversion from a ptr to a
651 // double; it requires a ptrtoint followed by a bitcast.
652 //
653 // It's unclear to me if the other rules have any practical effect, but we do
654 // it to match this pass's previous behavior.
655 if (any_of(Range: C, P: [](const ChainElem &E) {
656 return getLoadStoreType(I: E.Inst)->getScalarType()->isPointerTy();
657 })) {
658 return Type::getIntNTy(
659 C&: F.getContext(),
660 N: DL.getTypeSizeInBits(Ty: getLoadStoreType(I: C[0].Inst)->getScalarType()));
661 }
662
663 for (const ChainElem &E : C)
664 if (Type *T = getLoadStoreType(I: E.Inst)->getScalarType(); T->isIntegerTy())
665 return T;
666 return getLoadStoreType(I: C[0].Inst)->getScalarType();
667}
668
669std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
670 // We use a simple greedy algorithm.
671 // - Given a chain of length N, find all prefixes that
672 // (a) are not longer than the max register length, and
673 // (b) are a power of 2.
674 // - Starting from the longest prefix, try to create a vector of that length.
675 // - If one of them works, great. Repeat the algorithm on any remaining
676 // elements in the chain.
677 // - If none of them work, discard the first element and repeat on a chain
678 // of length N-1.
679 if (C.empty())
680 return {};
681
682 sortChainInOffsetOrder(C);
683
684 LLVM_DEBUG({
685 dbgs() << "LSV: splitChainByAlignment considering chain:\n";
686 dumpChain(C);
687 });
688
689 bool IsLoadChain = isa<LoadInst>(Val: C[0].Inst);
690 auto getVectorFactor = [&](unsigned VF, unsigned LoadStoreSize,
691 unsigned ChainSizeBytes, VectorType *VecTy) {
692 return IsLoadChain ? TTI.getLoadVectorFactor(VF, LoadSize: LoadStoreSize,
693 ChainSizeInBytes: ChainSizeBytes, VecTy)
694 : TTI.getStoreVectorFactor(VF, StoreSize: LoadStoreSize,
695 ChainSizeInBytes: ChainSizeBytes, VecTy);
696 };
697
698#ifndef NDEBUG
699 for (const auto &E : C) {
700 Type *Ty = getLoadStoreType(E.Inst)->getScalarType();
701 assert(isPowerOf2_32(DL.getTypeSizeInBits(Ty)) &&
702 "Should have filtered out non-power-of-two elements in "
703 "collectEquivalenceClasses.");
704 }
705#endif
706
707 unsigned AS = getLoadStoreAddressSpace(I: C[0].Inst);
708 unsigned VecRegBytes = TTI.getLoadStoreVecRegBitWidth(AddrSpace: AS) / 8;
709
710 std::vector<Chain> Ret;
711 for (unsigned CBegin = 0; CBegin < C.size(); ++CBegin) {
712 // Find candidate chains of size not greater than the largest vector reg.
713 // These chains are over the closed interval [CBegin, CEnd].
714 SmallVector<std::pair<unsigned /*CEnd*/, unsigned /*SizeBytes*/>, 8>
715 CandidateChains;
716 for (unsigned CEnd = CBegin + 1, Size = C.size(); CEnd < Size; ++CEnd) {
717 APInt Sz = C[CEnd].OffsetFromLeader +
718 DL.getTypeStoreSize(Ty: getLoadStoreType(I: C[CEnd].Inst)) -
719 C[CBegin].OffsetFromLeader;
720 if (Sz.sgt(RHS: VecRegBytes))
721 break;
722 CandidateChains.push_back(
723 Elt: {CEnd, static_cast<unsigned>(Sz.getLimitedValue())});
724 }
725
726 // Consider the longest chain first.
727 for (auto It = CandidateChains.rbegin(), End = CandidateChains.rend();
728 It != End; ++It) {
729 auto [CEnd, SizeBytes] = *It;
730 LLVM_DEBUG(
731 dbgs() << "LSV: splitChainByAlignment considering candidate chain ["
732 << *C[CBegin].Inst << " ... " << *C[CEnd].Inst << "]\n");
733
734 Type *VecElemTy = getChainElemTy(C);
735 // Note, VecElemTy is a power of 2, but might be less than one byte. For
736 // example, we can vectorize 2 x <2 x i4> to <4 x i4>, and in this case
737 // VecElemTy would be i4.
738 unsigned VecElemBits = DL.getTypeSizeInBits(Ty: VecElemTy);
739
740 // SizeBytes and VecElemBits are powers of 2, so they divide evenly.
741 assert((8 * SizeBytes) % VecElemBits == 0);
742 unsigned NumVecElems = 8 * SizeBytes / VecElemBits;
743 FixedVectorType *VecTy = FixedVectorType::get(ElementType: VecElemTy, NumElts: NumVecElems);
744 unsigned VF = 8 * VecRegBytes / VecElemBits;
745
746 // Check that TTI is happy with this vectorization factor.
747 unsigned TargetVF = getVectorFactor(VF, VecElemBits,
748 VecElemBits * NumVecElems / 8, VecTy);
749 if (TargetVF != VF && TargetVF < NumVecElems) {
750 LLVM_DEBUG(
751 dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
752 "because TargetVF="
753 << TargetVF << " != VF=" << VF
754 << " and TargetVF < NumVecElems=" << NumVecElems << "\n");
755 continue;
756 }
757
758 // Is a load/store with this alignment allowed by TTI and at least as fast
759 // as an unvectorized load/store?
760 //
761 // TTI and F are passed as explicit captures to WAR an MSVC misparse (??).
762 auto IsAllowedAndFast = [&, SizeBytes = SizeBytes, &TTI = TTI,
763 &F = F](Align Alignment) {
764 if (Alignment.value() % SizeBytes == 0)
765 return true;
766 unsigned VectorizedSpeed = 0;
767 bool AllowsMisaligned = TTI.allowsMisalignedMemoryAccesses(
768 Context&: F.getContext(), BitWidth: SizeBytes * 8, AddressSpace: AS, Alignment, Fast: &VectorizedSpeed);
769 if (!AllowsMisaligned) {
770 LLVM_DEBUG(dbgs()
771 << "LSV: Access of " << SizeBytes << "B in addrspace "
772 << AS << " with alignment " << Alignment.value()
773 << " is misaligned, and therefore can't be vectorized.\n");
774 return false;
775 }
776
777 unsigned ElementwiseSpeed = 0;
778 (TTI).allowsMisalignedMemoryAccesses(Context&: (F).getContext(), BitWidth: VecElemBits, AddressSpace: AS,
779 Alignment, Fast: &ElementwiseSpeed);
780 if (VectorizedSpeed < ElementwiseSpeed) {
781 LLVM_DEBUG(dbgs()
782 << "LSV: Access of " << SizeBytes << "B in addrspace "
783 << AS << " with alignment " << Alignment.value()
784 << " has relative speed " << VectorizedSpeed
785 << ", which is lower than the elementwise speed of "
786 << ElementwiseSpeed
787 << ". Therefore this access won't be vectorized.\n");
788 return false;
789 }
790 return true;
791 };
792
793 // If we're loading/storing from an alloca, align it if possible.
794 //
795 // FIXME: We eagerly upgrade the alignment, regardless of whether TTI
796 // tells us this is beneficial. This feels a bit odd, but it matches
797 // existing tests. This isn't *so* bad, because at most we align to 4
798 // bytes (current value of StackAdjustedAlignment).
799 //
800 // FIXME: We will upgrade the alignment of the alloca even if it turns out
801 // we can't vectorize for some other reason.
802 Value *PtrOperand = getLoadStorePointerOperand(V: C[CBegin].Inst);
803 bool IsAllocaAccess = AS == DL.getAllocaAddrSpace() &&
804 isa<AllocaInst>(Val: PtrOperand->stripPointerCasts());
805 Align Alignment = getLoadStoreAlignment(I: C[CBegin].Inst);
806 Align PrefAlign = Align(StackAdjustedAlignment);
807 if (IsAllocaAccess && Alignment.value() % SizeBytes != 0 &&
808 IsAllowedAndFast(PrefAlign)) {
809 Align NewAlign = getOrEnforceKnownAlignment(
810 V: PtrOperand, PrefAlign, DL, CxtI: C[CBegin].Inst, AC: nullptr, DT: &DT);
811 if (NewAlign >= Alignment) {
812 LLVM_DEBUG(dbgs()
813 << "LSV: splitByChain upgrading alloca alignment from "
814 << Alignment.value() << " to " << NewAlign.value()
815 << "\n");
816 Alignment = NewAlign;
817 }
818 }
819
820 if (!IsAllowedAndFast(Alignment)) {
821 LLVM_DEBUG(
822 dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
823 "because its alignment is not AllowedAndFast: "
824 << Alignment.value() << "\n");
825 continue;
826 }
827
828 if ((IsLoadChain &&
829 !TTI.isLegalToVectorizeLoadChain(ChainSizeInBytes: SizeBytes, Alignment, AddrSpace: AS)) ||
830 (!IsLoadChain &&
831 !TTI.isLegalToVectorizeStoreChain(ChainSizeInBytes: SizeBytes, Alignment, AddrSpace: AS))) {
832 LLVM_DEBUG(
833 dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
834 "because !isLegalToVectorizeLoad/StoreChain.");
835 continue;
836 }
837
838 // Hooray, we can vectorize this chain!
839 Chain &NewChain = Ret.emplace_back();
840 for (unsigned I = CBegin; I <= CEnd; ++I)
841 NewChain.push_back(Elt: C[I]);
842 CBegin = CEnd; // Skip over the instructions we've added to the chain.
843 break;
844 }
845 }
846 return Ret;
847}
848
849bool Vectorizer::vectorizeChain(Chain &C) {
850 if (C.size() < 2)
851 return false;
852
853 sortChainInOffsetOrder(C);
854
855 LLVM_DEBUG({
856 dbgs() << "LSV: Vectorizing chain of " << C.size() << " instructions:\n";
857 dumpChain(C);
858 });
859
860 Type *VecElemTy = getChainElemTy(C);
861 bool IsLoadChain = isa<LoadInst>(Val: C[0].Inst);
862 unsigned AS = getLoadStoreAddressSpace(I: C[0].Inst);
863 unsigned ChainBytes = std::accumulate(
864 first: C.begin(), last: C.end(), init: 0u, binary_op: [&](unsigned Bytes, const ChainElem &E) {
865 return Bytes + DL.getTypeStoreSize(Ty: getLoadStoreType(I: E.Inst));
866 });
867 assert(ChainBytes % DL.getTypeStoreSize(VecElemTy) == 0);
868 // VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller
869 // than 1 byte (e.g. VecTy == <32 x i1>).
870 Type *VecTy = FixedVectorType::get(
871 ElementType: VecElemTy, NumElts: 8 * ChainBytes / DL.getTypeSizeInBits(Ty: VecElemTy));
872
873 Align Alignment = getLoadStoreAlignment(I: C[0].Inst);
874 // If this is a load/store of an alloca, we might have upgraded the alloca's
875 // alignment earlier. Get the new alignment.
876 if (AS == DL.getAllocaAddrSpace()) {
877 Alignment = std::max(
878 a: Alignment,
879 b: getOrEnforceKnownAlignment(V: getLoadStorePointerOperand(V: C[0].Inst),
880 PrefAlign: MaybeAlign(), DL, CxtI: C[0].Inst, AC: nullptr, DT: &DT));
881 }
882
883 // All elements of the chain must have the same scalar-type size.
884#ifndef NDEBUG
885 for (const ChainElem &E : C)
886 assert(DL.getTypeStoreSize(getLoadStoreType(E.Inst)->getScalarType()) ==
887 DL.getTypeStoreSize(VecElemTy));
888#endif
889
890 Instruction *VecInst;
891 if (IsLoadChain) {
892 // Loads get hoisted to the location of the first load in the chain. We may
893 // also need to hoist the (transitive) operands of the loads.
894 Builder.SetInsertPoint(
895 llvm::min_element(Range&: C, C: [](const auto &A, const auto &B) {
896 return A.Inst->comesBefore(B.Inst);
897 })->Inst);
898
899 // Chain is in offset order, so C[0] is the instr with the lowest offset,
900 // i.e. the root of the vector.
901 VecInst = Builder.CreateAlignedLoad(Ty: VecTy,
902 Ptr: getLoadStorePointerOperand(V: C[0].Inst),
903 Align: Alignment);
904
905 unsigned VecIdx = 0;
906 for (const ChainElem &E : C) {
907 Instruction *I = E.Inst;
908 Value *V;
909 Type *T = getLoadStoreType(I);
910 if (auto *VT = dyn_cast<FixedVectorType>(Val: T)) {
911 auto Mask = llvm::to_vector<8>(
912 Range: llvm::seq<int>(Begin: VecIdx, End: VecIdx + VT->getNumElements()));
913 V = Builder.CreateShuffleVector(V: VecInst, Mask, Name: I->getName());
914 VecIdx += VT->getNumElements();
915 } else {
916 V = Builder.CreateExtractElement(Vec: VecInst, Idx: Builder.getInt32(C: VecIdx),
917 Name: I->getName());
918 ++VecIdx;
919 }
920 if (V->getType() != I->getType())
921 V = Builder.CreateBitOrPointerCast(V, DestTy: I->getType());
922 I->replaceAllUsesWith(V);
923 }
924
925 // Finally, we need to reorder the instrs in the BB so that the (transitive)
926 // operands of VecInst appear before it. To see why, suppose we have
927 // vectorized the following code:
928 //
929 // ptr1 = gep a, 1
930 // load1 = load i32 ptr1
931 // ptr0 = gep a, 0
932 // load0 = load i32 ptr0
933 //
934 // We will put the vectorized load at the location of the earliest load in
935 // the BB, i.e. load1. We get:
936 //
937 // ptr1 = gep a, 1
938 // loadv = load <2 x i32> ptr0
939 // load0 = extractelement loadv, 0
940 // load1 = extractelement loadv, 1
941 // ptr0 = gep a, 0
942 //
943 // Notice that loadv uses ptr0, which is defined *after* it!
944 reorder(I: VecInst);
945 } else {
946 // Stores get sunk to the location of the last store in the chain.
947 Builder.SetInsertPoint(llvm::max_element(Range&: C, C: [](auto &A, auto &B) {
948 return A.Inst->comesBefore(B.Inst);
949 })->Inst);
950
951 // Build the vector to store.
952 Value *Vec = PoisonValue::get(T: VecTy);
953 unsigned VecIdx = 0;
954 auto InsertElem = [&](Value *V) {
955 if (V->getType() != VecElemTy)
956 V = Builder.CreateBitOrPointerCast(V, DestTy: VecElemTy);
957 Vec = Builder.CreateInsertElement(Vec, NewElt: V, Idx: Builder.getInt32(C: VecIdx++));
958 };
959 for (const ChainElem &E : C) {
960 auto I = cast<StoreInst>(Val: E.Inst);
961 if (FixedVectorType *VT =
962 dyn_cast<FixedVectorType>(Val: getLoadStoreType(I))) {
963 for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) {
964 InsertElem(Builder.CreateExtractElement(Vec: I->getValueOperand(),
965 Idx: Builder.getInt32(C: J)));
966 }
967 } else {
968 InsertElem(I->getValueOperand());
969 }
970 }
971
972 // Chain is in offset order, so C[0] is the instr with the lowest offset,
973 // i.e. the root of the vector.
974 VecInst = Builder.CreateAlignedStore(
975 Val: Vec,
976 Ptr: getLoadStorePointerOperand(V: C[0].Inst),
977 Align: Alignment);
978 }
979
980 propagateMetadata(I: VecInst, C);
981
982 for (const ChainElem &E : C)
983 ToErase.push_back(Elt: E.Inst);
984
985 ++NumVectorInstructions;
986 NumScalarsVectorized += C.size();
987 return true;
988}
989
990template <bool IsLoadChain>
991bool Vectorizer::isSafeToMove(
992 Instruction *ChainElem, Instruction *ChainBegin,
993 const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets) {
994 LLVM_DEBUG(dbgs() << "LSV: isSafeToMove(" << *ChainElem << " -> "
995 << *ChainBegin << ")\n");
996
997 assert(isa<LoadInst>(ChainElem) == IsLoadChain);
998 if (ChainElem == ChainBegin)
999 return true;
1000
1001 // Invariant loads can always be reordered; by definition they are not
1002 // clobbered by stores.
1003 if (isInvariantLoad(I: ChainElem))
1004 return true;
1005
1006 auto BBIt = std::next([&] {
1007 if constexpr (IsLoadChain)
1008 return BasicBlock::reverse_iterator(ChainElem);
1009 else
1010 return BasicBlock::iterator(ChainElem);
1011 }());
1012 auto BBItEnd = std::next([&] {
1013 if constexpr (IsLoadChain)
1014 return BasicBlock::reverse_iterator(ChainBegin);
1015 else
1016 return BasicBlock::iterator(ChainBegin);
1017 }());
1018
1019 const APInt &ChainElemOffset = ChainOffsets.at(Val: ChainElem);
1020 const unsigned ChainElemSize =
1021 DL.getTypeStoreSize(Ty: getLoadStoreType(I: ChainElem));
1022
1023 for (; BBIt != BBItEnd; ++BBIt) {
1024 Instruction *I = &*BBIt;
1025
1026 if (!I->mayReadOrWriteMemory())
1027 continue;
1028
1029 // Loads can be reordered with other loads.
1030 if (IsLoadChain && isa<LoadInst>(Val: I))
1031 continue;
1032
1033 // Stores can be sunk below invariant loads.
1034 if (!IsLoadChain && isInvariantLoad(I))
1035 continue;
1036
1037 // If I is in the chain, we can tell whether it aliases ChainIt by checking
1038 // what offset ChainIt accesses. This may be better than AA is able to do.
1039 //
1040 // We should really only have duplicate offsets for stores (the duplicate
1041 // loads should be CSE'ed), but in case we have a duplicate load, we'll
1042 // split the chain so we don't have to handle this case specially.
1043 if (auto OffsetIt = ChainOffsets.find(Val: I); OffsetIt != ChainOffsets.end()) {
1044 // I and ChainElem overlap if:
1045 // - I and ChainElem have the same offset, OR
1046 // - I's offset is less than ChainElem's, but I touches past the
1047 // beginning of ChainElem, OR
1048 // - ChainElem's offset is less than I's, but ChainElem touches past the
1049 // beginning of I.
1050 const APInt &IOffset = OffsetIt->second;
1051 unsigned IElemSize = DL.getTypeStoreSize(Ty: getLoadStoreType(I));
1052 if (IOffset == ChainElemOffset ||
1053 (IOffset.sle(RHS: ChainElemOffset) &&
1054 (IOffset + IElemSize).sgt(RHS: ChainElemOffset)) ||
1055 (ChainElemOffset.sle(RHS: IOffset) &&
1056 (ChainElemOffset + ChainElemSize).sgt(RHS: OffsetIt->second))) {
1057 LLVM_DEBUG({
1058 // Double check that AA also sees this alias. If not, we probably
1059 // have a bug.
1060 ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem));
1061 assert(IsLoadChain ? isModSet(MR) : isModOrRefSet(MR));
1062 dbgs() << "LSV: Found alias in chain: " << *I << "\n";
1063 });
1064 return false; // We found an aliasing instruction; bail.
1065 }
1066
1067 continue; // We're confident there's no alias.
1068 }
1069
1070 LLVM_DEBUG(dbgs() << "LSV: Querying AA for " << *I << "\n");
1071 ModRefInfo MR = AA.getModRefInfo(I, OptLoc: MemoryLocation::get(Inst: ChainElem));
1072 if (IsLoadChain ? isModSet(MRI: MR) : isModOrRefSet(MRI: MR)) {
1073 LLVM_DEBUG(dbgs() << "LSV: Found alias in chain:\n"
1074 << " Aliasing instruction:\n"
1075 << " " << *I << '\n'
1076 << " Aliased instruction and pointer:\n"
1077 << " " << *ChainElem << '\n'
1078 << " " << *getLoadStorePointerOperand(ChainElem)
1079 << '\n');
1080
1081 return false;
1082 }
1083 }
1084 return true;
1085}
1086
1087static bool checkNoWrapFlags(Instruction *I, bool Signed) {
1088 BinaryOperator *BinOpI = cast<BinaryOperator>(Val: I);
1089 return (Signed && BinOpI->hasNoSignedWrap()) ||
1090 (!Signed && BinOpI->hasNoUnsignedWrap());
1091}
1092
1093static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA,
1094 unsigned MatchingOpIdxA, Instruction *AddOpB,
1095 unsigned MatchingOpIdxB, bool Signed) {
1096 LLVM_DEBUG(dbgs() << "LSV: checkIfSafeAddSequence IdxDiff=" << IdxDiff
1097 << ", AddOpA=" << *AddOpA << ", MatchingOpIdxA="
1098 << MatchingOpIdxA << ", AddOpB=" << *AddOpB
1099 << ", MatchingOpIdxB=" << MatchingOpIdxB
1100 << ", Signed=" << Signed << "\n");
1101 // If both OpA and OpB are adds with NSW/NUW and with one of the operands
1102 // being the same, we can guarantee that the transformation is safe if we can
1103 // prove that OpA won't overflow when Ret added to the other operand of OpA.
1104 // For example:
1105 // %tmp7 = add nsw i32 %tmp2, %v0
1106 // %tmp8 = sext i32 %tmp7 to i64
1107 // ...
1108 // %tmp11 = add nsw i32 %v0, 1
1109 // %tmp12 = add nsw i32 %tmp2, %tmp11
1110 // %tmp13 = sext i32 %tmp12 to i64
1111 //
1112 // Both %tmp7 and %tmp12 have the nsw flag and the first operand is %tmp2.
1113 // It's guaranteed that adding 1 to %tmp7 won't overflow because %tmp11 adds
1114 // 1 to %v0 and both %tmp11 and %tmp12 have the nsw flag.
1115 assert(AddOpA->getOpcode() == Instruction::Add &&
1116 AddOpB->getOpcode() == Instruction::Add &&
1117 checkNoWrapFlags(AddOpA, Signed) && checkNoWrapFlags(AddOpB, Signed));
1118 if (AddOpA->getOperand(i: MatchingOpIdxA) ==
1119 AddOpB->getOperand(i: MatchingOpIdxB)) {
1120 Value *OtherOperandA = AddOpA->getOperand(i: MatchingOpIdxA == 1 ? 0 : 1);
1121 Value *OtherOperandB = AddOpB->getOperand(i: MatchingOpIdxB == 1 ? 0 : 1);
1122 Instruction *OtherInstrA = dyn_cast<Instruction>(Val: OtherOperandA);
1123 Instruction *OtherInstrB = dyn_cast<Instruction>(Val: OtherOperandB);
1124 // Match `x +nsw/nuw y` and `x +nsw/nuw (y +nsw/nuw IdxDiff)`.
1125 if (OtherInstrB && OtherInstrB->getOpcode() == Instruction::Add &&
1126 checkNoWrapFlags(I: OtherInstrB, Signed) &&
1127 isa<ConstantInt>(Val: OtherInstrB->getOperand(i: 1))) {
1128 int64_t CstVal =
1129 cast<ConstantInt>(Val: OtherInstrB->getOperand(i: 1))->getSExtValue();
1130 if (OtherInstrB->getOperand(i: 0) == OtherOperandA &&
1131 IdxDiff.getSExtValue() == CstVal)
1132 return true;
1133 }
1134 // Match `x +nsw/nuw (y +nsw/nuw -Idx)` and `x +nsw/nuw (y +nsw/nuw x)`.
1135 if (OtherInstrA && OtherInstrA->getOpcode() == Instruction::Add &&
1136 checkNoWrapFlags(I: OtherInstrA, Signed) &&
1137 isa<ConstantInt>(Val: OtherInstrA->getOperand(i: 1))) {
1138 int64_t CstVal =
1139 cast<ConstantInt>(Val: OtherInstrA->getOperand(i: 1))->getSExtValue();
1140 if (OtherInstrA->getOperand(i: 0) == OtherOperandB &&
1141 IdxDiff.getSExtValue() == -CstVal)
1142 return true;
1143 }
1144 // Match `x +nsw/nuw (y +nsw/nuw c)` and
1145 // `x +nsw/nuw (y +nsw/nuw (c + IdxDiff))`.
1146 if (OtherInstrA && OtherInstrB &&
1147 OtherInstrA->getOpcode() == Instruction::Add &&
1148 OtherInstrB->getOpcode() == Instruction::Add &&
1149 checkNoWrapFlags(I: OtherInstrA, Signed) &&
1150 checkNoWrapFlags(I: OtherInstrB, Signed) &&
1151 isa<ConstantInt>(Val: OtherInstrA->getOperand(i: 1)) &&
1152 isa<ConstantInt>(Val: OtherInstrB->getOperand(i: 1))) {
1153 int64_t CstValA =
1154 cast<ConstantInt>(Val: OtherInstrA->getOperand(i: 1))->getSExtValue();
1155 int64_t CstValB =
1156 cast<ConstantInt>(Val: OtherInstrB->getOperand(i: 1))->getSExtValue();
1157 if (OtherInstrA->getOperand(i: 0) == OtherInstrB->getOperand(i: 0) &&
1158 IdxDiff.getSExtValue() == (CstValB - CstValA))
1159 return true;
1160 }
1161 }
1162 return false;
1163}
1164
1165std::optional<APInt> Vectorizer::getConstantOffsetComplexAddrs(
1166 Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) {
1167 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs PtrA=" << *PtrA
1168 << " PtrB=" << *PtrB << " ContextInst=" << *ContextInst
1169 << " Depth=" << Depth << "\n");
1170 auto *GEPA = dyn_cast<GetElementPtrInst>(Val: PtrA);
1171 auto *GEPB = dyn_cast<GetElementPtrInst>(Val: PtrB);
1172 if (!GEPA || !GEPB)
1173 return getConstantOffsetSelects(PtrA, PtrB, ContextInst, Depth);
1174
1175 // Look through GEPs after checking they're the same except for the last
1176 // index.
1177 if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
1178 GEPA->getPointerOperand() != GEPB->getPointerOperand())
1179 return std::nullopt;
1180 gep_type_iterator GTIA = gep_type_begin(GEP: GEPA);
1181 gep_type_iterator GTIB = gep_type_begin(GEP: GEPB);
1182 for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
1183 if (GTIA.getOperand() != GTIB.getOperand())
1184 return std::nullopt;
1185 ++GTIA;
1186 ++GTIB;
1187 }
1188
1189 Instruction *OpA = dyn_cast<Instruction>(Val: GTIA.getOperand());
1190 Instruction *OpB = dyn_cast<Instruction>(Val: GTIB.getOperand());
1191 if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
1192 OpA->getType() != OpB->getType())
1193 return std::nullopt;
1194
1195 uint64_t Stride = GTIA.getSequentialElementStride(DL);
1196
1197 // Only look through a ZExt/SExt.
1198 if (!isa<SExtInst>(Val: OpA) && !isa<ZExtInst>(Val: OpA))
1199 return std::nullopt;
1200
1201 bool Signed = isa<SExtInst>(Val: OpA);
1202
1203 // At this point A could be a function parameter, i.e. not an instruction
1204 Value *ValA = OpA->getOperand(i: 0);
1205 OpB = dyn_cast<Instruction>(Val: OpB->getOperand(i: 0));
1206 if (!OpB || ValA->getType() != OpB->getType())
1207 return std::nullopt;
1208
1209 const SCEV *OffsetSCEVA = SE.getSCEV(V: ValA);
1210 const SCEV *OffsetSCEVB = SE.getSCEV(V: OpB);
1211 const SCEV *IdxDiffSCEV = SE.getMinusSCEV(LHS: OffsetSCEVB, RHS: OffsetSCEVA);
1212 if (IdxDiffSCEV == SE.getCouldNotCompute())
1213 return std::nullopt;
1214
1215 ConstantRange IdxDiffRange = SE.getSignedRange(S: IdxDiffSCEV);
1216 if (!IdxDiffRange.isSingleElement())
1217 return std::nullopt;
1218 APInt IdxDiff = *IdxDiffRange.getSingleElement();
1219
1220 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs IdxDiff=" << IdxDiff
1221 << "\n");
1222
1223 // Now we need to prove that adding IdxDiff to ValA won't overflow.
1224 bool Safe = false;
1225
1226 // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
1227 // ValA, we're okay.
1228 if (OpB->getOpcode() == Instruction::Add &&
1229 isa<ConstantInt>(Val: OpB->getOperand(i: 1)) &&
1230 IdxDiff.sle(RHS: cast<ConstantInt>(Val: OpB->getOperand(i: 1))->getSExtValue()) &&
1231 checkNoWrapFlags(I: OpB, Signed))
1232 Safe = true;
1233
1234 // Second attempt: check if we have eligible add NSW/NUW instruction
1235 // sequences.
1236 OpA = dyn_cast<Instruction>(Val: ValA);
1237 if (!Safe && OpA && OpA->getOpcode() == Instruction::Add &&
1238 OpB->getOpcode() == Instruction::Add && checkNoWrapFlags(I: OpA, Signed) &&
1239 checkNoWrapFlags(I: OpB, Signed)) {
1240 // In the checks below a matching operand in OpA and OpB is an operand which
1241 // is the same in those two instructions. Below we account for possible
1242 // orders of the operands of these add instructions.
1243 for (unsigned MatchingOpIdxA : {0, 1})
1244 for (unsigned MatchingOpIdxB : {0, 1})
1245 if (!Safe)
1246 Safe = checkIfSafeAddSequence(IdxDiff, AddOpA: OpA, MatchingOpIdxA, AddOpB: OpB,
1247 MatchingOpIdxB, Signed);
1248 }
1249
1250 unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
1251
1252 // Third attempt:
1253 //
1254 // Assuming IdxDiff is positive: If all set bits of IdxDiff or any higher
1255 // order bit other than the sign bit are known to be zero in ValA, we can add
1256 // Diff to it while guaranteeing no overflow of any sort.
1257 //
1258 // If IdxDiff is negative, do the same, but swap ValA and ValB.
1259 if (!Safe) {
1260 // When computing known bits, use the GEPs as context instructions, since
1261 // they likely are in the same BB as the load/store.
1262 KnownBits Known(BitWidth);
1263 computeKnownBits(V: (IdxDiff.sge(RHS: 0) ? ValA : OpB), Known, DL, Depth: 0, AC: &AC,
1264 CxtI: ContextInst, DT: &DT);
1265 APInt BitsAllowedToBeSet = Known.Zero.zext(width: IdxDiff.getBitWidth());
1266 if (Signed)
1267 BitsAllowedToBeSet.clearBit(BitPosition: BitWidth - 1);
1268 if (BitsAllowedToBeSet.ult(RHS: IdxDiff.abs()))
1269 return std::nullopt;
1270 Safe = true;
1271 }
1272
1273 if (Safe)
1274 return IdxDiff * Stride;
1275 return std::nullopt;
1276}
1277
1278std::optional<APInt> Vectorizer::getConstantOffsetSelects(
1279 Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) {
1280 if (Depth++ == MaxDepth)
1281 return std::nullopt;
1282
1283 if (auto *SelectA = dyn_cast<SelectInst>(Val: PtrA)) {
1284 if (auto *SelectB = dyn_cast<SelectInst>(Val: PtrB)) {
1285 if (SelectA->getCondition() != SelectB->getCondition())
1286 return std::nullopt;
1287 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetSelects, PtrA=" << *PtrA
1288 << ", PtrB=" << *PtrB << ", ContextInst="
1289 << *ContextInst << ", Depth=" << Depth << "\n");
1290 std::optional<APInt> TrueDiff = getConstantOffset(
1291 PtrA: SelectA->getTrueValue(), PtrB: SelectB->getTrueValue(), ContextInst, Depth);
1292 if (!TrueDiff.has_value())
1293 return std::nullopt;
1294 std::optional<APInt> FalseDiff =
1295 getConstantOffset(PtrA: SelectA->getFalseValue(), PtrB: SelectB->getFalseValue(),
1296 ContextInst, Depth);
1297 if (TrueDiff == FalseDiff)
1298 return TrueDiff;
1299 }
1300 }
1301 return std::nullopt;
1302}
1303
1304EquivalenceClassMap
1305Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
1306 BasicBlock::iterator End) {
1307 EquivalenceClassMap Ret;
1308
1309 auto getUnderlyingObject = [](const Value *Ptr) -> const Value * {
1310 const Value *ObjPtr = llvm::getUnderlyingObject(V: Ptr);
1311 if (const auto *Sel = dyn_cast<SelectInst>(Val: ObjPtr)) {
1312 // The select's themselves are distinct instructions even if they share
1313 // the same condition and evaluate to consecutive pointers for true and
1314 // false values of the condition. Therefore using the select's themselves
1315 // for grouping instructions would put consecutive accesses into different
1316 // lists and they won't be even checked for being consecutive, and won't
1317 // be vectorized.
1318 return Sel->getCondition();
1319 }
1320 return ObjPtr;
1321 };
1322
1323 for (Instruction &I : make_range(x: Begin, y: End)) {
1324 auto *LI = dyn_cast<LoadInst>(Val: &I);
1325 auto *SI = dyn_cast<StoreInst>(Val: &I);
1326 if (!LI && !SI)
1327 continue;
1328
1329 if ((LI && !LI->isSimple()) || (SI && !SI->isSimple()))
1330 continue;
1331
1332 if ((LI && !TTI.isLegalToVectorizeLoad(LI)) ||
1333 (SI && !TTI.isLegalToVectorizeStore(SI)))
1334 continue;
1335
1336 Type *Ty = getLoadStoreType(I: &I);
1337 if (!VectorType::isValidElementType(ElemTy: Ty->getScalarType()))
1338 continue;
1339
1340 // Skip weird non-byte sizes. They probably aren't worth the effort of
1341 // handling correctly.
1342 unsigned TySize = DL.getTypeSizeInBits(Ty);
1343 if ((TySize % 8) != 0)
1344 continue;
1345
1346 // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
1347 // functions are currently using an integer type for the vectorized
1348 // load/store, and does not support casting between the integer type and a
1349 // vector of pointers (e.g. i64 to <2 x i16*>)
1350 if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
1351 continue;
1352
1353 Value *Ptr = getLoadStorePointerOperand(V: &I);
1354 unsigned AS = Ptr->getType()->getPointerAddressSpace();
1355 unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AddrSpace: AS);
1356
1357 unsigned VF = VecRegSize / TySize;
1358 VectorType *VecTy = dyn_cast<VectorType>(Val: Ty);
1359
1360 // Only handle power-of-two sized elements.
1361 if ((!VecTy && !isPowerOf2_32(Value: DL.getTypeSizeInBits(Ty))) ||
1362 (VecTy && !isPowerOf2_32(Value: DL.getTypeSizeInBits(Ty: VecTy->getScalarType()))))
1363 continue;
1364
1365 // No point in looking at these if they're too big to vectorize.
1366 if (TySize > VecRegSize / 2 ||
1367 (VecTy && TTI.getLoadVectorFactor(VF, LoadSize: TySize, ChainSizeInBytes: TySize / 8, VecTy) == 0))
1368 continue;
1369
1370 Ret[{getUnderlyingObject(Ptr), AS,
1371 DL.getTypeSizeInBits(Ty: getLoadStoreType(I: &I)->getScalarType()),
1372 /*IsLoad=*/LI != nullptr}]
1373 .push_back(Elt: &I);
1374 }
1375
1376 return Ret;
1377}
1378
1379std::vector<Chain> Vectorizer::gatherChains(ArrayRef<Instruction *> Instrs) {
1380 if (Instrs.empty())
1381 return {};
1382
1383 unsigned AS = getLoadStoreAddressSpace(I: Instrs[0]);
1384 unsigned ASPtrBits = DL.getIndexSizeInBits(AS);
1385
1386#ifndef NDEBUG
1387 // Check that Instrs is in BB order and all have the same addr space.
1388 for (size_t I = 1; I < Instrs.size(); ++I) {
1389 assert(Instrs[I - 1]->comesBefore(Instrs[I]));
1390 assert(getLoadStoreAddressSpace(Instrs[I]) == AS);
1391 }
1392#endif
1393
1394 // Machinery to build an MRU-hashtable of Chains.
1395 //
1396 // (Ideally this could be done with MapVector, but as currently implemented,
1397 // moving an element to the front of a MapVector is O(n).)
1398 struct InstrListElem : ilist_node<InstrListElem>,
1399 std::pair<Instruction *, Chain> {
1400 explicit InstrListElem(Instruction *I)
1401 : std::pair<Instruction *, Chain>(I, {}) {}
1402 };
1403 struct InstrListElemDenseMapInfo {
1404 using PtrInfo = DenseMapInfo<InstrListElem *>;
1405 using IInfo = DenseMapInfo<Instruction *>;
1406 static InstrListElem *getEmptyKey() { return PtrInfo::getEmptyKey(); }
1407 static InstrListElem *getTombstoneKey() {
1408 return PtrInfo::getTombstoneKey();
1409 }
1410 static unsigned getHashValue(const InstrListElem *E) {
1411 return IInfo::getHashValue(PtrVal: E->first);
1412 }
1413 static bool isEqual(const InstrListElem *A, const InstrListElem *B) {
1414 if (A == getEmptyKey() || B == getEmptyKey())
1415 return A == getEmptyKey() && B == getEmptyKey();
1416 if (A == getTombstoneKey() || B == getTombstoneKey())
1417 return A == getTombstoneKey() && B == getTombstoneKey();
1418 return IInfo::isEqual(LHS: A->first, RHS: B->first);
1419 }
1420 };
1421 SpecificBumpPtrAllocator<InstrListElem> Allocator;
1422 simple_ilist<InstrListElem> MRU;
1423 DenseSet<InstrListElem *, InstrListElemDenseMapInfo> Chains;
1424
1425 // Compare each instruction in `instrs` to leader of the N most recently-used
1426 // chains. This limits the O(n^2) behavior of this pass while also allowing
1427 // us to build arbitrarily long chains.
1428 for (Instruction *I : Instrs) {
1429 constexpr int MaxChainsToTry = 64;
1430
1431 bool MatchFound = false;
1432 auto ChainIter = MRU.begin();
1433 for (size_t J = 0; J < MaxChainsToTry && ChainIter != MRU.end();
1434 ++J, ++ChainIter) {
1435 std::optional<APInt> Offset = getConstantOffset(
1436 PtrA: getLoadStorePointerOperand(V: ChainIter->first),
1437 PtrB: getLoadStorePointerOperand(V: I),
1438 /*ContextInst=*/
1439 (ChainIter->first->comesBefore(Other: I) ? I : ChainIter->first));
1440 if (Offset.has_value()) {
1441 // `Offset` might not have the expected number of bits, if e.g. AS has a
1442 // different number of bits than opaque pointers.
1443 ChainIter->second.push_back(Elt: ChainElem{.Inst: I, .OffsetFromLeader: Offset.value()});
1444 // Move ChainIter to the front of the MRU list.
1445 MRU.remove(N&: *ChainIter);
1446 MRU.push_front(Node&: *ChainIter);
1447 MatchFound = true;
1448 break;
1449 }
1450 }
1451
1452 if (!MatchFound) {
1453 APInt ZeroOffset(ASPtrBits, 0);
1454 InstrListElem *E = new (Allocator.Allocate()) InstrListElem(I);
1455 E->second.push_back(Elt: ChainElem{.Inst: I, .OffsetFromLeader: ZeroOffset});
1456 MRU.push_front(Node&: *E);
1457 Chains.insert(V: E);
1458 }
1459 }
1460
1461 std::vector<Chain> Ret;
1462 Ret.reserve(n: Chains.size());
1463 // Iterate over MRU rather than Chains so the order is deterministic.
1464 for (auto &E : MRU)
1465 if (E.second.size() > 1)
1466 Ret.push_back(x: std::move(E.second));
1467 return Ret;
1468}
1469
1470std::optional<APInt> Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB,
1471 Instruction *ContextInst,
1472 unsigned Depth) {
1473 LLVM_DEBUG(dbgs() << "LSV: getConstantOffset, PtrA=" << *PtrA
1474 << ", PtrB=" << *PtrB << ", ContextInst= " << *ContextInst
1475 << ", Depth=" << Depth << "\n");
1476 // We'll ultimately return a value of this bit width, even if computations
1477 // happen in a different width.
1478 unsigned OrigBitWidth = DL.getIndexTypeSizeInBits(Ty: PtrA->getType());
1479 APInt OffsetA(OrigBitWidth, 0);
1480 APInt OffsetB(OrigBitWidth, 0);
1481 PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, Offset&: OffsetA);
1482 PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, Offset&: OffsetB);
1483 unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(Ty: PtrA->getType());
1484 if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(Ty: PtrB->getType()))
1485 return std::nullopt;
1486
1487 // If we have to shrink the pointer, stripAndAccumulateInBoundsConstantOffsets
1488 // should properly handle a possible overflow and the value should fit into
1489 // the smallest data type used in the cast/gep chain.
1490 assert(OffsetA.getSignificantBits() <= NewPtrBitWidth &&
1491 OffsetB.getSignificantBits() <= NewPtrBitWidth);
1492
1493 OffsetA = OffsetA.sextOrTrunc(width: NewPtrBitWidth);
1494 OffsetB = OffsetB.sextOrTrunc(width: NewPtrBitWidth);
1495 if (PtrA == PtrB)
1496 return (OffsetB - OffsetA).sextOrTrunc(width: OrigBitWidth);
1497
1498 // Try to compute B - A.
1499 const SCEV *DistScev = SE.getMinusSCEV(LHS: SE.getSCEV(V: PtrB), RHS: SE.getSCEV(V: PtrA));
1500 if (DistScev != SE.getCouldNotCompute()) {
1501 LLVM_DEBUG(dbgs() << "LSV: SCEV PtrB - PtrA =" << *DistScev << "\n");
1502 ConstantRange DistRange = SE.getSignedRange(S: DistScev);
1503 if (DistRange.isSingleElement()) {
1504 // Handle index width (the width of Dist) != pointer width (the width of
1505 // the Offset*s at this point).
1506 APInt Dist = DistRange.getSingleElement()->sextOrTrunc(width: NewPtrBitWidth);
1507 return (OffsetB - OffsetA + Dist).sextOrTrunc(width: OrigBitWidth);
1508 }
1509 }
1510 std::optional<APInt> Diff =
1511 getConstantOffsetComplexAddrs(PtrA, PtrB, ContextInst, Depth);
1512 if (Diff.has_value())
1513 return (OffsetB - OffsetA + Diff->sext(width: OffsetB.getBitWidth()))
1514 .sextOrTrunc(width: OrigBitWidth);
1515 return std::nullopt;
1516}
1517