1//===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass merges loads/stores to/from sequential memory addresses into vector
10// loads/stores. Although there's nothing GPU-specific in here, this pass is
11// motivated by the microarchitectural quirks of nVidia and AMD GPUs.
12//
13// (For simplicity below we talk about loads only, but everything also applies
14// to stores.)
15//
16// This pass is intended to be run late in the pipeline, after other
17// vectorization opportunities have been exploited. So the assumption here is
18// that immediately following our new vector load we'll need to extract out the
19// individual elements of the load, so we can operate on them individually.
20//
21// On CPUs this transformation is usually not beneficial, because extracting the
22// elements of a vector register is expensive on most architectures. It's
23// usually better just to load each element individually into its own scalar
24// register.
25//
26// However, nVidia and AMD GPUs don't have proper vector registers. Instead, a
27// "vector load" loads directly into a series of scalar registers. In effect,
28// extracting the elements of the vector is free. It's therefore always
29// beneficial to vectorize a sequence of loads on these architectures.
30//
31// Vectorizing (perhaps a better name might be "coalescing") loads can have
32// large performance impacts on GPU kernels, and opportunities for vectorizing
33// are common in GPU code. This pass tries very hard to find such
34// opportunities; its runtime is quadratic in the number of loads in a BB.
35//
36// Some CPU architectures, such as ARM, have instructions that load into
37// multiple scalar registers, similar to a GPU vectorized load. In theory ARM
38// could use this pass (with some modifications), but currently it implements
39// its own pass to do something similar to what we do here.
40//
41// Overview of the algorithm and terminology in this pass:
42//
43// - Break up each basic block into pseudo-BBs, composed of instructions which
44// are guaranteed to transfer control to their successors.
45// - Within a single pseudo-BB, find all loads, and group them into
46// "equivalence classes" according to getUnderlyingObject() and loaded
47// element size. Do the same for stores.
48// - For each equivalence class, greedily build "chains". Each chain has a
49// leader instruction, and every other member of the chain has a known
50// constant offset from the first instr in the chain.
51// - Break up chains so that they contain only contiguous accesses of legal
52// size with no intervening may-alias instrs.
53// - Convert each chain to vector instructions.
54//
55// The O(n^2) behavior of this pass comes from initially building the chains.
56// In the worst case we have to compare each new instruction to all of those
57// that came before. To limit this, we only calculate the offset to the leaders
58// of the N most recently-used chains.
59
60#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
61#include "llvm/ADT/APInt.h"
62#include "llvm/ADT/ArrayRef.h"
63#include "llvm/ADT/DenseMap.h"
64#include "llvm/ADT/MapVector.h"
65#include "llvm/ADT/PostOrderIterator.h"
66#include "llvm/ADT/STLExtras.h"
67#include "llvm/ADT/Sequence.h"
68#include "llvm/ADT/SmallPtrSet.h"
69#include "llvm/ADT/SmallVector.h"
70#include "llvm/ADT/Statistic.h"
71#include "llvm/ADT/iterator_range.h"
72#include "llvm/Analysis/AliasAnalysis.h"
73#include "llvm/Analysis/AssumptionCache.h"
74#include "llvm/Analysis/MemoryLocation.h"
75#include "llvm/Analysis/ScalarEvolution.h"
76#include "llvm/Analysis/TargetTransformInfo.h"
77#include "llvm/Analysis/ValueTracking.h"
78#include "llvm/Analysis/VectorUtils.h"
79#include "llvm/IR/Attributes.h"
80#include "llvm/IR/BasicBlock.h"
81#include "llvm/IR/ConstantRange.h"
82#include "llvm/IR/Constants.h"
83#include "llvm/IR/DataLayout.h"
84#include "llvm/IR/DerivedTypes.h"
85#include "llvm/IR/Dominators.h"
86#include "llvm/IR/Function.h"
87#include "llvm/IR/GetElementPtrTypeIterator.h"
88#include "llvm/IR/IRBuilder.h"
89#include "llvm/IR/InstrTypes.h"
90#include "llvm/IR/Instruction.h"
91#include "llvm/IR/Instructions.h"
92#include "llvm/IR/LLVMContext.h"
93#include "llvm/IR/Module.h"
94#include "llvm/IR/Type.h"
95#include "llvm/IR/Value.h"
96#include "llvm/InitializePasses.h"
97#include "llvm/Pass.h"
98#include "llvm/Support/Alignment.h"
99#include "llvm/Support/Casting.h"
100#include "llvm/Support/Debug.h"
101#include "llvm/Support/KnownBits.h"
102#include "llvm/Support/MathExtras.h"
103#include "llvm/Support/ModRef.h"
104#include "llvm/Support/raw_ostream.h"
105#include "llvm/Transforms/Utils/Local.h"
106#include <algorithm>
107#include <cassert>
108#include <cstdint>
109#include <cstdlib>
110#include <iterator>
111#include <numeric>
112#include <optional>
113#include <tuple>
114#include <type_traits>
115#include <utility>
116#include <vector>
117
118using namespace llvm;
119
120#define DEBUG_TYPE "load-store-vectorizer"
121
122STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
123STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
124
125namespace {
126
127// Equivalence class key, the initial tuple by which we group loads/stores.
128// Loads/stores with different EqClassKeys are never merged.
129//
130// (We could in theory remove element-size from the this tuple. We'd just need
131// to fix up the vector packing/unpacking code.)
132using EqClassKey =
133 std::tuple<const Value * /* result of getUnderlyingObject() */,
134 unsigned /* AddrSpace */,
135 unsigned /* Load/Store element size bits */,
136 char /* IsLoad; char b/c bool can't be a DenseMap key */
137 >;
138[[maybe_unused]] llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
139 const EqClassKey &K) {
140 const auto &[UnderlyingObject, AddrSpace, ElementSize, IsLoad] = K;
141 return OS << (IsLoad ? "load" : "store") << " of " << *UnderlyingObject
142 << " of element size " << ElementSize << " bits in addrspace "
143 << AddrSpace;
144}
145
146// A Chain is a set of instructions such that:
147// - All instructions have the same equivalence class, so in particular all are
148// loads, or all are stores.
149// - We know the address accessed by the i'th chain elem relative to the
150// chain's leader instruction, which is the first instr of the chain in BB
151// order.
152//
153// Chains have two canonical orderings:
154// - BB order, sorted by Instr->comesBefore.
155// - Offset order, sorted by OffsetFromLeader.
156// This pass switches back and forth between these orders.
157struct ChainElem {
158 Instruction *Inst;
159 APInt OffsetFromLeader;
160 ChainElem(Instruction *Inst, APInt OffsetFromLeader)
161 : Inst(std::move(Inst)), OffsetFromLeader(std::move(OffsetFromLeader)) {}
162};
163using Chain = SmallVector<ChainElem, 1>;
164
165void sortChainInBBOrder(Chain &C) {
166 sort(C, Comp: [](auto &A, auto &B) { return A.Inst->comesBefore(B.Inst); });
167}
168
169void sortChainInOffsetOrder(Chain &C) {
170 sort(C, Comp: [](const auto &A, const auto &B) {
171 if (A.OffsetFromLeader != B.OffsetFromLeader)
172 return A.OffsetFromLeader.slt(B.OffsetFromLeader);
173 return A.Inst->comesBefore(B.Inst); // stable tiebreaker
174 });
175}
176
177[[maybe_unused]] void dumpChain(ArrayRef<ChainElem> C) {
178 for (const auto &E : C) {
179 dbgs() << " " << *E.Inst << " (offset " << E.OffsetFromLeader << ")\n";
180 }
181}
182
183using EquivalenceClassMap =
184 MapVector<EqClassKey, SmallVector<Instruction *, 8>>;
185
186// FIXME: Assuming stack alignment of 4 is always good enough
187constexpr unsigned StackAdjustedAlignment = 4;
188
189Instruction *propagateMetadata(Instruction *I, const Chain &C) {
190 SmallVector<Value *, 8> Values;
191 for (const ChainElem &E : C)
192 Values.emplace_back(Args: E.Inst);
193 return propagateMetadata(I, VL: Values);
194}
195
196bool isInvariantLoad(const Instruction *I) {
197 const LoadInst *LI = dyn_cast<LoadInst>(Val: I);
198 return LI != nullptr && LI->hasMetadata(KindID: LLVMContext::MD_invariant_load);
199}
200
201/// Reorders the instructions that I depends on (the instructions defining its
202/// operands), to ensure they dominate I.
203void reorder(Instruction *I) {
204 SmallPtrSet<Instruction *, 16> InstructionsToMove;
205 SmallVector<Instruction *, 16> Worklist;
206
207 Worklist.emplace_back(Args&: I);
208 while (!Worklist.empty()) {
209 Instruction *IW = Worklist.pop_back_val();
210 int NumOperands = IW->getNumOperands();
211 for (int Idx = 0; Idx < NumOperands; Idx++) {
212 Instruction *IM = dyn_cast<Instruction>(Val: IW->getOperand(i: Idx));
213 if (!IM || IM->getOpcode() == Instruction::PHI)
214 continue;
215
216 // If IM is in another BB, no need to move it, because this pass only
217 // vectorizes instructions within one BB.
218 if (IM->getParent() != I->getParent())
219 continue;
220
221 assert(IM != I && "Unexpected cycle while re-ordering instructions");
222
223 if (!IM->comesBefore(Other: I)) {
224 InstructionsToMove.insert(Ptr: IM);
225 Worklist.emplace_back(Args&: IM);
226 }
227 }
228 }
229
230 // All instructions to move should follow I. Start from I, not from begin().
231 for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;) {
232 Instruction *IM = &*(BBI++);
233 if (!InstructionsToMove.contains(Ptr: IM))
234 continue;
235 IM->moveBefore(InsertPos: I->getIterator());
236 }
237}
238
239class Vectorizer {
240 Function &F;
241 AliasAnalysis &AA;
242 AssumptionCache &AC;
243 DominatorTree &DT;
244 ScalarEvolution &SE;
245 TargetTransformInfo &TTI;
246 const DataLayout &DL;
247 IRBuilder<> Builder;
248
249 /// We could erase instrs right after vectorizing them, but that can mess up
250 /// our BB iterators, and also can make the equivalence class keys point to
251 /// freed memory. This is fixable, but it's simpler just to wait until we're
252 /// done with the BB and erase all at once.
253 SmallVector<Instruction *, 128> ToErase;
254
255 /// We insert load/store instructions and GEPs to fill gaps and extend chains
256 /// to enable vectorization. Keep track and delete them later.
257 DenseSet<Instruction *> ExtraElements;
258
259public:
260 Vectorizer(Function &F, AliasAnalysis &AA, AssumptionCache &AC,
261 DominatorTree &DT, ScalarEvolution &SE, TargetTransformInfo &TTI)
262 : F(F), AA(AA), AC(AC), DT(DT), SE(SE), TTI(TTI),
263 DL(F.getDataLayout()), Builder(SE.getContext()) {}
264
265 bool run();
266
267private:
268 static const unsigned MaxDepth = 3;
269
270 /// Runs the vectorizer on a "pseudo basic block", which is a range of
271 /// instructions [Begin, End) within one BB all of which have
272 /// isGuaranteedToTransferExecutionToSuccessor(I) == true.
273 bool runOnPseudoBB(BasicBlock::iterator Begin, BasicBlock::iterator End);
274
275 /// Runs the vectorizer on one equivalence class, i.e. one set of loads/stores
276 /// in the same BB with the same value for getUnderlyingObject() etc.
277 bool runOnEquivalenceClass(const EqClassKey &EqClassKey,
278 ArrayRef<Instruction *> EqClass);
279
280 /// Runs the vectorizer on one chain, i.e. a subset of an equivalence class
281 /// where all instructions access a known, constant offset from the first
282 /// instruction.
283 bool runOnChain(Chain &C);
284
285 /// Splits the chain into subchains of instructions which read/write a
286 /// contiguous block of memory. Discards any length-1 subchains (because
287 /// there's nothing to vectorize in there). Also attempts to fill gaps with
288 /// "extra" elements to artificially make chains contiguous in some cases.
289 std::vector<Chain> splitChainByContiguity(Chain &C);
290
291 /// Splits the chain into subchains where it's safe to hoist loads up to the
292 /// beginning of the sub-chain and it's safe to sink loads up to the end of
293 /// the sub-chain. Discards any length-1 subchains. Also attempts to extend
294 /// non-power-of-two chains by adding "extra" elements in some cases.
295 std::vector<Chain> splitChainByMayAliasInstrs(Chain &C);
296
297 /// Splits the chain into subchains that make legal, aligned accesses.
298 /// Discards any length-1 subchains.
299 std::vector<Chain> splitChainByAlignment(Chain &C);
300
301 /// Converts the instrs in the chain into a single vectorized load or store.
302 /// Adds the old scalar loads/stores to ToErase.
303 bool vectorizeChain(Chain &C);
304
305 /// Tries to compute the offset in bytes PtrB - PtrA.
306 std::optional<APInt> getConstantOffset(Value *PtrA, Value *PtrB,
307 Instruction *ContextInst,
308 unsigned Depth = 0);
309 std::optional<APInt> getConstantOffsetComplexAddrs(Value *PtrA, Value *PtrB,
310 Instruction *ContextInst,
311 unsigned Depth);
312 std::optional<APInt> getConstantOffsetSelects(Value *PtrA, Value *PtrB,
313 Instruction *ContextInst,
314 unsigned Depth);
315
316 /// Gets the element type of the vector that the chain will load or store.
317 /// This is nontrivial because the chain may contain elements of different
318 /// types; e.g. it's legal to have a chain that contains both i32 and float.
319 Type *getChainElemTy(const Chain &C);
320
321 /// Determines whether ChainElem can be moved up (if IsLoad) or down (if
322 /// !IsLoad) to ChainBegin -- i.e. there are no intervening may-alias
323 /// instructions.
324 ///
325 /// The map ChainElemOffsets must contain all of the elements in
326 /// [ChainBegin, ChainElem] and their offsets from some arbitrary base
327 /// address. It's ok if it contains additional entries.
328 template <bool IsLoadChain>
329 bool isSafeToMove(
330 Instruction *ChainElem, Instruction *ChainBegin,
331 const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets,
332 BatchAAResults &BatchAA);
333
334 /// Merges the equivalence classes if they have underlying objects that differ
335 /// by one level of indirection (i.e., one is a getelementptr and the other is
336 /// the base pointer in that getelementptr).
337 void mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const;
338
339 /// Collects loads and stores grouped by "equivalence class", where:
340 /// - all elements in an eq class are a load or all are a store,
341 /// - they all load/store the same element size (it's OK to have e.g. i8 and
342 /// <4 x i8> in the same class, but not i32 and <4 x i8>), and
343 /// - they all have the same value for getUnderlyingObject().
344 EquivalenceClassMap collectEquivalenceClasses(BasicBlock::iterator Begin,
345 BasicBlock::iterator End);
346
347 /// Partitions Instrs into "chains" where every instruction has a known
348 /// constant offset from the first instr in the chain.
349 ///
350 /// Postcondition: For all i, ret[i][0].second == 0, because the first instr
351 /// in the chain is the leader, and an instr touches distance 0 from itself.
352 std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs);
353
354 /// Checks if a potential vector load/store with a given alignment is allowed
355 /// and fast. Aligned accesses are always allowed and fast, while misaligned
356 /// accesses depend on TTI checks to determine whether they can and should be
357 /// vectorized or kept as element-wise accesses.
358 bool accessIsAllowedAndFast(unsigned SizeBytes, unsigned AS, Align Alignment,
359 unsigned VecElemBits) const;
360
361 /// Create a new GEP and a new Load/Store instruction such that the GEP
362 /// is pointing at PrevElem + Offset. In the case of stores, store poison.
363 /// Extra elements will either be combined into a masked load/store or
364 /// deleted before the end of the pass.
365 ChainElem createExtraElementAfter(const ChainElem &PrevElem, Type *Ty,
366 APInt Offset, StringRef Prefix,
367 Align Alignment = Align());
368
369 /// Create a mask that masks off the extra elements in the chain, to be used
370 /// for the creation of a masked load/store vector.
371 Value *createMaskForExtraElements(const ArrayRef<ChainElem> C,
372 FixedVectorType *VecTy);
373
374 /// Delete dead GEPs and extra Load/Store instructions created by
375 /// createExtraElementAfter
376 void deleteExtraElements();
377};
378
379class LoadStoreVectorizerLegacyPass : public FunctionPass {
380public:
381 static char ID;
382
383 LoadStoreVectorizerLegacyPass() : FunctionPass(ID) {
384 initializeLoadStoreVectorizerLegacyPassPass(
385 *PassRegistry::getPassRegistry());
386 }
387
388 bool runOnFunction(Function &F) override;
389
390 StringRef getPassName() const override {
391 return "GPU Load and Store Vectorizer";
392 }
393
394 void getAnalysisUsage(AnalysisUsage &AU) const override {
395 AU.addRequired<AAResultsWrapperPass>();
396 AU.addRequired<AssumptionCacheTracker>();
397 AU.addRequired<ScalarEvolutionWrapperPass>();
398 AU.addRequired<DominatorTreeWrapperPass>();
399 AU.addRequired<TargetTransformInfoWrapperPass>();
400 AU.setPreservesCFG();
401 }
402};
403
404} // end anonymous namespace
405
406char LoadStoreVectorizerLegacyPass::ID = 0;
407
408INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
409 "Vectorize load and Store instructions", false, false)
410INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
411INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker);
412INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
413INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
414INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
415INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
416INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
417 "Vectorize load and store instructions", false, false)
418
419Pass *llvm::createLoadStoreVectorizerPass() {
420 return new LoadStoreVectorizerLegacyPass();
421}
422
423bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) {
424 // Don't vectorize when the attribute NoImplicitFloat is used.
425 if (skipFunction(F) || F.hasFnAttribute(Kind: Attribute::NoImplicitFloat))
426 return false;
427
428 AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
429 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
430 ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
431 TargetTransformInfo &TTI =
432 getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
433
434 AssumptionCache &AC =
435 getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
436
437 return Vectorizer(F, AA, AC, DT, SE, TTI).run();
438}
439
440PreservedAnalyses LoadStoreVectorizerPass::run(Function &F,
441 FunctionAnalysisManager &AM) {
442 // Don't vectorize when the attribute NoImplicitFloat is used.
443 if (F.hasFnAttribute(Kind: Attribute::NoImplicitFloat))
444 return PreservedAnalyses::all();
445
446 AliasAnalysis &AA = AM.getResult<AAManager>(IR&: F);
447 DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(IR&: F);
448 ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(IR&: F);
449 TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(IR&: F);
450 AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(IR&: F);
451
452 bool Changed = Vectorizer(F, AA, AC, DT, SE, TTI).run();
453 PreservedAnalyses PA;
454 PA.preserveSet<CFGAnalyses>();
455 return Changed ? PA : PreservedAnalyses::all();
456}
457
458bool Vectorizer::run() {
459 bool Changed = false;
460 // Break up the BB if there are any instrs which aren't guaranteed to transfer
461 // execution to their successor.
462 //
463 // Consider, for example:
464 //
465 // def assert_arr_len(int n) { if (n < 2) exit(); }
466 //
467 // load arr[0]
468 // call assert_array_len(arr.length)
469 // load arr[1]
470 //
471 // Even though assert_arr_len does not read or write any memory, we can't
472 // speculate the second load before the call. More info at
473 // https://github.com/llvm/llvm-project/issues/52950.
474 for (BasicBlock *BB : post_order(G: &F)) {
475 // BB must at least have a terminator.
476 assert(!BB->empty());
477
478 SmallVector<BasicBlock::iterator, 8> Barriers;
479 Barriers.emplace_back(Args: BB->begin());
480 for (Instruction &I : *BB)
481 if (!isGuaranteedToTransferExecutionToSuccessor(I: &I))
482 Barriers.emplace_back(Args: I.getIterator());
483 Barriers.emplace_back(Args: BB->end());
484
485 for (auto It = Barriers.begin(), End = std::prev(x: Barriers.end()); It != End;
486 ++It)
487 Changed |= runOnPseudoBB(Begin: *It, End: *std::next(x: It));
488
489 for (Instruction *I : ToErase) {
490 // These will get deleted in deleteExtraElements.
491 // This is because ExtraElements will include both extra elements
492 // that *were* vectorized and extra elements that *were not*
493 // vectorized. ToErase will only include extra elements that *were*
494 // vectorized, so in order to avoid double deletion we skip them here and
495 // handle them in deleteExtraElements.
496 if (ExtraElements.contains(V: I))
497 continue;
498 auto *PtrOperand = getLoadStorePointerOperand(V: I);
499 if (I->use_empty())
500 I->eraseFromParent();
501 RecursivelyDeleteTriviallyDeadInstructions(V: PtrOperand);
502 }
503 ToErase.clear();
504 deleteExtraElements();
505 }
506
507 return Changed;
508}
509
510bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin,
511 BasicBlock::iterator End) {
512 LLVM_DEBUG({
513 dbgs() << "LSV: Running on pseudo-BB [" << *Begin << " ... ";
514 if (End != Begin->getParent()->end())
515 dbgs() << *End;
516 else
517 dbgs() << "<BB end>";
518 dbgs() << ")\n";
519 });
520
521 bool Changed = false;
522 for (const auto &[EqClassKey, EqClass] :
523 collectEquivalenceClasses(Begin, End))
524 Changed |= runOnEquivalenceClass(EqClassKey, EqClass);
525
526 return Changed;
527}
528
529bool Vectorizer::runOnEquivalenceClass(const EqClassKey &EqClassKey,
530 ArrayRef<Instruction *> EqClass) {
531 bool Changed = false;
532
533 LLVM_DEBUG({
534 dbgs() << "LSV: Running on equivalence class of size " << EqClass.size()
535 << " keyed on " << EqClassKey << ":\n";
536 for (Instruction *I : EqClass)
537 dbgs() << " " << *I << "\n";
538 });
539
540 std::vector<Chain> Chains = gatherChains(Instrs: EqClass);
541 LLVM_DEBUG(dbgs() << "LSV: Got " << Chains.size()
542 << " nontrivial chains.\n";);
543 for (Chain &C : Chains)
544 Changed |= runOnChain(C);
545 return Changed;
546}
547
548bool Vectorizer::runOnChain(Chain &C) {
549 LLVM_DEBUG({
550 dbgs() << "LSV: Running on chain with " << C.size() << " instructions:\n";
551 dumpChain(C);
552 });
553
554 // Split up the chain into increasingly smaller chains, until we can finally
555 // vectorize the chains.
556 //
557 // (Don't be scared by the depth of the loop nest here. These operations are
558 // all at worst O(n lg n) in the number of instructions, and splitting chains
559 // doesn't change the number of instrs. So the whole loop nest is O(n lg n).)
560 bool Changed = false;
561 for (auto &C : splitChainByMayAliasInstrs(C))
562 for (auto &C : splitChainByContiguity(C))
563 for (auto &C : splitChainByAlignment(C))
564 Changed |= vectorizeChain(C);
565 return Changed;
566}
567
568std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) {
569 if (C.empty())
570 return {};
571
572 sortChainInBBOrder(C);
573
574 LLVM_DEBUG({
575 dbgs() << "LSV: splitChainByMayAliasInstrs considering chain:\n";
576 dumpChain(C);
577 });
578
579 // We know that elements in the chain with nonverlapping offsets can't
580 // alias, but AA may not be smart enough to figure this out. Use a
581 // hashtable.
582 DenseMap<Instruction *, APInt /*OffsetFromLeader*/> ChainOffsets;
583 for (const auto &E : C)
584 ChainOffsets.insert(KV: {&*E.Inst, E.OffsetFromLeader});
585
586 // Across a single invocation of this function the IR is not changing, so
587 // using a batched Alias Analysis is safe and can reduce compile time.
588 BatchAAResults BatchAA(AA);
589
590 // Loads get hoisted up to the first load in the chain. Stores get sunk
591 // down to the last store in the chain. Our algorithm for loads is:
592 //
593 // - Take the first element of the chain. This is the start of a new chain.
594 // - Take the next element of `Chain` and check for may-alias instructions
595 // up to the start of NewChain. If no may-alias instrs, add it to
596 // NewChain. Otherwise, start a new NewChain.
597 //
598 // For stores it's the same except in the reverse direction.
599 //
600 // We expect IsLoad to be an std::bool_constant.
601 auto Impl = [&](auto IsLoad) {
602 // MSVC is unhappy if IsLoad is a capture, so pass it as an arg.
603 auto [ChainBegin, ChainEnd] = [&](auto IsLoad) {
604 if constexpr (IsLoad())
605 return std::make_pair(x: C.begin(), y: C.end());
606 else
607 return std::make_pair(x: C.rbegin(), y: C.rend());
608 }(IsLoad);
609 assert(ChainBegin != ChainEnd);
610
611 std::vector<Chain> Chains;
612 SmallVector<ChainElem, 1> NewChain;
613 NewChain.emplace_back(*ChainBegin);
614 for (auto ChainIt = std::next(ChainBegin); ChainIt != ChainEnd; ++ChainIt) {
615 if (isSafeToMove<IsLoad>(ChainIt->Inst, NewChain.front().Inst,
616 ChainOffsets, BatchAA)) {
617 LLVM_DEBUG(dbgs() << "LSV: No intervening may-alias instrs; can merge "
618 << *ChainIt->Inst << " into " << *ChainBegin->Inst
619 << "\n");
620 NewChain.emplace_back(*ChainIt);
621 } else {
622 LLVM_DEBUG(
623 dbgs() << "LSV: Found intervening may-alias instrs; cannot merge "
624 << *ChainIt->Inst << " into " << *ChainBegin->Inst << "\n");
625 if (NewChain.size() > 1) {
626 LLVM_DEBUG({
627 dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
628 dumpChain(NewChain);
629 });
630 Chains.emplace_back(args: std::move(NewChain));
631 }
632
633 // Start a new chain.
634 NewChain = SmallVector<ChainElem, 1>({*ChainIt});
635 }
636 }
637 if (NewChain.size() > 1) {
638 LLVM_DEBUG({
639 dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
640 dumpChain(NewChain);
641 });
642 Chains.emplace_back(args: std::move(NewChain));
643 }
644 return Chains;
645 };
646
647 if (isa<LoadInst>(Val: C[0].Inst))
648 return Impl(/*IsLoad=*/std::bool_constant<true>());
649
650 assert(isa<StoreInst>(C[0].Inst));
651 return Impl(/*IsLoad=*/std::bool_constant<false>());
652}
653
654std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
655 if (C.empty())
656 return {};
657
658 sortChainInOffsetOrder(C);
659
660 LLVM_DEBUG({
661 dbgs() << "LSV: splitChainByContiguity considering chain:\n";
662 dumpChain(C);
663 });
664
665 // If the chain is not contiguous, we try to fill the gap with "extra"
666 // elements to artificially make it contiguous, to try to enable
667 // vectorization. We only fill gaps if there is potential to end up with a
668 // legal masked load/store given the target, address space, and element type.
669 // At this point, when querying the TTI, optimistically assume max alignment
670 // and max vector size, as splitChainByAlignment will ensure the final vector
671 // shape passes the legalization check.
672 unsigned AS = getLoadStoreAddressSpace(I: C[0].Inst);
673 Type *ElementType = getLoadStoreType(I: C[0].Inst)->getScalarType();
674 unsigned MaxVecRegBits = TTI.getLoadStoreVecRegBitWidth(AddrSpace: AS);
675 Align OptimisticAlign = Align(MaxVecRegBits / 8);
676 unsigned int MaxVectorNumElems =
677 MaxVecRegBits / DL.getTypeSizeInBits(Ty: ElementType);
678 // Note: This check decides whether to try to fill gaps based on the masked
679 // legality of the target's maximum vector size (getLoadStoreVecRegBitWidth).
680 // If a target *does not* support a masked load/store with this max vector
681 // size, but *does* support a masked load/store with a *smaller* vector size,
682 // that optimization will be missed. This does not occur in any of the targets
683 // that currently support this API.
684 FixedVectorType *OptimisticVectorType =
685 FixedVectorType::get(ElementType, NumElts: MaxVectorNumElems);
686 bool TryFillGaps =
687 isa<LoadInst>(Val: C[0].Inst)
688 ? TTI.isLegalMaskedLoad(DataType: OptimisticVectorType, Alignment: OptimisticAlign, AddressSpace: AS,
689 MaskKind: TTI::MaskKind::ConstantMask)
690 : TTI.isLegalMaskedStore(DataType: OptimisticVectorType, Alignment: OptimisticAlign, AddressSpace: AS,
691 MaskKind: TTI::MaskKind::ConstantMask);
692
693 // Cache the best aligned element in the chain for use when creating extra
694 // elements.
695 Align BestAlignedElemAlign = getLoadStoreAlignment(I: C[0].Inst);
696 APInt OffsetOfBestAlignedElemFromLeader = C[0].OffsetFromLeader;
697 for (const auto &E : C) {
698 Align ElementAlignment = getLoadStoreAlignment(I: E.Inst);
699 if (ElementAlignment > BestAlignedElemAlign) {
700 BestAlignedElemAlign = ElementAlignment;
701 OffsetOfBestAlignedElemFromLeader = E.OffsetFromLeader;
702 }
703 }
704
705 auto DeriveAlignFromBestAlignedElem = [&](APInt NewElemOffsetFromLeader) {
706 return commonAlignment(
707 A: BestAlignedElemAlign,
708 Offset: (NewElemOffsetFromLeader - OffsetOfBestAlignedElemFromLeader)
709 .abs()
710 .getLimitedValue());
711 };
712
713 unsigned ASPtrBits = DL.getIndexSizeInBits(AS);
714
715 std::vector<Chain> Ret;
716 Ret.push_back(x: {C.front()});
717
718 unsigned ChainElemTyBits = DL.getTypeSizeInBits(Ty: getChainElemTy(C));
719 ChainElem &Prev = C[0];
720 for (auto It = std::next(x: C.begin()), End = C.end(); It != End; ++It) {
721 auto &CurChain = Ret.back();
722
723 APInt PrevSzBytes =
724 APInt(ASPtrBits, DL.getTypeStoreSize(Ty: getLoadStoreType(I: Prev.Inst)));
725 APInt PrevReadEnd = Prev.OffsetFromLeader + PrevSzBytes;
726 unsigned SzBytes = DL.getTypeStoreSize(Ty: getLoadStoreType(I: It->Inst));
727
728 // Add this instruction to the end of the current chain, or start a new one.
729 assert(
730 8 * SzBytes % ChainElemTyBits == 0 &&
731 "Every chain-element size must be a multiple of the element size after "
732 "vectorization.");
733 APInt ReadEnd = It->OffsetFromLeader + SzBytes;
734 // Allow redundancy: partial or full overlap counts as contiguous.
735 bool AreContiguous = false;
736 if (It->OffsetFromLeader.sle(RHS: PrevReadEnd)) {
737 // Check overlap is a multiple of the element size after vectorization.
738 uint64_t Overlap = (PrevReadEnd - It->OffsetFromLeader).getZExtValue();
739 if (8 * Overlap % ChainElemTyBits == 0)
740 AreContiguous = true;
741 }
742
743 LLVM_DEBUG(dbgs() << "LSV: Instruction is "
744 << (AreContiguous ? "contiguous" : "chain-breaker")
745 << *It->Inst << " (starts at offset "
746 << It->OffsetFromLeader << ")\n");
747
748 // If the chain is not contiguous, try to fill in gaps between Prev and
749 // Curr. For now, we aren't filling gaps between load/stores of different
750 // sizes. Additionally, as a conservative heuristic, we only fill gaps of
751 // 1-2 elements. Generating loads/stores with too many unused bytes has a
752 // side effect of increasing register pressure (on NVIDIA targets at least),
753 // which could cancel out the benefits of reducing number of load/stores.
754 bool GapFilled = false;
755 if (!AreContiguous && TryFillGaps && PrevSzBytes == SzBytes) {
756 APInt GapSzBytes = It->OffsetFromLeader - PrevReadEnd;
757 if (GapSzBytes == PrevSzBytes) {
758 // There is a single gap between Prev and Curr, create one extra element
759 ChainElem NewElem = createExtraElementAfter(
760 PrevElem: Prev, Ty: getLoadStoreType(I: Prev.Inst), Offset: PrevSzBytes, Prefix: "GapFill",
761 Alignment: DeriveAlignFromBestAlignedElem(PrevReadEnd));
762 CurChain.push_back(Elt: NewElem);
763 GapFilled = true;
764 }
765 // There are two gaps between Prev and Curr, only create two extra
766 // elements if Prev is the first element in a sequence of four.
767 // This has the highest chance of resulting in a beneficial vectorization.
768 if ((GapSzBytes == 2 * PrevSzBytes) && (CurChain.size() % 4 == 1)) {
769 ChainElem NewElem1 = createExtraElementAfter(
770 PrevElem: Prev, Ty: getLoadStoreType(I: Prev.Inst), Offset: PrevSzBytes, Prefix: "GapFill",
771 Alignment: DeriveAlignFromBestAlignedElem(PrevReadEnd));
772 ChainElem NewElem2 = createExtraElementAfter(
773 PrevElem: NewElem1, Ty: getLoadStoreType(I: Prev.Inst), Offset: PrevSzBytes, Prefix: "GapFill",
774 Alignment: DeriveAlignFromBestAlignedElem(PrevReadEnd + PrevSzBytes));
775 CurChain.push_back(Elt: NewElem1);
776 CurChain.push_back(Elt: NewElem2);
777 GapFilled = true;
778 }
779 }
780
781 if (AreContiguous || GapFilled)
782 CurChain.push_back(Elt: *It);
783 else
784 Ret.push_back(x: {*It});
785 // In certain cases when handling redundant elements with partial overlaps,
786 // the previous element may still extend beyond the current element. Only
787 // update Prev if the current element is the new end of the chain.
788 if (ReadEnd.sge(RHS: PrevReadEnd))
789 Prev = *It;
790 }
791
792 // Filter out length-1 chains, these are uninteresting.
793 llvm::erase_if(C&: Ret, P: [](const auto &Chain) { return Chain.size() <= 1; });
794 return Ret;
795}
796
797Type *Vectorizer::getChainElemTy(const Chain &C) {
798 assert(!C.empty());
799 // The rules are:
800 // - If there are any pointer types in the chain, use an integer type.
801 // - Prefer an integer type if it appears in the chain.
802 // - Otherwise, use the first type in the chain.
803 //
804 // The rule about pointer types is a simplification when we merge e.g. a load
805 // of a ptr and a double. There's no direct conversion from a ptr to a
806 // double; it requires a ptrtoint followed by a bitcast.
807 //
808 // It's unclear to me if the other rules have any practical effect, but we do
809 // it to match this pass's previous behavior.
810 if (any_of(Range: C, P: [](const ChainElem &E) {
811 return getLoadStoreType(I: E.Inst)->getScalarType()->isPointerTy();
812 })) {
813 return Type::getIntNTy(
814 C&: F.getContext(),
815 N: DL.getTypeSizeInBits(Ty: getLoadStoreType(I: C[0].Inst)->getScalarType()));
816 }
817
818 for (const ChainElem &E : C)
819 if (Type *T = getLoadStoreType(I: E.Inst)->getScalarType(); T->isIntegerTy())
820 return T;
821 return getLoadStoreType(I: C[0].Inst)->getScalarType();
822}
823
824std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
825 // We use a simple greedy algorithm.
826 // - Given a chain of length N, find all prefixes that
827 // (a) are not longer than the max register length, and
828 // (b) are a power of 2.
829 // - Starting from the longest prefix, try to create a vector of that length.
830 // - If one of them works, great. Repeat the algorithm on any remaining
831 // elements in the chain.
832 // - If none of them work, discard the first element and repeat on a chain
833 // of length N-1.
834 if (C.empty())
835 return {};
836
837 sortChainInOffsetOrder(C);
838
839 LLVM_DEBUG({
840 dbgs() << "LSV: splitChainByAlignment considering chain:\n";
841 dumpChain(C);
842 });
843
844 bool IsLoadChain = isa<LoadInst>(Val: C[0].Inst);
845 auto GetVectorFactor = [&](unsigned VF, unsigned LoadStoreSize,
846 unsigned ChainSizeBytes, VectorType *VecTy) {
847 return IsLoadChain ? TTI.getLoadVectorFactor(VF, LoadSize: LoadStoreSize,
848 ChainSizeInBytes: ChainSizeBytes, VecTy)
849 : TTI.getStoreVectorFactor(VF, StoreSize: LoadStoreSize,
850 ChainSizeInBytes: ChainSizeBytes, VecTy);
851 };
852
853#ifndef NDEBUG
854 for (const auto &E : C) {
855 Type *Ty = getLoadStoreType(E.Inst)->getScalarType();
856 assert(isPowerOf2_32(DL.getTypeSizeInBits(Ty)) &&
857 "Should have filtered out non-power-of-two elements in "
858 "collectEquivalenceClasses.");
859 }
860#endif
861
862 unsigned AS = getLoadStoreAddressSpace(I: C[0].Inst);
863 unsigned VecRegBytes = TTI.getLoadStoreVecRegBitWidth(AddrSpace: AS) / 8;
864
865 // For compile time reasons, we cache whether or not the superset
866 // of all candidate chains contains any extra loads/stores from earlier gap
867 // filling.
868 bool CandidateChainsMayContainExtraLoadsStores = any_of(
869 Range&: C, P: [this](const ChainElem &E) { return ExtraElements.contains(V: E.Inst); });
870
871 std::vector<Chain> Ret;
872 for (unsigned CBegin = 0; CBegin < C.size(); ++CBegin) {
873 // Find candidate chains of size not greater than the largest vector reg.
874 // These chains are over the closed interval [CBegin, CEnd].
875 SmallVector<std::pair<unsigned /*CEnd*/, unsigned /*SizeBytes*/>, 8>
876 CandidateChains;
877 // Need to compute the size of every candidate chain from its beginning
878 // because of possible overlapping among chain elements.
879 unsigned Sz = DL.getTypeStoreSize(Ty: getLoadStoreType(I: C[CBegin].Inst));
880 APInt PrevReadEnd = C[CBegin].OffsetFromLeader + Sz;
881 for (unsigned CEnd = CBegin + 1, Size = C.size(); CEnd < Size; ++CEnd) {
882 APInt ReadEnd = C[CEnd].OffsetFromLeader +
883 DL.getTypeStoreSize(Ty: getLoadStoreType(I: C[CEnd].Inst));
884 unsigned BytesAdded =
885 PrevReadEnd.sle(RHS: ReadEnd) ? (ReadEnd - PrevReadEnd).getSExtValue() : 0;
886 Sz += BytesAdded;
887 if (Sz > VecRegBytes)
888 break;
889 CandidateChains.emplace_back(Args&: CEnd, Args&: Sz);
890 PrevReadEnd = APIntOps::smax(A: PrevReadEnd, B: ReadEnd);
891 }
892
893 // Consider the longest chain first.
894 for (auto It = CandidateChains.rbegin(), End = CandidateChains.rend();
895 It != End; ++It) {
896 auto [CEnd, SizeBytes] = *It;
897 LLVM_DEBUG(
898 dbgs() << "LSV: splitChainByAlignment considering candidate chain ["
899 << *C[CBegin].Inst << " ... " << *C[CEnd].Inst << "]\n");
900
901 Type *VecElemTy = getChainElemTy(C);
902 // Note, VecElemTy is a power of 2, but might be less than one byte. For
903 // example, we can vectorize 2 x <2 x i4> to <4 x i4>, and in this case
904 // VecElemTy would be i4.
905 unsigned VecElemBits = DL.getTypeSizeInBits(Ty: VecElemTy);
906
907 // SizeBytes and VecElemBits are powers of 2, so they divide evenly.
908 assert((8 * SizeBytes) % VecElemBits == 0);
909 unsigned NumVecElems = 8 * SizeBytes / VecElemBits;
910 FixedVectorType *VecTy = FixedVectorType::get(ElementType: VecElemTy, NumElts: NumVecElems);
911 unsigned VF = 8 * VecRegBytes / VecElemBits;
912
913 // Check that TTI is happy with this vectorization factor.
914 unsigned TargetVF = GetVectorFactor(VF, VecElemBits,
915 VecElemBits * NumVecElems / 8, VecTy);
916 if (TargetVF != VF && TargetVF < NumVecElems) {
917 LLVM_DEBUG(
918 dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
919 "because TargetVF="
920 << TargetVF << " != VF=" << VF
921 << " and TargetVF < NumVecElems=" << NumVecElems << "\n");
922 continue;
923 }
924
925 // If we're loading/storing from an alloca, align it if possible.
926 //
927 // FIXME: We eagerly upgrade the alignment, regardless of whether TTI
928 // tells us this is beneficial. This feels a bit odd, but it matches
929 // existing tests. This isn't *so* bad, because at most we align to 4
930 // bytes (current value of StackAdjustedAlignment).
931 //
932 // FIXME: We will upgrade the alignment of the alloca even if it turns out
933 // we can't vectorize for some other reason.
934 Value *PtrOperand = getLoadStorePointerOperand(V: C[CBegin].Inst);
935 bool IsAllocaAccess = AS == DL.getAllocaAddrSpace() &&
936 isa<AllocaInst>(Val: PtrOperand->stripPointerCasts());
937 Align Alignment = getLoadStoreAlignment(I: C[CBegin].Inst);
938 Align PrefAlign = Align(StackAdjustedAlignment);
939 if (IsAllocaAccess && Alignment.value() % SizeBytes != 0 &&
940 accessIsAllowedAndFast(SizeBytes, AS, Alignment: PrefAlign, VecElemBits)) {
941 Align NewAlign = getOrEnforceKnownAlignment(
942 V: PtrOperand, PrefAlign, DL, CxtI: C[CBegin].Inst, AC: nullptr, DT: &DT);
943 if (NewAlign >= Alignment) {
944 LLVM_DEBUG(dbgs()
945 << "LSV: splitByChain upgrading alloca alignment from "
946 << Alignment.value() << " to " << NewAlign.value()
947 << "\n");
948 Alignment = NewAlign;
949 }
950 }
951
952 Chain ExtendingLoadsStores;
953 if (!accessIsAllowedAndFast(SizeBytes, AS, Alignment, VecElemBits)) {
954 // If we have a non-power-of-2 element count, attempt to extend the
955 // chain to the next power-of-2 if it makes the access allowed and
956 // fast.
957 bool AllowedAndFast = false;
958 if (NumVecElems < TargetVF && !isPowerOf2_32(Value: NumVecElems) &&
959 VecElemBits >= 8) {
960 // TargetVF may be a lot higher than NumVecElems,
961 // so only extend to the next power of 2.
962 assert(VecElemBits % 8 == 0);
963 unsigned VecElemBytes = VecElemBits / 8;
964 unsigned NewNumVecElems = PowerOf2Ceil(A: NumVecElems);
965 unsigned NewSizeBytes = VecElemBytes * NewNumVecElems;
966
967 assert(isPowerOf2_32(TargetVF) &&
968 "TargetVF expected to be a power of 2");
969 assert(NewNumVecElems <= TargetVF &&
970 "Should not extend past TargetVF");
971
972 LLVM_DEBUG(dbgs()
973 << "LSV: attempting to extend chain of " << NumVecElems
974 << " " << (IsLoadChain ? "loads" : "stores") << " to "
975 << NewNumVecElems << " elements\n");
976 bool IsLegalToExtend =
977 IsLoadChain ? TTI.isLegalMaskedLoad(
978 DataType: FixedVectorType::get(ElementType: VecElemTy, NumElts: NewNumVecElems),
979 Alignment, AddressSpace: AS, MaskKind: TTI::MaskKind::ConstantMask)
980 : TTI.isLegalMaskedStore(
981 DataType: FixedVectorType::get(ElementType: VecElemTy, NumElts: NewNumVecElems),
982 Alignment, AddressSpace: AS, MaskKind: TTI::MaskKind::ConstantMask);
983 // Only artificially increase the chain if it would be AllowedAndFast
984 // and if the resulting masked load/store will be legal for the
985 // target.
986 if (IsLegalToExtend &&
987 accessIsAllowedAndFast(SizeBytes: NewSizeBytes, AS, Alignment,
988 VecElemBits)) {
989 LLVM_DEBUG(dbgs()
990 << "LSV: extending " << (IsLoadChain ? "load" : "store")
991 << " chain of " << NumVecElems << " "
992 << (IsLoadChain ? "loads" : "stores")
993 << " with total byte size of " << SizeBytes << " to "
994 << NewNumVecElems << " "
995 << (IsLoadChain ? "loads" : "stores")
996 << " with total byte size of " << NewSizeBytes
997 << ", TargetVF=" << TargetVF << " \n");
998
999 // Create (NewNumVecElems - NumVecElems) extra elements.
1000 // We are basing each extra element on CBegin, which means the
1001 // offsets should be based on SizeBytes, which represents the offset
1002 // from CBegin to the current end of the chain.
1003 unsigned ASPtrBits = DL.getIndexSizeInBits(AS);
1004 for (unsigned I = 0; I < (NewNumVecElems - NumVecElems); I++) {
1005 ChainElem NewElem = createExtraElementAfter(
1006 PrevElem: C[CBegin], Ty: VecElemTy,
1007 Offset: APInt(ASPtrBits, SizeBytes + I * VecElemBytes), Prefix: "Extend");
1008 ExtendingLoadsStores.push_back(Elt: NewElem);
1009 }
1010
1011 // Update the size and number of elements for upcoming checks.
1012 SizeBytes = NewSizeBytes;
1013 NumVecElems = NewNumVecElems;
1014 AllowedAndFast = true;
1015 }
1016 }
1017 if (!AllowedAndFast) {
1018 // We were not able to achieve legality by extending the chain.
1019 LLVM_DEBUG(dbgs()
1020 << "LSV: splitChainByAlignment discarding candidate chain "
1021 "because its alignment is not AllowedAndFast: "
1022 << Alignment.value() << "\n");
1023 continue;
1024 }
1025 }
1026
1027 if ((IsLoadChain &&
1028 !TTI.isLegalToVectorizeLoadChain(ChainSizeInBytes: SizeBytes, Alignment, AddrSpace: AS)) ||
1029 (!IsLoadChain &&
1030 !TTI.isLegalToVectorizeStoreChain(ChainSizeInBytes: SizeBytes, Alignment, AddrSpace: AS))) {
1031 LLVM_DEBUG(
1032 dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
1033 "because !isLegalToVectorizeLoad/StoreChain.");
1034 continue;
1035 }
1036
1037 if (CandidateChainsMayContainExtraLoadsStores) {
1038 // If the candidate chain contains extra loads/stores from an earlier
1039 // optimization, confirm legality now. This filter is essential because
1040 // when filling gaps in splitChainByContiguity, we queried the API to
1041 // check that (for a given element type and address space) there *may*
1042 // have been a legal masked load/store we could possibly create. Now, we
1043 // need to check if the actual chain we ended up with is legal to turn
1044 // into a masked load/store. This is relevant for NVPTX, for example,
1045 // where a masked store is only legal if we have ended up with a 256-bit
1046 // vector.
1047 bool CurrCandContainsExtraLoadsStores = llvm::any_of(
1048 Range: ArrayRef<ChainElem>(C).slice(N: CBegin, M: CEnd - CBegin + 1),
1049 P: [this](const ChainElem &E) {
1050 return ExtraElements.contains(V: E.Inst);
1051 });
1052
1053 if (CurrCandContainsExtraLoadsStores &&
1054 (IsLoadChain ? !TTI.isLegalMaskedLoad(
1055 DataType: FixedVectorType::get(ElementType: VecElemTy, NumElts: NumVecElems),
1056 Alignment, AddressSpace: AS, MaskKind: TTI::MaskKind::ConstantMask)
1057 : !TTI.isLegalMaskedStore(
1058 DataType: FixedVectorType::get(ElementType: VecElemTy, NumElts: NumVecElems),
1059 Alignment, AddressSpace: AS, MaskKind: TTI::MaskKind::ConstantMask))) {
1060 LLVM_DEBUG(dbgs()
1061 << "LSV: splitChainByAlignment discarding candidate chain "
1062 "because it contains extra loads/stores that we cannot "
1063 "legally vectorize into a masked load/store \n");
1064 continue;
1065 }
1066 }
1067
1068 // Hooray, we can vectorize this chain!
1069 Chain &NewChain = Ret.emplace_back();
1070 for (unsigned I = CBegin; I <= CEnd; ++I)
1071 NewChain.emplace_back(Args&: C[I]);
1072 for (ChainElem E : ExtendingLoadsStores)
1073 NewChain.emplace_back(Args&: E);
1074 CBegin = CEnd; // Skip over the instructions we've added to the chain.
1075 break;
1076 }
1077 }
1078 return Ret;
1079}
1080
1081bool Vectorizer::vectorizeChain(Chain &C) {
1082 if (C.size() < 2)
1083 return false;
1084
1085 bool ChainContainsExtraLoadsStores = llvm::any_of(
1086 Range&: C, P: [this](const ChainElem &E) { return ExtraElements.contains(V: E.Inst); });
1087
1088 // If we are left with a two-element chain, and one of the elements is an
1089 // extra element, we don't want to vectorize
1090 if (C.size() == 2 && ChainContainsExtraLoadsStores)
1091 return false;
1092
1093 sortChainInOffsetOrder(C);
1094
1095 LLVM_DEBUG({
1096 dbgs() << "LSV: Vectorizing chain of " << C.size() << " instructions:\n";
1097 dumpChain(C);
1098 });
1099
1100 Type *VecElemTy = getChainElemTy(C);
1101 bool IsLoadChain = isa<LoadInst>(Val: C[0].Inst);
1102 unsigned AS = getLoadStoreAddressSpace(I: C[0].Inst);
1103 unsigned BytesAdded = DL.getTypeStoreSize(Ty: getLoadStoreType(I: &*C[0].Inst));
1104 APInt PrevReadEnd = C[0].OffsetFromLeader + BytesAdded;
1105 unsigned ChainBytes = BytesAdded;
1106 for (auto It = std::next(x: C.begin()), End = C.end(); It != End; ++It) {
1107 unsigned SzBytes = DL.getTypeStoreSize(Ty: getLoadStoreType(I: &*It->Inst));
1108 APInt ReadEnd = It->OffsetFromLeader + SzBytes;
1109 // Update ChainBytes considering possible overlap.
1110 BytesAdded =
1111 PrevReadEnd.sle(RHS: ReadEnd) ? (ReadEnd - PrevReadEnd).getSExtValue() : 0;
1112 ChainBytes += BytesAdded;
1113 PrevReadEnd = APIntOps::smax(A: PrevReadEnd, B: ReadEnd);
1114 }
1115
1116 assert(8 * ChainBytes % DL.getTypeSizeInBits(VecElemTy) == 0);
1117 // VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller
1118 // than 1 byte (e.g. VecTy == <32 x i1>).
1119 unsigned NumElem = 8 * ChainBytes / DL.getTypeSizeInBits(Ty: VecElemTy);
1120 Type *VecTy = FixedVectorType::get(ElementType: VecElemTy, NumElts: NumElem);
1121
1122 Align Alignment = getLoadStoreAlignment(I: C[0].Inst);
1123 // If this is a load/store of an alloca, we might have upgraded the alloca's
1124 // alignment earlier. Get the new alignment.
1125 if (AS == DL.getAllocaAddrSpace()) {
1126 Alignment = std::max(
1127 a: Alignment,
1128 b: getOrEnforceKnownAlignment(V: getLoadStorePointerOperand(V: C[0].Inst),
1129 PrefAlign: MaybeAlign(), DL, CxtI: C[0].Inst, AC: nullptr, DT: &DT));
1130 }
1131
1132 // All elements of the chain must have the same scalar-type size.
1133#ifndef NDEBUG
1134 for (const ChainElem &E : C)
1135 assert(DL.getTypeStoreSize(getLoadStoreType(E.Inst)->getScalarType()) ==
1136 DL.getTypeStoreSize(VecElemTy));
1137#endif
1138
1139 Instruction *VecInst;
1140 if (IsLoadChain) {
1141 // Loads get hoisted to the location of the first load in the chain. We may
1142 // also need to hoist the (transitive) operands of the loads.
1143 Builder.SetInsertPoint(
1144 llvm::min_element(Range&: C, C: [](const auto &A, const auto &B) {
1145 return A.Inst->comesBefore(B.Inst);
1146 })->Inst);
1147
1148 // If the chain contains extra loads, we need to vectorize into a
1149 // masked load.
1150 if (ChainContainsExtraLoadsStores) {
1151 assert(TTI.isLegalMaskedLoad(VecTy, Alignment, AS,
1152 TTI::MaskKind::ConstantMask));
1153 Value *Mask = createMaskForExtraElements(C, VecTy: cast<FixedVectorType>(Val: VecTy));
1154 VecInst = Builder.CreateMaskedLoad(
1155 Ty: VecTy, Ptr: getLoadStorePointerOperand(V: C[0].Inst), Alignment, Mask);
1156 } else {
1157 // This can happen due to a chain of redundant loads.
1158 // In this case, just use the element-type, and avoid ExtractElement.
1159 if (NumElem == 1)
1160 VecTy = VecElemTy;
1161 // Chain is in offset order, so C[0] is the instr with the lowest offset,
1162 // i.e. the root of the vector.
1163 VecInst = Builder.CreateAlignedLoad(
1164 Ty: VecTy, Ptr: getLoadStorePointerOperand(V: C[0].Inst), Align: Alignment);
1165 }
1166
1167 for (const ChainElem &E : C) {
1168 Instruction *I = E.Inst;
1169 Value *V;
1170 Type *T = getLoadStoreType(I);
1171 unsigned EOffset =
1172 (E.OffsetFromLeader - C[0].OffsetFromLeader).getZExtValue();
1173 unsigned VecIdx = 8 * EOffset / DL.getTypeSizeInBits(Ty: VecElemTy);
1174 if (!VecTy->isVectorTy()) {
1175 V = VecInst;
1176 } else if (auto *VT = dyn_cast<FixedVectorType>(Val: T)) {
1177 auto Mask = llvm::to_vector<8>(
1178 Range: llvm::seq<int>(Begin: VecIdx, End: VecIdx + VT->getNumElements()));
1179 V = Builder.CreateShuffleVector(V: VecInst, Mask, Name: I->getName());
1180 } else {
1181 V = Builder.CreateExtractElement(Vec: VecInst, Idx: Builder.getInt32(C: VecIdx),
1182 Name: I->getName());
1183 }
1184 if (V->getType() != I->getType())
1185 V = Builder.CreateBitOrPointerCast(V, DestTy: I->getType());
1186 I->replaceAllUsesWith(V);
1187 }
1188
1189 // Finally, we need to reorder the instrs in the BB so that the (transitive)
1190 // operands of VecInst appear before it. To see why, suppose we have
1191 // vectorized the following code:
1192 //
1193 // ptr1 = gep a, 1
1194 // load1 = load i32 ptr1
1195 // ptr0 = gep a, 0
1196 // load0 = load i32 ptr0
1197 //
1198 // We will put the vectorized load at the location of the earliest load in
1199 // the BB, i.e. load1. We get:
1200 //
1201 // ptr1 = gep a, 1
1202 // loadv = load <2 x i32> ptr0
1203 // load0 = extractelement loadv, 0
1204 // load1 = extractelement loadv, 1
1205 // ptr0 = gep a, 0
1206 //
1207 // Notice that loadv uses ptr0, which is defined *after* it!
1208 reorder(I: VecInst);
1209 } else {
1210 // Stores get sunk to the location of the last store in the chain.
1211 Builder.SetInsertPoint(llvm::max_element(Range&: C, C: [](auto &A, auto &B) {
1212 return A.Inst->comesBefore(B.Inst);
1213 })->Inst);
1214
1215 // Build the vector to store.
1216 Value *Vec = PoisonValue::get(T: VecTy);
1217 auto InsertElem = [&](Value *V, unsigned VecIdx) {
1218 if (V->getType() != VecElemTy)
1219 V = Builder.CreateBitOrPointerCast(V, DestTy: VecElemTy);
1220 Vec = Builder.CreateInsertElement(Vec, NewElt: V, Idx: Builder.getInt32(C: VecIdx));
1221 };
1222 for (const ChainElem &E : C) {
1223 auto *I = cast<StoreInst>(Val: E.Inst);
1224 unsigned EOffset =
1225 (E.OffsetFromLeader - C[0].OffsetFromLeader).getZExtValue();
1226 unsigned VecIdx = 8 * EOffset / DL.getTypeSizeInBits(Ty: VecElemTy);
1227 if (FixedVectorType *VT =
1228 dyn_cast<FixedVectorType>(Val: getLoadStoreType(I))) {
1229 for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) {
1230 InsertElem(Builder.CreateExtractElement(Vec: I->getValueOperand(),
1231 Idx: Builder.getInt32(C: J)),
1232 VecIdx++);
1233 }
1234 } else {
1235 InsertElem(I->getValueOperand(), VecIdx);
1236 }
1237 }
1238
1239 // If the chain originates from extra stores, we need to vectorize into a
1240 // masked store.
1241 if (ChainContainsExtraLoadsStores) {
1242 assert(TTI.isLegalMaskedStore(Vec->getType(), Alignment, AS,
1243 TTI::MaskKind::ConstantMask));
1244 Value *Mask =
1245 createMaskForExtraElements(C, VecTy: cast<FixedVectorType>(Val: Vec->getType()));
1246 VecInst = Builder.CreateMaskedStore(
1247 Val: Vec, Ptr: getLoadStorePointerOperand(V: C[0].Inst), Alignment, Mask);
1248 } else {
1249 // Chain is in offset order, so C[0] is the instr with the lowest offset,
1250 // i.e. the root of the vector.
1251 VecInst = Builder.CreateAlignedStore(
1252 Val: Vec, Ptr: getLoadStorePointerOperand(V: C[0].Inst), Align: Alignment);
1253 }
1254 }
1255
1256 propagateMetadata(I: VecInst, C);
1257
1258 for (const ChainElem &E : C)
1259 ToErase.emplace_back(Args: E.Inst);
1260
1261 ++NumVectorInstructions;
1262 NumScalarsVectorized += C.size();
1263 return true;
1264}
1265
1266template <bool IsLoadChain>
1267bool Vectorizer::isSafeToMove(
1268 Instruction *ChainElem, Instruction *ChainBegin,
1269 const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets,
1270 BatchAAResults &BatchAA) {
1271 LLVM_DEBUG(dbgs() << "LSV: isSafeToMove(" << *ChainElem << " -> "
1272 << *ChainBegin << ")\n");
1273
1274 assert(isa<LoadInst>(ChainElem) == IsLoadChain);
1275 if (ChainElem == ChainBegin)
1276 return true;
1277
1278 // Invariant loads can always be reordered; by definition they are not
1279 // clobbered by stores.
1280 if (isInvariantLoad(I: ChainElem))
1281 return true;
1282
1283 auto BBIt = std::next([&] {
1284 if constexpr (IsLoadChain)
1285 return BasicBlock::reverse_iterator(ChainElem);
1286 else
1287 return BasicBlock::iterator(ChainElem);
1288 }());
1289 auto BBItEnd = std::next([&] {
1290 if constexpr (IsLoadChain)
1291 return BasicBlock::reverse_iterator(ChainBegin);
1292 else
1293 return BasicBlock::iterator(ChainBegin);
1294 }());
1295
1296 const APInt &ChainElemOffset = ChainOffsets.at(Val: ChainElem);
1297 const unsigned ChainElemSize =
1298 DL.getTypeStoreSize(Ty: getLoadStoreType(I: ChainElem));
1299
1300 for (; BBIt != BBItEnd; ++BBIt) {
1301 Instruction *I = &*BBIt;
1302
1303 if (!I->mayReadOrWriteMemory())
1304 continue;
1305
1306 // Loads can be reordered with other loads.
1307 if (IsLoadChain && isa<LoadInst>(Val: I))
1308 continue;
1309
1310 // Stores can be sunk below invariant loads.
1311 if (!IsLoadChain && isInvariantLoad(I))
1312 continue;
1313
1314 // If I is in the chain, we can tell whether it aliases ChainIt by checking
1315 // what offset ChainIt accesses. This may be better than AA is able to do.
1316 //
1317 // We should really only have duplicate offsets for stores (the duplicate
1318 // loads should be CSE'ed), but in case we have a duplicate load, we'll
1319 // split the chain so we don't have to handle this case specially.
1320 if (auto OffsetIt = ChainOffsets.find(Val: I); OffsetIt != ChainOffsets.end()) {
1321 // I and ChainElem overlap if:
1322 // - I and ChainElem have the same offset, OR
1323 // - I's offset is less than ChainElem's, but I touches past the
1324 // beginning of ChainElem, OR
1325 // - ChainElem's offset is less than I's, but ChainElem touches past the
1326 // beginning of I.
1327 const APInt &IOffset = OffsetIt->second;
1328 unsigned IElemSize = DL.getTypeStoreSize(Ty: getLoadStoreType(I));
1329 if (IOffset == ChainElemOffset ||
1330 (IOffset.sle(RHS: ChainElemOffset) &&
1331 (IOffset + IElemSize).sgt(RHS: ChainElemOffset)) ||
1332 (ChainElemOffset.sle(RHS: IOffset) &&
1333 (ChainElemOffset + ChainElemSize).sgt(RHS: OffsetIt->second))) {
1334 LLVM_DEBUG({
1335 // Double check that AA also sees this alias. If not, we probably
1336 // have a bug.
1337 ModRefInfo MR =
1338 BatchAA.getModRefInfo(I, MemoryLocation::get(ChainElem));
1339 assert(IsLoadChain ? isModSet(MR) : isModOrRefSet(MR));
1340 dbgs() << "LSV: Found alias in chain: " << *I << "\n";
1341 });
1342 return false; // We found an aliasing instruction; bail.
1343 }
1344
1345 continue; // We're confident there's no alias.
1346 }
1347
1348 LLVM_DEBUG(dbgs() << "LSV: Querying AA for " << *I << "\n");
1349 ModRefInfo MR = BatchAA.getModRefInfo(I, OptLoc: MemoryLocation::get(Inst: ChainElem));
1350 if (IsLoadChain ? isModSet(MRI: MR) : isModOrRefSet(MRI: MR)) {
1351 LLVM_DEBUG(dbgs() << "LSV: Found alias in chain:\n"
1352 << " Aliasing instruction:\n"
1353 << " " << *I << '\n'
1354 << " Aliased instruction and pointer:\n"
1355 << " " << *ChainElem << '\n'
1356 << " " << *getLoadStorePointerOperand(ChainElem)
1357 << '\n');
1358
1359 return false;
1360 }
1361 }
1362 return true;
1363}
1364
1365static bool checkNoWrapFlags(Instruction *I, bool Signed) {
1366 BinaryOperator *BinOpI = cast<BinaryOperator>(Val: I);
1367 return (Signed && BinOpI->hasNoSignedWrap()) ||
1368 (!Signed && BinOpI->hasNoUnsignedWrap());
1369}
1370
1371static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA,
1372 unsigned MatchingOpIdxA, Instruction *AddOpB,
1373 unsigned MatchingOpIdxB, bool Signed) {
1374 LLVM_DEBUG(dbgs() << "LSV: checkIfSafeAddSequence IdxDiff=" << IdxDiff
1375 << ", AddOpA=" << *AddOpA << ", MatchingOpIdxA="
1376 << MatchingOpIdxA << ", AddOpB=" << *AddOpB
1377 << ", MatchingOpIdxB=" << MatchingOpIdxB
1378 << ", Signed=" << Signed << "\n");
1379 // If both OpA and OpB are adds with NSW/NUW and with one of the operands
1380 // being the same, we can guarantee that the transformation is safe if we can
1381 // prove that OpA won't overflow when Ret added to the other operand of OpA.
1382 // For example:
1383 // %tmp7 = add nsw i32 %tmp2, %v0
1384 // %tmp8 = sext i32 %tmp7 to i64
1385 // ...
1386 // %tmp11 = add nsw i32 %v0, 1
1387 // %tmp12 = add nsw i32 %tmp2, %tmp11
1388 // %tmp13 = sext i32 %tmp12 to i64
1389 //
1390 // Both %tmp7 and %tmp12 have the nsw flag and the first operand is %tmp2.
1391 // It's guaranteed that adding 1 to %tmp7 won't overflow because %tmp11 adds
1392 // 1 to %v0 and both %tmp11 and %tmp12 have the nsw flag.
1393 assert(AddOpA->getOpcode() == Instruction::Add &&
1394 AddOpB->getOpcode() == Instruction::Add &&
1395 checkNoWrapFlags(AddOpA, Signed) && checkNoWrapFlags(AddOpB, Signed));
1396 if (AddOpA->getOperand(i: MatchingOpIdxA) ==
1397 AddOpB->getOperand(i: MatchingOpIdxB)) {
1398 Value *OtherOperandA = AddOpA->getOperand(i: MatchingOpIdxA == 1 ? 0 : 1);
1399 Value *OtherOperandB = AddOpB->getOperand(i: MatchingOpIdxB == 1 ? 0 : 1);
1400 Instruction *OtherInstrA = dyn_cast<Instruction>(Val: OtherOperandA);
1401 Instruction *OtherInstrB = dyn_cast<Instruction>(Val: OtherOperandB);
1402 // Match `x +nsw/nuw y` and `x +nsw/nuw (y +nsw/nuw IdxDiff)`.
1403 if (OtherInstrB && OtherInstrB->getOpcode() == Instruction::Add &&
1404 checkNoWrapFlags(I: OtherInstrB, Signed) &&
1405 isa<ConstantInt>(Val: OtherInstrB->getOperand(i: 1))) {
1406 int64_t CstVal =
1407 cast<ConstantInt>(Val: OtherInstrB->getOperand(i: 1))->getSExtValue();
1408 if (OtherInstrB->getOperand(i: 0) == OtherOperandA &&
1409 IdxDiff.getSExtValue() == CstVal)
1410 return true;
1411 }
1412 // Match `x +nsw/nuw (y +nsw/nuw -Idx)` and `x +nsw/nuw (y +nsw/nuw x)`.
1413 if (OtherInstrA && OtherInstrA->getOpcode() == Instruction::Add &&
1414 checkNoWrapFlags(I: OtherInstrA, Signed) &&
1415 isa<ConstantInt>(Val: OtherInstrA->getOperand(i: 1))) {
1416 int64_t CstVal =
1417 cast<ConstantInt>(Val: OtherInstrA->getOperand(i: 1))->getSExtValue();
1418 if (OtherInstrA->getOperand(i: 0) == OtherOperandB &&
1419 IdxDiff.getSExtValue() == -CstVal)
1420 return true;
1421 }
1422 // Match `x +nsw/nuw (y +nsw/nuw c)` and
1423 // `x +nsw/nuw (y +nsw/nuw (c + IdxDiff))`.
1424 if (OtherInstrA && OtherInstrB &&
1425 OtherInstrA->getOpcode() == Instruction::Add &&
1426 OtherInstrB->getOpcode() == Instruction::Add &&
1427 checkNoWrapFlags(I: OtherInstrA, Signed) &&
1428 checkNoWrapFlags(I: OtherInstrB, Signed) &&
1429 isa<ConstantInt>(Val: OtherInstrA->getOperand(i: 1)) &&
1430 isa<ConstantInt>(Val: OtherInstrB->getOperand(i: 1))) {
1431 int64_t CstValA =
1432 cast<ConstantInt>(Val: OtherInstrA->getOperand(i: 1))->getSExtValue();
1433 int64_t CstValB =
1434 cast<ConstantInt>(Val: OtherInstrB->getOperand(i: 1))->getSExtValue();
1435 if (OtherInstrA->getOperand(i: 0) == OtherInstrB->getOperand(i: 0) &&
1436 IdxDiff.getSExtValue() == (CstValB - CstValA))
1437 return true;
1438 }
1439 }
1440 return false;
1441}
1442
1443std::optional<APInt> Vectorizer::getConstantOffsetComplexAddrs(
1444 Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) {
1445 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs PtrA=" << *PtrA
1446 << " PtrB=" << *PtrB << " ContextInst=" << *ContextInst
1447 << " Depth=" << Depth << "\n");
1448 auto *GEPA = dyn_cast<GetElementPtrInst>(Val: PtrA);
1449 auto *GEPB = dyn_cast<GetElementPtrInst>(Val: PtrB);
1450 if (!GEPA || !GEPB)
1451 return getConstantOffsetSelects(PtrA, PtrB, ContextInst, Depth);
1452
1453 // Look through GEPs after checking they're the same except for the last
1454 // index.
1455 if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
1456 GEPA->getPointerOperand() != GEPB->getPointerOperand())
1457 return std::nullopt;
1458 gep_type_iterator GTIA = gep_type_begin(GEP: GEPA);
1459 gep_type_iterator GTIB = gep_type_begin(GEP: GEPB);
1460 for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
1461 if (GTIA.getOperand() != GTIB.getOperand())
1462 return std::nullopt;
1463 ++GTIA;
1464 ++GTIB;
1465 }
1466
1467 Instruction *OpA = dyn_cast<Instruction>(Val: GTIA.getOperand());
1468 Instruction *OpB = dyn_cast<Instruction>(Val: GTIB.getOperand());
1469 if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
1470 OpA->getType() != OpB->getType())
1471 return std::nullopt;
1472
1473 uint64_t Stride = GTIA.getSequentialElementStride(DL);
1474
1475 // Only look through a ZExt/SExt.
1476 if (!isa<SExtInst>(Val: OpA) && !isa<ZExtInst>(Val: OpA))
1477 return std::nullopt;
1478
1479 bool Signed = isa<SExtInst>(Val: OpA);
1480
1481 // At this point A could be a function parameter, i.e. not an instruction
1482 Value *ValA = OpA->getOperand(i: 0);
1483 OpB = dyn_cast<Instruction>(Val: OpB->getOperand(i: 0));
1484 if (!OpB || ValA->getType() != OpB->getType())
1485 return std::nullopt;
1486
1487 const SCEV *OffsetSCEVA = SE.getSCEV(V: ValA);
1488 const SCEV *OffsetSCEVB = SE.getSCEV(V: OpB);
1489 const SCEV *IdxDiffSCEV = SE.getMinusSCEV(LHS: OffsetSCEVB, RHS: OffsetSCEVA);
1490 if (IdxDiffSCEV == SE.getCouldNotCompute())
1491 return std::nullopt;
1492
1493 ConstantRange IdxDiffRange = SE.getSignedRange(S: IdxDiffSCEV);
1494 if (!IdxDiffRange.isSingleElement())
1495 return std::nullopt;
1496 APInt IdxDiff = *IdxDiffRange.getSingleElement();
1497
1498 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs IdxDiff=" << IdxDiff
1499 << "\n");
1500
1501 // Now we need to prove that adding IdxDiff to ValA won't overflow.
1502 bool Safe = false;
1503
1504 // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
1505 // ValA, we're okay.
1506 if (OpB->getOpcode() == Instruction::Add &&
1507 isa<ConstantInt>(Val: OpB->getOperand(i: 1)) &&
1508 IdxDiff.sle(RHS: cast<ConstantInt>(Val: OpB->getOperand(i: 1))->getSExtValue()) &&
1509 checkNoWrapFlags(I: OpB, Signed))
1510 Safe = true;
1511
1512 // Second attempt: check if we have eligible add NSW/NUW instruction
1513 // sequences.
1514 OpA = dyn_cast<Instruction>(Val: ValA);
1515 if (!Safe && OpA && OpA->getOpcode() == Instruction::Add &&
1516 OpB->getOpcode() == Instruction::Add && checkNoWrapFlags(I: OpA, Signed) &&
1517 checkNoWrapFlags(I: OpB, Signed)) {
1518 // In the checks below a matching operand in OpA and OpB is an operand which
1519 // is the same in those two instructions. Below we account for possible
1520 // orders of the operands of these add instructions.
1521 for (unsigned MatchingOpIdxA : {0, 1})
1522 for (unsigned MatchingOpIdxB : {0, 1})
1523 if (!Safe)
1524 Safe = checkIfSafeAddSequence(IdxDiff, AddOpA: OpA, MatchingOpIdxA, AddOpB: OpB,
1525 MatchingOpIdxB, Signed);
1526 }
1527
1528 unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
1529
1530 // Third attempt:
1531 //
1532 // Assuming IdxDiff is positive: If all set bits of IdxDiff or any higher
1533 // order bit other than the sign bit are known to be zero in ValA, we can add
1534 // Diff to it while guaranteeing no overflow of any sort.
1535 //
1536 // If IdxDiff is negative, do the same, but swap ValA and ValB.
1537 if (!Safe) {
1538 // When computing known bits, use the GEPs as context instructions, since
1539 // they likely are in the same BB as the load/store.
1540 KnownBits Known(BitWidth);
1541 computeKnownBits(V: (IdxDiff.sge(RHS: 0) ? ValA : OpB), Known, DL, AC: &AC, CxtI: ContextInst,
1542 DT: &DT);
1543 APInt BitsAllowedToBeSet = Known.Zero.zext(width: IdxDiff.getBitWidth());
1544 if (Signed)
1545 BitsAllowedToBeSet.clearBit(BitPosition: BitWidth - 1);
1546 Safe = BitsAllowedToBeSet.uge(RHS: IdxDiff.abs());
1547 }
1548
1549 if (Safe)
1550 return IdxDiff * Stride;
1551 return std::nullopt;
1552}
1553
1554std::optional<APInt> Vectorizer::getConstantOffsetSelects(
1555 Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) {
1556 if (Depth++ == MaxDepth)
1557 return std::nullopt;
1558
1559 if (auto *SelectA = dyn_cast<SelectInst>(Val: PtrA)) {
1560 if (auto *SelectB = dyn_cast<SelectInst>(Val: PtrB)) {
1561 if (SelectA->getCondition() != SelectB->getCondition())
1562 return std::nullopt;
1563 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetSelects, PtrA=" << *PtrA
1564 << ", PtrB=" << *PtrB << ", ContextInst="
1565 << *ContextInst << ", Depth=" << Depth << "\n");
1566 std::optional<APInt> TrueDiff = getConstantOffset(
1567 PtrA: SelectA->getTrueValue(), PtrB: SelectB->getTrueValue(), ContextInst, Depth);
1568 if (!TrueDiff)
1569 return std::nullopt;
1570 std::optional<APInt> FalseDiff =
1571 getConstantOffset(PtrA: SelectA->getFalseValue(), PtrB: SelectB->getFalseValue(),
1572 ContextInst, Depth);
1573 if (TrueDiff == FalseDiff)
1574 return TrueDiff;
1575 }
1576 }
1577 return std::nullopt;
1578}
1579
1580void Vectorizer::mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const {
1581 if (EQClasses.size() < 2) // There is nothing to merge.
1582 return;
1583
1584 // The reduced key has all elements of the ECClassKey except the underlying
1585 // object. Check that EqClassKey has 4 elements and define the reduced key.
1586 static_assert(std::tuple_size_v<EqClassKey> == 4,
1587 "EqClassKey has changed - EqClassReducedKey needs changes too");
1588 using EqClassReducedKey =
1589 std::tuple<std::tuple_element_t<1, EqClassKey> /* AddrSpace */,
1590 std::tuple_element_t<2, EqClassKey> /* Element size */,
1591 std::tuple_element_t<3, EqClassKey> /* IsLoad; */>;
1592 using ECReducedKeyToUnderlyingObjectMap =
1593 MapVector<EqClassReducedKey,
1594 SmallPtrSet<std::tuple_element_t<0, EqClassKey>, 4>>;
1595
1596 // Form a map from the reduced key (without the underlying object) to the
1597 // underlying objects: 1 reduced key to many underlying objects, to form
1598 // groups of potentially merge-able equivalence classes.
1599 ECReducedKeyToUnderlyingObjectMap RedKeyToUOMap;
1600 bool FoundPotentiallyOptimizableEC = false;
1601 for (const auto &EC : EQClasses) {
1602 const auto &Key = EC.first;
1603 EqClassReducedKey RedKey{std::get<1>(t: Key), std::get<2>(t: Key),
1604 std::get<3>(t: Key)};
1605 auto &UOMap = RedKeyToUOMap[RedKey];
1606 UOMap.insert(Ptr: std::get<0>(t: Key));
1607 if (UOMap.size() > 1)
1608 FoundPotentiallyOptimizableEC = true;
1609 }
1610 if (!FoundPotentiallyOptimizableEC)
1611 return;
1612
1613 LLVM_DEBUG({
1614 dbgs() << "LSV: mergeEquivalenceClasses: before merging:\n";
1615 for (const auto &EC : EQClasses) {
1616 dbgs() << " Key: {" << EC.first << "}\n";
1617 for (const auto &Inst : EC.second)
1618 dbgs() << " Inst: " << *Inst << '\n';
1619 }
1620 });
1621 LLVM_DEBUG({
1622 dbgs() << "LSV: mergeEquivalenceClasses: RedKeyToUOMap:\n";
1623 for (const auto &RedKeyToUO : RedKeyToUOMap) {
1624 dbgs() << " Reduced key: {" << std::get<0>(RedKeyToUO.first) << ", "
1625 << std::get<1>(RedKeyToUO.first) << ", "
1626 << static_cast<int>(std::get<2>(RedKeyToUO.first)) << "} --> "
1627 << RedKeyToUO.second.size() << " underlying objects:\n";
1628 for (auto UObject : RedKeyToUO.second)
1629 dbgs() << " " << *UObject << '\n';
1630 }
1631 });
1632
1633 using UObjectToUObjectMap = DenseMap<const Value *, const Value *>;
1634
1635 // Compute the ultimate targets for a set of underlying objects.
1636 auto GetUltimateTargets =
1637 [](SmallPtrSetImpl<const Value *> &UObjects) -> UObjectToUObjectMap {
1638 UObjectToUObjectMap IndirectionMap;
1639 for (const auto *UObject : UObjects) {
1640 const unsigned MaxLookupDepth = 1; // look for 1-level indirections only
1641 const auto *UltimateTarget = getUnderlyingObject(V: UObject, MaxLookup: MaxLookupDepth);
1642 if (UltimateTarget != UObject)
1643 IndirectionMap[UObject] = UltimateTarget;
1644 }
1645 UObjectToUObjectMap UltimateTargetsMap;
1646 for (const auto *UObject : UObjects) {
1647 auto Target = UObject;
1648 auto It = IndirectionMap.find(Val: Target);
1649 for (; It != IndirectionMap.end(); It = IndirectionMap.find(Val: Target))
1650 Target = It->second;
1651 UltimateTargetsMap[UObject] = Target;
1652 }
1653 return UltimateTargetsMap;
1654 };
1655
1656 // For each item in RedKeyToUOMap, if it has more than one underlying object,
1657 // try to merge the equivalence classes.
1658 for (auto &[RedKey, UObjects] : RedKeyToUOMap) {
1659 if (UObjects.size() < 2)
1660 continue;
1661 auto UTMap = GetUltimateTargets(UObjects);
1662 for (const auto &[UObject, UltimateTarget] : UTMap) {
1663 if (UObject == UltimateTarget)
1664 continue;
1665
1666 EqClassKey KeyFrom{UObject, std::get<0>(t&: RedKey), std::get<1>(t&: RedKey),
1667 std::get<2>(t&: RedKey)};
1668 EqClassKey KeyTo{UltimateTarget, std::get<0>(t&: RedKey), std::get<1>(t&: RedKey),
1669 std::get<2>(t&: RedKey)};
1670 // The entry for KeyFrom is guarantted to exist, unlike KeyTo. Thus,
1671 // request the reference to the instructions vector for KeyTo first.
1672 const auto &VecTo = EQClasses[KeyTo];
1673 const auto &VecFrom = EQClasses[KeyFrom];
1674 SmallVector<Instruction *, 8> MergedVec;
1675 std::merge(first1: VecFrom.begin(), last1: VecFrom.end(), first2: VecTo.begin(), last2: VecTo.end(),
1676 result: std::back_inserter(x&: MergedVec),
1677 comp: [](Instruction *A, Instruction *B) {
1678 return A && B && A->comesBefore(Other: B);
1679 });
1680 EQClasses[KeyTo] = std::move(MergedVec);
1681 EQClasses.erase(Key: KeyFrom);
1682 }
1683 }
1684 LLVM_DEBUG({
1685 dbgs() << "LSV: mergeEquivalenceClasses: after merging:\n";
1686 for (const auto &EC : EQClasses) {
1687 dbgs() << " Key: {" << EC.first << "}\n";
1688 for (const auto &Inst : EC.second)
1689 dbgs() << " Inst: " << *Inst << '\n';
1690 }
1691 });
1692}
1693
1694EquivalenceClassMap
1695Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
1696 BasicBlock::iterator End) {
1697 EquivalenceClassMap Ret;
1698
1699 auto GetUnderlyingObject = [](const Value *Ptr) -> const Value * {
1700 const Value *ObjPtr = llvm::getUnderlyingObject(V: Ptr);
1701 if (const auto *Sel = dyn_cast<SelectInst>(Val: ObjPtr)) {
1702 // The select's themselves are distinct instructions even if they share
1703 // the same condition and evaluate to consecutive pointers for true and
1704 // false values of the condition. Therefore using the select's themselves
1705 // for grouping instructions would put consecutive accesses into different
1706 // lists and they won't be even checked for being consecutive, and won't
1707 // be vectorized.
1708 return Sel->getCondition();
1709 }
1710 return ObjPtr;
1711 };
1712
1713 for (Instruction &I : make_range(x: Begin, y: End)) {
1714 auto *LI = dyn_cast<LoadInst>(Val: &I);
1715 auto *SI = dyn_cast<StoreInst>(Val: &I);
1716 if (!LI && !SI)
1717 continue;
1718
1719 if ((LI && !LI->isSimple()) || (SI && !SI->isSimple()))
1720 continue;
1721
1722 if ((LI && !TTI.isLegalToVectorizeLoad(LI)) ||
1723 (SI && !TTI.isLegalToVectorizeStore(SI)))
1724 continue;
1725
1726 Type *Ty = getLoadStoreType(I: &I);
1727 if (!VectorType::isValidElementType(ElemTy: Ty->getScalarType()))
1728 continue;
1729
1730 // Skip weird non-byte sizes. They probably aren't worth the effort of
1731 // handling correctly.
1732 unsigned TySize = DL.getTypeSizeInBits(Ty);
1733 if ((TySize % 8) != 0)
1734 continue;
1735
1736 // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
1737 // functions are currently using an integer type for the vectorized
1738 // load/store, and does not support casting between the integer type and a
1739 // vector of pointers (e.g. i64 to <2 x i16*>)
1740 if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
1741 continue;
1742
1743 Value *Ptr = getLoadStorePointerOperand(V: &I);
1744 unsigned AS = Ptr->getType()->getPointerAddressSpace();
1745 unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AddrSpace: AS);
1746
1747 unsigned VF = VecRegSize / TySize;
1748 VectorType *VecTy = dyn_cast<VectorType>(Val: Ty);
1749
1750 // Only handle power-of-two sized elements.
1751 if ((!VecTy && !isPowerOf2_32(Value: DL.getTypeSizeInBits(Ty))) ||
1752 (VecTy && !isPowerOf2_32(Value: DL.getTypeSizeInBits(Ty: VecTy->getScalarType()))))
1753 continue;
1754
1755 // No point in looking at these if they're too big to vectorize.
1756 if (TySize > VecRegSize / 2 ||
1757 (VecTy && TTI.getLoadVectorFactor(VF, LoadSize: TySize, ChainSizeInBytes: TySize / 8, VecTy) == 0))
1758 continue;
1759
1760 Ret[{GetUnderlyingObject(Ptr), AS,
1761 DL.getTypeSizeInBits(Ty: getLoadStoreType(I: &I)->getScalarType()),
1762 /*IsLoad=*/LI != nullptr}]
1763 .emplace_back(Args: &I);
1764 }
1765
1766 mergeEquivalenceClasses(EQClasses&: Ret);
1767 return Ret;
1768}
1769
1770std::vector<Chain> Vectorizer::gatherChains(ArrayRef<Instruction *> Instrs) {
1771 if (Instrs.empty())
1772 return {};
1773
1774 unsigned AS = getLoadStoreAddressSpace(I: Instrs[0]);
1775 unsigned ASPtrBits = DL.getIndexSizeInBits(AS);
1776
1777#ifndef NDEBUG
1778 // Check that Instrs is in BB order and all have the same addr space.
1779 for (size_t I = 1; I < Instrs.size(); ++I) {
1780 assert(Instrs[I - 1]->comesBefore(Instrs[I]));
1781 assert(getLoadStoreAddressSpace(Instrs[I]) == AS);
1782 }
1783#endif
1784
1785 // Machinery to build an MRU-hashtable of Chains.
1786 //
1787 // (Ideally this could be done with MapVector, but as currently implemented,
1788 // moving an element to the front of a MapVector is O(n).)
1789 struct InstrListElem : ilist_node<InstrListElem>,
1790 std::pair<Instruction *, Chain> {
1791 explicit InstrListElem(Instruction *I)
1792 : std::pair<Instruction *, Chain>(I, {}) {}
1793 };
1794 struct InstrListElemDenseMapInfo {
1795 using PtrInfo = DenseMapInfo<InstrListElem *>;
1796 using IInfo = DenseMapInfo<Instruction *>;
1797 static InstrListElem *getEmptyKey() { return PtrInfo::getEmptyKey(); }
1798 static InstrListElem *getTombstoneKey() {
1799 return PtrInfo::getTombstoneKey();
1800 }
1801 static unsigned getHashValue(const InstrListElem *E) {
1802 return IInfo::getHashValue(PtrVal: E->first);
1803 }
1804 static bool isEqual(const InstrListElem *A, const InstrListElem *B) {
1805 if (A == getEmptyKey() || B == getEmptyKey())
1806 return A == getEmptyKey() && B == getEmptyKey();
1807 if (A == getTombstoneKey() || B == getTombstoneKey())
1808 return A == getTombstoneKey() && B == getTombstoneKey();
1809 return IInfo::isEqual(LHS: A->first, RHS: B->first);
1810 }
1811 };
1812 SpecificBumpPtrAllocator<InstrListElem> Allocator;
1813 simple_ilist<InstrListElem> MRU;
1814 DenseSet<InstrListElem *, InstrListElemDenseMapInfo> Chains;
1815
1816 // Compare each instruction in `instrs` to leader of the N most recently-used
1817 // chains. This limits the O(n^2) behavior of this pass while also allowing
1818 // us to build arbitrarily long chains.
1819 for (Instruction *I : Instrs) {
1820 constexpr int MaxChainsToTry = 64;
1821
1822 bool MatchFound = false;
1823 auto ChainIter = MRU.begin();
1824 for (size_t J = 0; J < MaxChainsToTry && ChainIter != MRU.end();
1825 ++J, ++ChainIter) {
1826 if (std::optional<APInt> Offset = getConstantOffset(
1827 PtrA: getLoadStorePointerOperand(V: ChainIter->first),
1828 PtrB: getLoadStorePointerOperand(V: I),
1829 /*ContextInst=*/
1830 (ChainIter->first->comesBefore(Other: I) ? I : ChainIter->first))) {
1831 // `Offset` might not have the expected number of bits, if e.g. AS has a
1832 // different number of bits than opaque pointers.
1833 ChainIter->second.emplace_back(Args&: I, Args&: Offset.value());
1834 // Move ChainIter to the front of the MRU list.
1835 MRU.remove(N&: *ChainIter);
1836 MRU.push_front(Node&: *ChainIter);
1837 MatchFound = true;
1838 break;
1839 }
1840 }
1841
1842 if (!MatchFound) {
1843 APInt ZeroOffset(ASPtrBits, 0);
1844 InstrListElem *E = new (Allocator.Allocate()) InstrListElem(I);
1845 E->second.emplace_back(Args&: I, Args&: ZeroOffset);
1846 MRU.push_front(Node&: *E);
1847 Chains.insert(V: E);
1848 }
1849 }
1850
1851 std::vector<Chain> Ret;
1852 Ret.reserve(n: Chains.size());
1853 // Iterate over MRU rather than Chains so the order is deterministic.
1854 for (auto &E : MRU)
1855 if (E.second.size() > 1)
1856 Ret.emplace_back(args: std::move(E.second));
1857 return Ret;
1858}
1859
1860std::optional<APInt> Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB,
1861 Instruction *ContextInst,
1862 unsigned Depth) {
1863 LLVM_DEBUG(dbgs() << "LSV: getConstantOffset, PtrA=" << *PtrA
1864 << ", PtrB=" << *PtrB << ", ContextInst= " << *ContextInst
1865 << ", Depth=" << Depth << "\n");
1866 // We'll ultimately return a value of this bit width, even if computations
1867 // happen in a different width.
1868 unsigned OrigBitWidth = DL.getIndexTypeSizeInBits(Ty: PtrA->getType());
1869 APInt OffsetA(OrigBitWidth, 0);
1870 APInt OffsetB(OrigBitWidth, 0);
1871 PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, Offset&: OffsetA);
1872 PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, Offset&: OffsetB);
1873 unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(Ty: PtrA->getType());
1874 if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(Ty: PtrB->getType()))
1875 return std::nullopt;
1876
1877 // If we have to shrink the pointer, stripAndAccumulateInBoundsConstantOffsets
1878 // should properly handle a possible overflow and the value should fit into
1879 // the smallest data type used in the cast/gep chain.
1880 assert(OffsetA.getSignificantBits() <= NewPtrBitWidth &&
1881 OffsetB.getSignificantBits() <= NewPtrBitWidth);
1882
1883 OffsetA = OffsetA.sextOrTrunc(width: NewPtrBitWidth);
1884 OffsetB = OffsetB.sextOrTrunc(width: NewPtrBitWidth);
1885 if (PtrA == PtrB)
1886 return (OffsetB - OffsetA).sextOrTrunc(width: OrigBitWidth);
1887
1888 // Try to compute B - A.
1889 const SCEV *DistScev = SE.getMinusSCEV(LHS: SE.getSCEV(V: PtrB), RHS: SE.getSCEV(V: PtrA));
1890 if (DistScev != SE.getCouldNotCompute()) {
1891 LLVM_DEBUG(dbgs() << "LSV: SCEV PtrB - PtrA =" << *DistScev << "\n");
1892 ConstantRange DistRange = SE.getSignedRange(S: DistScev);
1893 if (DistRange.isSingleElement()) {
1894 // Handle index width (the width of Dist) != pointer width (the width of
1895 // the Offset*s at this point).
1896 APInt Dist = DistRange.getSingleElement()->sextOrTrunc(width: NewPtrBitWidth);
1897 return (OffsetB - OffsetA + Dist).sextOrTrunc(width: OrigBitWidth);
1898 }
1899 }
1900 if (std::optional<APInt> Diff =
1901 getConstantOffsetComplexAddrs(PtrA, PtrB, ContextInst, Depth))
1902 return (OffsetB - OffsetA + Diff->sext(width: OffsetB.getBitWidth()))
1903 .sextOrTrunc(width: OrigBitWidth);
1904 return std::nullopt;
1905}
1906
1907bool Vectorizer::accessIsAllowedAndFast(unsigned SizeBytes, unsigned AS,
1908 Align Alignment,
1909 unsigned VecElemBits) const {
1910 // Aligned vector accesses are ALWAYS faster than element-wise accesses.
1911 if (Alignment.value() % SizeBytes == 0)
1912 return true;
1913
1914 // Ask TTI whether misaligned accesses are faster as vector or element-wise.
1915 unsigned VectorizedSpeed = 0;
1916 bool AllowsMisaligned = TTI.allowsMisalignedMemoryAccesses(
1917 Context&: F.getContext(), BitWidth: SizeBytes * 8, AddressSpace: AS, Alignment, Fast: &VectorizedSpeed);
1918 if (!AllowsMisaligned) {
1919 LLVM_DEBUG(
1920 dbgs() << "LSV: Access of " << SizeBytes << "B in addrspace " << AS
1921 << " with alignment " << Alignment.value()
1922 << " is misaligned, and therefore can't be vectorized.\n");
1923 return false;
1924 }
1925
1926 unsigned ElementwiseSpeed = 0;
1927 (TTI).allowsMisalignedMemoryAccesses(Context&: (F).getContext(), BitWidth: VecElemBits, AddressSpace: AS,
1928 Alignment, Fast: &ElementwiseSpeed);
1929 if (VectorizedSpeed < ElementwiseSpeed) {
1930 LLVM_DEBUG(dbgs() << "LSV: Access of " << SizeBytes << "B in addrspace "
1931 << AS << " with alignment " << Alignment.value()
1932 << " has relative speed " << VectorizedSpeed
1933 << ", which is lower than the elementwise speed of "
1934 << ElementwiseSpeed
1935 << ". Therefore this access won't be vectorized.\n");
1936 return false;
1937 }
1938 return true;
1939}
1940
1941ChainElem Vectorizer::createExtraElementAfter(const ChainElem &Prev, Type *Ty,
1942 APInt Offset, StringRef Prefix,
1943 Align Alignment) {
1944 Instruction *NewElement = nullptr;
1945 Builder.SetInsertPoint(Prev.Inst->getNextNode());
1946 if (LoadInst *PrevLoad = dyn_cast<LoadInst>(Val: Prev.Inst)) {
1947 Value *NewGep = Builder.CreatePtrAdd(
1948 Ptr: PrevLoad->getPointerOperand(), Offset: Builder.getInt(AI: Offset), Name: Prefix + "GEP");
1949 LLVM_DEBUG(dbgs() << "LSV: Extra GEP Created: \n" << *NewGep << "\n");
1950 NewElement = Builder.CreateAlignedLoad(Ty, Ptr: NewGep, Align: Alignment, Name: Prefix);
1951 } else {
1952 StoreInst *PrevStore = cast<StoreInst>(Val: Prev.Inst);
1953
1954 Value *NewGep = Builder.CreatePtrAdd(
1955 Ptr: PrevStore->getPointerOperand(), Offset: Builder.getInt(AI: Offset), Name: Prefix + "GEP");
1956 LLVM_DEBUG(dbgs() << "LSV: Extra GEP Created: \n" << *NewGep << "\n");
1957 NewElement =
1958 Builder.CreateAlignedStore(Val: PoisonValue::get(T: Ty), Ptr: NewGep, Align: Alignment);
1959 }
1960
1961 // Attach all metadata to the new element.
1962 // propagateMetadata will fold it into the final vector when applicable.
1963 NewElement->copyMetadata(SrcInst: *Prev.Inst);
1964
1965 // Cache created elements for tracking and cleanup
1966 ExtraElements.insert(V: NewElement);
1967
1968 APInt NewOffsetFromLeader = Prev.OffsetFromLeader + Offset;
1969 LLVM_DEBUG(dbgs() << "LSV: Extra Element Created: \n"
1970 << *NewElement
1971 << " OffsetFromLeader: " << NewOffsetFromLeader << "\n");
1972 return ChainElem{NewElement, NewOffsetFromLeader};
1973}
1974
1975Value *Vectorizer::createMaskForExtraElements(const ArrayRef<ChainElem> C,
1976 FixedVectorType *VecTy) {
1977 // Start each mask element as false
1978 SmallVector<Constant *, 64> MaskElts(VecTy->getNumElements(),
1979 Builder.getInt1(V: false));
1980 // Iterate over the chain and set the corresponding mask element to true for
1981 // each element that is not an extra element.
1982 for (const ChainElem &E : C) {
1983 if (ExtraElements.contains(V: E.Inst))
1984 continue;
1985 unsigned EOffset =
1986 (E.OffsetFromLeader - C[0].OffsetFromLeader).getZExtValue();
1987 unsigned VecIdx =
1988 8 * EOffset / DL.getTypeSizeInBits(Ty: VecTy->getScalarType());
1989 if (FixedVectorType *VT =
1990 dyn_cast<FixedVectorType>(Val: getLoadStoreType(I: E.Inst)))
1991 for (unsigned J = 0; J < VT->getNumElements(); ++J)
1992 MaskElts[VecIdx + J] = Builder.getInt1(V: true);
1993 else
1994 MaskElts[VecIdx] = Builder.getInt1(V: true);
1995 }
1996 return ConstantVector::get(V: MaskElts);
1997}
1998
1999void Vectorizer::deleteExtraElements() {
2000 for (auto *ExtraElement : ExtraElements) {
2001 if (isa<LoadInst>(Val: ExtraElement)) {
2002 [[maybe_unused]] bool Deleted =
2003 RecursivelyDeleteTriviallyDeadInstructions(V: ExtraElement);
2004 assert(Deleted && "Extra Load should always be trivially dead");
2005 } else {
2006 // Unlike Extra Loads, Extra Stores won't be "dead", but should all be
2007 // deleted regardless. They will have either been combined into a masked
2008 // store, or will be left behind and need to be cleaned up.
2009 auto *PtrOperand = getLoadStorePointerOperand(V: ExtraElement);
2010 ExtraElement->eraseFromParent();
2011 RecursivelyDeleteTriviallyDeadInstructions(V: PtrOperand);
2012 }
2013 }
2014
2015 ExtraElements.clear();
2016}
2017