LoadStoreVectorizer.cpp source code [llvm_projects/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp]

1	//===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This pass merges loads/stores to/from sequential memory addresses into vector
10	// loads/stores. Although there's nothing GPU-specific in here, this pass is
11	// motivated by the microarchitectural quirks of nVidia and AMD GPUs.
12	//
13	// (For simplicity below we talk about loads only, but everything also applies
14	// to stores.)
15	//
16	// This pass is intended to be run late in the pipeline, after other
17	// vectorization opportunities have been exploited. So the assumption here is
18	// that immediately following our new vector load we'll need to extract out the
19	// individual elements of the load, so we can operate on them individually.
20	//
21	// On CPUs this transformation is usually not beneficial, because extracting the
22	// elements of a vector register is expensive on most architectures. It's
23	// usually better just to load each element individually into its own scalar
24	// register.
25	//
26	// However, nVidia and AMD GPUs don't have proper vector registers. Instead, a
27	// "vector load" loads directly into a series of scalar registers. In effect,
28	// extracting the elements of the vector is free. It's therefore always
29	// beneficial to vectorize a sequence of loads on these architectures.
30	//
31	// Vectorizing (perhaps a better name might be "coalescing") loads can have
32	// large performance impacts on GPU kernels, and opportunities for vectorizing
33	// are common in GPU code. This pass tries very hard to find such
34	// opportunities; its runtime is quadratic in the number of loads in a BB.
35	//
36	// Some CPU architectures, such as ARM, have instructions that load into
37	// multiple scalar registers, similar to a GPU vectorized load. In theory ARM
38	// could use this pass (with some modifications), but currently it implements
39	// its own pass to do something similar to what we do here.
40	//
41	// Overview of the algorithm and terminology in this pass:
42	//
43	// - Break up each basic block into pseudo-BBs, composed of instructions which
44	// are guaranteed to transfer control to their successors.
45	// - Within a single pseudo-BB, find all loads, and group them into
46	// "equivalence classes" according to getUnderlyingObject() and loaded
47	// element size. Do the same for stores.
48	// - For each equivalence class, greedily build "chains". Each chain has a
49	// leader instruction, and every other member of the chain has a known
50	// constant offset from the first instr in the chain.
51	// - Break up chains so that they contain only contiguous accesses of legal
52	// size with no intervening may-alias instrs.
53	// - Convert each chain to vector instructions.
54	//
55	// The O(n^2) behavior of this pass comes from initially building the chains.
56	// In the worst case we have to compare each new instruction to all of those
57	// that came before. To limit this, we only calculate the offset to the leaders
58	// of the N most recently-used chains.
59
60	#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
61	#include "llvm/ADT/APInt.h"
62	#include "llvm/ADT/ArrayRef.h"
63	#include "llvm/ADT/DenseMap.h"
64	#include "llvm/ADT/MapVector.h"
65	#include "llvm/ADT/PostOrderIterator.h"
66	#include "llvm/ADT/STLExtras.h"
67	#include "llvm/ADT/Sequence.h"
68	#include "llvm/ADT/SmallPtrSet.h"
69	#include "llvm/ADT/SmallVector.h"
70	#include "llvm/ADT/Statistic.h"
71	#include "llvm/ADT/iterator_range.h"
72	#include "llvm/Analysis/AliasAnalysis.h"
73	#include "llvm/Analysis/AssumptionCache.h"
74	#include "llvm/Analysis/MemoryLocation.h"
75	#include "llvm/Analysis/ScalarEvolution.h"
76	#include "llvm/Analysis/TargetTransformInfo.h"
77	#include "llvm/Analysis/ValueTracking.h"
78	#include "llvm/Analysis/VectorUtils.h"
79	#include "llvm/IR/Attributes.h"
80	#include "llvm/IR/BasicBlock.h"
81	#include "llvm/IR/ConstantRange.h"
82	#include "llvm/IR/Constants.h"
83	#include "llvm/IR/DataLayout.h"
84	#include "llvm/IR/DerivedTypes.h"
85	#include "llvm/IR/Dominators.h"
86	#include "llvm/IR/Function.h"
87	#include "llvm/IR/GetElementPtrTypeIterator.h"
88	#include "llvm/IR/IRBuilder.h"
89	#include "llvm/IR/InstrTypes.h"
90	#include "llvm/IR/Instruction.h"
91	#include "llvm/IR/Instructions.h"
92	#include "llvm/IR/LLVMContext.h"
93	#include "llvm/IR/Module.h"
94	#include "llvm/IR/Type.h"
95	#include "llvm/IR/Value.h"
96	#include "llvm/InitializePasses.h"
97	#include "llvm/Pass.h"
98	#include "llvm/Support/Alignment.h"
99	#include "llvm/Support/Casting.h"
100	#include "llvm/Support/Debug.h"
101	#include "llvm/Support/KnownBits.h"
102	#include "llvm/Support/MathExtras.h"
103	#include "llvm/Support/ModRef.h"
104	#include "llvm/Support/raw_ostream.h"
105	#include "llvm/Transforms/Utils/Local.h"
106	#include <algorithm>
107	#include <cassert>
108	#include <cstdint>
109	#include <cstdlib>
110	#include <iterator>
111	#include <numeric>
112	#include <optional>
113	#include <tuple>
114	#include <type_traits>
115	#include <utility>
116	#include <vector>
117
118	using namespace llvm;
119
120	#define DEBUG_TYPE "load-store-vectorizer"
121
122	STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
123	STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
124
125	namespace {
126
127	// Equivalence class key, the initial tuple by which we group loads/stores.
128	// Loads/stores with different EqClassKeys are never merged.
129	//
130	// (We could in theory remove element-size from the this tuple. We'd just need
131	// to fix up the vector packing/unpacking code.)
132	using EqClassKey =
133	std::tuple<const Value * / result of getUnderlyingObject() /,
134	unsigned / AddrSpace /,
135	unsigned / Load/Store element size bits /,
136	char / IsLoad; char b/c bool can't be a DenseMap key /
137	>;
138	[[maybe_unused]] llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
139	const EqClassKey &K) {
140	const auto &[UnderlyingObject, AddrSpace, ElementSize, IsLoad] = K;
141	return OS << (IsLoad ? "load" : "store") << " of " << *UnderlyingObject
142	<< " of element size " << ElementSize << " bits in addrspace "
143	<< AddrSpace;
144	}
145
146	// A Chain is a set of instructions such that:
147	// - All instructions have the same equivalence class, so in particular all are
148	// loads, or all are stores.
149	// - We know the address accessed by the i'th chain elem relative to the
150	// chain's leader instruction, which is the first instr of the chain in BB
151	// order.
152	//
153	// Chains have two canonical orderings:
154	// - BB order, sorted by Instr->comesBefore.
155	// - Offset order, sorted by OffsetFromLeader.
156	// This pass switches back and forth between these orders.
157	struct ChainElem {
158	Instruction *Inst;
159	APInt OffsetFromLeader;
160	ChainElem(Instruction *Inst, APInt OffsetFromLeader)
161	: Inst(std::move(Inst)), OffsetFromLeader (std::move(OffsetFromLeader)) {}
162	};
163	using Chain = SmallVector<ChainElem, `1`>;
164
165	void sortChainInBBOrder(Chain &C) {
166	sort(C, Comp: [](auto &A, auto &B) { return A.Inst->comesBefore(B.Inst); });
167	}
168
169	void sortChainInOffsetOrder(Chain &C) {
170	sort(C, Comp: [](const auto &A, const auto &B) {
171	if (A.OffsetFromLeader != B.OffsetFromLeader)
172	return A.OffsetFromLeader.slt(B.OffsetFromLeader);
173	return A.Inst->comesBefore(B.Inst); // stable tiebreaker
174	});
175	}
176
177	[[maybe_unused]] void dumpChain(ArrayRef<ChainElem> C) {
178	for (const auto &E : C) {
179	dbgs() << " " << *E.Inst << " (offset " << E.OffsetFromLeader << ")\n";
180	}
181	}
182
183	using EquivalenceClassMap =
184	MapVector<EqClassKey, SmallVector<Instruction *, `8`>>;
185
186	// FIXME: Assuming stack alignment of 4 is always good enough
187	constexpr unsigned StackAdjustedAlignment = `4`;
188
189	Instruction propagateMetadata(Instruction I, const Chain &C) {
190	SmallVector<Value *, `8`> Values;
191	for (const ChainElem &E : C)
192	Values.emplace_back(Args: E.Inst);
193	return propagateMetadata(I, VL: Values);
194	}
195
196	bool isInvariantLoad(const Instruction *I) {
197	const LoadInst *LI = dyn_cast<LoadInst>(Val: I);
198	return LI != nullptr && LI->hasMetadata(KindID: LLVMContext::MD_invariant_load);
199	}
200
201	/// Reorders the instructions that I depends on (the instructions defining its
202	/// operands), to ensure they dominate I.
203	void reorder(Instruction *I) {
204	SmallPtrSet<Instruction *, `16`> InstructionsToMove;
205	SmallVector<Instruction *, `16`> Worklist;
206
207	Worklist.emplace_back(Args&: I);
208	while (!Worklist.empty()) {
209	Instruction *IW = Worklist.pop_back_val();
210	int NumOperands = IW->getNumOperands();
211	for (int Idx = `0`; Idx < NumOperands; Idx++) {
212	Instruction *IM = dyn_cast<Instruction>(Val: IW->getOperand(i: Idx));
213	if (!IM \|\| IM->getOpcode() == Instruction::PHI)
214	continue;
215
216	// If IM is in another BB, no need to move it, because this pass only
217	// vectorizes instructions within one BB.
218	if (IM->getParent() != I->getParent())
219	continue;
220
221	assert(IM != I && "Unexpected cycle while re-ordering instructions");
222
223	if (!IM->comesBefore(Other: I)) {
224	InstructionsToMove.insert(Ptr: IM);
225	Worklist.emplace_back(Args&: IM);
226	}
227	}
228	}
229
230	// All instructions to move should follow I. Start from I, not from begin().
231	for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;) {
232	Instruction IM = &(BBI ++);
233	if (!InstructionsToMove.contains(Ptr: IM))
234	continue;
235	IM->moveBefore(InsertPos: I->getIterator());
236	}
237	}
238
239	class Vectorizer {
240	Function &F;
241	AliasAnalysis &AA;
242	AssumptionCache &AC;
243	DominatorTree &DT;
244	ScalarEvolution &SE;
245	TargetTransformInfo &TTI;
246	const DataLayout &DL;
247	IRBuilder<> Builder;
248
249	// We could erase instrs right after vectorizing them, but that can mess up
250	// our BB iterators, and also can make the equivalence class keys point to
251	// freed memory. This is fixable, but it's simpler just to wait until we're
252	// done with the BB and erase all at once.
253	SmallVector<Instruction *, `128`> ToErase;
254
255	public:
256	Vectorizer(Function &F, AliasAnalysis &AA, AssumptionCache &AC,
257	DominatorTree &DT, ScalarEvolution &SE, TargetTransformInfo &TTI)
258	: F(F), AA(AA), AC(AC), DT(DT), SE(SE), TTI(TTI),
259	DL(F.getDataLayout()), Builder (SE.getContext()) {}
260
261	bool run();
262
263	private:
264	static const unsigned MaxDepth = `3`;
265
266	/// Runs the vectorizer on a "pseudo basic block", which is a range of
267	/// instructions [Begin, End) within one BB all of which have
268	/// isGuaranteedToTransferExecutionToSuccessor(I) == true.
269	bool runOnPseudoBB(BasicBlock::iterator Begin, BasicBlock::iterator End);
270
271	/// Runs the vectorizer on one equivalence class, i.e. one set of loads/stores
272	/// in the same BB with the same value for getUnderlyingObject() etc.
273	bool runOnEquivalenceClass(const EqClassKey &EqClassKey,
274	ArrayRef<Instruction *> EqClass);
275
276	/// Runs the vectorizer on one chain, i.e. a subset of an equivalence class
277	/// where all instructions access a known, constant offset from the first
278	/// instruction.
279	bool runOnChain(Chain &C);
280
281	/// Splits the chain into subchains of instructions which read/write a
282	/// contiguous block of memory. Discards any length-1 subchains (because
283	/// there's nothing to vectorize in there).
284	std::vector<Chain> splitChainByContiguity(Chain &C);
285
286	/// Splits the chain into subchains where it's safe to hoist loads up to the
287	/// beginning of the sub-chain and it's safe to sink loads up to the end of
288	/// the sub-chain. Discards any length-1 subchains.
289	std::vector<Chain> splitChainByMayAliasInstrs(Chain &C);
290
291	/// Splits the chain into subchains that make legal, aligned accesses.
292	/// Discards any length-1 subchains.
293	std::vector<Chain> splitChainByAlignment(Chain &C);
294
295	/// Converts the instrs in the chain into a single vectorized load or store.
296	/// Adds the old scalar loads/stores to ToErase.
297	bool vectorizeChain(Chain &C);
298
299	/// Tries to compute the offset in bytes PtrB - PtrA.
300	std::optional<APInt> getConstantOffset(Value PtrA, Value PtrB,
301	Instruction *ContextInst,
302	unsigned Depth = `0`);
303	std::optional<APInt> getConstantOffsetComplexAddrs(Value PtrA, Value PtrB,
304	Instruction *ContextInst,
305	unsigned Depth);
306	std::optional<APInt> getConstantOffsetSelects(Value PtrA, Value PtrB,
307	Instruction *ContextInst,
308	unsigned Depth);
309
310	/// Gets the element type of the vector that the chain will load or store.
311	/// This is nontrivial because the chain may contain elements of different
312	/// types; e.g. it's legal to have a chain that contains both i32 and float.
313	Type getChainElemTy(const* Chain &C);
314
315	/// Determines whether ChainElem can be moved up (if IsLoad) or down (if
316	/// !IsLoad) to ChainBegin -- i.e. there are no intervening may-alias
317	/// instructions.
318	///
319	/// The map ChainElemOffsets must contain all of the elements in
320	/// [ChainBegin, ChainElem] and their offsets from some arbitrary base
321	/// address. It's ok if it contains additional entries.
322	template <bool IsLoadChain>
323	bool isSafeToMove(
324	Instruction ChainElem, Instruction ChainBegin,
325	const DenseMap<Instruction , APInt /OffsetFromLeader/*> &ChainOffsets);
326
327	/// Merges the equivalence classes if they have underlying objects that differ
328	/// by one level of indirection (i.e., one is a getelementptr and the other is
329	/// the base pointer in that getelementptr).
330	void mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const;
331
332	/// Collects loads and stores grouped by "equivalence class", where:
333	/// - all elements in an eq class are a load or all are a store,
334	/// - they all load/store the same element size (it's OK to have e.g. i8 and
335	/// <4 x i8> in the same class, but not i32 and <4 x i8>), and
336	/// - they all have the same value for getUnderlyingObject().
337	EquivalenceClassMap collectEquivalenceClasses(BasicBlock::iterator Begin,
338	BasicBlock::iterator End);
339
340	/// Partitions Instrs into "chains" where every instruction has a known
341	/// constant offset from the first instr in the chain.
342	///
343	/// Postcondition: For all i, ret[i][0].second == 0, because the first instr
344	/// in the chain is the leader, and an instr touches distance 0 from itself.
345	std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs);
346	};
347
348	class LoadStoreVectorizerLegacyPass : public FunctionPass {
349	public:
350	static char ID;
351
352	LoadStoreVectorizerLegacyPass() : FunctionPass (ID) {
353	initializeLoadStoreVectorizerLegacyPassPass(
354	*PassRegistry::getPassRegistry());
355	}
356
357	bool runOnFunction(Function &F) override;
358
359	StringRef getPassName() const override {
360	return "GPU Load and Store Vectorizer";
361	}
362
363	void getAnalysisUsage(AnalysisUsage &AU) const override {
364	AU.addRequired<AAResultsWrapperPass>();
365	AU.addRequired<AssumptionCacheTracker>();
366	AU.addRequired<ScalarEvolutionWrapperPass>();
367	AU.addRequired<DominatorTreeWrapperPass>();
368	AU.addRequired<TargetTransformInfoWrapperPass>();
369	AU.setPreservesCFG();
370	}
371	};
372
373	} // end anonymous namespace
374
375	char LoadStoreVectorizerLegacyPass::ID = `0`;
376
377	INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
378	"Vectorize load and Store instructions", false, false)
379	INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
380	INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker);
381	INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
382	INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
383	INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
384	INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
385	INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
386	"Vectorize load and store instructions", false, false)
387
388	Pass *llvm::createLoadStoreVectorizerPass() {
389	return new LoadStoreVectorizerLegacyPass ();
390	}
391
392	bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) {
393	// Don't vectorize when the attribute NoImplicitFloat is used.
394	if (skipFunction(F) \|\| F.hasFnAttribute(Kind: Attribute::NoImplicitFloat))
395	return false;
396
397	AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
398	DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
399	ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
400	TargetTransformInfo &TTI =
401	getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
402
403	AssumptionCache &AC =
404	getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
405
406	return Vectorizer (F, AA, AC, DT, SE, TTI).run();
407	}
408
409	PreservedAnalyses LoadStoreVectorizerPass::run(Function &F,
410	FunctionAnalysisManager &AM) {
411	// Don't vectorize when the attribute NoImplicitFloat is used.
412	if (F.hasFnAttribute(Kind: Attribute::NoImplicitFloat))
413	return PreservedAnalyses::all();
414
415	AliasAnalysis &AA = AM.getResult<AAManager>(IR&: F);
416	DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(IR&: F);
417	ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(IR&: F);
418	TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(IR&: F);
419	AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(IR&: F);
420
421	bool Changed = Vectorizer (F, AA, AC, DT, SE, TTI).run();
422	PreservedAnalyses PA;
423	PA.preserveSet<CFGAnalyses>();
424	return Changed ? PA : PreservedAnalyses::all();
425	}
426
427	bool Vectorizer::run() {
428	bool Changed = false;
429	// Break up the BB if there are any instrs which aren't guaranteed to transfer
430	// execution to their successor.
431	//
432	// Consider, for example:
433	//
434	// def assert_arr_len(int n) { if (n < 2) exit(); }
435	//
436	// load arr[0]
437	// call assert_array_len(arr.length)
438	// load arr[1]
439	//
440	// Even though assert_arr_len does not read or write any memory, we can't
441	// speculate the second load before the call. More info at
442	// https://github.com/llvm/llvm-project/issues/52950.
443	for (BasicBlock *BB : post_order(G: &F)) {
444	// BB must at least have a terminator.
445	assert(!BB->empty());
446
447	SmallVector<BasicBlock::iterator, `8`> Barriers;
448	Barriers.emplace_back(Args: BB->begin());
449	for (Instruction &I : *BB)
450	if (!isGuaranteedToTransferExecutionToSuccessor(I: &I))
451	Barriers.emplace_back(Args: I.getIterator());
452	Barriers.emplace_back(Args: BB->end());
453
454	for (auto It = Barriers.begin(), End = std::prev(x: Barriers.end()); It != End;
455	++It)
456	Changed \|= runOnPseudoBB(Begin: It, End: std::next(x: It));
457
458	for (Instruction *I : ToErase) {
459	auto *PtrOperand = getLoadStorePointerOperand(V: I);
460	if (I->use_empty())
461	I->eraseFromParent();
462	RecursivelyDeleteTriviallyDeadInstructions(V: PtrOperand);
463	}
464	ToErase.clear();
465	}
466
467	return Changed;
468	}
469
470	bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin,
471	BasicBlock::iterator End) {
472	LLVM_DEBUG({
473	dbgs() << "LSV: Running on pseudo-BB [" << *Begin << " ... ";
474	if (End != Begin->getParent()->end())
475	dbgs() << *End;
476	else
477	dbgs() << "<BB end>";
478	dbgs() << ")\n";
479	});
480
481	bool Changed = false;
482	for (const auto &[EqClassKey, EqClass] :
483	collectEquivalenceClasses(Begin, End))
484	Changed \|= runOnEquivalenceClass(EqClassKey, EqClass);
485
486	return Changed;
487	}
488
489	bool Vectorizer::runOnEquivalenceClass(const EqClassKey &EqClassKey,
490	ArrayRef<Instruction *> EqClass) {
491	bool Changed = false;
492
493	LLVM_DEBUG({
494	dbgs() << "LSV: Running on equivalence class of size " << EqClass.size()
495	<< " keyed on " << EqClassKey << ":\n";
496	for (Instruction *I : EqClass)
497	dbgs() << " " << *I << "\n";
498	});
499
500	std::vector<Chain> Chains = gatherChains(Instrs: EqClass);
501	LLVM_DEBUG(dbgs() << "LSV: Got " << Chains.size()
502	<< " nontrivial chains.\n";);
503	for (Chain &C : Chains)
504	Changed \|= runOnChain(C);
505	return Changed;
506	}
507
508	bool Vectorizer::runOnChain(Chain &C) {
509	LLVM_DEBUG({
510	dbgs() << "LSV: Running on chain with " << C.size() << " instructions:\n";
511	dumpChain(C);
512	});
513
514	// Split up the chain into increasingly smaller chains, until we can finally
515	// vectorize the chains.
516	//
517	// (Don't be scared by the depth of the loop nest here. These operations are
518	// all at worst O(n lg n) in the number of instructions, and splitting chains
519	// doesn't change the number of instrs. So the whole loop nest is O(n lg n).)
520	bool Changed = false;
521	for (auto &C : splitChainByMayAliasInstrs(C))
522	for (auto &C : splitChainByContiguity(C))
523	for (auto &C : splitChainByAlignment(C))
524	Changed \|= vectorizeChain(C);
525	return Changed;
526	}
527
528	std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) {
529	if (C.empty())
530	return {};
531
532	sortChainInBBOrder(C);
533
534	LLVM_DEBUG({
535	dbgs() << "LSV: splitChainByMayAliasInstrs considering chain:\n";
536	dumpChain(C);
537	});
538
539	// We know that elements in the chain with nonverlapping offsets can't
540	// alias, but AA may not be smart enough to figure this out. Use a
541	// hashtable.
542	DenseMap<Instruction , APInt /OffsetFromLeader/*> ChainOffsets;
543	for (const auto &E : C)
544	ChainOffsets.insert(KV: {&*E.Inst, E.OffsetFromLeader});
545
546	// Loads get hoisted up to the first load in the chain. Stores get sunk
547	// down to the last store in the chain. Our algorithm for loads is:
548	//
549	// - Take the first element of the chain. This is the start of a new chain.
550	// - Take the next element of `Chain` and check for may-alias instructions
551	// up to the start of NewChain. If no may-alias instrs, add it to
552	// NewChain. Otherwise, start a new NewChain.
553	//
554	// For stores it's the same except in the reverse direction.
555	//
556	// We expect IsLoad to be an std::bool_constant.
557	auto Impl = [&](auto IsLoad) {
558	// MSVC is unhappy if IsLoad is a capture, so pass it as an arg.
559	auto [ChainBegin, ChainEnd] = [&](auto IsLoad) {
560	if constexpr (IsLoad())
561	return std::make_pair(x: C.begin(), y: C.end());
562	else
563	return std::make_pair(x: C.rbegin(), y: C.rend());
564	}(IsLoad);
565	assert(ChainBegin != ChainEnd);
566
567	std::vector<Chain> Chains;
568	SmallVector<ChainElem, `1`> NewChain;
569	NewChain.emplace_back(*ChainBegin);
570	for (auto ChainIt = std::next(ChainBegin); ChainIt != ChainEnd; ++ChainIt) {
571	if (isSafeToMove<IsLoad>(ChainIt->Inst, NewChain.front().Inst,
572	ChainOffsets)) {
573	LLVM_DEBUG(dbgs() << "LSV: No intervening may-alias instrs; can merge "
574	<< ChainIt->Inst << " into " << ChainBegin->Inst
575	<< "\n");
576	NewChain.emplace_back(*ChainIt);
577	} else {
578	LLVM_DEBUG(
579	dbgs() << "LSV: Found intervening may-alias instrs; cannot merge "
580	<< ChainIt->Inst << " into " << ChainBegin->Inst << "\n");
581	if (NewChain.size() > `1`) {
582	LLVM_DEBUG({
583	dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
584	dumpChain(NewChain);
585	});
586	Chains.emplace_back(args: std::move(NewChain));
587	}
588
589	// Start a new chain.
590	NewChain = SmallVector<ChainElem, `1`>({*ChainIt});
591	}
592	}
593	if (NewChain.size() > `1`) {
594	LLVM_DEBUG({
595	dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
596	dumpChain(NewChain);
597	});
598	Chains.emplace_back(args: std::move(NewChain));
599	}
600	return Chains;
601	};
602
603	if (isa<LoadInst>(Val: C [`0`].Inst))
604	return Impl (/IsLoad=/std::bool_constant<true>());
605
606	assert(isa<StoreInst>(C[`0`].Inst));
607	return Impl (/IsLoad=/std::bool_constant<false>());
608	}
609
610	std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
611	if (C.empty())
612	return {};
613
614	sortChainInOffsetOrder(C);
615
616	LLVM_DEBUG({
617	dbgs() << "LSV: splitChainByContiguity considering chain:\n";
618	dumpChain(C);
619	});
620
621	std::vector<Chain> Ret;
622	Ret.push_back(x: {C.front()});
623
624	for (auto It = std::next(x: C.begin()), End = C.end(); It != End; ++It) {
625	// `prev` accesses offsets [PrevDistFromBase, PrevReadEnd).
626	auto &CurChain = Ret.back();
627	const ChainElem &Prev = CurChain.back();
628	unsigned SzBits = DL.getTypeSizeInBits(Ty: getLoadStoreType(I: &*Prev.Inst));
629	assert(SzBits % `8` == `0` && "Non-byte sizes should have been filtered out by "
630	"collectEquivalenceClass");
631	APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / `8`;
632
633	// Add this instruction to the end of the current chain, or start a new one.
634	bool AreContiguous = It->OffsetFromLeader == PrevReadEnd;
635	LLVM_DEBUG(dbgs() << "LSV: Instructions are "
636	<< (AreContiguous ? "" : "not ") << "contiguous: "
637	<< *Prev.Inst << " (ends at offset " << PrevReadEnd
638	<< ") -> " << *It->Inst << " (starts at offset "
639	<< It->OffsetFromLeader << ")\n");
640	if (AreContiguous)
641	CurChain.push_back(Elt: *It);
642	else
643	Ret.push_back(x: {*It});
644	}
645
646	// Filter out length-1 chains, these are uninteresting.
647	llvm::erase_if(C&: Ret, P: [](const auto &Chain) { return Chain.size() <= `1`; });
648	return Ret;
649	}
650
651	Type Vectorizer::getChainElemTy(const* Chain &C) {
652	assert(!C.empty());
653	// The rules are:
654	// - If there are any pointer types in the chain, use an integer type.
655	// - Prefer an integer type if it appears in the chain.
656	// - Otherwise, use the first type in the chain.
657	//
658	// The rule about pointer types is a simplification when we merge e.g. a load
659	// of a ptr and a double. There's no direct conversion from a ptr to a
660	// double; it requires a ptrtoint followed by a bitcast.
661	//
662	// It's unclear to me if the other rules have any practical effect, but we do
663	// it to match this pass's previous behavior.
664	if (any_of(Range: C, P: [](const ChainElem &E) {
665	return getLoadStoreType(I: E.Inst)->getScalarType()->isPointerTy();
666	})) {
667	return Type::getIntNTy(
668	C&: F.getContext(),
669	N: DL.getTypeSizeInBits(Ty: getLoadStoreType(I: C [`0`].Inst)->getScalarType()));
670	}
671
672	for (const ChainElem &E : C)
673	if (Type *T = getLoadStoreType(I: E.Inst)->getScalarType(); T->isIntegerTy())
674	return T;
675	return getLoadStoreType(I: C [`0`].Inst)->getScalarType();
676	}
677
678	std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
679	// We use a simple greedy algorithm.
680	// - Given a chain of length N, find all prefixes that
681	// (a) are not longer than the max register length, and
682	// (b) are a power of 2.
683	// - Starting from the longest prefix, try to create a vector of that length.
684	// - If one of them works, great. Repeat the algorithm on any remaining
685	// elements in the chain.
686	// - If none of them work, discard the first element and repeat on a chain
687	// of length N-1.
688	if (C.empty())
689	return {};
690
691	sortChainInOffsetOrder(C);
692
693	LLVM_DEBUG({
694	dbgs() << "LSV: splitChainByAlignment considering chain:\n";
695	dumpChain(C);
696	});
697
698	bool IsLoadChain = isa<LoadInst>(Val: C [`0`].Inst);
699	auto GetVectorFactor = [&](unsigned VF, unsigned LoadStoreSize,
700	unsigned ChainSizeBytes, VectorType *VecTy) {
701	return IsLoadChain ? TTI.getLoadVectorFactor(VF, LoadSize: LoadStoreSize,
702	ChainSizeInBytes: ChainSizeBytes, VecTy)
703	: TTI.getStoreVectorFactor(VF, StoreSize: LoadStoreSize,
704	ChainSizeInBytes: ChainSizeBytes, VecTy);
705	};
706
707	#ifndef NDEBUG
708	for (const auto &E : C) {
709	Type *Ty = getLoadStoreType(E.Inst)->getScalarType();
710	assert(isPowerOf2_32(DL.getTypeSizeInBits(Ty)) &&
711	"Should have filtered out non-power-of-two elements in "
712	"collectEquivalenceClasses.");
713	}
714	#endif
715
716	unsigned AS = getLoadStoreAddressSpace(I: C [`0`].Inst);
717	unsigned VecRegBytes = TTI.getLoadStoreVecRegBitWidth(AddrSpace: AS) / `8`;
718
719	std::vector<Chain> Ret;
720	for (unsigned CBegin = `0`; CBegin < C.size(); ++CBegin) {
721	// Find candidate chains of size not greater than the largest vector reg.
722	// These chains are over the closed interval [CBegin, CEnd].
723	SmallVector<std::pair<unsigned /CEnd/, unsigned /SizeBytes/>, `8`>
724	CandidateChains;
725	for (unsigned CEnd = CBegin + `1`, Size = C.size(); CEnd < Size; ++CEnd) {
726	APInt Sz = C [CEnd].OffsetFromLeader +
727	DL.getTypeStoreSize(Ty: getLoadStoreType(I: C [CEnd].Inst)) -
728	C [CBegin].OffsetFromLeader;
729	if (Sz.sgt(RHS: VecRegBytes))
730	break;
731	CandidateChains.emplace_back(Args&: CEnd,
732	Args: static_cast<unsigned>(Sz.getLimitedValue()));
733	}
734
735	// Consider the longest chain first.
736	for (auto It = CandidateChains.rbegin(), End = CandidateChains.rend();
737	It != End; ++It) {
738	auto [CEnd, SizeBytes] = *It;
739	LLVM_DEBUG(
740	dbgs() << "LSV: splitChainByAlignment considering candidate chain ["
741	<< C[CBegin].Inst << " ... " << C[CEnd].Inst << "]\n");
742
743	Type *VecElemTy = getChainElemTy(C);
744	// Note, VecElemTy is a power of 2, but might be less than one byte. For
745	// example, we can vectorize 2 x <2 x i4> to <4 x i4>, and in this case
746	// VecElemTy would be i4.
747	unsigned VecElemBits = DL.getTypeSizeInBits(Ty: VecElemTy);
748
749	// SizeBytes and VecElemBits are powers of 2, so they divide evenly.
750	assert((`8` * SizeBytes) % VecElemBits == `0`);
751	unsigned NumVecElems = `8` * SizeBytes / VecElemBits;
752	FixedVectorType *VecTy = FixedVectorType::get(ElementType: VecElemTy, NumElts: NumVecElems);
753	unsigned VF = `8` * VecRegBytes / VecElemBits;
754
755	// Check that TTI is happy with this vectorization factor.
756	unsigned TargetVF = GetVectorFactor (VF, VecElemBits,
757	VecElemBits * NumVecElems / `8`, VecTy);
758	if (TargetVF != VF && TargetVF < NumVecElems) {
759	LLVM_DEBUG(
760	dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
761	"because TargetVF="
762	<< TargetVF << " != VF=" << VF
763	<< " and TargetVF < NumVecElems=" << NumVecElems << "\n");
764	continue;
765	}
766
767	// Is a load/store with this alignment allowed by TTI and at least as fast
768	// as an unvectorized load/store?
769	//
770	// TTI and F are passed as explicit captures to WAR an MSVC misparse (??).
771	auto IsAllowedAndFast = [&, SizeBytes = SizeBytes, &TTI = TTI,
772	&F = F](Align Alignment) {
773	if (Alignment.value() % SizeBytes == `0`)
774	return true;
775	unsigned VectorizedSpeed = `0`;
776	bool AllowsMisaligned = TTI.allowsMisalignedMemoryAccesses(
777	Context&: F.getContext(), BitWidth: SizeBytes * `8`, AddressSpace: AS, Alignment, Fast: &VectorizedSpeed);
778	if (!AllowsMisaligned) {
779	LLVM_DEBUG(dbgs()
780	<< "LSV: Access of " << SizeBytes << "B in addrspace "
781	<< AS << " with alignment " << Alignment.value()
782	<< " is misaligned, and therefore can't be vectorized.\n");
783	return false;
784	}
785
786	unsigned ElementwiseSpeed = `0`;
787	(TTI).allowsMisalignedMemoryAccesses(Context&: (F).getContext(), BitWidth: VecElemBits, AddressSpace: AS,
788	Alignment, Fast: &ElementwiseSpeed);
789	if (VectorizedSpeed < ElementwiseSpeed) {
790	LLVM_DEBUG(dbgs()
791	<< "LSV: Access of " << SizeBytes << "B in addrspace "
792	<< AS << " with alignment " << Alignment.value()
793	<< " has relative speed " << VectorizedSpeed
794	<< ", which is lower than the elementwise speed of "
795	<< ElementwiseSpeed
796	<< ". Therefore this access won't be vectorized.\n");
797	return false;
798	}
799	return true;
800	};
801
802	// If we're loading/storing from an alloca, align it if possible.
803	//
804	// FIXME: We eagerly upgrade the alignment, regardless of whether TTI
805	// tells us this is beneficial. This feels a bit odd, but it matches
806	// existing tests. This isn't so* bad, because at most we align to 4*
807	// bytes (current value of StackAdjustedAlignment).
808	//
809	// FIXME: We will upgrade the alignment of the alloca even if it turns out
810	// we can't vectorize for some other reason.
811	Value *PtrOperand = getLoadStorePointerOperand(V: C [CBegin].Inst);
812	bool IsAllocaAccess = AS == DL.getAllocaAddrSpace() &&
813	isa<AllocaInst>(Val: PtrOperand->stripPointerCasts());
814	Align Alignment = getLoadStoreAlignment(I: C [CBegin].Inst);
815	Align PrefAlign = Align (StackAdjustedAlignment);
816	if (IsAllocaAccess && Alignment.value() % SizeBytes != `0` &&
817	IsAllowedAndFast (PrefAlign)) {
818	Align NewAlign = getOrEnforceKnownAlignment(
819	V: PtrOperand, PrefAlign, DL, CxtI: C [CBegin].Inst, AC: nullptr, DT: &DT);
820	if (NewAlign >= Alignment) {
821	LLVM_DEBUG(dbgs()
822	<< "LSV: splitByChain upgrading alloca alignment from "
823	<< Alignment.value() << " to " << NewAlign.value()
824	<< "\n");
825	Alignment = NewAlign;
826	}
827	}
828
829	if (!IsAllowedAndFast (Alignment)) {
830	LLVM_DEBUG(
831	dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
832	"because its alignment is not AllowedAndFast: "
833	<< Alignment.value() << "\n");
834	continue;
835	}
836
837	if ((IsLoadChain &&
838	!TTI.isLegalToVectorizeLoadChain(ChainSizeInBytes: SizeBytes, Alignment, AddrSpace: AS)) \|\|
839	(!IsLoadChain &&
840	!TTI.isLegalToVectorizeStoreChain(ChainSizeInBytes: SizeBytes, Alignment, AddrSpace: AS))) {
841	LLVM_DEBUG(
842	dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
843	"because !isLegalToVectorizeLoad/StoreChain.");
844	continue;
845	}
846
847	// Hooray, we can vectorize this chain!
848	Chain &NewChain = Ret.emplace_back();
849	for (unsigned I = CBegin; I <= CEnd; ++I)
850	NewChain.emplace_back(Args&: C [I]);
851	CBegin = CEnd; // Skip over the instructions we've added to the chain.
852	break;
853	}
854	}
855	return Ret;
856	}
857
858	bool Vectorizer::vectorizeChain(Chain &C) {
859	if (C.size() < `2`)
860	return false;
861
862	sortChainInOffsetOrder(C);
863
864	LLVM_DEBUG({
865	dbgs() << "LSV: Vectorizing chain of " << C.size() << " instructions:\n";
866	dumpChain(C);
867	});
868
869	Type *VecElemTy = getChainElemTy(C);
870	bool IsLoadChain = isa<LoadInst>(Val: C [`0`].Inst);
871	unsigned AS = getLoadStoreAddressSpace(I: C [`0`].Inst);
872	unsigned ChainBytes = std::accumulate(
873	first: C.begin(), last: C.end(), init: `0u`, binary_op: [&](unsigned Bytes, const ChainElem &E) {
874	return Bytes + DL.getTypeStoreSize(Ty: getLoadStoreType(I: E.Inst));
875	});
876	assert(ChainBytes % DL.getTypeStoreSize(VecElemTy) == `0`);
877	// VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller
878	// than 1 byte (e.g. VecTy == <32 x i1>).
879	Type *VecTy = FixedVectorType::get(
880	ElementType: VecElemTy, NumElts: `8` * ChainBytes / DL.getTypeSizeInBits(Ty: VecElemTy));
881
882	Align Alignment = getLoadStoreAlignment(I: C [`0`].Inst);
883	// If this is a load/store of an alloca, we might have upgraded the alloca's
884	// alignment earlier. Get the new alignment.
885	if (AS == DL.getAllocaAddrSpace()) {
886	Alignment = std::max(
887	a: Alignment,
888	b: getOrEnforceKnownAlignment(V: getLoadStorePointerOperand(V: C [`0`].Inst),
889	PrefAlign: MaybeAlign (), DL, CxtI: C [`0`].Inst, AC: nullptr, DT: &DT));
890	}
891
892	// All elements of the chain must have the same scalar-type size.
893	#ifndef NDEBUG
894	for (const ChainElem &E : C)
895	assert(DL.getTypeStoreSize(getLoadStoreType(E.Inst)->getScalarType()) ==
896	DL.getTypeStoreSize(VecElemTy));
897	#endif
898
899	Instruction *VecInst;
900	if (IsLoadChain) {
901	// Loads get hoisted to the location of the first load in the chain. We may
902	// also need to hoist the (transitive) operands of the loads.
903	Builder.SetInsertPoint(
904	llvm::min_element(Range&: C, C: [](const auto &A, const auto &B) {
905	return A.Inst->comesBefore(B.Inst);
906	})->Inst);
907
908	// Chain is in offset order, so C[0] is the instr with the lowest offset,
909	// i.e. the root of the vector.
910	VecInst = Builder.CreateAlignedLoad(Ty: VecTy,
911	Ptr: getLoadStorePointerOperand(V: C [`0`].Inst),
912	Align: Alignment);
913
914	unsigned VecIdx = `0`;
915	for (const ChainElem &E : C) {
916	Instruction *I = E.Inst;
917	Value *V;
918	Type *T = getLoadStoreType(I);
919	if (auto *VT = dyn_cast<FixedVectorType>(Val: T)) {
920	auto Mask = llvm::to_vector<`8`>(
921	Range: llvm::seq<int>(Begin: VecIdx, End: VecIdx + VT->getNumElements()));
922	V = Builder.CreateShuffleVector(V: VecInst, Mask, Name: I->getName());
923	VecIdx += VT->getNumElements();
924	} else {
925	V = Builder.CreateExtractElement(Vec: VecInst, Idx: Builder.getInt32(C: VecIdx),
926	Name: I->getName());
927	++VecIdx;
928	}
929	if (V->getType() != I->getType())
930	V = Builder.CreateBitOrPointerCast(V, DestTy: I->getType());
931	I->replaceAllUsesWith(V);
932	}
933
934	// Finally, we need to reorder the instrs in the BB so that the (transitive)
935	// operands of VecInst appear before it. To see why, suppose we have
936	// vectorized the following code:
937	//
938	// ptr1 = gep a, 1
939	// load1 = load i32 ptr1
940	// ptr0 = gep a, 0
941	// load0 = load i32 ptr0
942	//
943	// We will put the vectorized load at the location of the earliest load in
944	// the BB, i.e. load1. We get:
945	//
946	// ptr1 = gep a, 1
947	// loadv = load <2 x i32> ptr0
948	// load0 = extractelement loadv, 0
949	// load1 = extractelement loadv, 1
950	// ptr0 = gep a, 0
951	//
952	// Notice that loadv uses ptr0, which is defined after* it!*
953	reorder(I: VecInst);
954	} else {
955	// Stores get sunk to the location of the last store in the chain.
956	Builder.SetInsertPoint(llvm::max_element(Range&: C, C: [](auto &A, auto &B) {
957	return A.Inst->comesBefore(B.Inst);
958	})->Inst);
959
960	// Build the vector to store.
961	Value *Vec = PoisonValue::get(T: VecTy);
962	unsigned VecIdx = `0`;
963	auto InsertElem = [&](Value *V) {
964	if (V->getType() != VecElemTy)
965	V = Builder.CreateBitOrPointerCast(V, DestTy: VecElemTy);
966	Vec = Builder.CreateInsertElement(Vec, NewElt: V, Idx: Builder.getInt32(C: VecIdx++));
967	};
968	for (const ChainElem &E : C) {
969	auto *I = cast<StoreInst>(Val: E.Inst);
970	if (FixedVectorType *VT =
971	dyn_cast<FixedVectorType>(Val: getLoadStoreType(I))) {
972	for (int J = `0`, JE = VT->getNumElements(); J < JE; ++J) {
973	InsertElem (Builder.CreateExtractElement(Vec: I->getValueOperand(),
974	Idx: Builder.getInt32(C: J)));
975	}
976	} else {
977	InsertElem (I->getValueOperand());
978	}
979	}
980
981	// Chain is in offset order, so C[0] is the instr with the lowest offset,
982	// i.e. the root of the vector.
983	VecInst = Builder.CreateAlignedStore(
984	Val: Vec,
985	Ptr: getLoadStorePointerOperand(V: C [`0`].Inst),
986	Align: Alignment);
987	}
988
989	propagateMetadata(I: VecInst, C);
990
991	for (const ChainElem &E : C)
992	ToErase.emplace_back(Args: E.Inst);
993
994	++NumVectorInstructions;
995	NumScalarsVectorized += C.size();
996	return true;
997	}
998
999	template <bool IsLoadChain>
1000	bool Vectorizer::isSafeToMove(
1001	Instruction ChainElem, Instruction ChainBegin,
1002	const DenseMap<Instruction , APInt /OffsetFromLeader/*> &ChainOffsets) {
1003	LLVM_DEBUG(dbgs() << "LSV: isSafeToMove(" << *ChainElem << " -> "
1004	<< *ChainBegin << ")\n");
1005
1006	assert(isa<LoadInst>(ChainElem) == IsLoadChain);
1007	if (ChainElem == ChainBegin)
1008	return true;
1009
1010	// Invariant loads can always be reordered; by definition they are not
1011	// clobbered by stores.
1012	if (isInvariantLoad(I: ChainElem))
1013	return true;
1014
1015	auto BBIt = std::next([&] {
1016	if constexpr (IsLoadChain)
1017	return BasicBlock::reverse_iterator (ChainElem);
1018	else
1019	return BasicBlock::iterator (ChainElem);
1020	}());
1021	auto BBItEnd = std::next([&] {
1022	if constexpr (IsLoadChain)
1023	return BasicBlock::reverse_iterator (ChainBegin);
1024	else
1025	return BasicBlock::iterator (ChainBegin);
1026	}());
1027
1028	const APInt &ChainElemOffset = ChainOffsets.at(Val: ChainElem);
1029	const unsigned ChainElemSize =
1030	DL.getTypeStoreSize(Ty: getLoadStoreType(I: ChainElem));
1031
1032	for (; BBIt != BBItEnd; ++BBIt) {
1033	Instruction I = &BBIt;
1034
1035	if (!I->mayReadOrWriteMemory())
1036	continue;
1037
1038	// Loads can be reordered with other loads.
1039	if (IsLoadChain && isa<LoadInst>(Val: I))
1040	continue;
1041
1042	// Stores can be sunk below invariant loads.
1043	if (!IsLoadChain && isInvariantLoad(I))
1044	continue;
1045
1046	// If I is in the chain, we can tell whether it aliases ChainIt by checking
1047	// what offset ChainIt accesses. This may be better than AA is able to do.
1048	//
1049	// We should really only have duplicate offsets for stores (the duplicate
1050	// loads should be CSE'ed), but in case we have a duplicate load, we'll
1051	// split the chain so we don't have to handle this case specially.
1052	if (auto OffsetIt = ChainOffsets.find(Val: I); OffsetIt != ChainOffsets.end()) {
1053	// I and ChainElem overlap if:
1054	// - I and ChainElem have the same offset, OR
1055	// - I's offset is less than ChainElem's, but I touches past the
1056	// beginning of ChainElem, OR
1057	// - ChainElem's offset is less than I's, but ChainElem touches past the
1058	// beginning of I.
1059	const APInt &IOffset = OffsetIt ->second;
1060	unsigned IElemSize = DL.getTypeStoreSize(Ty: getLoadStoreType(I));
1061	if (IOffset == ChainElemOffset \|\|
1062	(IOffset.sle(RHS: ChainElemOffset) &&
1063	(IOffset + IElemSize).sgt(RHS: ChainElemOffset)) \|\|
1064	(ChainElemOffset.sle(RHS: IOffset) &&
1065	(ChainElemOffset + ChainElemSize).sgt(RHS: OffsetIt ->second))) {
1066	LLVM_DEBUG({
1067	// Double check that AA also sees this alias. If not, we probably
1068	// have a bug.
1069	ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem));
1070	assert(IsLoadChain ? isModSet(MR) : isModOrRefSet(MR));
1071	dbgs() << "LSV: Found alias in chain: " << *I << "\n";
1072	});
1073	return false; // We found an aliasing instruction; bail.
1074	}
1075
1076	continue; // We're confident there's no alias.
1077	}
1078
1079	LLVM_DEBUG(dbgs() << "LSV: Querying AA for " << *I << "\n");
1080	ModRefInfo MR = AA.getModRefInfo(I, OptLoc: MemoryLocation::get(Inst: ChainElem));
1081	if (IsLoadChain ? isModSet(MRI: MR) : isModOrRefSet(MRI: MR)) {
1082	LLVM_DEBUG(dbgs() << "LSV: Found alias in chain:\n"
1083	<< " Aliasing instruction:\n"
1084	<< " " << *I << `'\n'`
1085	<< " Aliased instruction and pointer:\n"
1086	<< " " << *ChainElem << `'\n'`
1087	<< " " << *getLoadStorePointerOperand(ChainElem)
1088	<< `'\n'`);
1089
1090	return false;
1091	}
1092	}
1093	return true;
1094	}
1095
1096	static bool checkNoWrapFlags(Instruction I, bool* Signed) {
1097	BinaryOperator *BinOpI = cast<BinaryOperator>(Val: I);
1098	return (Signed && BinOpI->hasNoSignedWrap()) \|\|
1099	(!Signed && BinOpI->hasNoUnsignedWrap());
1100	}
1101
1102	static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA,
1103	unsigned MatchingOpIdxA, Instruction *AddOpB,
1104	unsigned MatchingOpIdxB, bool Signed) {
1105	LLVM_DEBUG(dbgs() << "LSV: checkIfSafeAddSequence IdxDiff=" << IdxDiff
1106	<< ", AddOpA=" << *AddOpA << ", MatchingOpIdxA="
1107	<< MatchingOpIdxA << ", AddOpB=" << *AddOpB
1108	<< ", MatchingOpIdxB=" << MatchingOpIdxB
1109	<< ", Signed=" << Signed << "\n");
1110	// If both OpA and OpB are adds with NSW/NUW and with one of the operands
1111	// being the same, we can guarantee that the transformation is safe if we can
1112	// prove that OpA won't overflow when Ret added to the other operand of OpA.
1113	// For example:
1114	// %tmp7 = add nsw i32 %tmp2, %v0
1115	// %tmp8 = sext i32 %tmp7 to i64
1116	// ...
1117	// %tmp11 = add nsw i32 %v0, 1
1118	// %tmp12 = add nsw i32 %tmp2, %tmp11
1119	// %tmp13 = sext i32 %tmp12 to i64
1120	//
1121	// Both %tmp7 and %tmp12 have the nsw flag and the first operand is %tmp2.
1122	// It's guaranteed that adding 1 to %tmp7 won't overflow because %tmp11 adds
1123	// 1 to %v0 and both %tmp11 and %tmp12 have the nsw flag.
1124	assert(AddOpA->getOpcode() == Instruction::Add &&
1125	AddOpB->getOpcode() == Instruction::Add &&
1126	checkNoWrapFlags(AddOpA, Signed) && checkNoWrapFlags(AddOpB, Signed));
1127	if (AddOpA->getOperand(i: MatchingOpIdxA) ==
1128	AddOpB->getOperand(i: MatchingOpIdxB)) {
1129	Value *OtherOperandA = AddOpA->getOperand(i: MatchingOpIdxA == `1` ? `0` : `1`);
1130	Value *OtherOperandB = AddOpB->getOperand(i: MatchingOpIdxB == `1` ? `0` : `1`);
1131	Instruction *OtherInstrA = dyn_cast<Instruction>(Val: OtherOperandA);
1132	Instruction *OtherInstrB = dyn_cast<Instruction>(Val: OtherOperandB);
1133	// Match `x +nsw/nuw y` and `x +nsw/nuw (y +nsw/nuw IdxDiff)`.
1134	if (OtherInstrB && OtherInstrB->getOpcode() == Instruction::Add &&
1135	checkNoWrapFlags(I: OtherInstrB, Signed) &&
1136	isa<ConstantInt>(Val: OtherInstrB->getOperand(i: `1`))) {
1137	int64_t CstVal =
1138	cast<ConstantInt>(Val: OtherInstrB->getOperand(i: `1`))->getSExtValue();
1139	if (OtherInstrB->getOperand(i: `0`) == OtherOperandA &&
1140	IdxDiff.getSExtValue() == CstVal)
1141	return true;
1142	}
1143	// Match `x +nsw/nuw (y +nsw/nuw -Idx)` and `x +nsw/nuw (y +nsw/nuw x)`.
1144	if (OtherInstrA && OtherInstrA->getOpcode() == Instruction::Add &&
1145	checkNoWrapFlags(I: OtherInstrA, Signed) &&
1146	isa<ConstantInt>(Val: OtherInstrA->getOperand(i: `1`))) {
1147	int64_t CstVal =
1148	cast<ConstantInt>(Val: OtherInstrA->getOperand(i: `1`))->getSExtValue();
1149	if (OtherInstrA->getOperand(i: `0`) == OtherOperandB &&
1150	IdxDiff.getSExtValue() == -CstVal)
1151	return true;
1152	}
1153	// Match `x +nsw/nuw (y +nsw/nuw c)` and
1154	// `x +nsw/nuw (y +nsw/nuw (c + IdxDiff))`.
1155	if (OtherInstrA && OtherInstrB &&
1156	OtherInstrA->getOpcode() == Instruction::Add &&
1157	OtherInstrB->getOpcode() == Instruction::Add &&
1158	checkNoWrapFlags(I: OtherInstrA, Signed) &&
1159	checkNoWrapFlags(I: OtherInstrB, Signed) &&
1160	isa<ConstantInt>(Val: OtherInstrA->getOperand(i: `1`)) &&
1161	isa<ConstantInt>(Val: OtherInstrB->getOperand(i: `1`))) {
1162	int64_t CstValA =
1163	cast<ConstantInt>(Val: OtherInstrA->getOperand(i: `1`))->getSExtValue();
1164	int64_t CstValB =
1165	cast<ConstantInt>(Val: OtherInstrB->getOperand(i: `1`))->getSExtValue();
1166	if (OtherInstrA->getOperand(i: `0`) == OtherInstrB->getOperand(i: `0`) &&
1167	IdxDiff.getSExtValue() == (CstValB - CstValA))
1168	return true;
1169	}
1170	}
1171	return false;
1172	}
1173
1174	std::optional<APInt> Vectorizer::getConstantOffsetComplexAddrs(
1175	Value PtrA, Value PtrB, Instruction ContextInst, unsigned* Depth) {
1176	LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs PtrA=" << *PtrA
1177	<< " PtrB=" << PtrB << " ContextInst=" << ContextInst
1178	<< " Depth=" << Depth << "\n");
1179	auto *GEPA = dyn_cast<GetElementPtrInst>(Val: PtrA);
1180	auto *GEPB = dyn_cast<GetElementPtrInst>(Val: PtrB);
1181	if (!GEPA \|\| !GEPB)
1182	return getConstantOffsetSelects(PtrA, PtrB, ContextInst, Depth);
1183
1184	// Look through GEPs after checking they're the same except for the last
1185	// index.
1186	if (GEPA->getNumOperands() != GEPB->getNumOperands() \|\|
1187	GEPA->getPointerOperand() != GEPB->getPointerOperand())
1188	return std::nullopt;
1189	gep_type_iterator GTIA = gep_type_begin(GEP: GEPA);
1190	gep_type_iterator GTIB = gep_type_begin(GEP: GEPB);
1191	for (unsigned I = `0`, E = GEPA->getNumIndices() - `1`; I < E; ++I) {
1192	if (GTIA.getOperand() != GTIB.getOperand())
1193	return std::nullopt;
1194	++GTIA;
1195	++GTIB;
1196	}
1197
1198	Instruction *OpA = dyn_cast<Instruction>(Val: GTIA.getOperand());
1199	Instruction *OpB = dyn_cast<Instruction>(Val: GTIB.getOperand());
1200	if (!OpA \|\| !OpB \|\| OpA->getOpcode() != OpB->getOpcode() \|\|
1201	OpA->getType() != OpB->getType())
1202	return std::nullopt;
1203
1204	uint64_t Stride = GTIA.getSequentialElementStride(DL);
1205
1206	// Only look through a ZExt/SExt.
1207	if (!isa<SExtInst>(Val: OpA) && !isa<ZExtInst>(Val: OpA))
1208	return std::nullopt;
1209
1210	bool Signed = isa<SExtInst>(Val: OpA);
1211
1212	// At this point A could be a function parameter, i.e. not an instruction
1213	Value *ValA = OpA->getOperand(i: `0`);
1214	OpB = dyn_cast<Instruction>(Val: OpB->getOperand(i: `0`));
1215	if (!OpB \|\| ValA->getType() != OpB->getType())
1216	return std::nullopt;
1217
1218	const SCEV *OffsetSCEVA = SE.getSCEV(V: ValA);
1219	const SCEV *OffsetSCEVB = SE.getSCEV(V: OpB);
1220	const SCEV *IdxDiffSCEV = SE.getMinusSCEV(LHS: OffsetSCEVB, RHS: OffsetSCEVA);
1221	if (IdxDiffSCEV == SE.getCouldNotCompute())
1222	return std::nullopt;
1223
1224	ConstantRange IdxDiffRange = SE.getSignedRange(S: IdxDiffSCEV);
1225	if (!IdxDiffRange.isSingleElement())
1226	return std::nullopt;
1227	APInt IdxDiff = *IdxDiffRange.getSingleElement();
1228
1229	LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs IdxDiff=" << IdxDiff
1230	<< "\n");
1231
1232	// Now we need to prove that adding IdxDiff to ValA won't overflow.
1233	bool Safe = false;
1234
1235	// First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
1236	// ValA, we're okay.
1237	if (OpB->getOpcode() == Instruction::Add &&
1238	isa<ConstantInt>(Val: OpB->getOperand(i: `1`)) &&
1239	IdxDiff.sle(RHS: cast<ConstantInt>(Val: OpB->getOperand(i: `1`))->getSExtValue()) &&
1240	checkNoWrapFlags(I: OpB, Signed))
1241	Safe = true;
1242
1243	// Second attempt: check if we have eligible add NSW/NUW instruction
1244	// sequences.
1245	OpA = dyn_cast<Instruction>(Val: ValA);
1246	if (!Safe && OpA && OpA->getOpcode() == Instruction::Add &&
1247	OpB->getOpcode() == Instruction::Add && checkNoWrapFlags(I: OpA, Signed) &&
1248	checkNoWrapFlags(I: OpB, Signed)) {
1249	// In the checks below a matching operand in OpA and OpB is an operand which
1250	// is the same in those two instructions. Below we account for possible
1251	// orders of the operands of these add instructions.
1252	for (unsigned MatchingOpIdxA : {`0`, `1`})
1253	for (unsigned MatchingOpIdxB : {`0`, `1`})
1254	if (!Safe)
1255	Safe = checkIfSafeAddSequence(IdxDiff, AddOpA: OpA, MatchingOpIdxA, AddOpB: OpB,
1256	MatchingOpIdxB, Signed);
1257	}
1258
1259	unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
1260
1261	// Third attempt:
1262	//
1263	// Assuming IdxDiff is positive: If all set bits of IdxDiff or any higher
1264	// order bit other than the sign bit are known to be zero in ValA, we can add
1265	// Diff to it while guaranteeing no overflow of any sort.
1266	//
1267	// If IdxDiff is negative, do the same, but swap ValA and ValB.
1268	if (!Safe) {
1269	// When computing known bits, use the GEPs as context instructions, since
1270	// they likely are in the same BB as the load/store.
1271	KnownBits Known(BitWidth);
1272	computeKnownBits(V: (IdxDiff.sge(RHS: `0`) ? ValA : OpB), Known, DL, AC: &AC, CxtI: ContextInst,
1273	DT: &DT);
1274	APInt BitsAllowedToBeSet = Known.Zero.zext(width: IdxDiff.getBitWidth());
1275	if (Signed)
1276	BitsAllowedToBeSet.clearBit(BitPosition: BitWidth - `1`);
1277	Safe = BitsAllowedToBeSet.uge(RHS: IdxDiff.abs());
1278	}
1279
1280	if (Safe)
1281	return IdxDiff * Stride;
1282	return std::nullopt;
1283	}
1284
1285	std::optional<APInt> Vectorizer::getConstantOffsetSelects(
1286	Value PtrA, Value PtrB, Instruction ContextInst, unsigned* Depth) {
1287	if (Depth++ == MaxDepth)
1288	return std::nullopt;
1289
1290	if (auto *SelectA = dyn_cast<SelectInst>(Val: PtrA)) {
1291	if (auto *SelectB = dyn_cast<SelectInst>(Val: PtrB)) {
1292	if (SelectA->getCondition() != SelectB->getCondition())
1293	return std::nullopt;
1294	LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetSelects, PtrA=" << *PtrA
1295	<< ", PtrB=" << *PtrB << ", ContextInst="
1296	<< *ContextInst << ", Depth=" << Depth << "\n");
1297	std::optional<APInt> TrueDiff = getConstantOffset(
1298	PtrA: SelectA->getTrueValue(), PtrB: SelectB->getTrueValue(), ContextInst, Depth);
1299	if (!TrueDiff)
1300	return std::nullopt;
1301	std::optional<APInt> FalseDiff =
1302	getConstantOffset(PtrA: SelectA->getFalseValue(), PtrB: SelectB->getFalseValue(),
1303	ContextInst, Depth);
1304	if (TrueDiff == FalseDiff)
1305	return TrueDiff;
1306	}
1307	}
1308	return std::nullopt;
1309	}
1310
1311	void Vectorizer::mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const {
1312	if (EQClasses.size() < `2`) // There is nothing to merge.
1313	return;
1314
1315	// The reduced key has all elements of the ECClassKey except the underlying
1316	// object. Check that EqClassKey has 4 elements and define the reduced key.
1317	static_assert(std::tuple_size_v<EqClassKey> == `4`,
1318	"EqClassKey has changed - EqClassReducedKey needs changes too");
1319	using EqClassReducedKey =
1320	std::tuple<std::tuple_element_t<`1`, EqClassKey> / AddrSpace /,
1321	std::tuple_element_t<`2`, EqClassKey> / Element size /,
1322	std::tuple_element_t<`3`, EqClassKey> / IsLoad; />;
1323	using ECReducedKeyToUnderlyingObjectMap =
1324	MapVector<EqClassReducedKey,
1325	SmallPtrSet<std::tuple_element_t<`0`, EqClassKey>, `4`>>;
1326
1327	// Form a map from the reduced key (without the underlying object) to the
1328	// underlying objects: 1 reduced key to many underlying objects, to form
1329	// groups of potentially merge-able equivalence classes.
1330	ECReducedKeyToUnderlyingObjectMap RedKeyToUOMap;
1331	bool FoundPotentiallyOptimizableEC = false;
1332	for (const auto &EC : EQClasses) {
1333	const auto &Key = EC.first;
1334	EqClassReducedKey RedKey{std::get<`1`>(t: Key), std::get<`2`>(t: Key),
1335	std::get<`3`>(t: Key)};
1336	auto &UOMap = RedKeyToUOMap [RedKey];
1337	UOMap.insert(Ptr: std::get<`0`>(t: Key));
1338	if (UOMap.size() > `1`)
1339	FoundPotentiallyOptimizableEC = true;
1340	}
1341	if (!FoundPotentiallyOptimizableEC)
1342	return;
1343
1344	LLVM_DEBUG({
1345	dbgs() << "LSV: mergeEquivalenceClasses: before merging:\n";
1346	for (const auto &EC : EQClasses) {
1347	dbgs() << " Key: {" << EC.first << "}\n";
1348	for (const auto &Inst : EC.second)
1349	dbgs() << " Inst: " << *Inst << `'\n'`;
1350	}
1351	});
1352	LLVM_DEBUG({
1353	dbgs() << "LSV: mergeEquivalenceClasses: RedKeyToUOMap:\n";
1354	for (const auto &RedKeyToUO : RedKeyToUOMap) {
1355	dbgs() << " Reduced key: {" << std::get<`0`>(RedKeyToUO.first) << ", "
1356	<< std::get<`1`>(RedKeyToUO.first) << ", "
1357	<< static_cast<int>(std::get<`2`>(RedKeyToUO.first)) << "} --> "
1358	<< RedKeyToUO.second.size() << " underlying objects:\n";
1359	for (auto UObject : RedKeyToUO.second)
1360	dbgs() << " " << *UObject << `'\n'`;
1361	}
1362	});
1363
1364	using UObjectToUObjectMap = DenseMap<const Value , const* Value *>;
1365
1366	// Compute the ultimate targets for a set of underlying objects.
1367	auto GetUltimateTargets =
1368	[](SmallPtrSetImpl<const Value *> &UObjects) -> UObjectToUObjectMap {
1369	UObjectToUObjectMap IndirectionMap;
1370	for (const auto *UObject : UObjects) {
1371	const unsigned MaxLookupDepth = `1`; // look for 1-level indirections only
1372	const auto *UltimateTarget = getUnderlyingObject(V: UObject, MaxLookup: MaxLookupDepth);
1373	if (UltimateTarget != UObject)
1374	IndirectionMap [UObject] = UltimateTarget;
1375	}
1376	UObjectToUObjectMap UltimateTargetsMap;
1377	for (const auto *UObject : UObjects) {
1378	auto Target = UObject;
1379	auto It = IndirectionMap.find(Val: Target);
1380	for (; It != IndirectionMap.end(); It = IndirectionMap.find(Val: Target))
1381	Target = It ->second;
1382	UltimateTargetsMap [UObject] = Target;
1383	}
1384	return UltimateTargetsMap;
1385	};
1386
1387	// For each item in RedKeyToUOMap, if it has more than one underlying object,
1388	// try to merge the equivalence classes.
1389	for (auto &[RedKey, UObjects] : RedKeyToUOMap) {
1390	if (UObjects.size() < `2`)
1391	continue;
1392	auto UTMap = GetUltimateTargets (UObjects);
1393	for (const auto &[UObject, UltimateTarget] : UTMap) {
1394	if (UObject == UltimateTarget)
1395	continue;
1396
1397	EqClassKey KeyFrom{UObject, std::get<`0`>(t&: RedKey), std::get<`1`>(t&: RedKey),
1398	std::get<`2`>(t&: RedKey)};
1399	EqClassKey KeyTo{UltimateTarget, std::get<`0`>(t&: RedKey), std::get<`1`>(t&: RedKey),
1400	std::get<`2`>(t&: RedKey)};
1401	// The entry for KeyFrom is guarantted to exist, unlike KeyTo. Thus,
1402	// request the reference to the instructions vector for KeyTo first.
1403	const auto &VecTo = EQClasses [KeyTo];
1404	const auto &VecFrom = EQClasses [KeyFrom];
1405	SmallVector<Instruction *, `8`> MergedVec;
1406	std::merge(first1: VecFrom.begin(), last1: VecFrom.end(), first2: VecTo.begin(), last2: VecTo.end(),
1407	result: std::back_inserter(x&: MergedVec),
1408	comp: [](Instruction A, Instruction B) {
1409	return A && B && A->comesBefore(Other: B);
1410	});
1411	EQClasses [KeyTo] = std::move(MergedVec);
1412	EQClasses.erase(Key: KeyFrom);
1413	}
1414	}
1415	LLVM_DEBUG({
1416	dbgs() << "LSV: mergeEquivalenceClasses: after merging:\n";
1417	for (const auto &EC : EQClasses) {
1418	dbgs() << " Key: {" << EC.first << "}\n";
1419	for (const auto &Inst : EC.second)
1420	dbgs() << " Inst: " << *Inst << `'\n'`;
1421	}
1422	});
1423	}
1424
1425	EquivalenceClassMap
1426	Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
1427	BasicBlock::iterator End) {
1428	EquivalenceClassMap Ret;
1429
1430	auto GetUnderlyingObject = [](const Value Ptr) -> const* Value * {
1431	const Value *ObjPtr = llvm::getUnderlyingObject(V: Ptr);
1432	if (const auto *Sel = dyn_cast<SelectInst>(Val: ObjPtr)) {
1433	// The select's themselves are distinct instructions even if they share
1434	// the same condition and evaluate to consecutive pointers for true and
1435	// false values of the condition. Therefore using the select's themselves
1436	// for grouping instructions would put consecutive accesses into different
1437	// lists and they won't be even checked for being consecutive, and won't
1438	// be vectorized.
1439	return Sel->getCondition();
1440	}
1441	return ObjPtr;
1442	};
1443
1444	for (Instruction &I : make_range(x: Begin, y: End)) {
1445	auto *LI = dyn_cast<LoadInst>(Val: &I);
1446	auto *SI = dyn_cast<StoreInst>(Val: &I);
1447	if (!LI && !SI)
1448	continue;
1449
1450	if ((LI && !LI->isSimple()) \|\| (SI && !SI->isSimple()))
1451	continue;
1452
1453	if ((LI && !TTI.isLegalToVectorizeLoad(LI)) \|\|
1454	(SI && !TTI.isLegalToVectorizeStore(SI)))
1455	continue;
1456
1457	Type *Ty = getLoadStoreType(I: &I);
1458	if (!VectorType::isValidElementType(ElemTy: Ty->getScalarType()))
1459	continue;
1460
1461	// Skip weird non-byte sizes. They probably aren't worth the effort of
1462	// handling correctly.
1463	unsigned TySize = DL.getTypeSizeInBits(Ty);
1464	if ((TySize % `8`) != `0`)
1465	continue;
1466
1467	// Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
1468	// functions are currently using an integer type for the vectorized
1469	// load/store, and does not support casting between the integer type and a
1470	// vector of pointers (e.g. i64 to <2 x i16>)*
1471	if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
1472	continue;
1473
1474	Value *Ptr = getLoadStorePointerOperand(V: &I);
1475	unsigned AS = Ptr->getType()->getPointerAddressSpace();
1476	unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AddrSpace: AS);
1477
1478	unsigned VF = VecRegSize / TySize;
1479	VectorType *VecTy = dyn_cast<VectorType>(Val: Ty);
1480
1481	// Only handle power-of-two sized elements.
1482	if ((!VecTy && !isPowerOf2_32(Value: DL.getTypeSizeInBits(Ty))) \|\|
1483	(VecTy && !isPowerOf2_32(Value: DL.getTypeSizeInBits(Ty: VecTy->getScalarType()))))
1484	continue;
1485
1486	// No point in looking at these if they're too big to vectorize.
1487	if (TySize > VecRegSize / `2` \|\|
1488	(VecTy && TTI.getLoadVectorFactor(VF, LoadSize: TySize, ChainSizeInBytes: TySize / `8`, VecTy) == `0`))
1489	continue;
1490
1491	Ret [{GetUnderlyingObject (Ptr), AS,
1492	DL.getTypeSizeInBits(Ty: getLoadStoreType(I: &I)->getScalarType()),
1493	/IsLoad=/LI != nullptr}]
1494	.emplace_back(Args: &I);
1495	}
1496
1497	mergeEquivalenceClasses(EQClasses&: Ret);
1498	return Ret;
1499	}
1500
1501	std::vector<Chain> Vectorizer::gatherChains(ArrayRef<Instruction *> Instrs) {
1502	if (Instrs.empty())
1503	return {};
1504
1505	unsigned AS = getLoadStoreAddressSpace(I: Instrs [`0`]);
1506	unsigned ASPtrBits = DL.getIndexSizeInBits(AS);
1507
1508	#ifndef NDEBUG
1509	// Check that Instrs is in BB order and all have the same addr space.
1510	for (size_t I = `1`; I < Instrs.size(); ++I) {
1511	assert(Instrs[I - `1`]->comesBefore(Instrs[I]));
1512	assert(getLoadStoreAddressSpace(Instrs[I]) == AS);
1513	}
1514	#endif
1515
1516	// Machinery to build an MRU-hashtable of Chains.
1517	//
1518	// (Ideally this could be done with MapVector, but as currently implemented,
1519	// moving an element to the front of a MapVector is O(n).)
1520	struct InstrListElem : ilist_node<InstrListElem>,
1521	std::pair<Instruction *, Chain> {
1522	explicit InstrListElem(Instruction *I)
1523	: std::pair<Instruction *, Chain>(I, {}) {}
1524	};
1525	struct InstrListElemDenseMapInfo {
1526	using PtrInfo = DenseMapInfo<InstrListElem *>;
1527	using IInfo = DenseMapInfo<Instruction *>;
1528	static InstrListElem getEmptyKey() { return* PtrInfo::getEmptyKey(); }
1529	static InstrListElem *getTombstoneKey() {
1530	return PtrInfo::getTombstoneKey();
1531	}
1532	static unsigned getHashValue(const InstrListElem *E) {
1533	return IInfo::getHashValue(PtrVal: E->first);
1534	}
1535	static bool isEqual(const InstrListElem A, const* InstrListElem *B) {
1536	if (A == getEmptyKey() \|\| B == getEmptyKey())
1537	return A == getEmptyKey() && B == getEmptyKey();
1538	if (A == getTombstoneKey() \|\| B == getTombstoneKey())
1539	return A == getTombstoneKey() && B == getTombstoneKey();
1540	return IInfo::isEqual(LHS: A->first, RHS: B->first);
1541	}
1542	};
1543	SpecificBumpPtrAllocator<InstrListElem> Allocator;
1544	simple_ilist<InstrListElem> MRU;
1545	DenseSet<InstrListElem *, InstrListElemDenseMapInfo> Chains;
1546
1547	// Compare each instruction in `instrs` to leader of the N most recently-used
1548	// chains. This limits the O(n^2) behavior of this pass while also allowing
1549	// us to build arbitrarily long chains.
1550	for (Instruction *I : Instrs) {
1551	constexpr int MaxChainsToTry = `64`;
1552
1553	bool MatchFound = false;
1554	auto ChainIter = MRU.begin();
1555	for (size_t J = `0`; J < MaxChainsToTry && ChainIter != MRU.end();
1556	++J, ++ChainIter) {
1557	if (std::optional<APInt> Offset = getConstantOffset(
1558	PtrA: getLoadStorePointerOperand(V: ChainIter ->first),
1559	PtrB: getLoadStorePointerOperand(V: I),
1560	/ContextInst=/
1561	(ChainIter ->first->comesBefore(Other: I) ? I : ChainIter ->first))) {
1562	// `Offset` might not have the expected number of bits, if e.g. AS has a
1563	// different number of bits than opaque pointers.
1564	ChainIter ->second.emplace_back(Args&: I, Args&: Offset.value());
1565	// Move ChainIter to the front of the MRU list.
1566	MRU.remove(N&: *ChainIter);
1567	MRU.push_front(Node&: *ChainIter);
1568	MatchFound = true;
1569	break;
1570	}
1571	}
1572
1573	if (!MatchFound) {
1574	APInt ZeroOffset(ASPtrBits, `0`);
1575	InstrListElem E = new* (Allocator.Allocate()) InstrListElem(I);
1576	E->second.emplace_back(Args&: I, Args&: ZeroOffset);
1577	MRU.push_front(Node&: *E);
1578	Chains.insert(V: E);
1579	}
1580	}
1581
1582	std::vector<Chain> Ret;
1583	Ret.reserve(n: Chains.size());
1584	// Iterate over MRU rather than Chains so the order is deterministic.
1585	for (auto &E : MRU)
1586	if (E.second.size() > `1`)
1587	Ret.emplace_back(args: std::move(E.second));
1588	return Ret;
1589	}
1590
1591	std::optional<APInt> Vectorizer::getConstantOffset(Value PtrA, Value PtrB,
1592	Instruction *ContextInst,
1593	unsigned Depth) {
1594	LLVM_DEBUG(dbgs() << "LSV: getConstantOffset, PtrA=" << *PtrA
1595	<< ", PtrB=" << PtrB << ", ContextInst= " << ContextInst
1596	<< ", Depth=" << Depth << "\n");
1597	// We'll ultimately return a value of this bit width, even if computations
1598	// happen in a different width.
1599	unsigned OrigBitWidth = DL.getIndexTypeSizeInBits(Ty: PtrA->getType());
1600	APInt OffsetA(OrigBitWidth, `0`);
1601	APInt OffsetB(OrigBitWidth, `0`);
1602	PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, Offset&: OffsetA);
1603	PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, Offset&: OffsetB);
1604	unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(Ty: PtrA->getType());
1605	if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(Ty: PtrB->getType()))
1606	return std::nullopt;
1607
1608	// If we have to shrink the pointer, stripAndAccumulateInBoundsConstantOffsets
1609	// should properly handle a possible overflow and the value should fit into
1610	// the smallest data type used in the cast/gep chain.
1611	assert(OffsetA.getSignificantBits() <= NewPtrBitWidth &&
1612	OffsetB.getSignificantBits() <= NewPtrBitWidth);
1613
1614	OffsetA = OffsetA.sextOrTrunc(width: NewPtrBitWidth);
1615	OffsetB = OffsetB.sextOrTrunc(width: NewPtrBitWidth);
1616	if (PtrA == PtrB)
1617	return (OffsetB - OffsetA).sextOrTrunc(width: OrigBitWidth);
1618
1619	// Try to compute B - A.
1620	const SCEV *DistScev = SE.getMinusSCEV(LHS: SE.getSCEV(V: PtrB), RHS: SE.getSCEV(V: PtrA));
1621	if (DistScev != SE.getCouldNotCompute()) {
1622	LLVM_DEBUG(dbgs() << "LSV: SCEV PtrB - PtrA =" << *DistScev << "\n");
1623	ConstantRange DistRange = SE.getSignedRange(S: DistScev);
1624	if (DistRange.isSingleElement()) {
1625	// Handle index width (the width of Dist) != pointer width (the width of
1626	// the Offsets at this point).*
1627	APInt Dist = DistRange.getSingleElement()->sextOrTrunc(width: NewPtrBitWidth);
1628	return (OffsetB - OffsetA + Dist).sextOrTrunc(width: OrigBitWidth);
1629	}
1630	}
1631	if (std::optional<APInt> Diff =
1632	getConstantOffsetComplexAddrs(PtrA, PtrB, ContextInst, Depth))
1633	return (OffsetB - OffsetA + Diff ->sext(width: OffsetB.getBitWidth()))
1634	.sextOrTrunc(width: OrigBitWidth);
1635	return std::nullopt;
1636	}
1637

Browse the source code of llvm_projects/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp