1 | //===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This pass merges loads/stores to/from sequential memory addresses into vector |
10 | // loads/stores. Although there's nothing GPU-specific in here, this pass is |
11 | // motivated by the microarchitectural quirks of nVidia and AMD GPUs. |
12 | // |
13 | // (For simplicity below we talk about loads only, but everything also applies |
14 | // to stores.) |
15 | // |
16 | // This pass is intended to be run late in the pipeline, after other |
17 | // vectorization opportunities have been exploited. So the assumption here is |
18 | // that immediately following our new vector load we'll need to extract out the |
19 | // individual elements of the load, so we can operate on them individually. |
20 | // |
21 | // On CPUs this transformation is usually not beneficial, because extracting the |
22 | // elements of a vector register is expensive on most architectures. It's |
23 | // usually better just to load each element individually into its own scalar |
24 | // register. |
25 | // |
26 | // However, nVidia and AMD GPUs don't have proper vector registers. Instead, a |
27 | // "vector load" loads directly into a series of scalar registers. In effect, |
28 | // extracting the elements of the vector is free. It's therefore always |
29 | // beneficial to vectorize a sequence of loads on these architectures. |
30 | // |
31 | // Vectorizing (perhaps a better name might be "coalescing") loads can have |
32 | // large performance impacts on GPU kernels, and opportunities for vectorizing |
33 | // are common in GPU code. This pass tries very hard to find such |
34 | // opportunities; its runtime is quadratic in the number of loads in a BB. |
35 | // |
36 | // Some CPU architectures, such as ARM, have instructions that load into |
37 | // multiple scalar registers, similar to a GPU vectorized load. In theory ARM |
38 | // could use this pass (with some modifications), but currently it implements |
39 | // its own pass to do something similar to what we do here. |
40 | // |
41 | // Overview of the algorithm and terminology in this pass: |
42 | // |
43 | // - Break up each basic block into pseudo-BBs, composed of instructions which |
44 | // are guaranteed to transfer control to their successors. |
45 | // - Within a single pseudo-BB, find all loads, and group them into |
46 | // "equivalence classes" according to getUnderlyingObject() and loaded |
47 | // element size. Do the same for stores. |
48 | // - For each equivalence class, greedily build "chains". Each chain has a |
49 | // leader instruction, and every other member of the chain has a known |
50 | // constant offset from the first instr in the chain. |
51 | // - Break up chains so that they contain only contiguous accesses of legal |
52 | // size with no intervening may-alias instrs. |
53 | // - Convert each chain to vector instructions. |
54 | // |
55 | // The O(n^2) behavior of this pass comes from initially building the chains. |
56 | // In the worst case we have to compare each new instruction to all of those |
57 | // that came before. To limit this, we only calculate the offset to the leaders |
58 | // of the N most recently-used chains. |
59 | |
60 | #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h" |
61 | #include "llvm/ADT/APInt.h" |
62 | #include "llvm/ADT/ArrayRef.h" |
63 | #include "llvm/ADT/DenseMap.h" |
64 | #include "llvm/ADT/MapVector.h" |
65 | #include "llvm/ADT/PostOrderIterator.h" |
66 | #include "llvm/ADT/STLExtras.h" |
67 | #include "llvm/ADT/Sequence.h" |
68 | #include "llvm/ADT/SmallPtrSet.h" |
69 | #include "llvm/ADT/SmallVector.h" |
70 | #include "llvm/ADT/Statistic.h" |
71 | #include "llvm/ADT/iterator_range.h" |
72 | #include "llvm/Analysis/AliasAnalysis.h" |
73 | #include "llvm/Analysis/AssumptionCache.h" |
74 | #include "llvm/Analysis/MemoryLocation.h" |
75 | #include "llvm/Analysis/ScalarEvolution.h" |
76 | #include "llvm/Analysis/TargetTransformInfo.h" |
77 | #include "llvm/Analysis/ValueTracking.h" |
78 | #include "llvm/Analysis/VectorUtils.h" |
79 | #include "llvm/IR/Attributes.h" |
80 | #include "llvm/IR/BasicBlock.h" |
81 | #include "llvm/IR/ConstantRange.h" |
82 | #include "llvm/IR/Constants.h" |
83 | #include "llvm/IR/DataLayout.h" |
84 | #include "llvm/IR/DerivedTypes.h" |
85 | #include "llvm/IR/Dominators.h" |
86 | #include "llvm/IR/Function.h" |
87 | #include "llvm/IR/GetElementPtrTypeIterator.h" |
88 | #include "llvm/IR/IRBuilder.h" |
89 | #include "llvm/IR/InstrTypes.h" |
90 | #include "llvm/IR/Instruction.h" |
91 | #include "llvm/IR/Instructions.h" |
92 | #include "llvm/IR/LLVMContext.h" |
93 | #include "llvm/IR/Module.h" |
94 | #include "llvm/IR/Type.h" |
95 | #include "llvm/IR/Value.h" |
96 | #include "llvm/InitializePasses.h" |
97 | #include "llvm/Pass.h" |
98 | #include "llvm/Support/Alignment.h" |
99 | #include "llvm/Support/Casting.h" |
100 | #include "llvm/Support/Debug.h" |
101 | #include "llvm/Support/KnownBits.h" |
102 | #include "llvm/Support/MathExtras.h" |
103 | #include "llvm/Support/ModRef.h" |
104 | #include "llvm/Support/raw_ostream.h" |
105 | #include "llvm/Transforms/Utils/Local.h" |
106 | #include <algorithm> |
107 | #include <cassert> |
108 | #include <cstdint> |
109 | #include <cstdlib> |
110 | #include <iterator> |
111 | #include <numeric> |
112 | #include <optional> |
113 | #include <tuple> |
114 | #include <type_traits> |
115 | #include <utility> |
116 | #include <vector> |
117 | |
118 | using namespace llvm; |
119 | |
120 | #define DEBUG_TYPE "load-store-vectorizer" |
121 | |
122 | STATISTIC(NumVectorInstructions, "Number of vector accesses generated" ); |
123 | STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized" ); |
124 | |
125 | namespace { |
126 | |
127 | // Equivalence class key, the initial tuple by which we group loads/stores. |
128 | // Loads/stores with different EqClassKeys are never merged. |
129 | // |
130 | // (We could in theory remove element-size from the this tuple. We'd just need |
131 | // to fix up the vector packing/unpacking code.) |
132 | using EqClassKey = |
133 | std::tuple<const Value * /* result of getUnderlyingObject() */, |
134 | unsigned /* AddrSpace */, |
135 | unsigned /* Load/Store element size bits */, |
136 | char /* IsLoad; char b/c bool can't be a DenseMap key */ |
137 | >; |
138 | [[maybe_unused]] llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, |
139 | const EqClassKey &K) { |
140 | const auto &[UnderlyingObject, AddrSpace, ElementSize, IsLoad] = K; |
141 | return OS << (IsLoad ? "load" : "store" ) << " of " << *UnderlyingObject |
142 | << " of element size " << ElementSize << " bits in addrspace " |
143 | << AddrSpace; |
144 | } |
145 | |
146 | // A Chain is a set of instructions such that: |
147 | // - All instructions have the same equivalence class, so in particular all are |
148 | // loads, or all are stores. |
149 | // - We know the address accessed by the i'th chain elem relative to the |
150 | // chain's leader instruction, which is the first instr of the chain in BB |
151 | // order. |
152 | // |
153 | // Chains have two canonical orderings: |
154 | // - BB order, sorted by Instr->comesBefore. |
155 | // - Offset order, sorted by OffsetFromLeader. |
156 | // This pass switches back and forth between these orders. |
157 | struct ChainElem { |
158 | Instruction *Inst; |
159 | APInt OffsetFromLeader; |
160 | ChainElem(Instruction *Inst, APInt OffsetFromLeader) |
161 | : Inst(std::move(Inst)), OffsetFromLeader(std::move(OffsetFromLeader)) {} |
162 | }; |
163 | using Chain = SmallVector<ChainElem, 1>; |
164 | |
165 | void sortChainInBBOrder(Chain &C) { |
166 | sort(C, Comp: [](auto &A, auto &B) { return A.Inst->comesBefore(B.Inst); }); |
167 | } |
168 | |
169 | void sortChainInOffsetOrder(Chain &C) { |
170 | sort(C, Comp: [](const auto &A, const auto &B) { |
171 | if (A.OffsetFromLeader != B.OffsetFromLeader) |
172 | return A.OffsetFromLeader.slt(B.OffsetFromLeader); |
173 | return A.Inst->comesBefore(B.Inst); // stable tiebreaker |
174 | }); |
175 | } |
176 | |
177 | [[maybe_unused]] void dumpChain(ArrayRef<ChainElem> C) { |
178 | for (const auto &E : C) { |
179 | dbgs() << " " << *E.Inst << " (offset " << E.OffsetFromLeader << ")\n" ; |
180 | } |
181 | } |
182 | |
183 | using EquivalenceClassMap = |
184 | MapVector<EqClassKey, SmallVector<Instruction *, 8>>; |
185 | |
186 | // FIXME: Assuming stack alignment of 4 is always good enough |
187 | constexpr unsigned StackAdjustedAlignment = 4; |
188 | |
189 | Instruction *propagateMetadata(Instruction *I, const Chain &C) { |
190 | SmallVector<Value *, 8> Values; |
191 | for (const ChainElem &E : C) |
192 | Values.emplace_back(Args: E.Inst); |
193 | return propagateMetadata(I, VL: Values); |
194 | } |
195 | |
196 | bool isInvariantLoad(const Instruction *I) { |
197 | const LoadInst *LI = dyn_cast<LoadInst>(Val: I); |
198 | return LI != nullptr && LI->hasMetadata(KindID: LLVMContext::MD_invariant_load); |
199 | } |
200 | |
201 | /// Reorders the instructions that I depends on (the instructions defining its |
202 | /// operands), to ensure they dominate I. |
203 | void reorder(Instruction *I) { |
204 | SmallPtrSet<Instruction *, 16> InstructionsToMove; |
205 | SmallVector<Instruction *, 16> Worklist; |
206 | |
207 | Worklist.emplace_back(Args&: I); |
208 | while (!Worklist.empty()) { |
209 | Instruction *IW = Worklist.pop_back_val(); |
210 | int NumOperands = IW->getNumOperands(); |
211 | for (int Idx = 0; Idx < NumOperands; Idx++) { |
212 | Instruction *IM = dyn_cast<Instruction>(Val: IW->getOperand(i: Idx)); |
213 | if (!IM || IM->getOpcode() == Instruction::PHI) |
214 | continue; |
215 | |
216 | // If IM is in another BB, no need to move it, because this pass only |
217 | // vectorizes instructions within one BB. |
218 | if (IM->getParent() != I->getParent()) |
219 | continue; |
220 | |
221 | assert(IM != I && "Unexpected cycle while re-ordering instructions" ); |
222 | |
223 | if (!IM->comesBefore(Other: I)) { |
224 | InstructionsToMove.insert(Ptr: IM); |
225 | Worklist.emplace_back(Args&: IM); |
226 | } |
227 | } |
228 | } |
229 | |
230 | // All instructions to move should follow I. Start from I, not from begin(). |
231 | for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;) { |
232 | Instruction *IM = &*(BBI++); |
233 | if (!InstructionsToMove.contains(Ptr: IM)) |
234 | continue; |
235 | IM->moveBefore(InsertPos: I->getIterator()); |
236 | } |
237 | } |
238 | |
239 | class Vectorizer { |
240 | Function &F; |
241 | AliasAnalysis &AA; |
242 | AssumptionCache &AC; |
243 | DominatorTree &DT; |
244 | ScalarEvolution &SE; |
245 | TargetTransformInfo &TTI; |
246 | const DataLayout &DL; |
247 | IRBuilder<> Builder; |
248 | |
249 | // We could erase instrs right after vectorizing them, but that can mess up |
250 | // our BB iterators, and also can make the equivalence class keys point to |
251 | // freed memory. This is fixable, but it's simpler just to wait until we're |
252 | // done with the BB and erase all at once. |
253 | SmallVector<Instruction *, 128> ToErase; |
254 | |
255 | public: |
256 | Vectorizer(Function &F, AliasAnalysis &AA, AssumptionCache &AC, |
257 | DominatorTree &DT, ScalarEvolution &SE, TargetTransformInfo &TTI) |
258 | : F(F), AA(AA), AC(AC), DT(DT), SE(SE), TTI(TTI), |
259 | DL(F.getDataLayout()), Builder(SE.getContext()) {} |
260 | |
261 | bool run(); |
262 | |
263 | private: |
264 | static const unsigned MaxDepth = 3; |
265 | |
266 | /// Runs the vectorizer on a "pseudo basic block", which is a range of |
267 | /// instructions [Begin, End) within one BB all of which have |
268 | /// isGuaranteedToTransferExecutionToSuccessor(I) == true. |
269 | bool runOnPseudoBB(BasicBlock::iterator Begin, BasicBlock::iterator End); |
270 | |
271 | /// Runs the vectorizer on one equivalence class, i.e. one set of loads/stores |
272 | /// in the same BB with the same value for getUnderlyingObject() etc. |
273 | bool runOnEquivalenceClass(const EqClassKey &EqClassKey, |
274 | ArrayRef<Instruction *> EqClass); |
275 | |
276 | /// Runs the vectorizer on one chain, i.e. a subset of an equivalence class |
277 | /// where all instructions access a known, constant offset from the first |
278 | /// instruction. |
279 | bool runOnChain(Chain &C); |
280 | |
281 | /// Splits the chain into subchains of instructions which read/write a |
282 | /// contiguous block of memory. Discards any length-1 subchains (because |
283 | /// there's nothing to vectorize in there). |
284 | std::vector<Chain> splitChainByContiguity(Chain &C); |
285 | |
286 | /// Splits the chain into subchains where it's safe to hoist loads up to the |
287 | /// beginning of the sub-chain and it's safe to sink loads up to the end of |
288 | /// the sub-chain. Discards any length-1 subchains. |
289 | std::vector<Chain> splitChainByMayAliasInstrs(Chain &C); |
290 | |
291 | /// Splits the chain into subchains that make legal, aligned accesses. |
292 | /// Discards any length-1 subchains. |
293 | std::vector<Chain> splitChainByAlignment(Chain &C); |
294 | |
295 | /// Converts the instrs in the chain into a single vectorized load or store. |
296 | /// Adds the old scalar loads/stores to ToErase. |
297 | bool vectorizeChain(Chain &C); |
298 | |
299 | /// Tries to compute the offset in bytes PtrB - PtrA. |
300 | std::optional<APInt> getConstantOffset(Value *PtrA, Value *PtrB, |
301 | Instruction *ContextInst, |
302 | unsigned Depth = 0); |
303 | std::optional<APInt> getConstantOffsetComplexAddrs(Value *PtrA, Value *PtrB, |
304 | Instruction *ContextInst, |
305 | unsigned Depth); |
306 | std::optional<APInt> getConstantOffsetSelects(Value *PtrA, Value *PtrB, |
307 | Instruction *ContextInst, |
308 | unsigned Depth); |
309 | |
310 | /// Gets the element type of the vector that the chain will load or store. |
311 | /// This is nontrivial because the chain may contain elements of different |
312 | /// types; e.g. it's legal to have a chain that contains both i32 and float. |
313 | Type *getChainElemTy(const Chain &C); |
314 | |
315 | /// Determines whether ChainElem can be moved up (if IsLoad) or down (if |
316 | /// !IsLoad) to ChainBegin -- i.e. there are no intervening may-alias |
317 | /// instructions. |
318 | /// |
319 | /// The map ChainElemOffsets must contain all of the elements in |
320 | /// [ChainBegin, ChainElem] and their offsets from some arbitrary base |
321 | /// address. It's ok if it contains additional entries. |
322 | template <bool IsLoadChain> |
323 | bool isSafeToMove( |
324 | Instruction *ChainElem, Instruction *ChainBegin, |
325 | const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets); |
326 | |
327 | /// Merges the equivalence classes if they have underlying objects that differ |
328 | /// by one level of indirection (i.e., one is a getelementptr and the other is |
329 | /// the base pointer in that getelementptr). |
330 | void mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const; |
331 | |
332 | /// Collects loads and stores grouped by "equivalence class", where: |
333 | /// - all elements in an eq class are a load or all are a store, |
334 | /// - they all load/store the same element size (it's OK to have e.g. i8 and |
335 | /// <4 x i8> in the same class, but not i32 and <4 x i8>), and |
336 | /// - they all have the same value for getUnderlyingObject(). |
337 | EquivalenceClassMap collectEquivalenceClasses(BasicBlock::iterator Begin, |
338 | BasicBlock::iterator End); |
339 | |
340 | /// Partitions Instrs into "chains" where every instruction has a known |
341 | /// constant offset from the first instr in the chain. |
342 | /// |
343 | /// Postcondition: For all i, ret[i][0].second == 0, because the first instr |
344 | /// in the chain is the leader, and an instr touches distance 0 from itself. |
345 | std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs); |
346 | }; |
347 | |
348 | class LoadStoreVectorizerLegacyPass : public FunctionPass { |
349 | public: |
350 | static char ID; |
351 | |
352 | LoadStoreVectorizerLegacyPass() : FunctionPass(ID) { |
353 | initializeLoadStoreVectorizerLegacyPassPass( |
354 | *PassRegistry::getPassRegistry()); |
355 | } |
356 | |
357 | bool runOnFunction(Function &F) override; |
358 | |
359 | StringRef getPassName() const override { |
360 | return "GPU Load and Store Vectorizer" ; |
361 | } |
362 | |
363 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
364 | AU.addRequired<AAResultsWrapperPass>(); |
365 | AU.addRequired<AssumptionCacheTracker>(); |
366 | AU.addRequired<ScalarEvolutionWrapperPass>(); |
367 | AU.addRequired<DominatorTreeWrapperPass>(); |
368 | AU.addRequired<TargetTransformInfoWrapperPass>(); |
369 | AU.setPreservesCFG(); |
370 | } |
371 | }; |
372 | |
373 | } // end anonymous namespace |
374 | |
375 | char LoadStoreVectorizerLegacyPass::ID = 0; |
376 | |
377 | INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE, |
378 | "Vectorize load and Store instructions" , false, false) |
379 | INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass) |
380 | INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker); |
381 | INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) |
382 | INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) |
383 | INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) |
384 | INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) |
385 | INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE, |
386 | "Vectorize load and store instructions" , false, false) |
387 | |
388 | Pass *llvm::createLoadStoreVectorizerPass() { |
389 | return new LoadStoreVectorizerLegacyPass(); |
390 | } |
391 | |
392 | bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) { |
393 | // Don't vectorize when the attribute NoImplicitFloat is used. |
394 | if (skipFunction(F) || F.hasFnAttribute(Kind: Attribute::NoImplicitFloat)) |
395 | return false; |
396 | |
397 | AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); |
398 | DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); |
399 | ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); |
400 | TargetTransformInfo &TTI = |
401 | getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); |
402 | |
403 | AssumptionCache &AC = |
404 | getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); |
405 | |
406 | return Vectorizer(F, AA, AC, DT, SE, TTI).run(); |
407 | } |
408 | |
409 | PreservedAnalyses LoadStoreVectorizerPass::run(Function &F, |
410 | FunctionAnalysisManager &AM) { |
411 | // Don't vectorize when the attribute NoImplicitFloat is used. |
412 | if (F.hasFnAttribute(Kind: Attribute::NoImplicitFloat)) |
413 | return PreservedAnalyses::all(); |
414 | |
415 | AliasAnalysis &AA = AM.getResult<AAManager>(IR&: F); |
416 | DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(IR&: F); |
417 | ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(IR&: F); |
418 | TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(IR&: F); |
419 | AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(IR&: F); |
420 | |
421 | bool Changed = Vectorizer(F, AA, AC, DT, SE, TTI).run(); |
422 | PreservedAnalyses PA; |
423 | PA.preserveSet<CFGAnalyses>(); |
424 | return Changed ? PA : PreservedAnalyses::all(); |
425 | } |
426 | |
427 | bool Vectorizer::run() { |
428 | bool Changed = false; |
429 | // Break up the BB if there are any instrs which aren't guaranteed to transfer |
430 | // execution to their successor. |
431 | // |
432 | // Consider, for example: |
433 | // |
434 | // def assert_arr_len(int n) { if (n < 2) exit(); } |
435 | // |
436 | // load arr[0] |
437 | // call assert_array_len(arr.length) |
438 | // load arr[1] |
439 | // |
440 | // Even though assert_arr_len does not read or write any memory, we can't |
441 | // speculate the second load before the call. More info at |
442 | // https://github.com/llvm/llvm-project/issues/52950. |
443 | for (BasicBlock *BB : post_order(G: &F)) { |
444 | // BB must at least have a terminator. |
445 | assert(!BB->empty()); |
446 | |
447 | SmallVector<BasicBlock::iterator, 8> Barriers; |
448 | Barriers.emplace_back(Args: BB->begin()); |
449 | for (Instruction &I : *BB) |
450 | if (!isGuaranteedToTransferExecutionToSuccessor(I: &I)) |
451 | Barriers.emplace_back(Args: I.getIterator()); |
452 | Barriers.emplace_back(Args: BB->end()); |
453 | |
454 | for (auto It = Barriers.begin(), End = std::prev(x: Barriers.end()); It != End; |
455 | ++It) |
456 | Changed |= runOnPseudoBB(Begin: *It, End: *std::next(x: It)); |
457 | |
458 | for (Instruction *I : ToErase) { |
459 | auto *PtrOperand = getLoadStorePointerOperand(V: I); |
460 | if (I->use_empty()) |
461 | I->eraseFromParent(); |
462 | RecursivelyDeleteTriviallyDeadInstructions(V: PtrOperand); |
463 | } |
464 | ToErase.clear(); |
465 | } |
466 | |
467 | return Changed; |
468 | } |
469 | |
470 | bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin, |
471 | BasicBlock::iterator End) { |
472 | LLVM_DEBUG({ |
473 | dbgs() << "LSV: Running on pseudo-BB [" << *Begin << " ... " ; |
474 | if (End != Begin->getParent()->end()) |
475 | dbgs() << *End; |
476 | else |
477 | dbgs() << "<BB end>" ; |
478 | dbgs() << ")\n" ; |
479 | }); |
480 | |
481 | bool Changed = false; |
482 | for (const auto &[EqClassKey, EqClass] : |
483 | collectEquivalenceClasses(Begin, End)) |
484 | Changed |= runOnEquivalenceClass(EqClassKey, EqClass); |
485 | |
486 | return Changed; |
487 | } |
488 | |
489 | bool Vectorizer::runOnEquivalenceClass(const EqClassKey &EqClassKey, |
490 | ArrayRef<Instruction *> EqClass) { |
491 | bool Changed = false; |
492 | |
493 | LLVM_DEBUG({ |
494 | dbgs() << "LSV: Running on equivalence class of size " << EqClass.size() |
495 | << " keyed on " << EqClassKey << ":\n" ; |
496 | for (Instruction *I : EqClass) |
497 | dbgs() << " " << *I << "\n" ; |
498 | }); |
499 | |
500 | std::vector<Chain> Chains = gatherChains(Instrs: EqClass); |
501 | LLVM_DEBUG(dbgs() << "LSV: Got " << Chains.size() |
502 | << " nontrivial chains.\n" ;); |
503 | for (Chain &C : Chains) |
504 | Changed |= runOnChain(C); |
505 | return Changed; |
506 | } |
507 | |
508 | bool Vectorizer::runOnChain(Chain &C) { |
509 | LLVM_DEBUG({ |
510 | dbgs() << "LSV: Running on chain with " << C.size() << " instructions:\n" ; |
511 | dumpChain(C); |
512 | }); |
513 | |
514 | // Split up the chain into increasingly smaller chains, until we can finally |
515 | // vectorize the chains. |
516 | // |
517 | // (Don't be scared by the depth of the loop nest here. These operations are |
518 | // all at worst O(n lg n) in the number of instructions, and splitting chains |
519 | // doesn't change the number of instrs. So the whole loop nest is O(n lg n).) |
520 | bool Changed = false; |
521 | for (auto &C : splitChainByMayAliasInstrs(C)) |
522 | for (auto &C : splitChainByContiguity(C)) |
523 | for (auto &C : splitChainByAlignment(C)) |
524 | Changed |= vectorizeChain(C); |
525 | return Changed; |
526 | } |
527 | |
528 | std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) { |
529 | if (C.empty()) |
530 | return {}; |
531 | |
532 | sortChainInBBOrder(C); |
533 | |
534 | LLVM_DEBUG({ |
535 | dbgs() << "LSV: splitChainByMayAliasInstrs considering chain:\n" ; |
536 | dumpChain(C); |
537 | }); |
538 | |
539 | // We know that elements in the chain with nonverlapping offsets can't |
540 | // alias, but AA may not be smart enough to figure this out. Use a |
541 | // hashtable. |
542 | DenseMap<Instruction *, APInt /*OffsetFromLeader*/> ChainOffsets; |
543 | for (const auto &E : C) |
544 | ChainOffsets.insert(KV: {&*E.Inst, E.OffsetFromLeader}); |
545 | |
546 | // Loads get hoisted up to the first load in the chain. Stores get sunk |
547 | // down to the last store in the chain. Our algorithm for loads is: |
548 | // |
549 | // - Take the first element of the chain. This is the start of a new chain. |
550 | // - Take the next element of `Chain` and check for may-alias instructions |
551 | // up to the start of NewChain. If no may-alias instrs, add it to |
552 | // NewChain. Otherwise, start a new NewChain. |
553 | // |
554 | // For stores it's the same except in the reverse direction. |
555 | // |
556 | // We expect IsLoad to be an std::bool_constant. |
557 | auto Impl = [&](auto IsLoad) { |
558 | // MSVC is unhappy if IsLoad is a capture, so pass it as an arg. |
559 | auto [ChainBegin, ChainEnd] = [&](auto IsLoad) { |
560 | if constexpr (IsLoad()) |
561 | return std::make_pair(x: C.begin(), y: C.end()); |
562 | else |
563 | return std::make_pair(x: C.rbegin(), y: C.rend()); |
564 | }(IsLoad); |
565 | assert(ChainBegin != ChainEnd); |
566 | |
567 | std::vector<Chain> Chains; |
568 | SmallVector<ChainElem, 1> NewChain; |
569 | NewChain.emplace_back(*ChainBegin); |
570 | for (auto ChainIt = std::next(ChainBegin); ChainIt != ChainEnd; ++ChainIt) { |
571 | if (isSafeToMove<IsLoad>(ChainIt->Inst, NewChain.front().Inst, |
572 | ChainOffsets)) { |
573 | LLVM_DEBUG(dbgs() << "LSV: No intervening may-alias instrs; can merge " |
574 | << *ChainIt->Inst << " into " << *ChainBegin->Inst |
575 | << "\n" ); |
576 | NewChain.emplace_back(*ChainIt); |
577 | } else { |
578 | LLVM_DEBUG( |
579 | dbgs() << "LSV: Found intervening may-alias instrs; cannot merge " |
580 | << *ChainIt->Inst << " into " << *ChainBegin->Inst << "\n" ); |
581 | if (NewChain.size() > 1) { |
582 | LLVM_DEBUG({ |
583 | dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n" ; |
584 | dumpChain(NewChain); |
585 | }); |
586 | Chains.emplace_back(args: std::move(NewChain)); |
587 | } |
588 | |
589 | // Start a new chain. |
590 | NewChain = SmallVector<ChainElem, 1>({*ChainIt}); |
591 | } |
592 | } |
593 | if (NewChain.size() > 1) { |
594 | LLVM_DEBUG({ |
595 | dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n" ; |
596 | dumpChain(NewChain); |
597 | }); |
598 | Chains.emplace_back(args: std::move(NewChain)); |
599 | } |
600 | return Chains; |
601 | }; |
602 | |
603 | if (isa<LoadInst>(Val: C[0].Inst)) |
604 | return Impl(/*IsLoad=*/std::bool_constant<true>()); |
605 | |
606 | assert(isa<StoreInst>(C[0].Inst)); |
607 | return Impl(/*IsLoad=*/std::bool_constant<false>()); |
608 | } |
609 | |
610 | std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) { |
611 | if (C.empty()) |
612 | return {}; |
613 | |
614 | sortChainInOffsetOrder(C); |
615 | |
616 | LLVM_DEBUG({ |
617 | dbgs() << "LSV: splitChainByContiguity considering chain:\n" ; |
618 | dumpChain(C); |
619 | }); |
620 | |
621 | std::vector<Chain> Ret; |
622 | Ret.push_back(x: {C.front()}); |
623 | |
624 | for (auto It = std::next(x: C.begin()), End = C.end(); It != End; ++It) { |
625 | // `prev` accesses offsets [PrevDistFromBase, PrevReadEnd). |
626 | auto &CurChain = Ret.back(); |
627 | const ChainElem &Prev = CurChain.back(); |
628 | unsigned SzBits = DL.getTypeSizeInBits(Ty: getLoadStoreType(I: &*Prev.Inst)); |
629 | assert(SzBits % 8 == 0 && "Non-byte sizes should have been filtered out by " |
630 | "collectEquivalenceClass" ); |
631 | APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / 8; |
632 | |
633 | // Add this instruction to the end of the current chain, or start a new one. |
634 | bool AreContiguous = It->OffsetFromLeader == PrevReadEnd; |
635 | LLVM_DEBUG(dbgs() << "LSV: Instructions are " |
636 | << (AreContiguous ? "" : "not " ) << "contiguous: " |
637 | << *Prev.Inst << " (ends at offset " << PrevReadEnd |
638 | << ") -> " << *It->Inst << " (starts at offset " |
639 | << It->OffsetFromLeader << ")\n" ); |
640 | if (AreContiguous) |
641 | CurChain.push_back(Elt: *It); |
642 | else |
643 | Ret.push_back(x: {*It}); |
644 | } |
645 | |
646 | // Filter out length-1 chains, these are uninteresting. |
647 | llvm::erase_if(C&: Ret, P: [](const auto &Chain) { return Chain.size() <= 1; }); |
648 | return Ret; |
649 | } |
650 | |
651 | Type *Vectorizer::getChainElemTy(const Chain &C) { |
652 | assert(!C.empty()); |
653 | // The rules are: |
654 | // - If there are any pointer types in the chain, use an integer type. |
655 | // - Prefer an integer type if it appears in the chain. |
656 | // - Otherwise, use the first type in the chain. |
657 | // |
658 | // The rule about pointer types is a simplification when we merge e.g. a load |
659 | // of a ptr and a double. There's no direct conversion from a ptr to a |
660 | // double; it requires a ptrtoint followed by a bitcast. |
661 | // |
662 | // It's unclear to me if the other rules have any practical effect, but we do |
663 | // it to match this pass's previous behavior. |
664 | if (any_of(Range: C, P: [](const ChainElem &E) { |
665 | return getLoadStoreType(I: E.Inst)->getScalarType()->isPointerTy(); |
666 | })) { |
667 | return Type::getIntNTy( |
668 | C&: F.getContext(), |
669 | N: DL.getTypeSizeInBits(Ty: getLoadStoreType(I: C[0].Inst)->getScalarType())); |
670 | } |
671 | |
672 | for (const ChainElem &E : C) |
673 | if (Type *T = getLoadStoreType(I: E.Inst)->getScalarType(); T->isIntegerTy()) |
674 | return T; |
675 | return getLoadStoreType(I: C[0].Inst)->getScalarType(); |
676 | } |
677 | |
678 | std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) { |
679 | // We use a simple greedy algorithm. |
680 | // - Given a chain of length N, find all prefixes that |
681 | // (a) are not longer than the max register length, and |
682 | // (b) are a power of 2. |
683 | // - Starting from the longest prefix, try to create a vector of that length. |
684 | // - If one of them works, great. Repeat the algorithm on any remaining |
685 | // elements in the chain. |
686 | // - If none of them work, discard the first element and repeat on a chain |
687 | // of length N-1. |
688 | if (C.empty()) |
689 | return {}; |
690 | |
691 | sortChainInOffsetOrder(C); |
692 | |
693 | LLVM_DEBUG({ |
694 | dbgs() << "LSV: splitChainByAlignment considering chain:\n" ; |
695 | dumpChain(C); |
696 | }); |
697 | |
698 | bool IsLoadChain = isa<LoadInst>(Val: C[0].Inst); |
699 | auto GetVectorFactor = [&](unsigned VF, unsigned LoadStoreSize, |
700 | unsigned ChainSizeBytes, VectorType *VecTy) { |
701 | return IsLoadChain ? TTI.getLoadVectorFactor(VF, LoadSize: LoadStoreSize, |
702 | ChainSizeInBytes: ChainSizeBytes, VecTy) |
703 | : TTI.getStoreVectorFactor(VF, StoreSize: LoadStoreSize, |
704 | ChainSizeInBytes: ChainSizeBytes, VecTy); |
705 | }; |
706 | |
707 | #ifndef NDEBUG |
708 | for (const auto &E : C) { |
709 | Type *Ty = getLoadStoreType(E.Inst)->getScalarType(); |
710 | assert(isPowerOf2_32(DL.getTypeSizeInBits(Ty)) && |
711 | "Should have filtered out non-power-of-two elements in " |
712 | "collectEquivalenceClasses." ); |
713 | } |
714 | #endif |
715 | |
716 | unsigned AS = getLoadStoreAddressSpace(I: C[0].Inst); |
717 | unsigned VecRegBytes = TTI.getLoadStoreVecRegBitWidth(AddrSpace: AS) / 8; |
718 | |
719 | std::vector<Chain> Ret; |
720 | for (unsigned CBegin = 0; CBegin < C.size(); ++CBegin) { |
721 | // Find candidate chains of size not greater than the largest vector reg. |
722 | // These chains are over the closed interval [CBegin, CEnd]. |
723 | SmallVector<std::pair<unsigned /*CEnd*/, unsigned /*SizeBytes*/>, 8> |
724 | CandidateChains; |
725 | for (unsigned CEnd = CBegin + 1, Size = C.size(); CEnd < Size; ++CEnd) { |
726 | APInt Sz = C[CEnd].OffsetFromLeader + |
727 | DL.getTypeStoreSize(Ty: getLoadStoreType(I: C[CEnd].Inst)) - |
728 | C[CBegin].OffsetFromLeader; |
729 | if (Sz.sgt(RHS: VecRegBytes)) |
730 | break; |
731 | CandidateChains.emplace_back(Args&: CEnd, |
732 | Args: static_cast<unsigned>(Sz.getLimitedValue())); |
733 | } |
734 | |
735 | // Consider the longest chain first. |
736 | for (auto It = CandidateChains.rbegin(), End = CandidateChains.rend(); |
737 | It != End; ++It) { |
738 | auto [CEnd, SizeBytes] = *It; |
739 | LLVM_DEBUG( |
740 | dbgs() << "LSV: splitChainByAlignment considering candidate chain [" |
741 | << *C[CBegin].Inst << " ... " << *C[CEnd].Inst << "]\n" ); |
742 | |
743 | Type *VecElemTy = getChainElemTy(C); |
744 | // Note, VecElemTy is a power of 2, but might be less than one byte. For |
745 | // example, we can vectorize 2 x <2 x i4> to <4 x i4>, and in this case |
746 | // VecElemTy would be i4. |
747 | unsigned VecElemBits = DL.getTypeSizeInBits(Ty: VecElemTy); |
748 | |
749 | // SizeBytes and VecElemBits are powers of 2, so they divide evenly. |
750 | assert((8 * SizeBytes) % VecElemBits == 0); |
751 | unsigned NumVecElems = 8 * SizeBytes / VecElemBits; |
752 | FixedVectorType *VecTy = FixedVectorType::get(ElementType: VecElemTy, NumElts: NumVecElems); |
753 | unsigned VF = 8 * VecRegBytes / VecElemBits; |
754 | |
755 | // Check that TTI is happy with this vectorization factor. |
756 | unsigned TargetVF = GetVectorFactor(VF, VecElemBits, |
757 | VecElemBits * NumVecElems / 8, VecTy); |
758 | if (TargetVF != VF && TargetVF < NumVecElems) { |
759 | LLVM_DEBUG( |
760 | dbgs() << "LSV: splitChainByAlignment discarding candidate chain " |
761 | "because TargetVF=" |
762 | << TargetVF << " != VF=" << VF |
763 | << " and TargetVF < NumVecElems=" << NumVecElems << "\n" ); |
764 | continue; |
765 | } |
766 | |
767 | // Is a load/store with this alignment allowed by TTI and at least as fast |
768 | // as an unvectorized load/store? |
769 | // |
770 | // TTI and F are passed as explicit captures to WAR an MSVC misparse (??). |
771 | auto IsAllowedAndFast = [&, SizeBytes = SizeBytes, &TTI = TTI, |
772 | &F = F](Align Alignment) { |
773 | if (Alignment.value() % SizeBytes == 0) |
774 | return true; |
775 | unsigned VectorizedSpeed = 0; |
776 | bool AllowsMisaligned = TTI.allowsMisalignedMemoryAccesses( |
777 | Context&: F.getContext(), BitWidth: SizeBytes * 8, AddressSpace: AS, Alignment, Fast: &VectorizedSpeed); |
778 | if (!AllowsMisaligned) { |
779 | LLVM_DEBUG(dbgs() |
780 | << "LSV: Access of " << SizeBytes << "B in addrspace " |
781 | << AS << " with alignment " << Alignment.value() |
782 | << " is misaligned, and therefore can't be vectorized.\n" ); |
783 | return false; |
784 | } |
785 | |
786 | unsigned ElementwiseSpeed = 0; |
787 | (TTI).allowsMisalignedMemoryAccesses(Context&: (F).getContext(), BitWidth: VecElemBits, AddressSpace: AS, |
788 | Alignment, Fast: &ElementwiseSpeed); |
789 | if (VectorizedSpeed < ElementwiseSpeed) { |
790 | LLVM_DEBUG(dbgs() |
791 | << "LSV: Access of " << SizeBytes << "B in addrspace " |
792 | << AS << " with alignment " << Alignment.value() |
793 | << " has relative speed " << VectorizedSpeed |
794 | << ", which is lower than the elementwise speed of " |
795 | << ElementwiseSpeed |
796 | << ". Therefore this access won't be vectorized.\n" ); |
797 | return false; |
798 | } |
799 | return true; |
800 | }; |
801 | |
802 | // If we're loading/storing from an alloca, align it if possible. |
803 | // |
804 | // FIXME: We eagerly upgrade the alignment, regardless of whether TTI |
805 | // tells us this is beneficial. This feels a bit odd, but it matches |
806 | // existing tests. This isn't *so* bad, because at most we align to 4 |
807 | // bytes (current value of StackAdjustedAlignment). |
808 | // |
809 | // FIXME: We will upgrade the alignment of the alloca even if it turns out |
810 | // we can't vectorize for some other reason. |
811 | Value *PtrOperand = getLoadStorePointerOperand(V: C[CBegin].Inst); |
812 | bool IsAllocaAccess = AS == DL.getAllocaAddrSpace() && |
813 | isa<AllocaInst>(Val: PtrOperand->stripPointerCasts()); |
814 | Align Alignment = getLoadStoreAlignment(I: C[CBegin].Inst); |
815 | Align PrefAlign = Align(StackAdjustedAlignment); |
816 | if (IsAllocaAccess && Alignment.value() % SizeBytes != 0 && |
817 | IsAllowedAndFast(PrefAlign)) { |
818 | Align NewAlign = getOrEnforceKnownAlignment( |
819 | V: PtrOperand, PrefAlign, DL, CxtI: C[CBegin].Inst, AC: nullptr, DT: &DT); |
820 | if (NewAlign >= Alignment) { |
821 | LLVM_DEBUG(dbgs() |
822 | << "LSV: splitByChain upgrading alloca alignment from " |
823 | << Alignment.value() << " to " << NewAlign.value() |
824 | << "\n" ); |
825 | Alignment = NewAlign; |
826 | } |
827 | } |
828 | |
829 | if (!IsAllowedAndFast(Alignment)) { |
830 | LLVM_DEBUG( |
831 | dbgs() << "LSV: splitChainByAlignment discarding candidate chain " |
832 | "because its alignment is not AllowedAndFast: " |
833 | << Alignment.value() << "\n" ); |
834 | continue; |
835 | } |
836 | |
837 | if ((IsLoadChain && |
838 | !TTI.isLegalToVectorizeLoadChain(ChainSizeInBytes: SizeBytes, Alignment, AddrSpace: AS)) || |
839 | (!IsLoadChain && |
840 | !TTI.isLegalToVectorizeStoreChain(ChainSizeInBytes: SizeBytes, Alignment, AddrSpace: AS))) { |
841 | LLVM_DEBUG( |
842 | dbgs() << "LSV: splitChainByAlignment discarding candidate chain " |
843 | "because !isLegalToVectorizeLoad/StoreChain." ); |
844 | continue; |
845 | } |
846 | |
847 | // Hooray, we can vectorize this chain! |
848 | Chain &NewChain = Ret.emplace_back(); |
849 | for (unsigned I = CBegin; I <= CEnd; ++I) |
850 | NewChain.emplace_back(Args&: C[I]); |
851 | CBegin = CEnd; // Skip over the instructions we've added to the chain. |
852 | break; |
853 | } |
854 | } |
855 | return Ret; |
856 | } |
857 | |
858 | bool Vectorizer::vectorizeChain(Chain &C) { |
859 | if (C.size() < 2) |
860 | return false; |
861 | |
862 | sortChainInOffsetOrder(C); |
863 | |
864 | LLVM_DEBUG({ |
865 | dbgs() << "LSV: Vectorizing chain of " << C.size() << " instructions:\n" ; |
866 | dumpChain(C); |
867 | }); |
868 | |
869 | Type *VecElemTy = getChainElemTy(C); |
870 | bool IsLoadChain = isa<LoadInst>(Val: C[0].Inst); |
871 | unsigned AS = getLoadStoreAddressSpace(I: C[0].Inst); |
872 | unsigned ChainBytes = std::accumulate( |
873 | first: C.begin(), last: C.end(), init: 0u, binary_op: [&](unsigned Bytes, const ChainElem &E) { |
874 | return Bytes + DL.getTypeStoreSize(Ty: getLoadStoreType(I: E.Inst)); |
875 | }); |
876 | assert(ChainBytes % DL.getTypeStoreSize(VecElemTy) == 0); |
877 | // VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller |
878 | // than 1 byte (e.g. VecTy == <32 x i1>). |
879 | Type *VecTy = FixedVectorType::get( |
880 | ElementType: VecElemTy, NumElts: 8 * ChainBytes / DL.getTypeSizeInBits(Ty: VecElemTy)); |
881 | |
882 | Align Alignment = getLoadStoreAlignment(I: C[0].Inst); |
883 | // If this is a load/store of an alloca, we might have upgraded the alloca's |
884 | // alignment earlier. Get the new alignment. |
885 | if (AS == DL.getAllocaAddrSpace()) { |
886 | Alignment = std::max( |
887 | a: Alignment, |
888 | b: getOrEnforceKnownAlignment(V: getLoadStorePointerOperand(V: C[0].Inst), |
889 | PrefAlign: MaybeAlign(), DL, CxtI: C[0].Inst, AC: nullptr, DT: &DT)); |
890 | } |
891 | |
892 | // All elements of the chain must have the same scalar-type size. |
893 | #ifndef NDEBUG |
894 | for (const ChainElem &E : C) |
895 | assert(DL.getTypeStoreSize(getLoadStoreType(E.Inst)->getScalarType()) == |
896 | DL.getTypeStoreSize(VecElemTy)); |
897 | #endif |
898 | |
899 | Instruction *VecInst; |
900 | if (IsLoadChain) { |
901 | // Loads get hoisted to the location of the first load in the chain. We may |
902 | // also need to hoist the (transitive) operands of the loads. |
903 | Builder.SetInsertPoint( |
904 | llvm::min_element(Range&: C, C: [](const auto &A, const auto &B) { |
905 | return A.Inst->comesBefore(B.Inst); |
906 | })->Inst); |
907 | |
908 | // Chain is in offset order, so C[0] is the instr with the lowest offset, |
909 | // i.e. the root of the vector. |
910 | VecInst = Builder.CreateAlignedLoad(Ty: VecTy, |
911 | Ptr: getLoadStorePointerOperand(V: C[0].Inst), |
912 | Align: Alignment); |
913 | |
914 | unsigned VecIdx = 0; |
915 | for (const ChainElem &E : C) { |
916 | Instruction *I = E.Inst; |
917 | Value *V; |
918 | Type *T = getLoadStoreType(I); |
919 | if (auto *VT = dyn_cast<FixedVectorType>(Val: T)) { |
920 | auto Mask = llvm::to_vector<8>( |
921 | Range: llvm::seq<int>(Begin: VecIdx, End: VecIdx + VT->getNumElements())); |
922 | V = Builder.CreateShuffleVector(V: VecInst, Mask, Name: I->getName()); |
923 | VecIdx += VT->getNumElements(); |
924 | } else { |
925 | V = Builder.CreateExtractElement(Vec: VecInst, Idx: Builder.getInt32(C: VecIdx), |
926 | Name: I->getName()); |
927 | ++VecIdx; |
928 | } |
929 | if (V->getType() != I->getType()) |
930 | V = Builder.CreateBitOrPointerCast(V, DestTy: I->getType()); |
931 | I->replaceAllUsesWith(V); |
932 | } |
933 | |
934 | // Finally, we need to reorder the instrs in the BB so that the (transitive) |
935 | // operands of VecInst appear before it. To see why, suppose we have |
936 | // vectorized the following code: |
937 | // |
938 | // ptr1 = gep a, 1 |
939 | // load1 = load i32 ptr1 |
940 | // ptr0 = gep a, 0 |
941 | // load0 = load i32 ptr0 |
942 | // |
943 | // We will put the vectorized load at the location of the earliest load in |
944 | // the BB, i.e. load1. We get: |
945 | // |
946 | // ptr1 = gep a, 1 |
947 | // loadv = load <2 x i32> ptr0 |
948 | // load0 = extractelement loadv, 0 |
949 | // load1 = extractelement loadv, 1 |
950 | // ptr0 = gep a, 0 |
951 | // |
952 | // Notice that loadv uses ptr0, which is defined *after* it! |
953 | reorder(I: VecInst); |
954 | } else { |
955 | // Stores get sunk to the location of the last store in the chain. |
956 | Builder.SetInsertPoint(llvm::max_element(Range&: C, C: [](auto &A, auto &B) { |
957 | return A.Inst->comesBefore(B.Inst); |
958 | })->Inst); |
959 | |
960 | // Build the vector to store. |
961 | Value *Vec = PoisonValue::get(T: VecTy); |
962 | unsigned VecIdx = 0; |
963 | auto InsertElem = [&](Value *V) { |
964 | if (V->getType() != VecElemTy) |
965 | V = Builder.CreateBitOrPointerCast(V, DestTy: VecElemTy); |
966 | Vec = Builder.CreateInsertElement(Vec, NewElt: V, Idx: Builder.getInt32(C: VecIdx++)); |
967 | }; |
968 | for (const ChainElem &E : C) { |
969 | auto *I = cast<StoreInst>(Val: E.Inst); |
970 | if (FixedVectorType *VT = |
971 | dyn_cast<FixedVectorType>(Val: getLoadStoreType(I))) { |
972 | for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) { |
973 | InsertElem(Builder.CreateExtractElement(Vec: I->getValueOperand(), |
974 | Idx: Builder.getInt32(C: J))); |
975 | } |
976 | } else { |
977 | InsertElem(I->getValueOperand()); |
978 | } |
979 | } |
980 | |
981 | // Chain is in offset order, so C[0] is the instr with the lowest offset, |
982 | // i.e. the root of the vector. |
983 | VecInst = Builder.CreateAlignedStore( |
984 | Val: Vec, |
985 | Ptr: getLoadStorePointerOperand(V: C[0].Inst), |
986 | Align: Alignment); |
987 | } |
988 | |
989 | propagateMetadata(I: VecInst, C); |
990 | |
991 | for (const ChainElem &E : C) |
992 | ToErase.emplace_back(Args: E.Inst); |
993 | |
994 | ++NumVectorInstructions; |
995 | NumScalarsVectorized += C.size(); |
996 | return true; |
997 | } |
998 | |
999 | template <bool IsLoadChain> |
1000 | bool Vectorizer::isSafeToMove( |
1001 | Instruction *ChainElem, Instruction *ChainBegin, |
1002 | const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets) { |
1003 | LLVM_DEBUG(dbgs() << "LSV: isSafeToMove(" << *ChainElem << " -> " |
1004 | << *ChainBegin << ")\n" ); |
1005 | |
1006 | assert(isa<LoadInst>(ChainElem) == IsLoadChain); |
1007 | if (ChainElem == ChainBegin) |
1008 | return true; |
1009 | |
1010 | // Invariant loads can always be reordered; by definition they are not |
1011 | // clobbered by stores. |
1012 | if (isInvariantLoad(I: ChainElem)) |
1013 | return true; |
1014 | |
1015 | auto BBIt = std::next([&] { |
1016 | if constexpr (IsLoadChain) |
1017 | return BasicBlock::reverse_iterator(ChainElem); |
1018 | else |
1019 | return BasicBlock::iterator(ChainElem); |
1020 | }()); |
1021 | auto BBItEnd = std::next([&] { |
1022 | if constexpr (IsLoadChain) |
1023 | return BasicBlock::reverse_iterator(ChainBegin); |
1024 | else |
1025 | return BasicBlock::iterator(ChainBegin); |
1026 | }()); |
1027 | |
1028 | const APInt &ChainElemOffset = ChainOffsets.at(Val: ChainElem); |
1029 | const unsigned ChainElemSize = |
1030 | DL.getTypeStoreSize(Ty: getLoadStoreType(I: ChainElem)); |
1031 | |
1032 | for (; BBIt != BBItEnd; ++BBIt) { |
1033 | Instruction *I = &*BBIt; |
1034 | |
1035 | if (!I->mayReadOrWriteMemory()) |
1036 | continue; |
1037 | |
1038 | // Loads can be reordered with other loads. |
1039 | if (IsLoadChain && isa<LoadInst>(Val: I)) |
1040 | continue; |
1041 | |
1042 | // Stores can be sunk below invariant loads. |
1043 | if (!IsLoadChain && isInvariantLoad(I)) |
1044 | continue; |
1045 | |
1046 | // If I is in the chain, we can tell whether it aliases ChainIt by checking |
1047 | // what offset ChainIt accesses. This may be better than AA is able to do. |
1048 | // |
1049 | // We should really only have duplicate offsets for stores (the duplicate |
1050 | // loads should be CSE'ed), but in case we have a duplicate load, we'll |
1051 | // split the chain so we don't have to handle this case specially. |
1052 | if (auto OffsetIt = ChainOffsets.find(Val: I); OffsetIt != ChainOffsets.end()) { |
1053 | // I and ChainElem overlap if: |
1054 | // - I and ChainElem have the same offset, OR |
1055 | // - I's offset is less than ChainElem's, but I touches past the |
1056 | // beginning of ChainElem, OR |
1057 | // - ChainElem's offset is less than I's, but ChainElem touches past the |
1058 | // beginning of I. |
1059 | const APInt &IOffset = OffsetIt->second; |
1060 | unsigned IElemSize = DL.getTypeStoreSize(Ty: getLoadStoreType(I)); |
1061 | if (IOffset == ChainElemOffset || |
1062 | (IOffset.sle(RHS: ChainElemOffset) && |
1063 | (IOffset + IElemSize).sgt(RHS: ChainElemOffset)) || |
1064 | (ChainElemOffset.sle(RHS: IOffset) && |
1065 | (ChainElemOffset + ChainElemSize).sgt(RHS: OffsetIt->second))) { |
1066 | LLVM_DEBUG({ |
1067 | // Double check that AA also sees this alias. If not, we probably |
1068 | // have a bug. |
1069 | ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem)); |
1070 | assert(IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)); |
1071 | dbgs() << "LSV: Found alias in chain: " << *I << "\n" ; |
1072 | }); |
1073 | return false; // We found an aliasing instruction; bail. |
1074 | } |
1075 | |
1076 | continue; // We're confident there's no alias. |
1077 | } |
1078 | |
1079 | LLVM_DEBUG(dbgs() << "LSV: Querying AA for " << *I << "\n" ); |
1080 | ModRefInfo MR = AA.getModRefInfo(I, OptLoc: MemoryLocation::get(Inst: ChainElem)); |
1081 | if (IsLoadChain ? isModSet(MRI: MR) : isModOrRefSet(MRI: MR)) { |
1082 | LLVM_DEBUG(dbgs() << "LSV: Found alias in chain:\n" |
1083 | << " Aliasing instruction:\n" |
1084 | << " " << *I << '\n' |
1085 | << " Aliased instruction and pointer:\n" |
1086 | << " " << *ChainElem << '\n' |
1087 | << " " << *getLoadStorePointerOperand(ChainElem) |
1088 | << '\n'); |
1089 | |
1090 | return false; |
1091 | } |
1092 | } |
1093 | return true; |
1094 | } |
1095 | |
1096 | static bool checkNoWrapFlags(Instruction *I, bool Signed) { |
1097 | BinaryOperator *BinOpI = cast<BinaryOperator>(Val: I); |
1098 | return (Signed && BinOpI->hasNoSignedWrap()) || |
1099 | (!Signed && BinOpI->hasNoUnsignedWrap()); |
1100 | } |
1101 | |
1102 | static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA, |
1103 | unsigned MatchingOpIdxA, Instruction *AddOpB, |
1104 | unsigned MatchingOpIdxB, bool Signed) { |
1105 | LLVM_DEBUG(dbgs() << "LSV: checkIfSafeAddSequence IdxDiff=" << IdxDiff |
1106 | << ", AddOpA=" << *AddOpA << ", MatchingOpIdxA=" |
1107 | << MatchingOpIdxA << ", AddOpB=" << *AddOpB |
1108 | << ", MatchingOpIdxB=" << MatchingOpIdxB |
1109 | << ", Signed=" << Signed << "\n" ); |
1110 | // If both OpA and OpB are adds with NSW/NUW and with one of the operands |
1111 | // being the same, we can guarantee that the transformation is safe if we can |
1112 | // prove that OpA won't overflow when Ret added to the other operand of OpA. |
1113 | // For example: |
1114 | // %tmp7 = add nsw i32 %tmp2, %v0 |
1115 | // %tmp8 = sext i32 %tmp7 to i64 |
1116 | // ... |
1117 | // %tmp11 = add nsw i32 %v0, 1 |
1118 | // %tmp12 = add nsw i32 %tmp2, %tmp11 |
1119 | // %tmp13 = sext i32 %tmp12 to i64 |
1120 | // |
1121 | // Both %tmp7 and %tmp12 have the nsw flag and the first operand is %tmp2. |
1122 | // It's guaranteed that adding 1 to %tmp7 won't overflow because %tmp11 adds |
1123 | // 1 to %v0 and both %tmp11 and %tmp12 have the nsw flag. |
1124 | assert(AddOpA->getOpcode() == Instruction::Add && |
1125 | AddOpB->getOpcode() == Instruction::Add && |
1126 | checkNoWrapFlags(AddOpA, Signed) && checkNoWrapFlags(AddOpB, Signed)); |
1127 | if (AddOpA->getOperand(i: MatchingOpIdxA) == |
1128 | AddOpB->getOperand(i: MatchingOpIdxB)) { |
1129 | Value *OtherOperandA = AddOpA->getOperand(i: MatchingOpIdxA == 1 ? 0 : 1); |
1130 | Value *OtherOperandB = AddOpB->getOperand(i: MatchingOpIdxB == 1 ? 0 : 1); |
1131 | Instruction *OtherInstrA = dyn_cast<Instruction>(Val: OtherOperandA); |
1132 | Instruction *OtherInstrB = dyn_cast<Instruction>(Val: OtherOperandB); |
1133 | // Match `x +nsw/nuw y` and `x +nsw/nuw (y +nsw/nuw IdxDiff)`. |
1134 | if (OtherInstrB && OtherInstrB->getOpcode() == Instruction::Add && |
1135 | checkNoWrapFlags(I: OtherInstrB, Signed) && |
1136 | isa<ConstantInt>(Val: OtherInstrB->getOperand(i: 1))) { |
1137 | int64_t CstVal = |
1138 | cast<ConstantInt>(Val: OtherInstrB->getOperand(i: 1))->getSExtValue(); |
1139 | if (OtherInstrB->getOperand(i: 0) == OtherOperandA && |
1140 | IdxDiff.getSExtValue() == CstVal) |
1141 | return true; |
1142 | } |
1143 | // Match `x +nsw/nuw (y +nsw/nuw -Idx)` and `x +nsw/nuw (y +nsw/nuw x)`. |
1144 | if (OtherInstrA && OtherInstrA->getOpcode() == Instruction::Add && |
1145 | checkNoWrapFlags(I: OtherInstrA, Signed) && |
1146 | isa<ConstantInt>(Val: OtherInstrA->getOperand(i: 1))) { |
1147 | int64_t CstVal = |
1148 | cast<ConstantInt>(Val: OtherInstrA->getOperand(i: 1))->getSExtValue(); |
1149 | if (OtherInstrA->getOperand(i: 0) == OtherOperandB && |
1150 | IdxDiff.getSExtValue() == -CstVal) |
1151 | return true; |
1152 | } |
1153 | // Match `x +nsw/nuw (y +nsw/nuw c)` and |
1154 | // `x +nsw/nuw (y +nsw/nuw (c + IdxDiff))`. |
1155 | if (OtherInstrA && OtherInstrB && |
1156 | OtherInstrA->getOpcode() == Instruction::Add && |
1157 | OtherInstrB->getOpcode() == Instruction::Add && |
1158 | checkNoWrapFlags(I: OtherInstrA, Signed) && |
1159 | checkNoWrapFlags(I: OtherInstrB, Signed) && |
1160 | isa<ConstantInt>(Val: OtherInstrA->getOperand(i: 1)) && |
1161 | isa<ConstantInt>(Val: OtherInstrB->getOperand(i: 1))) { |
1162 | int64_t CstValA = |
1163 | cast<ConstantInt>(Val: OtherInstrA->getOperand(i: 1))->getSExtValue(); |
1164 | int64_t CstValB = |
1165 | cast<ConstantInt>(Val: OtherInstrB->getOperand(i: 1))->getSExtValue(); |
1166 | if (OtherInstrA->getOperand(i: 0) == OtherInstrB->getOperand(i: 0) && |
1167 | IdxDiff.getSExtValue() == (CstValB - CstValA)) |
1168 | return true; |
1169 | } |
1170 | } |
1171 | return false; |
1172 | } |
1173 | |
1174 | std::optional<APInt> Vectorizer::getConstantOffsetComplexAddrs( |
1175 | Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) { |
1176 | LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs PtrA=" << *PtrA |
1177 | << " PtrB=" << *PtrB << " ContextInst=" << *ContextInst |
1178 | << " Depth=" << Depth << "\n" ); |
1179 | auto *GEPA = dyn_cast<GetElementPtrInst>(Val: PtrA); |
1180 | auto *GEPB = dyn_cast<GetElementPtrInst>(Val: PtrB); |
1181 | if (!GEPA || !GEPB) |
1182 | return getConstantOffsetSelects(PtrA, PtrB, ContextInst, Depth); |
1183 | |
1184 | // Look through GEPs after checking they're the same except for the last |
1185 | // index. |
1186 | if (GEPA->getNumOperands() != GEPB->getNumOperands() || |
1187 | GEPA->getPointerOperand() != GEPB->getPointerOperand()) |
1188 | return std::nullopt; |
1189 | gep_type_iterator GTIA = gep_type_begin(GEP: GEPA); |
1190 | gep_type_iterator GTIB = gep_type_begin(GEP: GEPB); |
1191 | for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) { |
1192 | if (GTIA.getOperand() != GTIB.getOperand()) |
1193 | return std::nullopt; |
1194 | ++GTIA; |
1195 | ++GTIB; |
1196 | } |
1197 | |
1198 | Instruction *OpA = dyn_cast<Instruction>(Val: GTIA.getOperand()); |
1199 | Instruction *OpB = dyn_cast<Instruction>(Val: GTIB.getOperand()); |
1200 | if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() || |
1201 | OpA->getType() != OpB->getType()) |
1202 | return std::nullopt; |
1203 | |
1204 | uint64_t Stride = GTIA.getSequentialElementStride(DL); |
1205 | |
1206 | // Only look through a ZExt/SExt. |
1207 | if (!isa<SExtInst>(Val: OpA) && !isa<ZExtInst>(Val: OpA)) |
1208 | return std::nullopt; |
1209 | |
1210 | bool Signed = isa<SExtInst>(Val: OpA); |
1211 | |
1212 | // At this point A could be a function parameter, i.e. not an instruction |
1213 | Value *ValA = OpA->getOperand(i: 0); |
1214 | OpB = dyn_cast<Instruction>(Val: OpB->getOperand(i: 0)); |
1215 | if (!OpB || ValA->getType() != OpB->getType()) |
1216 | return std::nullopt; |
1217 | |
1218 | const SCEV *OffsetSCEVA = SE.getSCEV(V: ValA); |
1219 | const SCEV *OffsetSCEVB = SE.getSCEV(V: OpB); |
1220 | const SCEV *IdxDiffSCEV = SE.getMinusSCEV(LHS: OffsetSCEVB, RHS: OffsetSCEVA); |
1221 | if (IdxDiffSCEV == SE.getCouldNotCompute()) |
1222 | return std::nullopt; |
1223 | |
1224 | ConstantRange IdxDiffRange = SE.getSignedRange(S: IdxDiffSCEV); |
1225 | if (!IdxDiffRange.isSingleElement()) |
1226 | return std::nullopt; |
1227 | APInt IdxDiff = *IdxDiffRange.getSingleElement(); |
1228 | |
1229 | LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs IdxDiff=" << IdxDiff |
1230 | << "\n" ); |
1231 | |
1232 | // Now we need to prove that adding IdxDiff to ValA won't overflow. |
1233 | bool Safe = false; |
1234 | |
1235 | // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to |
1236 | // ValA, we're okay. |
1237 | if (OpB->getOpcode() == Instruction::Add && |
1238 | isa<ConstantInt>(Val: OpB->getOperand(i: 1)) && |
1239 | IdxDiff.sle(RHS: cast<ConstantInt>(Val: OpB->getOperand(i: 1))->getSExtValue()) && |
1240 | checkNoWrapFlags(I: OpB, Signed)) |
1241 | Safe = true; |
1242 | |
1243 | // Second attempt: check if we have eligible add NSW/NUW instruction |
1244 | // sequences. |
1245 | OpA = dyn_cast<Instruction>(Val: ValA); |
1246 | if (!Safe && OpA && OpA->getOpcode() == Instruction::Add && |
1247 | OpB->getOpcode() == Instruction::Add && checkNoWrapFlags(I: OpA, Signed) && |
1248 | checkNoWrapFlags(I: OpB, Signed)) { |
1249 | // In the checks below a matching operand in OpA and OpB is an operand which |
1250 | // is the same in those two instructions. Below we account for possible |
1251 | // orders of the operands of these add instructions. |
1252 | for (unsigned MatchingOpIdxA : {0, 1}) |
1253 | for (unsigned MatchingOpIdxB : {0, 1}) |
1254 | if (!Safe) |
1255 | Safe = checkIfSafeAddSequence(IdxDiff, AddOpA: OpA, MatchingOpIdxA, AddOpB: OpB, |
1256 | MatchingOpIdxB, Signed); |
1257 | } |
1258 | |
1259 | unsigned BitWidth = ValA->getType()->getScalarSizeInBits(); |
1260 | |
1261 | // Third attempt: |
1262 | // |
1263 | // Assuming IdxDiff is positive: If all set bits of IdxDiff or any higher |
1264 | // order bit other than the sign bit are known to be zero in ValA, we can add |
1265 | // Diff to it while guaranteeing no overflow of any sort. |
1266 | // |
1267 | // If IdxDiff is negative, do the same, but swap ValA and ValB. |
1268 | if (!Safe) { |
1269 | // When computing known bits, use the GEPs as context instructions, since |
1270 | // they likely are in the same BB as the load/store. |
1271 | KnownBits Known(BitWidth); |
1272 | computeKnownBits(V: (IdxDiff.sge(RHS: 0) ? ValA : OpB), Known, DL, AC: &AC, CxtI: ContextInst, |
1273 | DT: &DT); |
1274 | APInt BitsAllowedToBeSet = Known.Zero.zext(width: IdxDiff.getBitWidth()); |
1275 | if (Signed) |
1276 | BitsAllowedToBeSet.clearBit(BitPosition: BitWidth - 1); |
1277 | Safe = BitsAllowedToBeSet.uge(RHS: IdxDiff.abs()); |
1278 | } |
1279 | |
1280 | if (Safe) |
1281 | return IdxDiff * Stride; |
1282 | return std::nullopt; |
1283 | } |
1284 | |
1285 | std::optional<APInt> Vectorizer::getConstantOffsetSelects( |
1286 | Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) { |
1287 | if (Depth++ == MaxDepth) |
1288 | return std::nullopt; |
1289 | |
1290 | if (auto *SelectA = dyn_cast<SelectInst>(Val: PtrA)) { |
1291 | if (auto *SelectB = dyn_cast<SelectInst>(Val: PtrB)) { |
1292 | if (SelectA->getCondition() != SelectB->getCondition()) |
1293 | return std::nullopt; |
1294 | LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetSelects, PtrA=" << *PtrA |
1295 | << ", PtrB=" << *PtrB << ", ContextInst=" |
1296 | << *ContextInst << ", Depth=" << Depth << "\n" ); |
1297 | std::optional<APInt> TrueDiff = getConstantOffset( |
1298 | PtrA: SelectA->getTrueValue(), PtrB: SelectB->getTrueValue(), ContextInst, Depth); |
1299 | if (!TrueDiff) |
1300 | return std::nullopt; |
1301 | std::optional<APInt> FalseDiff = |
1302 | getConstantOffset(PtrA: SelectA->getFalseValue(), PtrB: SelectB->getFalseValue(), |
1303 | ContextInst, Depth); |
1304 | if (TrueDiff == FalseDiff) |
1305 | return TrueDiff; |
1306 | } |
1307 | } |
1308 | return std::nullopt; |
1309 | } |
1310 | |
1311 | void Vectorizer::mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const { |
1312 | if (EQClasses.size() < 2) // There is nothing to merge. |
1313 | return; |
1314 | |
1315 | // The reduced key has all elements of the ECClassKey except the underlying |
1316 | // object. Check that EqClassKey has 4 elements and define the reduced key. |
1317 | static_assert(std::tuple_size_v<EqClassKey> == 4, |
1318 | "EqClassKey has changed - EqClassReducedKey needs changes too" ); |
1319 | using EqClassReducedKey = |
1320 | std::tuple<std::tuple_element_t<1, EqClassKey> /* AddrSpace */, |
1321 | std::tuple_element_t<2, EqClassKey> /* Element size */, |
1322 | std::tuple_element_t<3, EqClassKey> /* IsLoad; */>; |
1323 | using ECReducedKeyToUnderlyingObjectMap = |
1324 | MapVector<EqClassReducedKey, |
1325 | SmallPtrSet<std::tuple_element_t<0, EqClassKey>, 4>>; |
1326 | |
1327 | // Form a map from the reduced key (without the underlying object) to the |
1328 | // underlying objects: 1 reduced key to many underlying objects, to form |
1329 | // groups of potentially merge-able equivalence classes. |
1330 | ECReducedKeyToUnderlyingObjectMap RedKeyToUOMap; |
1331 | bool FoundPotentiallyOptimizableEC = false; |
1332 | for (const auto &EC : EQClasses) { |
1333 | const auto &Key = EC.first; |
1334 | EqClassReducedKey RedKey{std::get<1>(t: Key), std::get<2>(t: Key), |
1335 | std::get<3>(t: Key)}; |
1336 | auto &UOMap = RedKeyToUOMap[RedKey]; |
1337 | UOMap.insert(Ptr: std::get<0>(t: Key)); |
1338 | if (UOMap.size() > 1) |
1339 | FoundPotentiallyOptimizableEC = true; |
1340 | } |
1341 | if (!FoundPotentiallyOptimizableEC) |
1342 | return; |
1343 | |
1344 | LLVM_DEBUG({ |
1345 | dbgs() << "LSV: mergeEquivalenceClasses: before merging:\n" ; |
1346 | for (const auto &EC : EQClasses) { |
1347 | dbgs() << " Key: {" << EC.first << "}\n" ; |
1348 | for (const auto &Inst : EC.second) |
1349 | dbgs() << " Inst: " << *Inst << '\n'; |
1350 | } |
1351 | }); |
1352 | LLVM_DEBUG({ |
1353 | dbgs() << "LSV: mergeEquivalenceClasses: RedKeyToUOMap:\n" ; |
1354 | for (const auto &RedKeyToUO : RedKeyToUOMap) { |
1355 | dbgs() << " Reduced key: {" << std::get<0>(RedKeyToUO.first) << ", " |
1356 | << std::get<1>(RedKeyToUO.first) << ", " |
1357 | << static_cast<int>(std::get<2>(RedKeyToUO.first)) << "} --> " |
1358 | << RedKeyToUO.second.size() << " underlying objects:\n" ; |
1359 | for (auto UObject : RedKeyToUO.second) |
1360 | dbgs() << " " << *UObject << '\n'; |
1361 | } |
1362 | }); |
1363 | |
1364 | using UObjectToUObjectMap = DenseMap<const Value *, const Value *>; |
1365 | |
1366 | // Compute the ultimate targets for a set of underlying objects. |
1367 | auto GetUltimateTargets = |
1368 | [](SmallPtrSetImpl<const Value *> &UObjects) -> UObjectToUObjectMap { |
1369 | UObjectToUObjectMap IndirectionMap; |
1370 | for (const auto *UObject : UObjects) { |
1371 | const unsigned MaxLookupDepth = 1; // look for 1-level indirections only |
1372 | const auto *UltimateTarget = getUnderlyingObject(V: UObject, MaxLookup: MaxLookupDepth); |
1373 | if (UltimateTarget != UObject) |
1374 | IndirectionMap[UObject] = UltimateTarget; |
1375 | } |
1376 | UObjectToUObjectMap UltimateTargetsMap; |
1377 | for (const auto *UObject : UObjects) { |
1378 | auto Target = UObject; |
1379 | auto It = IndirectionMap.find(Val: Target); |
1380 | for (; It != IndirectionMap.end(); It = IndirectionMap.find(Val: Target)) |
1381 | Target = It->second; |
1382 | UltimateTargetsMap[UObject] = Target; |
1383 | } |
1384 | return UltimateTargetsMap; |
1385 | }; |
1386 | |
1387 | // For each item in RedKeyToUOMap, if it has more than one underlying object, |
1388 | // try to merge the equivalence classes. |
1389 | for (auto &[RedKey, UObjects] : RedKeyToUOMap) { |
1390 | if (UObjects.size() < 2) |
1391 | continue; |
1392 | auto UTMap = GetUltimateTargets(UObjects); |
1393 | for (const auto &[UObject, UltimateTarget] : UTMap) { |
1394 | if (UObject == UltimateTarget) |
1395 | continue; |
1396 | |
1397 | EqClassKey KeyFrom{UObject, std::get<0>(t&: RedKey), std::get<1>(t&: RedKey), |
1398 | std::get<2>(t&: RedKey)}; |
1399 | EqClassKey KeyTo{UltimateTarget, std::get<0>(t&: RedKey), std::get<1>(t&: RedKey), |
1400 | std::get<2>(t&: RedKey)}; |
1401 | // The entry for KeyFrom is guarantted to exist, unlike KeyTo. Thus, |
1402 | // request the reference to the instructions vector for KeyTo first. |
1403 | const auto &VecTo = EQClasses[KeyTo]; |
1404 | const auto &VecFrom = EQClasses[KeyFrom]; |
1405 | SmallVector<Instruction *, 8> MergedVec; |
1406 | std::merge(first1: VecFrom.begin(), last1: VecFrom.end(), first2: VecTo.begin(), last2: VecTo.end(), |
1407 | result: std::back_inserter(x&: MergedVec), |
1408 | comp: [](Instruction *A, Instruction *B) { |
1409 | return A && B && A->comesBefore(Other: B); |
1410 | }); |
1411 | EQClasses[KeyTo] = std::move(MergedVec); |
1412 | EQClasses.erase(Key: KeyFrom); |
1413 | } |
1414 | } |
1415 | LLVM_DEBUG({ |
1416 | dbgs() << "LSV: mergeEquivalenceClasses: after merging:\n" ; |
1417 | for (const auto &EC : EQClasses) { |
1418 | dbgs() << " Key: {" << EC.first << "}\n" ; |
1419 | for (const auto &Inst : EC.second) |
1420 | dbgs() << " Inst: " << *Inst << '\n'; |
1421 | } |
1422 | }); |
1423 | } |
1424 | |
1425 | EquivalenceClassMap |
1426 | Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin, |
1427 | BasicBlock::iterator End) { |
1428 | EquivalenceClassMap Ret; |
1429 | |
1430 | auto GetUnderlyingObject = [](const Value *Ptr) -> const Value * { |
1431 | const Value *ObjPtr = llvm::getUnderlyingObject(V: Ptr); |
1432 | if (const auto *Sel = dyn_cast<SelectInst>(Val: ObjPtr)) { |
1433 | // The select's themselves are distinct instructions even if they share |
1434 | // the same condition and evaluate to consecutive pointers for true and |
1435 | // false values of the condition. Therefore using the select's themselves |
1436 | // for grouping instructions would put consecutive accesses into different |
1437 | // lists and they won't be even checked for being consecutive, and won't |
1438 | // be vectorized. |
1439 | return Sel->getCondition(); |
1440 | } |
1441 | return ObjPtr; |
1442 | }; |
1443 | |
1444 | for (Instruction &I : make_range(x: Begin, y: End)) { |
1445 | auto *LI = dyn_cast<LoadInst>(Val: &I); |
1446 | auto *SI = dyn_cast<StoreInst>(Val: &I); |
1447 | if (!LI && !SI) |
1448 | continue; |
1449 | |
1450 | if ((LI && !LI->isSimple()) || (SI && !SI->isSimple())) |
1451 | continue; |
1452 | |
1453 | if ((LI && !TTI.isLegalToVectorizeLoad(LI)) || |
1454 | (SI && !TTI.isLegalToVectorizeStore(SI))) |
1455 | continue; |
1456 | |
1457 | Type *Ty = getLoadStoreType(I: &I); |
1458 | if (!VectorType::isValidElementType(ElemTy: Ty->getScalarType())) |
1459 | continue; |
1460 | |
1461 | // Skip weird non-byte sizes. They probably aren't worth the effort of |
1462 | // handling correctly. |
1463 | unsigned TySize = DL.getTypeSizeInBits(Ty); |
1464 | if ((TySize % 8) != 0) |
1465 | continue; |
1466 | |
1467 | // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain |
1468 | // functions are currently using an integer type for the vectorized |
1469 | // load/store, and does not support casting between the integer type and a |
1470 | // vector of pointers (e.g. i64 to <2 x i16*>) |
1471 | if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy()) |
1472 | continue; |
1473 | |
1474 | Value *Ptr = getLoadStorePointerOperand(V: &I); |
1475 | unsigned AS = Ptr->getType()->getPointerAddressSpace(); |
1476 | unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AddrSpace: AS); |
1477 | |
1478 | unsigned VF = VecRegSize / TySize; |
1479 | VectorType *VecTy = dyn_cast<VectorType>(Val: Ty); |
1480 | |
1481 | // Only handle power-of-two sized elements. |
1482 | if ((!VecTy && !isPowerOf2_32(Value: DL.getTypeSizeInBits(Ty))) || |
1483 | (VecTy && !isPowerOf2_32(Value: DL.getTypeSizeInBits(Ty: VecTy->getScalarType())))) |
1484 | continue; |
1485 | |
1486 | // No point in looking at these if they're too big to vectorize. |
1487 | if (TySize > VecRegSize / 2 || |
1488 | (VecTy && TTI.getLoadVectorFactor(VF, LoadSize: TySize, ChainSizeInBytes: TySize / 8, VecTy) == 0)) |
1489 | continue; |
1490 | |
1491 | Ret[{GetUnderlyingObject(Ptr), AS, |
1492 | DL.getTypeSizeInBits(Ty: getLoadStoreType(I: &I)->getScalarType()), |
1493 | /*IsLoad=*/LI != nullptr}] |
1494 | .emplace_back(Args: &I); |
1495 | } |
1496 | |
1497 | mergeEquivalenceClasses(EQClasses&: Ret); |
1498 | return Ret; |
1499 | } |
1500 | |
1501 | std::vector<Chain> Vectorizer::gatherChains(ArrayRef<Instruction *> Instrs) { |
1502 | if (Instrs.empty()) |
1503 | return {}; |
1504 | |
1505 | unsigned AS = getLoadStoreAddressSpace(I: Instrs[0]); |
1506 | unsigned ASPtrBits = DL.getIndexSizeInBits(AS); |
1507 | |
1508 | #ifndef NDEBUG |
1509 | // Check that Instrs is in BB order and all have the same addr space. |
1510 | for (size_t I = 1; I < Instrs.size(); ++I) { |
1511 | assert(Instrs[I - 1]->comesBefore(Instrs[I])); |
1512 | assert(getLoadStoreAddressSpace(Instrs[I]) == AS); |
1513 | } |
1514 | #endif |
1515 | |
1516 | // Machinery to build an MRU-hashtable of Chains. |
1517 | // |
1518 | // (Ideally this could be done with MapVector, but as currently implemented, |
1519 | // moving an element to the front of a MapVector is O(n).) |
1520 | struct InstrListElem : ilist_node<InstrListElem>, |
1521 | std::pair<Instruction *, Chain> { |
1522 | explicit InstrListElem(Instruction *I) |
1523 | : std::pair<Instruction *, Chain>(I, {}) {} |
1524 | }; |
1525 | struct InstrListElemDenseMapInfo { |
1526 | using PtrInfo = DenseMapInfo<InstrListElem *>; |
1527 | using IInfo = DenseMapInfo<Instruction *>; |
1528 | static InstrListElem *getEmptyKey() { return PtrInfo::getEmptyKey(); } |
1529 | static InstrListElem *getTombstoneKey() { |
1530 | return PtrInfo::getTombstoneKey(); |
1531 | } |
1532 | static unsigned getHashValue(const InstrListElem *E) { |
1533 | return IInfo::getHashValue(PtrVal: E->first); |
1534 | } |
1535 | static bool isEqual(const InstrListElem *A, const InstrListElem *B) { |
1536 | if (A == getEmptyKey() || B == getEmptyKey()) |
1537 | return A == getEmptyKey() && B == getEmptyKey(); |
1538 | if (A == getTombstoneKey() || B == getTombstoneKey()) |
1539 | return A == getTombstoneKey() && B == getTombstoneKey(); |
1540 | return IInfo::isEqual(LHS: A->first, RHS: B->first); |
1541 | } |
1542 | }; |
1543 | SpecificBumpPtrAllocator<InstrListElem> Allocator; |
1544 | simple_ilist<InstrListElem> MRU; |
1545 | DenseSet<InstrListElem *, InstrListElemDenseMapInfo> Chains; |
1546 | |
1547 | // Compare each instruction in `instrs` to leader of the N most recently-used |
1548 | // chains. This limits the O(n^2) behavior of this pass while also allowing |
1549 | // us to build arbitrarily long chains. |
1550 | for (Instruction *I : Instrs) { |
1551 | constexpr int MaxChainsToTry = 64; |
1552 | |
1553 | bool MatchFound = false; |
1554 | auto ChainIter = MRU.begin(); |
1555 | for (size_t J = 0; J < MaxChainsToTry && ChainIter != MRU.end(); |
1556 | ++J, ++ChainIter) { |
1557 | if (std::optional<APInt> Offset = getConstantOffset( |
1558 | PtrA: getLoadStorePointerOperand(V: ChainIter->first), |
1559 | PtrB: getLoadStorePointerOperand(V: I), |
1560 | /*ContextInst=*/ |
1561 | (ChainIter->first->comesBefore(Other: I) ? I : ChainIter->first))) { |
1562 | // `Offset` might not have the expected number of bits, if e.g. AS has a |
1563 | // different number of bits than opaque pointers. |
1564 | ChainIter->second.emplace_back(Args&: I, Args&: Offset.value()); |
1565 | // Move ChainIter to the front of the MRU list. |
1566 | MRU.remove(N&: *ChainIter); |
1567 | MRU.push_front(Node&: *ChainIter); |
1568 | MatchFound = true; |
1569 | break; |
1570 | } |
1571 | } |
1572 | |
1573 | if (!MatchFound) { |
1574 | APInt ZeroOffset(ASPtrBits, 0); |
1575 | InstrListElem *E = new (Allocator.Allocate()) InstrListElem(I); |
1576 | E->second.emplace_back(Args&: I, Args&: ZeroOffset); |
1577 | MRU.push_front(Node&: *E); |
1578 | Chains.insert(V: E); |
1579 | } |
1580 | } |
1581 | |
1582 | std::vector<Chain> Ret; |
1583 | Ret.reserve(n: Chains.size()); |
1584 | // Iterate over MRU rather than Chains so the order is deterministic. |
1585 | for (auto &E : MRU) |
1586 | if (E.second.size() > 1) |
1587 | Ret.emplace_back(args: std::move(E.second)); |
1588 | return Ret; |
1589 | } |
1590 | |
1591 | std::optional<APInt> Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB, |
1592 | Instruction *ContextInst, |
1593 | unsigned Depth) { |
1594 | LLVM_DEBUG(dbgs() << "LSV: getConstantOffset, PtrA=" << *PtrA |
1595 | << ", PtrB=" << *PtrB << ", ContextInst= " << *ContextInst |
1596 | << ", Depth=" << Depth << "\n" ); |
1597 | // We'll ultimately return a value of this bit width, even if computations |
1598 | // happen in a different width. |
1599 | unsigned OrigBitWidth = DL.getIndexTypeSizeInBits(Ty: PtrA->getType()); |
1600 | APInt OffsetA(OrigBitWidth, 0); |
1601 | APInt OffsetB(OrigBitWidth, 0); |
1602 | PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, Offset&: OffsetA); |
1603 | PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, Offset&: OffsetB); |
1604 | unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(Ty: PtrA->getType()); |
1605 | if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(Ty: PtrB->getType())) |
1606 | return std::nullopt; |
1607 | |
1608 | // If we have to shrink the pointer, stripAndAccumulateInBoundsConstantOffsets |
1609 | // should properly handle a possible overflow and the value should fit into |
1610 | // the smallest data type used in the cast/gep chain. |
1611 | assert(OffsetA.getSignificantBits() <= NewPtrBitWidth && |
1612 | OffsetB.getSignificantBits() <= NewPtrBitWidth); |
1613 | |
1614 | OffsetA = OffsetA.sextOrTrunc(width: NewPtrBitWidth); |
1615 | OffsetB = OffsetB.sextOrTrunc(width: NewPtrBitWidth); |
1616 | if (PtrA == PtrB) |
1617 | return (OffsetB - OffsetA).sextOrTrunc(width: OrigBitWidth); |
1618 | |
1619 | // Try to compute B - A. |
1620 | const SCEV *DistScev = SE.getMinusSCEV(LHS: SE.getSCEV(V: PtrB), RHS: SE.getSCEV(V: PtrA)); |
1621 | if (DistScev != SE.getCouldNotCompute()) { |
1622 | LLVM_DEBUG(dbgs() << "LSV: SCEV PtrB - PtrA =" << *DistScev << "\n" ); |
1623 | ConstantRange DistRange = SE.getSignedRange(S: DistScev); |
1624 | if (DistRange.isSingleElement()) { |
1625 | // Handle index width (the width of Dist) != pointer width (the width of |
1626 | // the Offset*s at this point). |
1627 | APInt Dist = DistRange.getSingleElement()->sextOrTrunc(width: NewPtrBitWidth); |
1628 | return (OffsetB - OffsetA + Dist).sextOrTrunc(width: OrigBitWidth); |
1629 | } |
1630 | } |
1631 | if (std::optional<APInt> Diff = |
1632 | getConstantOffsetComplexAddrs(PtrA, PtrB, ContextInst, Depth)) |
1633 | return (OffsetB - OffsetA + Diff->sext(width: OffsetB.getBitWidth())) |
1634 | .sextOrTrunc(width: OrigBitWidth); |
1635 | return std::nullopt; |
1636 | } |
1637 | |