InterleavedLoadCombinePass.cpp source code [llvm_projects/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp]

1	//===- InterleavedLoadCombine.cpp - Combine Interleaved Loads ---- C++ --===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// \file
10	//
11	// This file defines the interleaved-load-combine pass. The pass searches for
12	// ShuffleVectorInstruction that execute interleaving loads. If a matching
13	// pattern is found, it adds a combined load and further instructions in a
14	// pattern that is detectable by InterleavedAccesPass. The old instructions are
15	// left dead to be removed later. The pass is specifically designed to be
16	// executed just before InterleavedAccesPass to find any left-over instances
17	// that are not detected within former passes.
18	//
19	//===----------------------------------------------------------------------===//
20
21	#include "llvm/ADT/Statistic.h"
22	#include "llvm/Analysis/MemorySSA.h"
23	#include "llvm/Analysis/MemorySSAUpdater.h"
24	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
25	#include "llvm/Analysis/TargetTransformInfo.h"
26	#include "llvm/CodeGen/InterleavedLoadCombine.h"
27	#include "llvm/CodeGen/Passes.h"
28	#include "llvm/CodeGen/TargetLowering.h"
29	#include "llvm/CodeGen/TargetPassConfig.h"
30	#include "llvm/CodeGen/TargetSubtargetInfo.h"
31	#include "llvm/IR/DataLayout.h"
32	#include "llvm/IR/Dominators.h"
33	#include "llvm/IR/Function.h"
34	#include "llvm/IR/IRBuilder.h"
35	#include "llvm/IR/Instructions.h"
36	#include "llvm/IR/Module.h"
37	#include "llvm/InitializePasses.h"
38	#include "llvm/Pass.h"
39	#include "llvm/Support/Debug.h"
40	#include "llvm/Support/ErrorHandling.h"
41	#include "llvm/Support/raw_ostream.h"
42	#include "llvm/Target/TargetMachine.h"
43
44	#include <algorithm>
45	#include <cassert>
46	#include <list>
47
48	using namespace llvm;
49
50	#define DEBUG_TYPE "interleaved-load-combine"
51
52	namespace {
53
54	/// Statistic counter
55	STATISTIC(NumInterleavedLoadCombine, "Number of combined loads");
56
57	/// Option to disable the pass
58	static cl::opt<bool> DisableInterleavedLoadCombine(
59	"disable-" DEBUG_TYPE, cl::init(Val: false), cl::Hidden,
60	cl::desc ("Disable combining of interleaved loads"));
61
62	struct VectorInfo;
63
64	struct InterleavedLoadCombineImpl {
65	public:
66	InterleavedLoadCombineImpl(Function &F, DominatorTree &DT, MemorySSA &MSSA,
67	const TargetTransformInfo &TTI,
68	const TargetMachine &TM)
69	: F(F), DT(DT), MSSA(MSSA),
70	TLI(*TM.getSubtargetImpl(F)->getTargetLowering()), TTI(TTI) {}
71
72	/// Scan the function for interleaved load candidates and execute the
73	/// replacement if applicable.
74	bool run();
75
76	private:
77	/// Function this pass is working on
78	Function &F;
79
80	/// Dominator Tree Analysis
81	DominatorTree &DT;
82
83	/// Memory Alias Analyses
84	MemorySSA &MSSA;
85
86	/// Target Lowering Information
87	const TargetLowering &TLI;
88
89	/// Target Transform Information
90	const TargetTransformInfo &TTI;
91
92	/// Find the instruction in sets LIs that dominates all others, return nullptr
93	/// if there is none.
94	LoadInst findFirstLoad(const* std::set<LoadInst *> &LIs);
95
96	/// Replace interleaved load candidates. It does additional
97	/// analyses if this makes sense. Returns true on success and false
98	/// of nothing has been changed.
99	bool combine(std::list<VectorInfo> &InterleavedLoad,
100	OptimizationRemarkEmitter &ORE);
101
102	/// Given a set of VectorInfo containing candidates for a given interleave
103	/// factor, find a set that represents a 'factor' interleaved load.
104	bool findPattern(std::list<VectorInfo> &Candidates,
105	std::list<VectorInfo> &InterleavedLoad, unsigned Factor,
106	const DataLayout &DL);
107	}; // InterleavedLoadCombine
108
109	/// First Order Polynomial on an n-Bit Integer Value
110	///
111	/// Polynomial(Value) = Value B + A + E2^(n-e)
112	///
113	/// A and B are the coefficients. E2^(n-e) is an error within 'e' most*
114	/// significant bits. It is introduced if an exact computation cannot be proven
115	/// (e.q. division by 2).
116	///
117	/// As part of this optimization multiple loads will be combined. It necessary
118	/// to prove that loads are within some relative offset to each other. This
119	/// class is used to prove relative offsets of values loaded from memory.
120	///
121	/// Representing an integer in this form is sound since addition in two's
122	/// complement is associative (trivial) and multiplication distributes over the
123	/// addition (see Proof(1) in Polynomial::mul). Further, both operations
124	/// commute.
125	//
126	// Example:
127	// declare @fn(i64 %IDX, <4 x float> %PTR) {*
128	// %Pa1 = add i64 %IDX, 2
129	// %Pa2 = lshr i64 %Pa1, 1
130	// %Pa3 = getelementptr inbounds <4 x float>, <4 x float> %PTR, i64 %Pa2*
131	// %Va = load <4 x float>, <4 x float> %Pa3*
132	//
133	// %Pb1 = add i64 %IDX, 4
134	// %Pb2 = lshr i64 %Pb1, 1
135	// %Pb3 = getelementptr inbounds <4 x float>, <4 x float> %PTR, i64 %Pb2*
136	// %Vb = load <4 x float>, <4 x float> %Pb3*
137	// ... }
138	//
139	// The goal is to prove that two loads load consecutive addresses.
140	//
141	// In this case the polynomials are constructed by the following
142	// steps.
143	//
144	// The number tag #e specifies the error bits.
145	//
146	// Pa_0 = %IDX #0
147	// Pa_1 = %IDX + 2 #0 \| add 2
148	// Pa_2 = %IDX/2 + 1 #1 \| lshr 1
149	// Pa_3 = %IDX/2 + 1 #1 \| GEP, step signext to i64
150	// Pa_4 = (%IDX/2)16 + 16 #0 \| GEP, multiply index by sizeof(4) for floats*
151	// Pa_5 = (%IDX/2)16 + 16 #0 \| GEP, add offset of leading components*
152	//
153	// Pb_0 = %IDX #0
154	// Pb_1 = %IDX + 4 #0 \| add 2
155	// Pb_2 = %IDX/2 + 2 #1 \| lshr 1
156	// Pb_3 = %IDX/2 + 2 #1 \| GEP, step signext to i64
157	// Pb_4 = (%IDX/2)16 + 32 #0 \| GEP, multiply index by sizeof(4) for floats*
158	// Pb_5 = (%IDX/2)16 + 16 #0 \| GEP, add offset of leading components*
159	//
160	// Pb_5 - Pa_5 = 16 #0 \| subtract to get the offset
161	//
162	// Remark: %PTR is not maintained within this class. So in this instance the
163	// offset of 16 can only be assumed if the pointers are equal.
164	//
165	class Polynomial {
166	/// Operations on B
167	enum BOps {
168	LShr,
169	Mul,
170	SExt,
171	Trunc,
172	};
173
174	/// Number of Error Bits e
175	unsigned ErrorMSBs = (unsigned)-`1`;
176
177	/// Value
178	Value V = nullptr*;
179
180	/// Coefficient B
181	SmallVector<std::pair<BOps, APInt>, `4`> B;
182
183	/// Coefficient A
184	APInt A;
185
186	public:
187	Polynomial(Value *V) : V(V) {
188	IntegerType *Ty = dyn_cast<IntegerType>(Val: V->getType());
189	if (Ty) {
190	ErrorMSBs = `0`;
191	this->V = V;
192	A = APInt (Ty->getBitWidth(), `0`);
193	}
194	}
195
196	Polynomial(const APInt &A, unsigned ErrorMSBs = `0`)
197	: ErrorMSBs(ErrorMSBs), A (A) {}
198
199	Polynomial(unsigned BitWidth, uint64_t A, unsigned ErrorMSBs = `0`)
200	: ErrorMSBs(ErrorMSBs), A (BitWidth, A) {}
201
202	Polynomial() = default;
203
204	/// Increment and clamp the number of undefined bits.
205	void incErrorMSBs(unsigned amt) {
206	if (ErrorMSBs == (unsigned)-`1`)
207	return;
208
209	ErrorMSBs += amt;
210	if (ErrorMSBs > A.getBitWidth())
211	ErrorMSBs = A.getBitWidth();
212	}
213
214	/// Decrement and clamp the number of undefined bits.
215	void decErrorMSBs(unsigned amt) {
216	if (ErrorMSBs == (unsigned)-`1`)
217	return;
218
219	if (ErrorMSBs > amt)
220	ErrorMSBs -= amt;
221	else
222	ErrorMSBs = `0`;
223	}
224
225	/// Apply an add on the polynomial
226	Polynomial &add(const APInt &C) {
227	// Note: Addition is associative in two's complement even when in case of
228	// signed overflow.
229	//
230	// Error bits can only propagate into higher significant bits. As these are
231	// already regarded as undefined, there is no change.
232	//
233	// Theorem: Adding a constant to a polynomial does not change the error
234	// term.
235	//
236	// Proof:
237	//
238	// Since the addition is associative and commutes:
239	//
240	// (B + A + E2^(n-e)) + C = B + (A + C) + E2^(n-e)
241	// [qed]
242
243	if (C.getBitWidth() != A.getBitWidth()) {
244	ErrorMSBs = (unsigned)-`1`;
245	return *this;
246	}
247
248	A += C;
249	return *this;
250	}
251
252	/// Apply a multiplication onto the polynomial.
253	Polynomial &mul(const APInt &C) {
254	// Note: Multiplication distributes over the addition
255	//
256	// Theorem: Multiplication distributes over the addition
257	//
258	// Proof(1):
259	//
260	// (B+A)C =-*
261	// = (B + A) + (B + A) + .. {C Times}
262	// addition is associative and commutes, hence
263	// = B + B + .. {C Times} .. + A + A + .. {C times}
264	// = BC + AC
265	// (see (function add) for signed values and overflows)
266	// [qed]
267	//
268	// Theorem: If C has c trailing zeros, errors bits in A or B are shifted out
269	// to the left.
270	//
271	// Proof(2):
272	//
273	// Let B' and A' be the n-Bit inputs with some unknown errors EA,
274	// EB at e leading bits. B' and A' can be written down as:
275	//
276	// B' = B + 2^(n-e)EB*
277	// A' = A + 2^(n-e)EA*
278	//
279	// Let C' be an input with c trailing zero bits. C' can be written as
280	//
281	// C' = C2^c*
282	//
283	// Therefore we can compute the result by using distributivity and
284	// commutativity.
285	//
286	// (B'C' + A'C') = [B + 2^(n-e)EB] * C' + [A + 2^(n-e)EA] C' =*
287	// = [B + 2^(n-e)EB + A + 2^(n-e)EA] C' =*
288	// = (B'+A') C' =*
289	// = [B + 2^(n-e)EB + A + 2^(n-e)EA] C' =*
290	// = [B + A + 2^(n-e)EB + 2^(n-e)EA] C' =*
291	// = (B + A) C' + [2^(n-e)EB + 2^(n-e)EA)] * C' =*
292	// = (B + A) C' + [2^(n-e)EB + 2^(n-e)EA)] * C2^c =
293	// = (B + A) C' + C(EB + EA)2^(n-e)2^c =
294	//
295	// Let EC be the final error with EC = C(EB + EA)*
296	//
297	// = (B + A)C' + EC2^(n-e)2^c =*
298	// = (B + A)C' + EC2^(n-(e-c))
299	//
300	// Since EC is multiplied by 2^(n-(e-c)) the resulting error contains c
301	// less error bits than the input. c bits are shifted out to the left.
302	// [qed]
303
304	if (C.getBitWidth() != A.getBitWidth()) {
305	ErrorMSBs = (unsigned)-`1`;
306	return *this;
307	}
308
309	// Multiplying by one is a no-op.
310	if (C.isOne()) {
311	return *this;
312	}
313
314	// Multiplying by zero removes the coefficient B and defines all bits.
315	if (C.isZero()) {
316	ErrorMSBs = `0`;
317	deleteB();
318	}
319
320	// See Proof(2): Trailing zero bits indicate a left shift. This removes
321	// leading bits from the result even if they are undefined.
322	decErrorMSBs(amt: C.countr_zero());
323
324	A *= C;
325	pushBOperation(Op: Mul, C);
326	return *this;
327	}
328
329	/// Apply a logical shift right on the polynomial
330	Polynomial &lshr(const APInt &C) {
331	// Theorem(1): (B + A + E2^(n-e)) >> 1 => (B >> 1) + (A >> 1) + E'2^(n-e')
332	// where
333	// e' = e + 1,
334	// E is a e-bit number,
335	// E' is a e'-bit number,
336	// holds under the following precondition:
337	// pre(1): A % 2 = 0
338	// pre(2): e < n, (see Theorem(2) for the trivial case with e=n)
339	// where >> expresses a logical shift to the right, with adding zeros.
340	//
341	// We need to show that for every, E there is a E'
342	//
343	// B = b_h 2^(n-1) + b_m * 2 + b_l*
344	// A = a_h 2^(n-1) + a_m * 2 (pre(1))*
345	//
346	// where a_h, b_h, b_l are single bits, and a_m, b_m are (n-2) bit numbers
347	//
348	// Let X = (B + A + E2^(n-e)) >> 1*
349	// Let Y = (B >> 1) + (A >> 1) + E2^(n-e) >> 1*
350	//
351	// X = [B + A + E2^(n-e)] >> 1 =*
352	// = [ b_h 2^(n-1) + b_m * 2 + b_l +*
353	// + a_h 2^(n-1) + a_m * 2 +*
354	// + E 2^(n-e) ] >> 1 =*
355	//
356	// The sum is built by putting the overflow of [a_m + b+n] into the term
357	// 2^(n-1). As there are no more bits beyond 2^(n-1) the overflow within
358	// this bit is discarded. This is expressed by % 2.
359	//
360	// The bit in position 0 cannot overflow into the term (b_m + a_m).
361	//
362	// = [ ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) 2^(n-1) +*
363	// + ((b_m + a_m) % 2^(n-2)) 2 +*
364	// + b_l + E 2^(n-e) ] >> 1 =*
365	//
366	// The shift is computed by dividing the terms by 2 and by cutting off
367	// b_l.
368	//
369	// = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) 2^(n-2) +*
370	// + ((b_m + a_m) % 2^(n-2)) +
371	// + E 2^(n-(e+1)) =*
372	//
373	// by the definition in the Theorem e+1 = e'
374	//
375	// = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) 2^(n-2) +*
376	// + ((b_m + a_m) % 2^(n-2)) +
377	// + E 2^(n-e') =*
378	//
379	// Compute Y by applying distributivity first
380	//
381	// Y = (B >> 1) + (A >> 1) + E2^(n-e') =*
382	// = (b_h 2^(n-1) + b_m * 2 + b_l) >> 1 +*
383	// + (a_h 2^(n-1) + a_m * 2) >> 1 +*
384	// + E 2^(n-e) >> 1 =*
385	//
386	// Again, the shift is computed by dividing the terms by 2 and by cutting
387	// off b_l.
388	//
389	// = b_h 2^(n-2) + b_m +*
390	// + a_h 2^(n-2) + a_m +*
391	// + E 2^(n-(e+1)) =*
392	//
393	// Again, the sum is built by putting the overflow of [a_m + b+n] into
394	// the term 2^(n-1). But this time there is room for a second bit in the
395	// term 2^(n-2) we add this bit to a new term and denote it o_h in a
396	// second step.
397	//
398	// = ([b_h + a_h + (b_m + a_m) >> (n-2)] >> 1) 2^(n-1) +*
399	// + ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) 2^(n-2) +*
400	// + ((b_m + a_m) % 2^(n-2)) +
401	// + E 2^(n-(e+1)) =*
402	//
403	// Let o_h = [b_h + a_h + (b_m + a_m) >> (n-2)] >> 1
404	// Further replace e+1 by e'.
405	//
406	// = o_h 2^(n-1) +*
407	// + ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) 2^(n-2) +*
408	// + ((b_m + a_m) % 2^(n-2)) +
409	// + E 2^(n-e') =*
410	//
411	// Move o_h into the error term and construct E'. To ensure that there is
412	// no 2^x with negative x, this step requires pre(2) (e < n).
413	//
414	// = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) 2^(n-2) +*
415	// + ((b_m + a_m) % 2^(n-2)) +
416	// + o_h 2^(e'-1) * 2^(n-e') + \| pre(2), move 2^(e'-1)*
417	// \| out of the old exponent
418	// + E 2^(n-e') =*
419	// = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) 2^(n-2) +*
420	// + ((b_m + a_m) % 2^(n-2)) +
421	// + [o_h 2^(e'-1) + E] * 2^(n-e') + \| move 2^(e'-1) out of*
422	// \| the old exponent
423	//
424	// Let E' = o_h 2^(e'-1) + E*
425	//
426	// = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) 2^(n-2) +*
427	// + ((b_m + a_m) % 2^(n-2)) +
428	// + E' 2^(n-e')*
429	//
430	// Because X and Y are distinct only in there error terms and E' can be
431	// constructed as shown the theorem holds.
432	// [qed]
433	//
434	// For completeness in case of the case e=n it is also required to show that
435	// distributivity can be applied.
436	//
437	// In this case Theorem(1) transforms to (the pre-condition on A can also be
438	// dropped)
439	//
440	// Theorem(2): (B + A + E) >> 1 => (B >> 1) + (A >> 1) + E'
441	// where
442	// A, B, E, E' are two's complement numbers with the same bit
443	// width
444	//
445	// Let A + B + E = X
446	// Let (B >> 1) + (A >> 1) = Y
447	//
448	// Therefore we need to show that for every X and Y there is an E' which
449	// makes the equation
450	//
451	// X = Y + E'
452	//
453	// hold. This is trivially the case for E' = X - Y.
454	//
455	// [qed]
456	//
457	// Remark: Distributing lshr with and arbitrary number n can be expressed as
458	// ((((B + A) lshr 1) lshr 1) ... ) {n times}.
459	// This construction induces n additional error bits at the left.
460
461	if (C.getBitWidth() != A.getBitWidth()) {
462	ErrorMSBs = (unsigned)-`1`;
463	return *this;
464	}
465
466	if (C.isZero())
467	return *this;
468
469	// Test if the result will be zero
470	unsigned shiftAmt = C.getZExtValue();
471	if (shiftAmt >= C.getBitWidth())
472	return mul(C: APInt (C.getBitWidth(), `0`));
473
474	// The proof that shiftAmt LSBs are zero for at least one summand is only
475	// possible for the constant number.
476	//
477	// If this can be proven add shiftAmt to the error counter
478	// `ErrorMSBs`. Otherwise set all bits as undefined.
479	if (A.countr_zero() < shiftAmt)
480	ErrorMSBs = A.getBitWidth();
481	else
482	incErrorMSBs(amt: shiftAmt);
483
484	// Apply the operation.
485	pushBOperation(Op: LShr, C);
486	A = A.lshr(shiftAmt);
487
488	return *this;
489	}
490
491	/// Apply a sign-extend or truncate operation on the polynomial.
492	Polynomial &sextOrTrunc(unsigned n) {
493	if (n < A.getBitWidth()) {
494	// Truncate: Clearly undefined Bits on the MSB side are removed
495	// if there are any.
496	decErrorMSBs(amt: A.getBitWidth() - n);
497	A = A.trunc(width: n);
498	pushBOperation(Op: Trunc, C: APInt (sizeof(n) * `8`, n));
499	}
500	if (n > A.getBitWidth()) {
501	// Extend: Clearly extending first and adding later is different
502	// to adding first and extending later in all extended bits.
503	incErrorMSBs(amt: n - A.getBitWidth());
504	A = A.sext(width: n);
505	pushBOperation(Op: SExt, C: APInt (sizeof(n) * `8`, n));
506	}
507
508	return *this;
509	}
510
511	/// Test if there is a coefficient B.
512	bool isFirstOrder() const { return V != nullptr; }
513
514	/// Test coefficient B of two Polynomials are equal.
515	bool isCompatibleTo(const Polynomial &o) const {
516	// The polynomial use different bit width.
517	if (A.getBitWidth() != o.A.getBitWidth())
518	return false;
519
520	// If neither Polynomial has the Coefficient B.
521	if (!isFirstOrder() && !o.isFirstOrder())
522	return true;
523
524	// The index variable is different.
525	if (V != o.V)
526	return false;
527
528	// Check the operations.
529	if (B.size() != o.B.size())
530	return false;
531
532	auto *ob = o.B.begin();
533	for (const auto &b : B) {
534	if (b != *ob)
535	return false;
536	ob++;
537	}
538
539	return true;
540	}
541
542	/// Subtract two polynomials, return an undefined polynomial if
543	/// subtraction is not possible.
544	Polynomial operator-(const Polynomial &o) const {
545	// Return an undefined polynomial if incompatible.
546	if (!isCompatibleTo(o))
547	return Polynomial ();
548
549	// If the polynomials are compatible (meaning they have the same
550	// coefficient on B), B is eliminated. Thus a polynomial solely
551	// containing A is returned
552	return Polynomial (A - o.A, std::max(a: ErrorMSBs, b: o.ErrorMSBs));
553	}
554
555	/// Subtract a constant from a polynomial,
556	Polynomial operator-(uint64_t C) const {
557	Polynomial Result(*this);
558	Result.A -= C;
559	return Result;
560	}
561
562	/// Add a constant to a polynomial,
563	Polynomial operator+(uint64_t C) const {
564	Polynomial Result(*this);
565	Result.A += C;
566	return Result;
567	}
568
569	/// Returns true if it can be proven that two Polynomials are equal.
570	bool isProvenEqualTo(const Polynomial &o) {
571	// Subtract both polynomials and test if it is fully defined and zero.
572	Polynomial r = *this - o;
573	return (r.ErrorMSBs == `0`) && (!r.isFirstOrder()) && (r.A.isZero());
574	}
575
576	/// Print the polynomial into a stream.
577	void print(raw_ostream &OS) const {
578	OS << "[{#ErrBits:" << ErrorMSBs << "} ";
579
580	if (V) {
581	for (auto b : B)
582	OS << "(";
583	OS << "(" << *V << ") ";
584
585	for (auto b : B) {
586	switch (b.first) {
587	case LShr:
588	OS << "LShr ";
589	break;
590	case Mul:
591	OS << "Mul ";
592	break;
593	case SExt:
594	OS << "SExt ";
595	break;
596	case Trunc:
597	OS << "Trunc ";
598	break;
599	}
600
601	OS << b.second << ") ";
602	}
603	}
604
605	OS << "+ " << A << "]";
606	}
607
608	private:
609	void deleteB() {
610	V = nullptr;
611	B.clear();
612	}
613
614	void pushBOperation(const BOps Op, const APInt &C) {
615	if (isFirstOrder()) {
616	B.push_back(Elt: std::make_pair(x: Op, y: C));
617	return;
618	}
619	}
620	};
621
622	#ifndef NDEBUG
623	static raw_ostream &operator<<(raw_ostream &OS, const Polynomial &S) {
624	S.print(OS);
625	return OS;
626	}
627	#endif
628
629	/// VectorInfo stores abstract the following information for each vector
630	/// element:
631	///
632	/// 1) The memory address loaded into the element as Polynomial
633	/// 2) a set of load instruction necessary to construct the vector,
634	/// 3) a set of all other instructions that are necessary to create the vector and
635	/// 4) a pointer value that can be used as relative base for all elements.
636	struct VectorInfo {
637	private:
638	VectorInfo(const VectorInfo &c) : VTy(c.VTy) {
639	llvm_unreachable(
640	"Copying VectorInfo is neither implemented nor necessary,");
641	}
642
643	public:
644	/// Information of a Vector Element
645	struct ElementInfo {
646	/// Offset Polynomial.
647	Polynomial Ofs;
648
649	/// The Load Instruction used to Load the entry. LI is null if the pointer
650	/// of the load instruction does not point on to the entry
651	LoadInst *LI;
652
653	ElementInfo(Polynomial Offset = Polynomial (), LoadInst LI = nullptr*)
654	: Ofs (Offset), LI(LI) {}
655	};
656
657	/// Basic-block the load instructions are within
658	BasicBlock BB = nullptr*;
659
660	/// Pointer value of all participation load instructions
661	Value PV = nullptr*;
662
663	/// Participating load instructions
664	std::set<LoadInst *> LIs;
665
666	/// Participating instructions
667	std::set<Instruction *> Is;
668
669	/// Final shuffle-vector instruction
670	ShuffleVectorInst SVI = nullptr*;
671
672	/// Information of the offset for each vector element
673	ElementInfo *EI;
674
675	/// Vector Type
676	FixedVectorType *const VTy;
677
678	VectorInfo(FixedVectorType *VTy) : VTy(VTy) {
679	EI = new ElementInfo[VTy->getNumElements()];
680	}
681
682	VectorInfo &operator=(const VectorInfo &other) = delete;
683
684	virtual ~VectorInfo() { delete[] EI; }
685
686	unsigned getDimension() const { return VTy->getNumElements(); }
687
688	/// Test if the VectorInfo can be part of an interleaved load with the
689	/// specified factor.
690	///
691	/// \param Factor of the interleave
692	/// \param DL Targets Datalayout
693	///
694	/// \returns true if this is possible and false if not
695	bool isInterleaved(unsigned Factor, const DataLayout &DL) const {
696	unsigned Size = DL.getTypeAllocSize(Ty: VTy->getElementType());
697	for (unsigned i = `1`; i < getDimension(); i++) {
698	if (!EI[i].Ofs.isProvenEqualTo(o: EI[`0`].Ofs + i * Factor * Size)) {
699	return false;
700	}
701	}
702	return true;
703	}
704
705	/// Recursively computes the vector information stored in V.
706	///
707	/// This function delegates the work to specialized implementations
708	///
709	/// \param V Value to operate on
710	/// \param Result Result of the computation
711	///
712	/// \returns false if no sensible information can be gathered.
713	static bool compute(Value V, VectorInfo &Result, const* DataLayout &DL) {
714	ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(Val: V);
715	if (SVI)
716	return computeFromSVI(SVI, Result, DL);
717	LoadInst *LI = dyn_cast<LoadInst>(Val: V);
718	if (LI)
719	return computeFromLI(LI, Result, DL);
720	BitCastInst *BCI = dyn_cast<BitCastInst>(Val: V);
721	if (BCI)
722	return computeFromBCI(BCI, Result, DL);
723	return false;
724	}
725
726	/// BitCastInst specialization to compute the vector information.
727	///
728	/// \param BCI BitCastInst to operate on
729	/// \param Result Result of the computation
730	///
731	/// \returns false if no sensible information can be gathered.
732	static bool computeFromBCI(BitCastInst *BCI, VectorInfo &Result,
733	const DataLayout &DL) {
734	Instruction *Op = dyn_cast<Instruction>(Val: BCI->getOperand(i_nocapture: `0`));
735
736	if (!Op)
737	return false;
738
739	FixedVectorType *VTy = dyn_cast<FixedVectorType>(Val: Op->getType());
740	if (!VTy)
741	return false;
742
743	// We can only cast from large to smaller vectors
744	if (Result.VTy->getNumElements() % VTy->getNumElements())
745	return false;
746
747	unsigned Factor = Result.VTy->getNumElements() / VTy->getNumElements();
748	unsigned NewSize = DL.getTypeAllocSize(Ty: Result.VTy->getElementType());
749	unsigned OldSize = DL.getTypeAllocSize(Ty: VTy->getElementType());
750
751	if (NewSize * Factor != OldSize)
752	return false;
753
754	VectorInfo Old(VTy);
755	if (!compute(V: Op, Result&: Old, DL))
756	return false;
757
758	for (unsigned i = `0`; i < Result.VTy->getNumElements(); i += Factor) {
759	for (unsigned j = `0`; j < Factor; j++) {
760	Result.EI[i + j] =
761	ElementInfo (Old.EI[i / Factor].Ofs + j * NewSize,
762	j == `0` ? Old.EI[i / Factor].LI : nullptr);
763	}
764	}
765
766	Result.BB = Old.BB;
767	Result.PV = Old.PV;
768	Result.LIs.insert(first: Old.LIs.begin(), last: Old.LIs.end());
769	Result.Is.insert(first: Old.Is.begin(), last: Old.Is.end());
770	Result.Is.insert(x: BCI);
771	Result.SVI = nullptr;
772
773	return true;
774	}
775
776	/// ShuffleVectorInst specialization to compute vector information.
777	///
778	/// \param SVI ShuffleVectorInst to operate on
779	/// \param Result Result of the computation
780	///
781	/// Compute the left and the right side vector information and merge them by
782	/// applying the shuffle operation. This function also ensures that the left
783	/// and right side have compatible loads. This means that all loads are with
784	/// in the same basic block and are based on the same pointer.
785	///
786	/// \returns false if no sensible information can be gathered.
787	static bool computeFromSVI(ShuffleVectorInst *SVI, VectorInfo &Result,
788	const DataLayout &DL) {
789	FixedVectorType *ArgTy =
790	cast<FixedVectorType>(Val: SVI->getOperand(i_nocapture: `0`)->getType());
791
792	// Compute the left hand vector information.
793	VectorInfo LHS(ArgTy);
794	if (!compute(V: SVI->getOperand(i_nocapture: `0`), Result&: LHS, DL))
795	LHS.BB = nullptr;
796
797	// Compute the right hand vector information.
798	VectorInfo RHS(ArgTy);
799	if (!compute(V: SVI->getOperand(i_nocapture: `1`), Result&: RHS, DL))
800	RHS.BB = nullptr;
801
802	// Neither operand produced sensible results?
803	if (!LHS.BB && !RHS.BB)
804	return false;
805	// Only RHS produced sensible results?
806	else if (!LHS.BB) {
807	Result.BB = RHS.BB;
808	Result.PV = RHS.PV;
809	}
810	// Only LHS produced sensible results?
811	else if (!RHS.BB) {
812	Result.BB = LHS.BB;
813	Result.PV = LHS.PV;
814	}
815	// Both operands produced sensible results?
816	else if ((LHS.BB == RHS.BB) && (LHS.PV == RHS.PV)) {
817	Result.BB = LHS.BB;
818	Result.PV = LHS.PV;
819	}
820	// Both operands produced sensible results but they are incompatible.
821	else {
822	return false;
823	}
824
825	// Merge and apply the operation on the offset information.
826	if (LHS.BB) {
827	Result.LIs.insert(first: LHS.LIs.begin(), last: LHS.LIs.end());
828	Result.Is.insert(first: LHS.Is.begin(), last: LHS.Is.end());
829	}
830	if (RHS.BB) {
831	Result.LIs.insert(first: RHS.LIs.begin(), last: RHS.LIs.end());
832	Result.Is.insert(first: RHS.Is.begin(), last: RHS.Is.end());
833	}
834	Result.Is.insert(x: SVI);
835	Result.SVI = SVI;
836
837	int j = `0`;
838	for (int i : SVI->getShuffleMask()) {
839	assert((i < `2` * (signed)ArgTy->getNumElements()) &&
840	"Invalid ShuffleVectorInst (index out of bounds)");
841
842	if (i < `0`)
843	Result.EI[j] = ElementInfo ();
844	else if (i < (signed)ArgTy->getNumElements()) {
845	if (LHS.BB)
846	Result.EI[j] = LHS.EI[i];
847	else
848	Result.EI[j] = ElementInfo ();
849	} else {
850	if (RHS.BB)
851	Result.EI[j] = RHS.EI[i - ArgTy->getNumElements()];
852	else
853	Result.EI[j] = ElementInfo ();
854	}
855	j++;
856	}
857
858	return true;
859	}
860
861	/// LoadInst specialization to compute vector information.
862	///
863	/// This function also acts as abort condition to the recursion.
864	///
865	/// \param LI LoadInst to operate on
866	/// \param Result Result of the computation
867	///
868	/// \returns false if no sensible information can be gathered.
869	static bool computeFromLI(LoadInst *LI, VectorInfo &Result,
870	const DataLayout &DL) {
871	Value *BasePtr;
872	Polynomial Offset;
873
874	if (LI->isVolatile())
875	return false;
876
877	if (LI->isAtomic())
878	return false;
879
880	if (!DL.typeSizeEqualsStoreSize(Ty: Result.VTy->getElementType()))
881	return false;
882
883	// Get the base polynomial
884	computePolynomialFromPointer(Ptr&: *LI->getPointerOperand(), Result&: Offset, BasePtr, DL);
885
886	Result.BB = LI->getParent();
887	Result.PV = BasePtr;
888	Result.LIs.insert(x: LI);
889	Result.Is.insert(x: LI);
890
891	for (unsigned i = `0`; i < Result.getDimension(); i++) {
892	Value *Idx[`2`] = {
893	ConstantInt::get(Ty: Type::getInt32Ty(C&: LI->getContext()), V: `0`),
894	ConstantInt::get(Ty: Type::getInt32Ty(C&: LI->getContext()), V: i),
895	};
896	int64_t Ofs = DL.getIndexedOffsetInType(ElemTy: Result.VTy, Indices: Idx);
897	Result.EI[i] = ElementInfo (Offset + Ofs, i == `0` ? LI : nullptr);
898	}
899
900	return true;
901	}
902
903	/// Recursively compute polynomial of a value.
904	///
905	/// \param BO Input binary operation
906	/// \param Result Result polynomial
907	static void computePolynomialBinOp(BinaryOperator &BO, Polynomial &Result) {
908	Value *LHS = BO.getOperand(i_nocapture: `0`);
909	Value *RHS = BO.getOperand(i_nocapture: `1`);
910
911	// Find the RHS Constant if any
912	ConstantInt *C = dyn_cast<ConstantInt>(Val: RHS);
913	if ((!C) && BO.isCommutative()) {
914	C = dyn_cast<ConstantInt>(Val: LHS);
915	if (C)
916	std::swap(a&: LHS, b&: RHS);
917	}
918
919	switch (BO.getOpcode()) {
920	case Instruction::Add:
921	if (!C)
922	break;
923
924	computePolynomial(V&: *LHS, Result);
925	Result.add(C: C->getValue());
926	return;
927
928	case Instruction::LShr:
929	if (!C)
930	break;
931
932	computePolynomial(V&: *LHS, Result);
933	Result.lshr(C: C->getValue());
934	return;
935
936	default:
937	break;
938	}
939
940	Result = Polynomial (&BO);
941	}
942
943	/// Recursively compute polynomial of a value
944	///
945	/// \param V input value
946	/// \param Result result polynomial
947	static void computePolynomial(Value &V, Polynomial &Result) {
948	if (auto *BO = dyn_cast<BinaryOperator>(Val: &V))
949	computePolynomialBinOp(BO&: *BO, Result);
950	else
951	Result = Polynomial (&V);
952	}
953
954	/// Compute the Polynomial representation of a Pointer type.
955	///
956	/// \param Ptr input pointer value
957	/// \param Result result polynomial
958	/// \param BasePtr pointer the polynomial is based on
959	/// \param DL Datalayout of the target machine
960	static void computePolynomialFromPointer(Value &Ptr, Polynomial &Result,
961	Value *&BasePtr,
962	const DataLayout &DL) {
963	// Not a pointer type? Return an undefined polynomial
964	PointerType *PtrTy = dyn_cast<PointerType>(Val: Ptr.getType());
965	if (!PtrTy) {
966	Result = Polynomial ();
967	BasePtr = nullptr;
968	return;
969	}
970	unsigned PointerBits =
971	DL.getIndexSizeInBits(AS: PtrTy->getPointerAddressSpace());
972
973	/// Skip pointer casts. Return Zero polynomial otherwise
974	if (isa<CastInst>(Val: &Ptr)) {
975	CastInst &CI = *cast<CastInst>(Val: &Ptr);
976	switch (CI.getOpcode()) {
977	case Instruction::BitCast:
978	computePolynomialFromPointer(Ptr&: *CI.getOperand(i_nocapture: `0`), Result, BasePtr, DL);
979	break;
980	default:
981	BasePtr = &Ptr;
982	Polynomial (PointerBits, `0`);
983	break;
984	}
985	}
986	/// Resolve GetElementPtrInst.
987	else if (isa<GetElementPtrInst>(Val: &Ptr)) {
988	GetElementPtrInst &GEP = *cast<GetElementPtrInst>(Val: &Ptr);
989
990	APInt BaseOffset(PointerBits, `0`);
991
992	// Check if we can compute the Offset with accumulateConstantOffset
993	if (GEP.accumulateConstantOffset(DL, Offset&: BaseOffset)) {
994	Result = Polynomial (BaseOffset);
995	BasePtr = GEP.getPointerOperand();
996	return;
997	} else {
998	// Otherwise we allow that the last index operand of the GEP is
999	// non-constant.
1000	unsigned idxOperand, e;
1001	SmallVector<Value *, `4`> Indices;
1002	for (idxOperand = `1`, e = GEP.getNumOperands(); idxOperand < e;
1003	idxOperand++) {
1004	ConstantInt *IDX = dyn_cast<ConstantInt>(Val: GEP.getOperand(i_nocapture: idxOperand));
1005	if (!IDX)
1006	break;
1007	Indices.push_back(Elt: IDX);
1008	}
1009
1010	// It must also be the last operand.
1011	if (idxOperand + `1` != e) {
1012	Result = Polynomial ();
1013	BasePtr = nullptr;
1014	return;
1015	}
1016
1017	// Compute the polynomial of the index operand.
1018	computePolynomial(V&: *GEP.getOperand(i_nocapture: idxOperand), Result);
1019
1020	// Compute base offset from zero based index, excluding the last
1021	// variable operand.
1022	BaseOffset =
1023	DL.getIndexedOffsetInType(ElemTy: GEP.getSourceElementType(), Indices);
1024
1025	// Apply the operations of GEP to the polynomial.
1026	unsigned ResultSize = DL.getTypeAllocSize(Ty: GEP.getResultElementType());
1027	Result.sextOrTrunc(n: PointerBits);
1028	Result.mul(C: APInt (PointerBits, ResultSize));
1029	Result.add(C: BaseOffset);
1030	BasePtr = GEP.getPointerOperand();
1031	}
1032	}
1033	// All other instructions are handled by using the value as base pointer and
1034	// a zero polynomial.
1035	else {
1036	BasePtr = &Ptr;
1037	Polynomial (DL.getIndexSizeInBits(AS: PtrTy->getPointerAddressSpace()), `0`);
1038	}
1039	}
1040
1041	#ifndef NDEBUG
1042	void print(raw_ostream &OS) const {
1043	if (PV)
1044	OS << *PV;
1045	else
1046	OS << "(none)";
1047	OS << " + ";
1048	for (unsigned i = `0`; i < getDimension(); i++)
1049	OS << ((i == `0`) ? "[" : ", ") << EI[i].Ofs;
1050	OS << "]";
1051	}
1052	#endif
1053	};
1054
1055	} // anonymous namespace
1056
1057	bool InterleavedLoadCombineImpl::findPattern(
1058	std::list<VectorInfo> &Candidates, std::list<VectorInfo> &InterleavedLoad,
1059	unsigned Factor, const DataLayout &DL) {
1060	for (auto C0 = Candidates.begin(), E0 = Candidates.end(); C0 != E0; ++C0) {
1061	unsigned i;
1062	// Try to find an interleaved load using the front of Worklist as first line
1063	unsigned Size = DL.getTypeAllocSize(Ty: C0 ->VTy->getElementType());
1064
1065	// List containing iterators pointing to the VectorInfos of the candidates
1066	std::vector<std::list<VectorInfo>::iterator> Res(Factor, Candidates.end());
1067
1068	for (auto C = Candidates.begin(), E = Candidates.end(); C != E; C ++) {
1069	if (C ->VTy != C0 ->VTy)
1070	continue;
1071	if (C ->BB != C0 ->BB)
1072	continue;
1073	if (C ->PV != C0 ->PV)
1074	continue;
1075
1076	// Check the current value matches any of factor - 1 remaining lines
1077	for (i = `1`; i < Factor; i++) {
1078	if (C ->EI[`0`].Ofs.isProvenEqualTo(o: C0 ->EI[`0`].Ofs + i * Size)) {
1079	Res [i] = C;
1080	}
1081	}
1082
1083	for (i = `1`; i < Factor; i++) {
1084	if (Res [i] == Candidates.end())
1085	break;
1086	}
1087	if (i == Factor) {
1088	Res [`0`] = C0;
1089	break;
1090	}
1091	}
1092
1093	if (Res [`0`] != Candidates.end()) {
1094	// Move the result into the output
1095	for (unsigned i = `0`; i < Factor; i++) {
1096	InterleavedLoad.splice(position: InterleavedLoad.end(), x&: Candidates, i: Res [i]);
1097	}
1098
1099	return true;
1100	}
1101	}
1102	return false;
1103	}
1104
1105	LoadInst *
1106	InterleavedLoadCombineImpl::findFirstLoad(const std::set<LoadInst *> &LIs) {
1107	assert(!LIs.empty() && "No load instructions given.");
1108
1109	// All LIs are within the same BB. Select the first for a reference.
1110	BasicBlock BB = (LIs.begin())->getParent();
1111	BasicBlock::iterator FLI = llvm::find_if(
1112	Range&: BB, P: [&LIs](Instruction &I) -> bool* { return is_contained(Range: LIs, Element: &I); });
1113	assert(FLI != BB->end());
1114
1115	return cast<LoadInst>(Val&: FLI);
1116	}
1117
1118	bool InterleavedLoadCombineImpl::combine(std::list<VectorInfo> &InterleavedLoad,
1119	OptimizationRemarkEmitter &ORE) {
1120	LLVM_DEBUG(dbgs() << "Checking interleaved load\n");
1121
1122	// The insertion point is the LoadInst which loads the first values. The
1123	// following tests are used to proof that the combined load can be inserted
1124	// just before InsertionPoint.
1125	LoadInst *InsertionPoint = InterleavedLoad.front().EI[`0`].LI;
1126
1127	// Test if the offset is computed
1128	if (!InsertionPoint)
1129	return false;
1130
1131	std::set<LoadInst *> LIs;
1132	std::set<Instruction *> Is;
1133	std::set<Instruction *> SVIs;
1134
1135	InstructionCost InterleavedCost;
1136	InstructionCost InstructionCost = `0`;
1137	const TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency;
1138
1139	// Get the interleave factor
1140	unsigned Factor = InterleavedLoad.size();
1141
1142	// Merge all input sets used in analysis
1143	for (auto &VI : InterleavedLoad) {
1144	// Generate a set of all load instructions to be combined
1145	LIs.insert(first: VI.LIs.begin(), last: VI.LIs.end());
1146
1147	// Generate a set of all instructions taking part in load
1148	// interleaved. This list excludes the instructions necessary for the
1149	// polynomial construction.
1150	Is.insert(first: VI.Is.begin(), last: VI.Is.end());
1151
1152	// Generate the set of the final ShuffleVectorInst.
1153	SVIs.insert(x: VI.SVI);
1154	}
1155
1156	// There is nothing to combine.
1157	if (LIs.size() < `2`)
1158	return false;
1159
1160	// Test if all participating instruction will be dead after the
1161	// transformation. If intermediate results are used, no performance gain can
1162	// be expected. Also sum the cost of the Instructions beeing left dead.
1163	for (const auto &I : Is) {
1164	// Compute the old cost
1165	InstructionCost += TTI.getInstructionCost(U: I, CostKind);
1166
1167	// The final SVIs are allowed not to be dead, all uses will be replaced
1168	if (SVIs.find(x: I) != SVIs.end())
1169	continue;
1170
1171	// If there are users outside the set to be eliminated, we abort the
1172	// transformation. No gain can be expected.
1173	for (auto *U : I->users()) {
1174	if (Is.find(x: dyn_cast<Instruction>(Val: U)) == Is.end())
1175	return false;
1176	}
1177	}
1178
1179	// We need to have a valid cost in order to proceed.
1180	if (!InstructionCost.isValid())
1181	return false;
1182
1183	// We know that all LoadInst are within the same BB. This guarantees that
1184	// either everything or nothing is loaded.
1185	LoadInst *First = findFirstLoad(LIs);
1186
1187	// To be safe that the loads can be combined, iterate over all loads and test
1188	// that the corresponding defining access dominates first LI. This guarantees
1189	// that there are no aliasing stores in between the loads.
1190	auto FMA = MSSA.getMemoryAccess(I: First);
1191	for (auto *LI : LIs) {
1192	auto MADef = MSSA.getMemoryAccess(I: LI)->getDefiningAccess();
1193	if (!MSSA.dominates(A: MADef, B: FMA))
1194	return false;
1195	}
1196	assert(!LIs.empty() && "There are no LoadInst to combine");
1197
1198	// It is necessary that insertion point dominates all final ShuffleVectorInst.
1199	for (auto &VI : InterleavedLoad) {
1200	if (!DT.dominates(Def: InsertionPoint, User: VI.SVI))
1201	return false;
1202	}
1203
1204	// All checks are done. Add instructions detectable by InterleavedAccessPass
1205	// The old instruction will are left dead.
1206	IRBuilder<> Builder(InsertionPoint);
1207	Type *ETy = InterleavedLoad.front().SVI->getType()->getElementType();
1208	unsigned ElementsPerSVI =
1209	cast<FixedVectorType>(Val: InterleavedLoad.front().SVI->getType())
1210	->getNumElements();
1211	FixedVectorType ILTy = FixedVectorType::get(ElementType: ETy, NumElts: Factor ElementsPerSVI);
1212
1213	auto Indices = llvm::to_vector<`4`>(Range: llvm::seq<unsigned>(Begin: `0`, End: Factor));
1214	InterleavedCost = TTI.getInterleavedMemoryOpCost(
1215	Opcode: Instruction::Load, VecTy: ILTy, Factor, Indices, Alignment: InsertionPoint->getAlign(),
1216	AddressSpace: InsertionPoint->getPointerAddressSpace(), CostKind);
1217
1218	if (InterleavedCost >= InstructionCost) {
1219	return false;
1220	}
1221
1222	// Create the wide load and update the MemorySSA.
1223	auto Ptr = InsertionPoint->getPointerOperand();
1224	auto LI = Builder.CreateAlignedLoad(Ty: ILTy, Ptr, Align: InsertionPoint->getAlign(),
1225	Name: "interleaved.wide.load");
1226	auto MSSAU = MemorySSAUpdater (&MSSA);
1227	MemoryUse *MSSALoad = cast<MemoryUse>(Val: MSSAU.createMemoryAccessBefore(
1228	I: LI, Definition: nullptr, InsertPt: MSSA.getMemoryAccess(I: InsertionPoint)));
1229	MSSAU.insertUse(Use: MSSALoad, /RenameUses=/ true);
1230
1231	// Create the final SVIs and replace all uses.
1232	int i = `0`;
1233	for (auto &VI : InterleavedLoad) {
1234	SmallVector<int, `4`> Mask;
1235	for (unsigned j = `0`; j < ElementsPerSVI; j++)
1236	Mask.push_back(Elt: i + j * Factor);
1237
1238	Builder.SetInsertPoint(VI.SVI);
1239	auto SVI = Builder.CreateShuffleVector(V: LI, Mask, Name: "interleaved.shuffle");
1240	VI.SVI->replaceAllUsesWith(V: SVI);
1241	i++;
1242	}
1243
1244	NumInterleavedLoadCombine ++;
1245	ORE.emit(RemarkBuilder: [&]() {
1246	return OptimizationRemark (DEBUG_TYPE, "Combined Interleaved Load", LI)
1247	<< "Load interleaved combined with factor "
1248	<< ore::NV ("Factor", Factor);
1249	});
1250
1251	return true;
1252	}
1253
1254	bool InterleavedLoadCombineImpl::run() {
1255	OptimizationRemarkEmitter ORE(&F);
1256	bool changed = false;
1257	unsigned MaxFactor = TLI.getMaxSupportedInterleaveFactor();
1258
1259	auto &DL = F.getDataLayout();
1260
1261	// Start with the highest factor to avoid combining and recombining.
1262	for (unsigned Factor = MaxFactor; Factor >= `2`; Factor--) {
1263	std::list<VectorInfo> Candidates;
1264
1265	for (BasicBlock &BB : F) {
1266	for (Instruction &I : BB) {
1267	if (auto SVI = dyn_cast<ShuffleVectorInst>(Val: &I)) {
1268	// We don't support scalable vectors in this pass.
1269	if (isa<ScalableVectorType>(Val: SVI->getType()))
1270	continue;
1271
1272	Candidates.emplace_back(args: cast<FixedVectorType>(Val: SVI->getType()));
1273
1274	if (!VectorInfo::computeFromSVI(SVI, Result&: Candidates.back(), DL)) {
1275	Candidates.pop_back();
1276	continue;
1277	}
1278
1279	if (!Candidates.back().isInterleaved(Factor, DL)) {
1280	Candidates.pop_back();
1281	}
1282	}
1283	}
1284	}
1285
1286	std::list<VectorInfo> InterleavedLoad;
1287	while (findPattern(Candidates, InterleavedLoad, Factor, DL)) {
1288	if (combine(InterleavedLoad, ORE)) {
1289	changed = true;
1290	} else {
1291	// Remove the first element of the Interleaved Load but put the others
1292	// back on the list and continue searching
1293	Candidates.splice(position: Candidates.begin(), x&: InterleavedLoad,
1294	first: std::next(x: InterleavedLoad.begin()),
1295	last: InterleavedLoad.end());
1296	}
1297	InterleavedLoad.clear();
1298	}
1299	}
1300
1301	return changed;
1302	}
1303
1304	namespace {
1305	/// This pass combines interleaved loads into a pattern detectable by
1306	/// InterleavedAccessPass.
1307	struct InterleavedLoadCombine : public FunctionPass {
1308	static char ID;
1309
1310	InterleavedLoadCombine() : FunctionPass (ID) {
1311	initializeInterleavedLoadCombinePass(*PassRegistry::getPassRegistry());
1312	}
1313
1314	StringRef getPassName() const override {
1315	return "Interleaved Load Combine Pass";
1316	}
1317
1318	bool runOnFunction(Function &F) override {
1319	if (DisableInterleavedLoadCombine)
1320	return false;
1321
1322	auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
1323	if (!TPC)
1324	return false;
1325
1326	LLVM_DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName()
1327	<< "\n");
1328
1329	return InterleavedLoadCombineImpl (
1330	F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
1331	getAnalysis<MemorySSAWrapperPass>().getMSSA(),
1332	getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F),
1333	TPC->getTM<TargetMachine>())
1334	.run();
1335	}
1336
1337	void getAnalysisUsage(AnalysisUsage &AU) const override {
1338	AU.addRequired<MemorySSAWrapperPass>();
1339	AU.addRequired<DominatorTreeWrapperPass>();
1340	AU.addRequired<TargetTransformInfoWrapperPass>();
1341	FunctionPass::getAnalysisUsage(AU);
1342	}
1343
1344	private:
1345	};
1346	} // anonymous namespace
1347
1348	PreservedAnalyses
1349	InterleavedLoadCombinePass::run(Function &F, FunctionAnalysisManager &FAM) {
1350
1351	auto &DT = FAM.getResult<DominatorTreeAnalysis>(IR&: F);
1352	auto &MemSSA = FAM.getResult<MemorySSAAnalysis>(IR&: F).getMSSA();
1353	auto &TTI = FAM.getResult<TargetIRAnalysis>(IR&: F);
1354	bool Changed = InterleavedLoadCombineImpl (F, DT, MemSSA, TTI, *TM).run();
1355	return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
1356	}
1357
1358	char InterleavedLoadCombine::ID = `0`;
1359
1360	INITIALIZE_PASS_BEGIN(
1361	InterleavedLoadCombine, DEBUG_TYPE,
1362	"Combine interleaved loads into wide loads and shufflevector instructions",
1363	false, false)
1364	INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
1365	INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
1366	INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
1367	INITIALIZE_PASS_END(
1368	InterleavedLoadCombine, DEBUG_TYPE,
1369	"Combine interleaved loads into wide loads and shufflevector instructions",
1370	false, false)
1371
1372	FunctionPass *
1373	llvm::createInterleavedLoadCombinePass() {
1374	auto P = new InterleavedLoadCombine ();
1375	return P;
1376	}
1377

Browse the source code of llvm_projects/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp