HexagonISelDAGToDAGHVX.cpp source code [llvm_projects/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp]

1	//===-- HexagonISelDAGToDAGHVX.cpp ----------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "HexagonISelDAGToDAG.h"
10	#include "HexagonISelLowering.h"
11	#include "llvm/ADT/BitVector.h"
12	#include "llvm/ADT/SetVector.h"
13	#include "llvm/CodeGen/SelectionDAGISel.h"
14	#include "llvm/IR/Intrinsics.h"
15	#include "llvm/IR/IntrinsicsHexagon.h"
16	#include "llvm/Support/Debug.h"
17	#include "llvm/Support/MathExtras.h"
18
19	#include <algorithm>
20	#include <cmath>
21	#include <deque>
22	#include <functional>
23	#include <map>
24	#include <optional>
25	#include <set>
26	#include <unordered_map>
27	#include <utility>
28	#include <vector>
29
30	#define DEBUG_TYPE "hexagon-isel"
31	using namespace llvm;
32
33	namespace {
34
35	// --------------------------------------------------------------------
36	// Implementation of permutation networks.
37
38	// Implementation of the node routing through butterfly networks:
39	// - Forward delta.
40	// - Reverse delta.
41	// - Benes.
42	//
43	//
44	// Forward delta network consists of log(N) steps, where N is the number
45	// of inputs. In each step, an input can stay in place, or it can get
46	// routed to another position[1]. The step after that consists of two
47	// networks, each half in size in terms of the number of nodes. In those
48	// terms, in the given step, an input can go to either the upper or the
49	// lower network in the next step.
50	//
51	// [1] Hexagon's vdelta/vrdelta allow an element to be routed to both
52	// positions as long as there is no conflict.
53
54	// Here's a delta network for 8 inputs, only the switching routes are
55	// shown:
56	//
57	// Steps:
58	// \|- 1 ---------------\|- 2 -----\|- 3 -\|
59	//
60	// Inp[0] * * * Out[0]*
61	// \ / \ / \ /
62	// \ / \ / X
63	// \ / \ / / \
64	// Inp[1] \ / * X * * Out[1]*
65	// \ \ / / \ / \ /
66	// \ \ / / X X
67	// \ \ / / / \ / \
68	// Inp[2] \ \ / / * X * * Out[2]*
69	// \ \ X / / / \ \ /
70	// \ \ / \ / / / \ X
71	// \ X X / / \ / \
72	// Inp[3] \ / \ / \ / * * * Out[3]*
73	// \ X X X /
74	// \ / \ / \ / \ /
75	// X X X X
76	// / \ / \ / \ / \
77	// / X X X \
78	// Inp[4] / \ / \ / \ * * * Out[4]*
79	// / X X \ \ / \ /
80	// / / \ / \ \ \ / X
81	// / / X \ \ \ / / \
82	// Inp[5] / / \ \ * X * * Out[5]*
83	// / / \ \ \ / \ /
84	// / / \ \ X X
85	// / / \ \ / \ / \
86	// Inp[6] / \ * X * * Out[6]*
87	// / \ / \ \ /
88	// / \ / \ X
89	// / \ / \ / \
90	// Inp[7] * * * Out[7]*
91	//
92	//
93	// Reverse delta network is same as delta network, with the steps in
94	// the opposite order.
95	//
96	//
97	// Benes network is a forward delta network immediately followed by
98	// a reverse delta network.
99
100	enum class ColorKind { None, Red, Black };
101
102	// Graph coloring utility used to partition nodes into two groups:
103	// they will correspond to nodes routed to the upper and lower networks.
104	struct Coloring {
105	using Node = int;
106	using MapType = std::map<Node, ColorKind>;
107	static constexpr Node Ignore = Node(-`1`);
108
109	Coloring(ArrayRef<Node> Ord) : Order (Ord) {
110	build();
111	if (!color())
112	Colors.clear();
113	}
114
115	const MapType &colors() const {
116	return Colors;
117	}
118
119	ColorKind other(ColorKind Color) {
120	if (Color == ColorKind::None)
121	return ColorKind::Red;
122	return Color == ColorKind::Red ? ColorKind::Black : ColorKind::Red;
123	}
124
125	LLVM_DUMP_METHOD void dump() const;
126
127	private:
128	ArrayRef<Node> Order;
129	MapType Colors;
130	std::set<Node> Needed;
131
132	using NodeSet = std::set<Node>;
133	std::map<Node,NodeSet> Edges;
134
135	Node conj(Node Pos) {
136	Node Num = Order.size();
137	return (Pos < Num/`2`) ? Pos + Num/`2` : Pos - Num/`2`;
138	}
139
140	ColorKind getColor(Node N) {
141	auto F = Colors.find(x: N);
142	return F != Colors.end() ? F ->second : ColorKind::None;
143	}
144
145	std::pair<bool, ColorKind> getUniqueColor(const NodeSet &Nodes);
146
147	void build();
148	bool color();
149	};
150	} // namespace
151
152	std::pair<bool, ColorKind> Coloring::getUniqueColor(const NodeSet &Nodes) {
153	auto Color = ColorKind::None;
154	for (Node N : Nodes) {
155	ColorKind ColorN = getColor(N);
156	if (ColorN == ColorKind::None)
157	continue;
158	if (Color == ColorKind::None)
159	Color = ColorN;
160	else if (Color != ColorKind::None && Color != ColorN)
161	return { false, ColorKind::None };
162	}
163	return { true, Color };
164	}
165
166	void Coloring::build() {
167	// Add Order[P] and Order[conj(P)] to Edges.
168	for (unsigned P = `0`; P != Order.size(); ++P) {
169	Node I = Order [P];
170	if (I != Ignore) {
171	Needed.insert(x: I);
172	Node PC = Order [conj(Pos: P)];
173	if (PC != Ignore && PC != I)
174	Edges [I].insert(x: PC);
175	}
176	}
177	// Add I and conj(I) to Edges.
178	for (unsigned I = `0`; I != Order.size(); ++I) {
179	if (!Needed.count(x: I))
180	continue;
181	Node C = conj(Pos: I);
182	// This will create an entry in the edge table, even if I is not
183	// connected to any other node. This is necessary, because it still
184	// needs to be colored.
185	NodeSet &Is = Edges [I];
186	if (Needed.count(x: C))
187	Is.insert(x: C);
188	}
189	}
190
191	bool Coloring::color() {
192	SetVector<Node> FirstQ;
193	auto Enqueue = [this,&FirstQ] (Node N) {
194	SetVector<Node> Q;
195	Q.insert(X: N);
196	for (unsigned I = `0`; I != Q.size(); ++I) {
197	NodeSet &Ns = Edges [Q [I]];
198	Q.insert_range(R&: Ns);
199	}
200	FirstQ.insert_range(R&: Q);
201	};
202	for (Node N : Needed)
203	Enqueue (N);
204
205	for (Node N : FirstQ) {
206	if (Colors.count(x: N))
207	continue;
208	NodeSet &Ns = Edges [N];
209	auto P = getUniqueColor(Nodes: Ns);
210	if (!P.first)
211	return false;
212	Colors [N] = other(Color: P.second);
213	}
214
215	// First, color nodes that don't have any dups.
216	for (auto E : Edges) {
217	Node N = E.first;
218	if (!Needed.count(x: conj(Pos: N)) \|\| Colors.count(x: N))
219	continue;
220	auto P = getUniqueColor(Nodes: E.second);
221	if (!P.first)
222	return false;
223	Colors [N] = other(Color: P.second);
224	}
225
226	// Now, nodes that are still uncolored. Since the graph can be modified
227	// in this step, create a work queue.
228	std::vector<Node> WorkQ;
229	for (auto E : Edges) {
230	Node N = E.first;
231	if (!Colors.count(x: N))
232	WorkQ.push_back(x: N);
233	}
234
235	for (Node N : WorkQ) {
236	NodeSet &Ns = Edges [N];
237	auto P = getUniqueColor(Nodes: Ns);
238	if (P.first) {
239	Colors [N] = other(Color: P.second);
240	continue;
241	}
242
243	// Coloring failed. Split this node.
244	Node C = conj(Pos: N);
245	ColorKind ColorN = other(Color: ColorKind::None);
246	ColorKind ColorC = other(Color: ColorN);
247	NodeSet &Cs = Edges [C];
248	NodeSet CopyNs = Ns;
249	for (Node M : CopyNs) {
250	ColorKind ColorM = getColor(N: M);
251	if (ColorM == ColorC) {
252	// Connect M with C, disconnect M from N.
253	Cs.insert(x: M);
254	Edges [M].insert(x: C);
255	Ns.erase(x: M);
256	Edges [M].erase(x: N);
257	}
258	}
259	Colors [N] = ColorN;
260	Colors [C] = ColorC;
261	}
262
263	// Explicitly assign "None" to all uncolored nodes.
264	for (unsigned I = `0`; I != Order.size(); ++I)
265	Colors.try_emplace(k: I, args: ColorKind::None);
266
267	return true;
268	}
269
270	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
271	void Coloring::dump() const {
272	dbgs() << "{ Order: {";
273	for (Node P : Order) {
274	if (P != Ignore)
275	dbgs() << `' '` << P;
276	else
277	dbgs() << " -";
278	}
279	dbgs() << " }\n";
280	dbgs() << " Needed: {";
281	for (Node N : Needed)
282	dbgs() << `' '` << N;
283	dbgs() << " }\n";
284
285	dbgs() << " Edges: {\n";
286	for (auto E : Edges) {
287	dbgs() << " " << E.first << " -> {";
288	for (auto N : E.second)
289	dbgs() << `' '` << N;
290	dbgs() << " }\n";
291	}
292	dbgs() << " }\n";
293
294	auto ColorKindToName = [](ColorKind C) {
295	switch (C) {
296	case ColorKind::None:
297	return "None";
298	case ColorKind::Red:
299	return "Red";
300	case ColorKind::Black:
301	return "Black";
302	}
303	llvm_unreachable("all ColorKinds should be handled by the switch above");
304	};
305
306	dbgs() << " Colors: {\n";
307	for (auto C : Colors)
308	dbgs() << " " << C.first << " -> " << ColorKindToName(C.second) << "\n";
309	dbgs() << " }\n}\n";
310	}
311	#endif
312
313	namespace {
314	// Base class of for reordering networks. They don't strictly need to be
315	// permutations, as outputs with repeated occurrences of an input element
316	// are allowed.
317	struct PermNetwork {
318	using Controls = std::vector<uint8_t>;
319	using ElemType = int;
320	static constexpr ElemType Ignore = ElemType(-`1`);
321
322	enum : uint8_t {
323	None,
324	Pass,
325	Switch
326	};
327	enum : uint8_t {
328	Forward,
329	Reverse
330	};
331
332	PermNetwork(ArrayRef<ElemType> Ord, unsigned Mult = `1`) {
333	Order.assign(first: Ord.data(), last: Ord.data()+Ord.size());
334	Log = `0`;
335
336	unsigned S = Order.size();
337	while (S >>= `1`)
338	++Log;
339
340	Table.resize(new_size: Order.size());
341	for (RowType &Row : Table)
342	Row.resize(new_size: Mult*Log, x: None);
343	}
344
345	void getControls(Controls &V, unsigned StartAt, uint8_t Dir) const {
346	unsigned Size = Order.size();
347	V.resize(new_size: Size);
348	for (unsigned I = `0`; I != Size; ++I) {
349	unsigned W = `0`;
350	for (unsigned L = `0`; L != Log; ++L) {
351	unsigned C = ctl(Pos: I, Step: StartAt+L) == Switch;
352	if (Dir == Forward)
353	W \|= C << (Log-`1`-L);
354	else
355	W \|= C << L;
356	}
357	assert(isUInt<`8`>(W));
358	V [I] = uint8_t(W);
359	}
360	}
361
362	uint8_t ctl(ElemType Pos, unsigned Step) const {
363	return Table [Pos][Step];
364	}
365	unsigned size() const {
366	return Order.size();
367	}
368	unsigned steps() const {
369	return Log;
370	}
371
372	protected:
373	unsigned Log;
374	std::vector<ElemType> Order;
375	using RowType = std::vector<uint8_t>;
376	std::vector<RowType> Table;
377	};
378
379	struct ForwardDeltaNetwork : public PermNetwork {
380	ForwardDeltaNetwork(ArrayRef<ElemType> Ord) : PermNetwork (Ord) {}
381
382	bool run(Controls &V) {
383	if (!route(P: Order.data(), T: Table.data(), Size: size(), Step: `0`))
384	return false;
385	getControls(V, StartAt: `0`, Dir: Forward);
386	return true;
387	}
388
389	private:
390	bool route(ElemType P, RowType T, unsigned Size, unsigned Step);
391	};
392
393	struct ReverseDeltaNetwork : public PermNetwork {
394	ReverseDeltaNetwork(ArrayRef<ElemType> Ord) : PermNetwork (Ord) {}
395
396	bool run(Controls &V) {
397	if (!route(P: Order.data(), T: Table.data(), Size: size(), Step: `0`))
398	return false;
399	getControls(V, StartAt: `0`, Dir: Reverse);
400	return true;
401	}
402
403	private:
404	bool route(ElemType P, RowType T, unsigned Size, unsigned Step);
405	};
406
407	struct BenesNetwork : public PermNetwork {
408	BenesNetwork(ArrayRef<ElemType> Ord) : PermNetwork (Ord, `2`) {}
409
410	bool run(Controls &F, Controls &R) {
411	if (!route(P: Order.data(), T: Table.data(), Size: size(), Step: `0`))
412	return false;
413
414	getControls(V&: F, StartAt: `0`, Dir: Forward);
415	getControls(V&: R, StartAt: Log, Dir: Reverse);
416	return true;
417	}
418
419	private:
420	bool route(ElemType P, RowType T, unsigned Size, unsigned Step);
421	};
422	} // namespace
423
424	bool ForwardDeltaNetwork::route(ElemType P, RowType T, unsigned Size,
425	unsigned Step) {
426	bool UseUp = false, UseDown = false;
427	ElemType Num = Size;
428
429	// Cannot use coloring here, because coloring is used to determine
430	// the "big" switch, i.e. the one that changes halves, and in a forward
431	// network, a color can be simultaneously routed to both halves in the
432	// step we're working on.
433	for (ElemType J = `0`; J != Num; ++J) {
434	ElemType I = P[J];
435	// I is the position in the input,
436	// J is the position in the output.
437	if (I == Ignore)
438	continue;
439	uint8_t S;
440	if (I < Num/`2`)
441	S = (J < Num/`2`) ? Pass : Switch;
442	else
443	S = (J < Num/`2`) ? Switch : Pass;
444
445	// U is the element in the table that needs to be updated.
446	ElemType U = (S == Pass) ? I : (I < Num/`2` ? I+Num/`2` : I-Num/`2`);
447	if (U < Num/`2`)
448	UseUp = true;
449	else
450	UseDown = true;
451	if (T[U][Step] != S && T[U][Step] != None)
452	return false;
453	T[U][Step] = S;
454	}
455
456	for (ElemType J = `0`; J != Num; ++J)
457	if (P[J] != Ignore && P[J] >= Num/`2`)
458	P[J] -= Num/`2`;
459
460	if (Step+`1` < Log) {
461	if (UseUp && !route(P, T, Size: Size/`2`, Step: Step+`1`))
462	return false;
463	if (UseDown && !route(P: P+Size/`2`, T: T+Size/`2`, Size: Size/`2`, Step: Step+`1`))
464	return false;
465	}
466	return true;
467	}
468
469	bool ReverseDeltaNetwork::route(ElemType P, RowType T, unsigned Size,
470	unsigned Step) {
471	unsigned Pets = Log-`1` - Step;
472	bool UseUp = false, UseDown = false;
473	ElemType Num = Size;
474
475	// In this step half-switching occurs, so coloring can be used.
476	Coloring G({P,Size});
477	const Coloring::MapType &M = G.colors();
478	if (M.empty())
479	return false;
480
481	ColorKind ColorUp = ColorKind::None;
482	for (ElemType J = `0`; J != Num; ++J) {
483	ElemType I = P[J];
484	// I is the position in the input,
485	// J is the position in the output.
486	if (I == Ignore)
487	continue;
488	ColorKind C = M.at(k: I);
489	if (C == ColorKind::None)
490	continue;
491	// During "Step", inputs cannot switch halves, so if the "up" color
492	// is still unknown, make sure that it is selected in such a way that
493	// "I" will stay in the same half.
494	bool InpUp = I < Num/`2`;
495	if (ColorUp == ColorKind::None)
496	ColorUp = InpUp ? C : G.other(Color: C);
497	if ((C == ColorUp) != InpUp) {
498	// If I should go to a different half than where is it now, give up.
499	return false;
500	}
501
502	uint8_t S;
503	if (InpUp) {
504	S = (J < Num/`2`) ? Pass : Switch;
505	UseUp = true;
506	} else {
507	S = (J < Num/`2`) ? Switch : Pass;
508	UseDown = true;
509	}
510	T[J][Pets] = S;
511	}
512
513	// Reorder the working permutation according to the computed switch table
514	// for the last step (i.e. Pets).
515	for (ElemType J = `0`, E = Size / `2`; J != E; ++J) {
516	ElemType PJ = P[J]; // Current values of P[J]
517	ElemType PC = P[J+Size/`2`]; // and P[conj(J)]
518	ElemType QJ = PJ; // New values of P[J]
519	ElemType QC = PC; // and P[conj(J)]
520	if (T[J][Pets] == Switch)
521	QC = PJ;
522	if (T[J+Size/`2`][Pets] == Switch)
523	QJ = PC;
524	P[J] = QJ;
525	P[J+Size/`2`] = QC;
526	}
527
528	for (ElemType J = `0`; J != Num; ++J)
529	if (P[J] != Ignore && P[J] >= Num/`2`)
530	P[J] -= Num/`2`;
531
532	if (Step+`1` < Log) {
533	if (UseUp && !route(P, T, Size: Size/`2`, Step: Step+`1`))
534	return false;
535	if (UseDown && !route(P: P+Size/`2`, T: T+Size/`2`, Size: Size/`2`, Step: Step+`1`))
536	return false;
537	}
538	return true;
539	}
540
541	bool BenesNetwork::route(ElemType P, RowType T, unsigned Size,
542	unsigned Step) {
543	Coloring G({P,Size});
544	const Coloring::MapType &M = G.colors();
545	if (M.empty())
546	return false;
547	ElemType Num = Size;
548
549	unsigned Pets = `2`*Log-`1` - Step;
550	bool UseUp = false, UseDown = false;
551
552	// Both assignments, i.e. Red->Up and Red->Down are valid, but they will
553	// result in different controls. Let's pick the one where the first
554	// control will be "Pass".
555	ColorKind ColorUp = ColorKind::None;
556	for (ElemType J = `0`; J != Num; ++J) {
557	ElemType I = P[J];
558	if (I == Ignore)
559	continue;
560	ColorKind C = M.at(k: I);
561	if (C == ColorKind::None)
562	continue;
563	if (ColorUp == ColorKind::None) {
564	ColorUp = (I < Num / `2`) ? ColorKind::Red : ColorKind::Black;
565	}
566	unsigned CI = (I < Num/`2`) ? I+Num/`2` : I-Num/`2`;
567	if (C == ColorUp) {
568	if (I < Num/`2`)
569	T[I][Step] = Pass;
570	else
571	T[CI][Step] = Switch;
572	T[J][Pets] = (J < Num/`2`) ? Pass : Switch;
573	UseUp = true;
574	} else { // Down
575	if (I < Num/`2`)
576	T[CI][Step] = Switch;
577	else
578	T[I][Step] = Pass;
579	T[J][Pets] = (J < Num/`2`) ? Switch : Pass;
580	UseDown = true;
581	}
582	}
583
584	// Reorder the working permutation according to the computed switch table
585	// for the last step (i.e. Pets).
586	for (ElemType J = `0`; J != Num/`2`; ++J) {
587	ElemType PJ = P[J]; // Current values of P[J]
588	ElemType PC = P[J+Num/`2`]; // and P[conj(J)]
589	ElemType QJ = PJ; // New values of P[J]
590	ElemType QC = PC; // and P[conj(J)]
591	if (T[J][Pets] == Switch)
592	QC = PJ;
593	if (T[J+Num/`2`][Pets] == Switch)
594	QJ = PC;
595	P[J] = QJ;
596	P[J+Num/`2`] = QC;
597	}
598
599	for (ElemType J = `0`; J != Num; ++J)
600	if (P[J] != Ignore && P[J] >= Num/`2`)
601	P[J] -= Num/`2`;
602
603	if (Step+`1` < Log) {
604	if (UseUp && !route(P, T, Size: Size/`2`, Step: Step+`1`))
605	return false;
606	if (UseDown && !route(P: P+Size/`2`, T: T+Size/`2`, Size: Size/`2`, Step: Step+`1`))
607	return false;
608	}
609	return true;
610	}
611
612	// --------------------------------------------------------------------
613	// Support for building selection results (output instructions that are
614	// parts of the final selection).
615
616	namespace {
617	struct OpRef {
618	OpRef(SDValue V) : OpV (V) {}
619	bool isValue() const { return OpV.getNode() != nullptr; }
620	bool isValid() const { return isValue() \|\| !(OpN & Invalid); }
621	bool isUndef() const { return OpN & Undef; }
622	static OpRef res(int N) { return OpRef (Whole \| (N & Index)); }
623	static OpRef fail() { return OpRef (Invalid); }
624
625	static OpRef lo(const OpRef &R) {
626	assert(!R.isValue());
627	return OpRef (R.OpN & (Undef \| Index \| LoHalf));
628	}
629	static OpRef hi(const OpRef &R) {
630	assert(!R.isValue());
631	return OpRef (R.OpN & (Undef \| Index \| HiHalf));
632	}
633	static OpRef undef(MVT Ty) { return OpRef (Undef \| Ty.SimpleTy); }
634
635	// Direct value.
636	SDValue OpV = SDValue ();
637
638	// Reference to the operand of the input node:
639	// If the 31st bit is 1, it's undef, otherwise, bits 28..0 are the
640	// operand index:
641	// If bit 30 is set, it's the high half of the operand.
642	// If bit 29 is set, it's the low half of the operand.
643	unsigned OpN = `0`;
644
645	enum : unsigned {
646	Invalid = `0x10000000`,
647	LoHalf = `0x20000000`,
648	HiHalf = `0x40000000`,
649	Whole = LoHalf \| HiHalf,
650	Undef = `0x80000000`,
651	Index = `0x0FFFFFFF`, // Mask of the index value.
652	IndexBits = `28`,
653	};
654
655	LLVM_DUMP_METHOD
656	void print(raw_ostream &OS, const SelectionDAG &G) const;
657
658	private:
659	OpRef(unsigned N) : OpN(N) {}
660	};
661
662	struct NodeTemplate {
663	NodeTemplate() = default;
664	unsigned Opc = `0`;
665	MVT Ty = MVT::Other;
666	std::vector<OpRef> Ops;
667
668	LLVM_DUMP_METHOD void print(raw_ostream &OS, const SelectionDAG &G) const;
669	};
670
671	struct ResultStack {
672	ResultStack(SDNode *Inp)
673	: InpNode(Inp), InpTy(Inp->getValueType(ResNo: `0`).getSimpleVT()) {}
674	SDNode *InpNode;
675	MVT InpTy;
676	unsigned push(const NodeTemplate &Res) {
677	List.push_back(x: Res);
678	return List.size()-`1`;
679	}
680	unsigned push(unsigned Opc, MVT Ty, std::vector<OpRef> &&Ops) {
681	NodeTemplate Res;
682	Res.Opc = Opc;
683	Res.Ty = Ty;
684	Res.Ops = Ops;
685	return push(Res);
686	}
687	bool empty() const { return List.empty(); }
688	unsigned size() const { return List.size(); }
689	unsigned top() const { return size()-`1`; }
690	const NodeTemplate &operator[](unsigned I) const { return List [I]; }
691	unsigned reset(unsigned NewTop) {
692	List.resize(new_size: NewTop+`1`);
693	return NewTop;
694	}
695
696	using BaseType = std::vector<NodeTemplate>;
697	BaseType::iterator begin() { return List.begin(); }
698	BaseType::iterator end() { return List.end(); }
699	BaseType::const_iterator begin() const { return List.begin(); }
700	BaseType::const_iterator end() const { return List.end(); }
701
702	BaseType List;
703
704	LLVM_DUMP_METHOD
705	void print(raw_ostream &OS, const SelectionDAG &G) const;
706	};
707	} // namespace
708
709	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
710	void OpRef::print(raw_ostream &OS, const SelectionDAG &G) const {
711	if (isValue()) {
712	OpV.getNode()->print(OS, &G);
713	return;
714	}
715	if (OpN & Invalid) {
716	OS << "invalid";
717	return;
718	}
719	if (OpN & Undef) {
720	OS << "undef";
721	return;
722	}
723	if ((OpN & Whole) != Whole) {
724	assert((OpN & Whole) == LoHalf \|\| (OpN & Whole) == HiHalf);
725	if (OpN & LoHalf)
726	OS << "lo ";
727	else
728	OS << "hi ";
729	}
730	OS << `'#'` << SignExtend32(OpN & Index, IndexBits);
731	}
732
733	void NodeTemplate::print(raw_ostream &OS, const SelectionDAG &G) const {
734	const TargetInstrInfo &TII = *G.getSubtarget().getInstrInfo();
735	OS << format("%8s", EVT(Ty).getEVTString().c_str()) << " "
736	<< TII.getName(Opc);
737	bool Comma = false;
738	for (const auto &R : Ops) {
739	if (Comma)
740	OS << `','`;
741	Comma = true;
742	OS << `' '`;
743	R.print(OS, G);
744	}
745	}
746
747	void ResultStack::print(raw_ostream &OS, const SelectionDAG &G) const {
748	OS << "Input node:\n";
749	#ifndef NDEBUG
750	InpNode->dumpr(&G);
751	#endif
752	OS << "Result templates:\n";
753	for (unsigned I = `0`, E = List.size(); I != E; ++I) {
754	OS << `'['` << I << "] ";
755	List[I].print(OS, G);
756	OS << `'\n'`;
757	}
758	}
759	#endif
760
761	namespace {
762	struct ShuffleMask {
763	ShuffleMask(ArrayRef<int> M) : Mask (M) {
764	for (int M : Mask) {
765	if (M == -`1`)
766	continue;
767	MinSrc = (MinSrc == -`1`) ? M : std::min(a: MinSrc, b: M);
768	MaxSrc = (MaxSrc == -`1`) ? M : std::max(a: MaxSrc, b: M);
769	}
770	}
771
772	ArrayRef<int> Mask;
773	int MinSrc = -`1`, MaxSrc = -`1`;
774
775	ShuffleMask lo() const {
776	size_t H = Mask.size()/`2`;
777	return ShuffleMask (Mask.take_front(N: H));
778	}
779	ShuffleMask hi() const {
780	size_t H = Mask.size()/`2`;
781	return ShuffleMask (Mask.take_back(N: H));
782	}
783
784	void print(raw_ostream &OS) const {
785	OS << "MinSrc:" << MinSrc << ", MaxSrc:" << MaxSrc << " {";
786	for (int M : Mask)
787	OS << `' '` << M;
788	OS << " }";
789	}
790	};
791
792	LLVM_ATTRIBUTE_UNUSED
793	raw_ostream &operator<<(raw_ostream &OS, const ShuffleMask &SM) {
794	SM.print(OS);
795	return OS;
796	}
797	} // namespace
798
799	namespace shuffles {
800	using MaskT = SmallVector<int, `128`>;
801	// Vdd = vshuffvdd(Vu, Vv, Rt)
802	// Vdd = vdealvdd(Vu, Vv, Rt)
803	// Vd = vpack(Vu, Vv, Size, TakeOdd)
804	// Vd = vshuff(Vu, Vv, Size, TakeOdd)
805	// Vd = vdeal(Vu, Vv, Size, TakeOdd)
806	// Vd = vdealb4w(Vu, Vv)
807
808	ArrayRef<int> lo(ArrayRef<int> Vuu) { return Vuu.take_front(N: Vuu.size() / `2`); }
809	ArrayRef<int> hi(ArrayRef<int> Vuu) { return Vuu.take_back(N: Vuu.size() / `2`); }
810
811	MaskT vshuffvdd(ArrayRef<int> Vu, ArrayRef<int> Vv, unsigned Rt) {
812	int Len = Vu.size();
813	MaskT Vdd(`2` * Len);
814	std::copy(first: Vv.begin(), last: Vv.end(), result: Vdd.begin());
815	std::copy(first: Vu.begin(), last: Vu.end(), result: Vdd.begin() + Len);
816
817	auto Vd0 = MutableArrayRef<int>(Vdd).take_front(N: Len);
818	auto Vd1 = MutableArrayRef<int>(Vdd).take_back(N: Len);
819
820	for (int Offset = `1`; Offset < Len; Offset *= `2`) {
821	if ((Rt & Offset) == `0`)
822	continue;
823	for (int i = `0`; i != Len; ++i) {
824	if ((i & Offset) == `0`)
825	std::swap(a&: Vd1 [i], b&: Vd0 [i + Offset]);
826	}
827	}
828	return Vdd;
829	}
830
831	MaskT vdealvdd(ArrayRef<int> Vu, ArrayRef<int> Vv, unsigned Rt) {
832	int Len = Vu.size();
833	MaskT Vdd(`2` * Len);
834	std::copy(first: Vv.begin(), last: Vv.end(), result: Vdd.begin());
835	std::copy(first: Vu.begin(), last: Vu.end(), result: Vdd.begin() + Len);
836
837	auto Vd0 = MutableArrayRef<int>(Vdd).take_front(N: Len);
838	auto Vd1 = MutableArrayRef<int>(Vdd).take_back(N: Len);
839
840	for (int Offset = Len / `2`; Offset > `0`; Offset /= `2`) {
841	if ((Rt & Offset) == `0`)
842	continue;
843	for (int i = `0`; i != Len; ++i) {
844	if ((i & Offset) == `0`)
845	std::swap(a&: Vd1 [i], b&: Vd0 [i + Offset]);
846	}
847	}
848	return Vdd;
849	}
850
851	MaskT vpack(ArrayRef<int> Vu, ArrayRef<int> Vv, unsigned Size, bool TakeOdd) {
852	int Len = Vu.size();
853	MaskT Vd(Len);
854	auto Odd = static_cast<int>(TakeOdd);
855	for (int i = `0`, e = Len / (`2` * Size); i != e; ++i) {
856	for (int b = `0`; b != static_cast<int>(Size); ++b) {
857	// clang-format off
858	Vd [i * Size + b] = Vv [(`2` * i + Odd) * Size + b];
859	Vd [i * Size + b + Len / `2`] = Vu [(`2` * i + Odd) * Size + b];
860	// clang-format on
861	}
862	}
863	return Vd;
864	}
865
866	MaskT vshuff(ArrayRef<int> Vu, ArrayRef<int> Vv, unsigned Size, bool TakeOdd) {
867	int Len = Vu.size();
868	MaskT Vd(Len);
869	auto Odd = static_cast<int>(TakeOdd);
870	for (int i = `0`, e = Len / (`2` * Size); i != e; ++i) {
871	for (int b = `0`; b != static_cast<int>(Size); ++b) {
872	Vd [(`2` * i + `0`) * Size + b] = Vv [(`2` * i + Odd) * Size + b];
873	Vd [(`2` * i + `1`) * Size + b] = Vu [(`2` * i + Odd) * Size + b];
874	}
875	}
876	return Vd;
877	}
878
879	MaskT vdeal(ArrayRef<int> Vu, ArrayRef<int> Vv, unsigned Size, bool TakeOdd) {
880	int Len = Vu.size();
881	MaskT T = vdealvdd(Vu, Vv, Rt: Len - `2` * Size);
882	return vpack(Vu: hi(Vuu: T), Vv: lo(Vuu: T), Size, TakeOdd);
883	}
884
885	MaskT vdealb4w(ArrayRef<int> Vu, ArrayRef<int> Vv) {
886	int Len = Vu.size();
887	MaskT Vd(Len);
888	for (int i = `0`, e = Len / `4`; i != e; ++i) {
889	Vd [`0` * (Len / `4`) + i] = Vv [`4` * i + `0`];
890	Vd [`1` * (Len / `4`) + i] = Vv [`4` * i + `2`];
891	Vd [`2` * (Len / `4`) + i] = Vu [`4` * i + `0`];
892	Vd [`3` * (Len / `4`) + i] = Vu [`4` * i + `2`];
893	}
894	return Vd;
895	}
896
897	template <typename ShuffFunc, typename... OptArgs>
898	auto mask(ShuffFunc S, unsigned Length, OptArgs... args) -> MaskT {
899	MaskT Vu(Length), Vv(Length);
900	std::iota(first: Vu.begin(), last: Vu.end(), value: Length); // High
901	std::iota(first: Vv.begin(), last: Vv.end(), value: `0`); // Low
902	return S(Vu, Vv, args...);
903	}
904
905	} // namespace shuffles
906
907	// --------------------------------------------------------------------
908	// The HvxSelector class.
909
910	static const HexagonTargetLowering &getHexagonLowering(SelectionDAG &G) {
911	return static_cast<const HexagonTargetLowering&>(G.getTargetLoweringInfo());
912	}
913	static const HexagonSubtarget &getHexagonSubtarget(SelectionDAG &G) {
914	return G.getSubtarget<HexagonSubtarget>();
915	}
916
917	namespace llvm {
918	struct HvxSelector {
919	const HexagonTargetLowering &Lower;
920	HexagonDAGToDAGISel &ISel;
921	SelectionDAG &DAG;
922	const HexagonSubtarget &HST;
923	const unsigned HwLen;
924
925	HvxSelector(HexagonDAGToDAGISel &HS, SelectionDAG &G)
926	: Lower(getHexagonLowering(G)), ISel(HS), DAG(G),
927	HST(getHexagonSubtarget(G)), HwLen(HST.getVectorLength()) {}
928
929	MVT getSingleVT(MVT ElemTy) const {
930	assert(ElemTy != MVT::i1 && "Use getBoolVT for predicates");
931	unsigned NumElems = HwLen / (ElemTy.getSizeInBits() / `8`);
932	return MVT::getVectorVT(VT: ElemTy, NumElements: NumElems);
933	}
934
935	MVT getPairVT(MVT ElemTy) const {
936	assert(ElemTy != MVT::i1); // Suspicious: there are no predicate pairs.
937	unsigned NumElems = (`2` * HwLen) / (ElemTy.getSizeInBits() / `8`);
938	return MVT::getVectorVT(VT: ElemTy, NumElements: NumElems);
939	}
940
941	MVT getBoolVT() const {
942	// Return HwLen x i1.
943	return MVT::getVectorVT(VT: MVT::i1, NumElements: HwLen);
944	}
945
946	void selectExtractSubvector(SDNode *N);
947	void selectShuffle(SDNode *N);
948	void selectRor(SDNode *N);
949	void selectVAlign(SDNode *N);
950
951	static SmallVector<uint32_t, `8`> getPerfectCompletions(ShuffleMask SM,
952	unsigned Width);
953	static SmallVector<uint32_t, `8`> completeToPerfect(
954	ArrayRef<uint32_t> Completions, unsigned Width);
955	static std::optional<int> rotationDistance(ShuffleMask SM, unsigned WrapAt);
956
957	private:
958	void select(SDNode *ISelN);
959	void materialize(const ResultStack &Results);
960
961	SDValue getConst32(int Val, const SDLoc &dl);
962	SDValue getVectorConstant(ArrayRef<uint8_t> Data, const SDLoc &dl);
963
964	enum : unsigned {
965	None,
966	PackMux,
967	};
968	OpRef concats(OpRef Va, OpRef Vb, ResultStack &Results);
969	OpRef funnels(OpRef Va, OpRef Vb, int Amount, ResultStack &Results);
970
971	OpRef packs(ShuffleMask SM, OpRef Va, OpRef Vb, ResultStack &Results,
972	MutableArrayRef<int> NewMask, unsigned Options = None);
973	OpRef packp(ShuffleMask SM, OpRef Va, OpRef Vb, ResultStack &Results,
974	MutableArrayRef<int> NewMask);
975	OpRef vmuxs(ArrayRef<uint8_t> Bytes, OpRef Va, OpRef Vb,
976	ResultStack &Results);
977	OpRef vmuxp(ArrayRef<uint8_t> Bytes, OpRef Va, OpRef Vb,
978	ResultStack &Results);
979
980	OpRef shuffs1(ShuffleMask SM, OpRef Va, ResultStack &Results);
981	OpRef shuffs2(ShuffleMask SM, OpRef Va, OpRef Vb, ResultStack &Results);
982	OpRef shuffp1(ShuffleMask SM, OpRef Va, ResultStack &Results);
983	OpRef shuffp2(ShuffleMask SM, OpRef Va, OpRef Vb, ResultStack &Results);
984
985	OpRef butterfly(ShuffleMask SM, OpRef Va, ResultStack &Results);
986	OpRef contracting(ShuffleMask SM, OpRef Va, OpRef Vb, ResultStack &Results);
987	OpRef expanding(ShuffleMask SM, OpRef Va, ResultStack &Results);
988	OpRef perfect(ShuffleMask SM, OpRef Va, ResultStack &Results);
989
990	bool selectVectorConstants(SDNode *N);
991	bool scalarizeShuffle(ArrayRef<int> Mask, const SDLoc &dl, MVT ResTy,
992	SDValue Va, SDValue Vb, SDNode *N);
993	};
994	} // namespace llvm
995
996	static void splitMask(ArrayRef<int> Mask, MutableArrayRef<int> MaskL,
997	MutableArrayRef<int> MaskR) {
998	unsigned VecLen = Mask.size();
999	assert(MaskL.size() == VecLen && MaskR.size() == VecLen);
1000	for (unsigned I = `0`; I != VecLen; ++I) {
1001	int M = Mask [I];
1002	if (M < `0`) {
1003	MaskL [I] = MaskR [I] = -`1`;
1004	} else if (unsigned(M) < VecLen) {
1005	MaskL [I] = M;
1006	MaskR [I] = -`1`;
1007	} else {
1008	MaskL [I] = -`1`;
1009	MaskR [I] = M-VecLen;
1010	}
1011	}
1012	}
1013
1014	static std::pair<int,unsigned> findStrip(ArrayRef<int> A, int Inc,
1015	unsigned MaxLen) {
1016	assert(A.size() > `0` && A.size() >= MaxLen);
1017	int F = A [`0`];
1018	int E = F;
1019	for (unsigned I = `1`; I != MaxLen; ++I) {
1020	if (A [I] - E != Inc)
1021	return { F, I };
1022	E = A [I];
1023	}
1024	return { F, MaxLen };
1025	}
1026
1027	static bool isUndef(ArrayRef<int> Mask) {
1028	for (int Idx : Mask)
1029	if (Idx != -`1`)
1030	return false;
1031	return true;
1032	}
1033
1034	static bool isIdentity(ArrayRef<int> Mask) {
1035	for (int I = `0`, E = Mask.size(); I != E; ++I) {
1036	int M = Mask [I];
1037	if (M >= `0` && M != I)
1038	return false;
1039	}
1040	return true;
1041	}
1042
1043	static bool isLowHalfOnly(ArrayRef<int> Mask) {
1044	int L = Mask.size();
1045	assert(L % `2` == `0`);
1046	// Check if the second half of the mask is all-undef.
1047	return llvm::all_of(Range: Mask.drop_front(N: L / `2`), P: [](int M) { return M < `0`; });
1048	}
1049
1050	static SmallVector<unsigned, `4`> getInputSegmentList(ShuffleMask SM,
1051	unsigned SegLen) {
1052	assert(isPowerOf2_32(SegLen));
1053	SmallVector<unsigned, `4`> SegList;
1054	if (SM.MaxSrc == -`1`)
1055	return SegList;
1056
1057	unsigned Shift = Log2_32(Value: SegLen);
1058	BitVector Segs(alignTo(Value: SM.MaxSrc + `1`, Align: SegLen) >> Shift);
1059
1060	for (int M : SM.Mask) {
1061	if (M >= `0`)
1062	Segs.set(M >> Shift);
1063	}
1064
1065	llvm::append_range(C&: SegList, R: Segs.set_bits());
1066	return SegList;
1067	}
1068
1069	static SmallVector<unsigned, `4`> getOutputSegmentMap(ShuffleMask SM,
1070	unsigned SegLen) {
1071	// Calculate the layout of the output segments in terms of the input
1072	// segments.
1073	// For example [1,3,1,0] means that the output consists of 4 output
1074	// segments, where the first output segment has only elements of the
1075	// input segment at index 1. The next output segment only has elements
1076	// of the input segment 3, etc.
1077	// If an output segment only has undef elements, the value will be ~0u.
1078	// If an output segment has elements from more than one input segment,
1079	// the corresponding value will be ~1u.
1080	unsigned MaskLen = SM.Mask.size();
1081	assert(MaskLen % SegLen == `0`);
1082	SmallVector<unsigned, `4`> Map(MaskLen / SegLen);
1083
1084	for (int S = `0`, E = Map.size(); S != E; ++S) {
1085	unsigned Idx = ~`0u`;
1086	for (int I = `0`; I != static_cast<int>(SegLen); ++I) {
1087	int M = SM.Mask [S*SegLen + I];
1088	if (M < `0`)
1089	continue;
1090	unsigned G = M / SegLen; // Input segment of this element.
1091	if (Idx == ~`0u`) {
1092	Idx = G;
1093	} else if (Idx != G) {
1094	Idx = ~`1u`;
1095	break;
1096	}
1097	}
1098	Map [S] = Idx;
1099	}
1100
1101	return Map;
1102	}
1103
1104	static void packSegmentMask(ArrayRef<int> Mask, ArrayRef<unsigned> OutSegMap,
1105	unsigned SegLen, MutableArrayRef<int> PackedMask) {
1106	SmallVector<unsigned, `4`> InvMap;
1107	for (int I = OutSegMap.size() - `1`; I >= `0`; --I) {
1108	unsigned S = OutSegMap [I];
1109	assert(S != ~`0u` && "Unexpected undef");
1110	assert(S != ~`1u` && "Unexpected multi");
1111	if (InvMap.size() <= S)
1112	InvMap.resize(N: S+`1`);
1113	InvMap [S] = I;
1114	}
1115
1116	unsigned Shift = Log2_32(Value: SegLen);
1117	for (int I = `0`, E = Mask.size(); I != E; ++I) {
1118	int M = Mask [I];
1119	if (M >= `0`) {
1120	int OutIdx = InvMap [M >> Shift];
1121	M = (M & (SegLen-`1`)) + SegLen*OutIdx;
1122	}
1123	PackedMask [I] = M;
1124	}
1125	}
1126
1127	bool HvxSelector::selectVectorConstants(SDNode *N) {
1128	// Constant vectors are generated as loads from constant pools or as
1129	// splats of a constant value. Since they are generated during the
1130	// selection process, the main selection algorithm is not aware of them.
1131	// Select them directly here.
1132	SmallVector<SDNode*,`4`> Nodes;
1133	SetVector<SDNode*> WorkQ;
1134
1135	// The DAG can change (due to CSE) during selection, so cache all the
1136	// unselected nodes first to avoid traversing a mutating DAG.
1137	WorkQ.insert(X: N);
1138	for (unsigned i = `0`; i != WorkQ.size(); ++i) {
1139	SDNode *W = WorkQ [i];
1140	if (!W->isMachineOpcode() && W->getOpcode() == HexagonISD::ISEL)
1141	Nodes.push_back(Elt: W);
1142	for (unsigned j = `0`, f = W->getNumOperands(); j != f; ++j)
1143	WorkQ.insert(X: W->getOperand(Num: j).getNode());
1144	}
1145
1146	for (SDNode *L : Nodes)
1147	select(ISelN: L);
1148
1149	return !Nodes.empty();
1150	}
1151
1152	void HvxSelector::materialize(const ResultStack &Results) {
1153	DEBUG_WITH_TYPE("isel", {
1154	dbgs() << "Materializing\n";
1155	Results.print(dbgs(), DAG);
1156	});
1157	if (Results.empty())
1158	return;
1159	const SDLoc &dl(Results.InpNode);
1160	std::vector<SDValue> Output;
1161
1162	for (unsigned I = `0`, E = Results.size(); I != E; ++I) {
1163	const NodeTemplate &Node = Results [I];
1164	std::vector<SDValue> Ops;
1165	for (const OpRef &R : Node.Ops) {
1166	assert(R.isValid());
1167	if (R.isValue()) {
1168	Ops.push_back(x: R.OpV);
1169	continue;
1170	}
1171	if (R.OpN & OpRef::Undef) {
1172	MVT::SimpleValueType SVT = MVT::SimpleValueType(R.OpN & OpRef::Index);
1173	Ops.push_back(x: ISel.selectUndef(dl, ResTy: MVT (SVT)));
1174	continue;
1175	}
1176	// R is an index of a result.
1177	unsigned Part = R.OpN & OpRef::Whole;
1178	int Idx = SignExtend32(X: R.OpN & OpRef::Index, B: OpRef::IndexBits);
1179	if (Idx < `0`)
1180	Idx += I;
1181	assert(Idx >= `0` && unsigned(Idx) < Output.size());
1182	SDValue Op = Output [Idx];
1183	MVT OpTy = Op.getValueType().getSimpleVT();
1184	if (Part != OpRef::Whole) {
1185	assert(Part == OpRef::LoHalf \|\| Part == OpRef::HiHalf);
1186	MVT HalfTy = MVT::getVectorVT(VT: OpTy.getVectorElementType(),
1187	NumElements: OpTy.getVectorNumElements()/`2`);
1188	unsigned Sub = (Part == OpRef::LoHalf) ? Hexagon::vsub_lo
1189	: Hexagon::vsub_hi;
1190	Op = DAG.getTargetExtractSubreg(SRIdx: Sub, DL: dl, VT: HalfTy, Operand: Op);
1191	}
1192	Ops.push_back(x: Op);
1193	} // for (Node : Results)
1194
1195	assert(Node.Ty != MVT::Other);
1196	SDNode *ResN = (Node.Opc == TargetOpcode::COPY)
1197	? Ops.front().getNode()
1198	: DAG.getMachineNode(Opcode: Node.Opc, dl, VT: Node.Ty, Ops);
1199	Output.push_back(x: SDValue (ResN, `0`));
1200	}
1201
1202	SDNode *OutN = Output.back().getNode();
1203	SDNode *InpN = Results.InpNode;
1204	DEBUG_WITH_TYPE("isel", {
1205	dbgs() << "Generated node:\n";
1206	OutN->dumpr(&DAG);
1207	});
1208
1209	ISel.ReplaceNode(F: InpN, T: OutN);
1210	selectVectorConstants(N: OutN);
1211	DAG.RemoveDeadNodes();
1212	}
1213
1214	OpRef HvxSelector::concats(OpRef Lo, OpRef Hi, ResultStack &Results) {
1215	DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << `'\n'`;});
1216	const SDLoc &dl(Results.InpNode);
1217	Results.push(Opc: TargetOpcode::REG_SEQUENCE, Ty: getPairVT(ElemTy: MVT::i8), Ops: {
1218	getConst32(Val: Hexagon::HvxWRRegClassID, dl),
1219	Lo, getConst32(Val: Hexagon::vsub_lo, dl),
1220	Hi, getConst32(Val: Hexagon::vsub_hi, dl),
1221	});
1222	return OpRef::res(N: Results.top());
1223	}
1224
1225	OpRef HvxSelector::funnels(OpRef Va, OpRef Vb, int Amount,
1226	ResultStack &Results) {
1227	// Do a funnel shift towards the low end (shift right) by Amount bytes.
1228	// If Amount < 0, treat it as shift left, i.e. do a shift right by
1229	// Amount + HwLen.
1230	auto VecLen = static_cast<int>(HwLen);
1231
1232	if (Amount == `0`)
1233	return Va;
1234	if (Amount == VecLen)
1235	return Vb;
1236
1237	MVT Ty = getSingleVT(ElemTy: MVT::i8);
1238	const SDLoc &dl(Results.InpNode);
1239
1240	if (Amount < `0`)
1241	Amount += VecLen;
1242	if (Amount > VecLen) {
1243	Amount -= VecLen;
1244	std::swap(a&: Va, b&: Vb);
1245	}
1246
1247	if (isUInt<`3`>(x: Amount)) {
1248	SDValue A = getConst32(Val: Amount, dl);
1249	Results.push(Opc: Hexagon::V6_valignbi, Ty, Ops: {Vb, Va, A});
1250	} else if (isUInt<`3`>(x: VecLen - Amount)) {
1251	SDValue A = getConst32(Val: VecLen - Amount, dl);
1252	Results.push(Opc: Hexagon::V6_vlalignbi, Ty, Ops: {Vb, Va, A});
1253	} else {
1254	SDValue A = getConst32(Val: Amount, dl);
1255	Results.push(Opc: Hexagon::A2_tfrsi, Ty, Ops: {A});
1256	Results.push(Opc: Hexagon::V6_valignb, Ty, Ops: {Vb, Va, OpRef::res(N: -`1`)});
1257	}
1258	return OpRef::res(N: Results.top());
1259	}
1260
1261	// Va, Vb are single vectors. If SM only uses two vector halves from Va/Vb,
1262	// pack these halves into a single vector, and remap SM into NewMask to use
1263	// the new vector instead.
1264	OpRef HvxSelector::packs(ShuffleMask SM, OpRef Va, OpRef Vb,
1265	ResultStack &Results, MutableArrayRef<int> NewMask,
1266	unsigned Options) {
1267	DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << `'\n'`;});
1268	if (!Va.isValid() \|\| !Vb.isValid())
1269	return OpRef::fail();
1270
1271	if (Vb.isUndef()) {
1272	llvm::copy(Range&: SM.Mask, Out: NewMask.begin());
1273	return Va;
1274	}
1275	if (Va.isUndef()) {
1276	llvm::copy(Range&: SM.Mask, Out: NewMask.begin());
1277	ShuffleVectorSDNode::commuteMask(Mask: NewMask);
1278	return Vb;
1279	}
1280
1281	MVT Ty = getSingleVT(ElemTy: MVT::i8);
1282	MVT PairTy = getPairVT(ElemTy: MVT::i8);
1283	OpRef Inp[`2`] = {Va, Vb};
1284	unsigned VecLen = SM.Mask.size();
1285
1286	auto valign = [this](OpRef Lo, OpRef Hi, unsigned Amt, MVT Ty,
1287	ResultStack &Results) {
1288	if (Amt == `0`)
1289	return Lo;
1290	const SDLoc &dl(Results.InpNode);
1291	if (isUInt<`3`>(x: Amt) \|\| isUInt<`3`>(x: HwLen - Amt)) {
1292	bool IsRight = isUInt<`3`>(x: Amt); // Right align.
1293	SDValue S = getConst32(Val: IsRight ? Amt : HwLen - Amt, dl);
1294	unsigned Opc = IsRight ? Hexagon::V6_valignbi : Hexagon::V6_vlalignbi;
1295	Results.push(Opc, Ty, Ops: {Hi, Lo, S});
1296	return OpRef::res(N: Results.top());
1297	}
1298	Results.push(Opc: Hexagon::A2_tfrsi, Ty: MVT::i32, Ops: {getConst32(Val: Amt, dl)});
1299	OpRef A = OpRef::res(N: Results.top());
1300	Results.push(Opc: Hexagon::V6_valignb, Ty, Ops: {Hi, Lo, A});
1301	return OpRef::res(N: Results.top());
1302	};
1303
1304	// Segment is a vector half.
1305	unsigned SegLen = HwLen / `2`;
1306
1307	// Check if we can shuffle vector halves around to get the used elements
1308	// into a single vector.
1309	shuffles::MaskT MaskH(SM.Mask);
1310	SmallVector<unsigned, `4`> SegList = getInputSegmentList(SM: SM.Mask, SegLen);
1311	unsigned SegCount = SegList.size();
1312	SmallVector<unsigned, `4`> SegMap = getOutputSegmentMap(SM: SM.Mask, SegLen);
1313
1314	if (SegList.empty())
1315	return OpRef::undef(Ty);
1316
1317	// NOTE:
1318	// In the following part of the function, where the segments are rearranged,
1319	// the shuffle mask SM can be of any length that is a multiple of a vector
1320	// (i.e. a multiple of 2SegLen), and non-zero.*
1321	// The output segment map is computed, and it may have any even number of
1322	// entries, but the rearrangement of input segments will be done based only
1323	// on the first two (non-undef) entries in the segment map.
1324	// For example, if the output map is 3, 1, 1, 3 (it can have at most two
1325	// distinct entries!), the segments 1 and 3 of Va/Vb will be packaged into
1326	// a single vector V = 3:1. The output mask will then be updated to use
1327	// seg(0,V), seg(1,V), seg(1,V), seg(0,V).
1328	//
1329	// Picking the segments based on the output map is an optimization. For
1330	// correctness it is only necessary that Seg0 and Seg1 are the two input
1331	// segments that are used in the output.
1332
1333	unsigned Seg0 = ~`0u`, Seg1 = ~`0u`;
1334	for (unsigned X : SegMap) {
1335	if (X == ~`0u`)
1336	continue;
1337	if (Seg0 == ~`0u`)
1338	Seg0 = X;
1339	else if (Seg1 != ~`0u`)
1340	break;
1341	if (X == ~`1u` \|\| X != Seg0)
1342	Seg1 = X;
1343	}
1344
1345	if (SegCount == `1`) {
1346	unsigned SrcOp = SegList [`0`] / `2`;
1347	for (int I = `0`; I != static_cast<int>(VecLen); ++I) {
1348	int M = SM.Mask [I];
1349	if (M >= `0`) {
1350	M -= SrcOp * HwLen;
1351	assert(M >= `0`);
1352	}
1353	NewMask [I] = M;
1354	}
1355	return Inp[SrcOp];
1356	}
1357
1358	if (SegCount == `2`) {
1359	// Seg0 should not be undef here: this would imply a SegList
1360	// with <= 1 elements, which was checked earlier.
1361	assert(Seg0 != ~`0u`);
1362
1363	// If Seg0 or Seg1 are "multi-defined", pick them from the input
1364	// segment list instead.
1365	if (Seg0 == ~`1u` \|\| Seg1 == ~`1u`) {
1366	if (Seg0 == Seg1) {
1367	Seg0 = SegList [`0`];
1368	Seg1 = SegList [`1`];
1369	} else if (Seg0 == ~`1u`) {
1370	Seg0 = SegList [`0`] != Seg1 ? SegList [`0`] : SegList [`1`];
1371	} else {
1372	assert(Seg1 == ~`1u`);
1373	Seg1 = SegList [`0`] != Seg0 ? SegList [`0`] : SegList [`1`];
1374	}
1375	}
1376	assert(Seg0 != ~`1u` && Seg1 != ~`1u`);
1377
1378	assert(Seg0 != Seg1 && "Expecting different segments");
1379	const SDLoc &dl(Results.InpNode);
1380	Results.push(Opc: Hexagon::A2_tfrsi, Ty: MVT::i32, Ops: {getConst32(Val: SegLen, dl)});
1381	OpRef HL = OpRef::res(N: Results.top());
1382
1383	// Va = AB, Vb = CD
1384
1385	if (Seg0 / `2` == Seg1 / `2`) {
1386	// Same input vector.
1387	Va = Inp[Seg0 / `2`];
1388	if (Seg0 > Seg1) {
1389	// Swap halves.
1390	Results.push(Opc: Hexagon::V6_vror, Ty, Ops: {Inp[Seg0 / `2`], HL});
1391	Va = OpRef::res(N: Results.top());
1392	}
1393	packSegmentMask(Mask: SM.Mask, OutSegMap: {Seg0, Seg1}, SegLen, PackedMask: MaskH);
1394	} else if (Seg0 % `2` == Seg1 % `2`) {
1395	// Picking AC, BD, CA, or DB.
1396	// vshuff(CD,AB,HL) -> BD:AC
1397	// vshuff(AB,CD,HL) -> DB:CA
1398	auto Vs = (Seg0 == `0` \|\| Seg0 == `1`) ? std::make_pair(x&: Vb, y&: Va) // AC or BD
1399	: std::make_pair(x&: Va, y&: Vb); // CA or DB
1400	Results.push(Opc: Hexagon::V6_vshuffvdd, Ty: PairTy, Ops: {Vs.first, Vs.second, HL});
1401	OpRef P = OpRef::res(N: Results.top());
1402	Va = (Seg0 == `0` \|\| Seg0 == `2`) ? OpRef::lo(R: P) : OpRef::hi(R: P);
1403	packSegmentMask(Mask: SM.Mask, OutSegMap: {Seg0, Seg1}, SegLen, PackedMask: MaskH);
1404	} else {
1405	// Picking AD, BC, CB, or DA.
1406	if ((Seg0 == `0` && Seg1 == `3`) \|\| (Seg0 == `2` && Seg1 == `1`)) {
1407	// AD or BC: this can be done using vmux.
1408	// Q = V6_pred_scalar2 SegLen
1409	// V = V6_vmux Q, (Va, Vb) or (Vb, Va)
1410	Results.push(Opc: Hexagon::V6_pred_scalar2, Ty: getBoolVT(), Ops: {HL});
1411	OpRef Qt = OpRef::res(N: Results.top());
1412	auto Vs = (Seg0 == `0`) ? std::make_pair(x&: Va, y&: Vb) // AD
1413	: std::make_pair(x&: Vb, y&: Va); // CB
1414	Results.push(Opc: Hexagon::V6_vmux, Ty, Ops: {Qt, Vs.first, Vs.second});
1415	Va = OpRef::res(N: Results.top());
1416	packSegmentMask(Mask: SM.Mask, OutSegMap: {Seg0, Seg1}, SegLen, PackedMask: MaskH);
1417	} else {
1418	// BC or DA: this could be done via valign by SegLen.
1419	// Do nothing here, because valign (if possible) will be generated
1420	// later on (make sure the Seg0 values are as expected).
1421	assert(Seg0 == `1` \|\| Seg0 == `3`);
1422	}
1423	}
1424	}
1425
1426	// Check if the arguments can be packed by valign(Va,Vb) or valign(Vb,Va).
1427	// FIXME: maybe remove this?
1428	ShuffleMask SMH(MaskH);
1429	assert(SMH.Mask.size() == VecLen);
1430	shuffles::MaskT MaskA(SMH.Mask);
1431
1432	if (SMH.MaxSrc - SMH.MinSrc >= static_cast<int>(HwLen)) {
1433	// valign(Lo=Va,Hi=Vb) won't work. Try swapping Va/Vb.
1434	shuffles::MaskT Swapped(SMH.Mask);
1435	ShuffleVectorSDNode::commuteMask(Mask: Swapped);
1436	ShuffleMask SW(Swapped);
1437	if (SW.MaxSrc - SW.MinSrc < static_cast<int>(HwLen)) {
1438	MaskA.assign(in_start: SW.Mask.begin(), in_end: SW.Mask.end());
1439	std::swap(a&: Va, b&: Vb);
1440	}
1441	}
1442	ShuffleMask SMA(MaskA);
1443	assert(SMA.Mask.size() == VecLen);
1444
1445	if (SMA.MaxSrc - SMA.MinSrc < static_cast<int>(HwLen)) {
1446	int ShiftR = SMA.MinSrc;
1447	if (ShiftR >= static_cast<int>(HwLen)) {
1448	Va = Vb;
1449	Vb = OpRef::undef(Ty);
1450	ShiftR -= HwLen;
1451	}
1452	OpRef RetVal = valign (Va, Vb, ShiftR, Ty, Results);
1453
1454	for (int I = `0`; I != static_cast<int>(VecLen); ++I) {
1455	int M = SMA.Mask [I];
1456	if (M != -`1`)
1457	M -= SMA.MinSrc;
1458	NewMask [I] = M;
1459	}
1460	return RetVal;
1461	}
1462
1463	// By here, packing by segment (half-vector) shuffling, and vector alignment
1464	// failed. Try vmux.
1465	// Note: since this is using the original mask, Va and Vb must not have been
1466	// modified.
1467
1468	if (Options & PackMux) {
1469	// If elements picked from Va and Vb have all different (source) indexes
1470	// (relative to the start of the argument), do a mux, and update the mask.
1471	BitVector Picked(HwLen);
1472	SmallVector<uint8_t,`128`> MuxBytes(HwLen);
1473	bool CanMux = true;
1474	for (int I = `0`; I != static_cast<int>(VecLen); ++I) {
1475	int M = SM.Mask [I];
1476	if (M == -`1`)
1477	continue;
1478	if (M >= static_cast<int>(HwLen))
1479	M -= HwLen;
1480	else
1481	MuxBytes [M] = `0xFF`;
1482	if (Picked [M]) {
1483	CanMux = false;
1484	break;
1485	}
1486	NewMask [I] = M;
1487	}
1488	if (CanMux)
1489	return vmuxs(Bytes: MuxBytes, Va, Vb, Results);
1490	}
1491	return OpRef::fail();
1492	}
1493
1494	// Va, Vb are vector pairs. If SM only uses two single vectors from Va/Vb,
1495	// pack these vectors into a pair, and remap SM into NewMask to use the
1496	// new pair instead.
1497	OpRef HvxSelector::packp(ShuffleMask SM, OpRef Va, OpRef Vb,
1498	ResultStack &Results, MutableArrayRef<int> NewMask) {
1499	DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << `'\n'`;});
1500	SmallVector<unsigned, `4`> SegList = getInputSegmentList(SM: SM.Mask, SegLen: HwLen);
1501	if (SegList.empty())
1502	return OpRef::undef(Ty: getPairVT(ElemTy: MVT::i8));
1503
1504	// If more than two halves are used, bail.
1505	// TODO: be more aggressive here?
1506	unsigned SegCount = SegList.size();
1507	if (SegCount > `2`)
1508	return OpRef::fail();
1509
1510	MVT HalfTy = getSingleVT(ElemTy: MVT::i8);
1511
1512	OpRef Inp[`2`] = { Va, Vb };
1513	OpRef Out[`2`] = { OpRef::undef(Ty: HalfTy), OpRef::undef(Ty: HalfTy) };
1514
1515	// Really make sure we have at most 2 vectors used in the mask.
1516	assert(SegCount <= `2`);
1517
1518	for (int I = `0`, E = SegList.size(); I != E; ++I) {
1519	unsigned S = SegList [I];
1520	OpRef Op = Inp[S / `2`];
1521	Out[I] = (S & `1`) ? OpRef::hi(R: Op) : OpRef::lo(R: Op);
1522	}
1523
1524	// NOTE: Using SegList as the packing map here (not SegMap). This works,
1525	// because we're not concerned here about the order of the segments (i.e.
1526	// single vectors) in the output pair. Changing the order of vectors is
1527	// free (as opposed to changing the order of vector halves as in packs),
1528	// and so there is no extra cost added in case the order needs to be
1529	// changed later.
1530	packSegmentMask(Mask: SM.Mask, OutSegMap: SegList, SegLen: HwLen, PackedMask: NewMask);
1531	return concats(Lo: Out[`0`], Hi: Out[`1`], Results);
1532	}
1533
1534	OpRef HvxSelector::vmuxs(ArrayRef<uint8_t> Bytes, OpRef Va, OpRef Vb,
1535	ResultStack &Results) {
1536	DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << `'\n'`;});
1537	MVT ByteTy = getSingleVT(ElemTy: MVT::i8);
1538	MVT BoolTy = MVT::getVectorVT(VT: MVT::i1, NumElements: HwLen);
1539	const SDLoc &dl(Results.InpNode);
1540	SDValue B = getVectorConstant(Data: Bytes, dl);
1541	Results.push(Opc: Hexagon::V6_vd0, Ty: ByteTy, Ops: {});
1542	Results.push(Opc: Hexagon::V6_veqb, Ty: BoolTy, Ops: {OpRef (B), OpRef::res(N: -`1`)});
1543	Results.push(Opc: Hexagon::V6_vmux, Ty: ByteTy, Ops: {OpRef::res(N: -`1`), Vb, Va});
1544	return OpRef::res(N: Results.top());
1545	}
1546
1547	OpRef HvxSelector::vmuxp(ArrayRef<uint8_t> Bytes, OpRef Va, OpRef Vb,
1548	ResultStack &Results) {
1549	DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << `'\n'`;});
1550	size_t S = Bytes.size() / `2`;
1551	OpRef L = vmuxs(Bytes: Bytes.take_front(N: S), Va: OpRef::lo(R: Va), Vb: OpRef::lo(R: Vb), Results);
1552	OpRef H = vmuxs(Bytes: Bytes.drop_front(N: S), Va: OpRef::hi(R: Va), Vb: OpRef::hi(R: Vb), Results);
1553	return concats(Lo: L, Hi: H, Results);
1554	}
1555
1556	OpRef HvxSelector::shuffs1(ShuffleMask SM, OpRef Va, ResultStack &Results) {
1557	DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << `'\n'`;});
1558	unsigned VecLen = SM.Mask.size();
1559	assert(HwLen == VecLen);
1560	(void)VecLen;
1561	assert(all_of(SM.Mask, [this](int M) { return M == -`1` \|\| M < int(HwLen); }));
1562
1563	if (isIdentity(Mask: SM.Mask))
1564	return Va;
1565	if (isUndef(Mask: SM.Mask))
1566	return OpRef::undef(Ty: getSingleVT(ElemTy: MVT::i8));
1567
1568	// First, check for rotations.
1569	if (auto Dist = rotationDistance(SM, WrapAt: VecLen)) {
1570	OpRef Rotate = funnels(Va, Vb: Va, Amount: *Dist, Results);
1571	if (Rotate.isValid())
1572	return Rotate;
1573	}
1574	unsigned HalfLen = HwLen / `2`;
1575	assert(isPowerOf2_32(HalfLen));
1576
1577	// Handle special case where the output is the same half of the input
1578	// repeated twice, i.e. if Va = AB, then handle the output of AA or BB.
1579	std::pair<int, unsigned> Strip1 = findStrip(A: SM.Mask, Inc: `1`, MaxLen: HalfLen);
1580	if ((Strip1.first & ~HalfLen) == `0` && Strip1.second == HalfLen) {
1581	std::pair<int, unsigned> Strip2 =
1582	findStrip(A: SM.Mask.drop_front(N: HalfLen), Inc: `1`, MaxLen: HalfLen);
1583	if (Strip1 == Strip2) {
1584	const SDLoc &dl(Results.InpNode);
1585	Results.push(Opc: Hexagon::A2_tfrsi, Ty: MVT::i32, Ops: {getConst32(Val: HalfLen, dl)});
1586	Results.push(Opc: Hexagon::V6_vshuffvdd, Ty: getPairVT(ElemTy: MVT::i8),
1587	Ops: {Va, Va, OpRef::res(N: Results.top())});
1588	OpRef S = OpRef::res(N: Results.top());
1589	return (Strip1.first == `0`) ? OpRef::lo(R: S) : OpRef::hi(R: S);
1590	}
1591	}
1592
1593	OpRef P = perfect(SM, Va, Results);
1594	if (P.isValid())
1595	return P;
1596	return butterfly(SM, Va, Results);
1597	}
1598
1599	OpRef HvxSelector::shuffs2(ShuffleMask SM, OpRef Va, OpRef Vb,
1600	ResultStack &Results) {
1601	DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << `'\n'`;});
1602	if (isUndef(Mask: SM.Mask))
1603	return OpRef::undef(Ty: getSingleVT(ElemTy: MVT::i8));
1604
1605	OpRef C = contracting(SM, Va, Vb, Results);
1606	if (C.isValid())
1607	return C;
1608
1609	int VecLen = SM.Mask.size();
1610	shuffles::MaskT PackedMask(VecLen);
1611	OpRef P = packs(SM, Va, Vb, Results, NewMask: PackedMask);
1612	if (P.isValid())
1613	return shuffs1(SM: ShuffleMask (PackedMask), Va: P, Results);
1614
1615	// TODO: Before we split the mask, try perfect shuffle on concatenated
1616	// operands.
1617
1618	shuffles::MaskT MaskL(VecLen), MaskR(VecLen);
1619	splitMask(Mask: SM.Mask, MaskL, MaskR);
1620
1621	OpRef L = shuffs1(SM: ShuffleMask (MaskL), Va, Results);
1622	OpRef R = shuffs1(SM: ShuffleMask (MaskR), Va: Vb, Results);
1623	if (!L.isValid() \|\| !R.isValid())
1624	return OpRef::fail();
1625
1626	SmallVector<uint8_t, `128`> Bytes(VecLen);
1627	for (int I = `0`; I != VecLen; ++I) {
1628	if (MaskL [I] != -`1`)
1629	Bytes [I] = `0xFF`;
1630	}
1631	return vmuxs(Bytes, Va: L, Vb: R, Results);
1632	}
1633
1634	OpRef HvxSelector::shuffp1(ShuffleMask SM, OpRef Va, ResultStack &Results) {
1635	DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << `'\n'`;});
1636	int VecLen = SM.Mask.size();
1637
1638	if (isIdentity(Mask: SM.Mask))
1639	return Va;
1640	if (isUndef(Mask: SM.Mask))
1641	return OpRef::undef(Ty: getPairVT(ElemTy: MVT::i8));
1642
1643	shuffles::MaskT PackedMask(VecLen);
1644	OpRef P = packs(SM, Va: OpRef::lo(R: Va), Vb: OpRef::hi(R: Va), Results, NewMask: PackedMask);
1645	if (P.isValid()) {
1646	ShuffleMask PM(PackedMask);
1647	OpRef E = expanding(SM: PM, Va: P, Results);
1648	if (E.isValid())
1649	return E;
1650
1651	OpRef L = shuffs1(SM: PM.lo(), Va: P, Results);
1652	OpRef H = shuffs1(SM: PM.hi(), Va: P, Results);
1653	if (L.isValid() && H.isValid())
1654	return concats(Lo: L, Hi: H, Results);
1655	}
1656
1657	if (!isLowHalfOnly(Mask: SM.Mask)) {
1658	// Doing a perfect shuffle on a low-half mask (i.e. where the upper half
1659	// is all-undef) may produce a perfect shuffle that generates legitimate
1660	// upper half. This isn't wrong, but if the perfect shuffle was possible,
1661	// then there is a good chance that a shorter (contracting) code may be
1662	// used as well (e.g. V6_vshuffeb, etc).
1663	OpRef R = perfect(SM, Va, Results);
1664	if (R.isValid())
1665	return R;
1666	// TODO commute the mask and try the opposite order of the halves.
1667	}
1668
1669	OpRef L = shuffs2(SM: SM.lo(), Va: OpRef::lo(R: Va), Vb: OpRef::hi(R: Va), Results);
1670	OpRef H = shuffs2(SM: SM.hi(), Va: OpRef::lo(R: Va), Vb: OpRef::hi(R: Va), Results);
1671	if (L.isValid() && H.isValid())
1672	return concats(Lo: L, Hi: H, Results);
1673
1674	return OpRef::fail();
1675	}
1676
1677	OpRef HvxSelector::shuffp2(ShuffleMask SM, OpRef Va, OpRef Vb,
1678	ResultStack &Results) {
1679	DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << `'\n'`;});
1680	if (isUndef(Mask: SM.Mask))
1681	return OpRef::undef(Ty: getPairVT(ElemTy: MVT::i8));
1682
1683	int VecLen = SM.Mask.size();
1684	SmallVector<int,`256`> PackedMask(VecLen);
1685	OpRef P = packp(SM, Va, Vb, Results, NewMask: PackedMask);
1686	if (P.isValid())
1687	return shuffp1(SM: ShuffleMask (PackedMask), Va: P, Results);
1688
1689	SmallVector<int,`256`> MaskL(VecLen), MaskR(VecLen);
1690	splitMask(Mask: SM.Mask, MaskL, MaskR);
1691
1692	OpRef L = shuffp1(SM: ShuffleMask (MaskL), Va, Results);
1693	OpRef R = shuffp1(SM: ShuffleMask (MaskR), Va: Vb, Results);
1694	if (!L.isValid() \|\| !R.isValid())
1695	return OpRef::fail();
1696
1697	// Mux the results.
1698	SmallVector<uint8_t,`256`> Bytes(VecLen);
1699	for (int I = `0`; I != VecLen; ++I) {
1700	if (MaskL [I] != -`1`)
1701	Bytes [I] = `0xFF`;
1702	}
1703	return vmuxp(Bytes, Va: L, Vb: R, Results);
1704	}
1705
1706	namespace {
1707	struct Deleter : public SelectionDAG::DAGNodeDeletedListener {
1708	template <typename T>
1709	Deleter(SelectionDAG &D, T &C)
1710	: SelectionDAG::DAGNodeDeletedListener(D, [&C] (SDNode N, SDNode E) {
1711	C.erase(N);
1712	}) {}
1713	};
1714
1715	template <typename T>
1716	struct NullifyingVector : public T {
1717	DenseMap<SDNode, SDNode*> Refs;
1718	NullifyingVector(T &&V) : T(V) {
1719	for (unsigned i = `0`, e = T::size(); i != e; ++i) {
1720	SDNode &N = T::operator*[](i);
1721	Refs [N] = &N;
1722	}
1723	}
1724	void erase(SDNode *N) {
1725	auto F = Refs.find(Val: N);
1726	if (F != Refs.end())
1727	F ->second = nullptr*;
1728	}
1729	};
1730	}
1731
1732	void HvxSelector::select(SDNode *ISelN) {
1733	// What's important here is to select the right set of nodes. The main
1734	// selection algorithm loops over nodes in a topological order, i.e. users
1735	// are visited before their operands.
1736	//
1737	// It is an error to have an unselected node with a selected operand, and
1738	// there is an assertion in the main selector code to enforce that.
1739	//
1740	// Such a situation could occur if we selected a node, which is both a
1741	// subnode of ISelN, and a subnode of an unrelated (and yet unselected)
1742	// node in the DAG.
1743	assert(ISelN->getOpcode() == HexagonISD::ISEL);
1744	SDNode *N0 = ISelN->getOperand(Num: `0`).getNode();
1745
1746	// There could have been nodes created (i.e. inserted into the DAG)
1747	// that are now dead. Remove them, in case they use any of the nodes
1748	// to select (and make them look shared).
1749	DAG.RemoveDeadNodes();
1750
1751	SetVector<SDNode *> SubNodes;
1752
1753	if (!N0->isMachineOpcode()) {
1754	// Don't want to select N0 if it's shared with another node, except if
1755	// it's shared with other ISELs.
1756	auto IsISelN = [](SDNode T) { return* T->getOpcode() == HexagonISD::ISEL; };
1757	if (llvm::all_of(Range: N0->users(), P: IsISelN))
1758	SubNodes.insert(X: N0);
1759	}
1760	if (SubNodes.empty()) {
1761	ISel.ReplaceNode(F: ISelN, T: N0);
1762	return;
1763	}
1764
1765	// Need to manually select the nodes that are dominated by the ISEL. Other
1766	// nodes are reachable from the rest of the DAG, and so will be selected
1767	// by the DAG selection routine.
1768	SetVector<SDNode*> Dom, NonDom;
1769	Dom.insert(X: N0);
1770
1771	auto IsDomRec = [&Dom, &NonDom] (SDNode T, auto* Rec) -> bool {
1772	if (Dom.count(key: T))
1773	return true;
1774	if (T->use_empty() \|\| NonDom.count(key: T))
1775	return false;
1776	for (SDNode *U : T->users()) {
1777	// If T is reachable from a known non-dominated node, then T itself
1778	// is non-dominated.
1779	if (!Rec(U, Rec)) {
1780	NonDom.insert(X: T);
1781	return false;
1782	}
1783	}
1784	Dom.insert(X: T);
1785	return true;
1786	};
1787
1788	auto IsDom = [&IsDomRec] (SDNode T) { return* IsDomRec (T, IsDomRec); };
1789
1790	// Add the rest of nodes dominated by ISEL to SubNodes.
1791	for (unsigned I = `0`; I != SubNodes.size(); ++I) {
1792	for (SDValue Op : SubNodes [I]->ops()) {
1793	SDNode *O = Op.getNode();
1794	if (IsDom (O))
1795	SubNodes.insert(X: O);
1796	}
1797	}
1798
1799	// Do a topological sort of nodes from Dom.
1800	SetVector<SDNode*> TmpQ;
1801
1802	std::map<SDNode , unsigned*> OpCount;
1803	for (SDNode *T : Dom) {
1804	unsigned NumDomOps = llvm::count_if(Range: T->ops(), P: [&Dom](const SDUse &U) {
1805	return Dom.count(key: U.getNode());
1806	});
1807
1808	OpCount.insert(x: {T, NumDomOps});
1809	if (NumDomOps == `0`)
1810	TmpQ.insert(X: T);
1811	}
1812
1813	for (unsigned I = `0`; I != TmpQ.size(); ++I) {
1814	SDNode *S = TmpQ [I];
1815	for (SDNode *U : S->users()) {
1816	if (U == ISelN)
1817	continue;
1818	auto F = OpCount.find(x: U);
1819	assert(F != OpCount.end());
1820	if (F ->second > `0` && !--F ->second)
1821	TmpQ.insert(X: F ->first);
1822	}
1823	}
1824
1825	// Remove the marker.
1826	ISel.ReplaceNode(F: ISelN, T: N0);
1827
1828	assert(SubNodes.size() == TmpQ.size());
1829	NullifyingVector<decltype(TmpQ)::vector_type> Queue(TmpQ.takeVector());
1830
1831	Deleter DUQ(DAG, Queue);
1832	for (SDNode *S : reverse(C&: Queue)) {
1833	if (S == nullptr)
1834	continue;
1835	DEBUG_WITH_TYPE("isel", {dbgs() << "HVX selecting: "; S->dump(&DAG);});
1836	ISel.Select(N: S);
1837	}
1838	}
1839
1840	bool HvxSelector::scalarizeShuffle(ArrayRef<int> Mask, const SDLoc &dl,
1841	MVT ResTy, SDValue Va, SDValue Vb,
1842	SDNode *N) {
1843	DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << `'\n'`;});
1844	MVT ElemTy = ResTy.getVectorElementType();
1845	assert(ElemTy == MVT::i8);
1846	unsigned VecLen = Mask.size();
1847	bool HavePairs = (`2`*HwLen == VecLen);
1848	MVT SingleTy = getSingleVT(ElemTy: MVT::i8);
1849
1850	// The prior attempts to handle this shuffle may have left a bunch of
1851	// dead nodes in the DAG (such as constants). These nodes will be added
1852	// at the end of DAG's node list, which at that point had already been
1853	// sorted topologically. In the main selection loop, the node list is
1854	// traversed backwards from the root node, which means that any new
1855	// nodes (from the end of the list) will not be visited.
1856	// Scalarization will replace the shuffle node with the scalarized
1857	// expression, and if that expression reused any if the leftoever (dead)
1858	// nodes, these nodes would not be selected (since the "local" selection
1859	// only visits nodes that are not in AllNodes).
1860	// To avoid this issue, remove all dead nodes from the DAG now.
1861	// DAG.RemoveDeadNodes();
1862
1863	SmallVector<SDValue,`128`> Ops;
1864	LLVMContext &Ctx = *DAG.getContext();
1865	MVT LegalTy = Lower.getTypeToTransformTo(Context&: Ctx, VT: ElemTy).getSimpleVT();
1866	for (int I : Mask) {
1867	if (I < `0`) {
1868	Ops.push_back(Elt: ISel.selectUndef(dl, ResTy: LegalTy));
1869	continue;
1870	}
1871	SDValue Vec;
1872	unsigned M = I;
1873	if (M < VecLen) {
1874	Vec = Va;
1875	} else {
1876	Vec = Vb;
1877	M -= VecLen;
1878	}
1879	if (HavePairs) {
1880	if (M < HwLen) {
1881	Vec = DAG.getTargetExtractSubreg(SRIdx: Hexagon::vsub_lo, DL: dl, VT: SingleTy, Operand: Vec);
1882	} else {
1883	Vec = DAG.getTargetExtractSubreg(SRIdx: Hexagon::vsub_hi, DL: dl, VT: SingleTy, Operand: Vec);
1884	M -= HwLen;
1885	}
1886	}
1887	SDValue Idx = DAG.getConstant(Val: M, DL: dl, VT: MVT::i32);
1888	SDValue Ex = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT: LegalTy, Ops: {Vec, Idx});
1889	SDValue L = Lower.LowerOperation(Op: Ex, DAG);
1890	assert(L.getNode());
1891	Ops.push_back(Elt: L);
1892	}
1893
1894	SDValue LV;
1895	if (`2`*HwLen == VecLen) {
1896	SDValue B0 = DAG.getBuildVector(VT: SingleTy, DL: dl, Ops: {Ops.data(), HwLen});
1897	SDValue L0 = Lower.LowerOperation(Op: B0, DAG);
1898	SDValue B1 = DAG.getBuildVector(VT: SingleTy, DL: dl, Ops: {Ops.data()+HwLen, HwLen});
1899	SDValue L1 = Lower.LowerOperation(Op: B1, DAG);
1900	// XXX CONCAT_VECTORS is legal for HVX vectors. Legalizing (lowering)
1901	// functions may expect to be called only for illegal operations, so
1902	// make sure that they are not called for legal ones. Develop a better
1903	// mechanism for dealing with this.
1904	LV = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT: ResTy, Ops: {L0, L1});
1905	} else {
1906	SDValue BV = DAG.getBuildVector(VT: ResTy, DL: dl, Ops);
1907	LV = Lower.LowerOperation(Op: BV, DAG);
1908	}
1909
1910	assert(!N->use_empty());
1911	SDValue IS = DAG.getNode(Opcode: HexagonISD::ISEL, DL: dl, VT: ResTy, Operand: LV);
1912	ISel.ReplaceNode(F: N, T: IS.getNode());
1913	select(ISelN: IS.getNode());
1914	DAG.RemoveDeadNodes();
1915	return true;
1916	}
1917
1918	SmallVector<uint32_t, `8`> HvxSelector::getPerfectCompletions(ShuffleMask SM,
1919	unsigned Width) {
1920	auto possibilities = [](ArrayRef<uint8_t> Bs, unsigned Width) -> uint32_t {
1921	unsigned Impossible = ~(`1u` << Width) + `1`;
1922	for (unsigned I = `0`, E = Bs.size(); I != E; ++I) {
1923	uint8_t B = Bs [I];
1924	if (B == `0xff`)
1925	continue;
1926	if (~Impossible == `0`)
1927	break;
1928	for (unsigned Log = `0`; Log != Width; ++Log) {
1929	if (Impossible & (`1u` << Log))
1930	continue;
1931	unsigned Expected = (I >> Log) % `2`;
1932	if (B != Expected)
1933	Impossible \|= (`1u` << Log);
1934	}
1935	}
1936	return ~Impossible;
1937	};
1938
1939	SmallVector<uint32_t, `8`> Worklist(Width);
1940
1941	for (unsigned BitIdx = `0`; BitIdx != Width; ++BitIdx) {
1942	SmallVector<uint8_t> BitValues(SM.Mask.size());
1943	for (int i = `0`, e = SM.Mask.size(); i != e; ++i) {
1944	int M = SM.Mask [i];
1945	if (M < `0`)
1946	BitValues [i] = `0xff`;
1947	else
1948	BitValues [i] = (M & (`1u` << BitIdx)) != `0`;
1949	}
1950	Worklist [BitIdx] = possibilities (BitValues, Width);
1951	}
1952
1953	// If there is a word P in Worklist that matches multiple possibilities,
1954	// then if any other word Q matches any of the possibilities matched by P,
1955	// then Q matches all the possibilities matched by P. In fact, P == Q.
1956	// In other words, for each words P, Q, the sets of possibilities matched
1957	// by P and Q are either equal or disjoint (no partial overlap).
1958	//
1959	// Illustration: For 4-bit values there are 4 complete sequences:
1960	// a: 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1
1961	// b: 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1
1962	// c: 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1
1963	// d: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1
1964	//
1965	// Words containing unknown bits that match two of the complete
1966	// sequences:
1967	// ab: 0 u u 1 0 u u 1 0 u u 1 0 u u 1
1968	// ac: 0 u 0 u u 1 u 1 0 u 0 u u 1 u 1
1969	// ad: 0 u 0 u 0 u 0 u u 1 u 1 u 1 u 1
1970	// bc: 0 0 u u u u 1 1 0 0 u u u u 1 1
1971	// bd: 0 0 u u 0 0 u u u u 1 1 u u 1 1
1972	// cd: 0 0 0 0 u u u u u u u u 1 1 1 1
1973	//
1974	// Proof of the claim above:
1975	// Let P be a word that matches s0 and s1. For that to happen, all known
1976	// bits in P must match s0 and s1 exactly.
1977	// Assume there is Q that matches s1. Note that since P and Q came from
1978	// the same shuffle mask, the positions of unknown bits in P and Q match
1979	// exactly, which makes the indices of known bits be exactly the same
1980	// between P and Q. Since P matches s0 and s1, the known bits of P much
1981	// match both s0 and s1. Also, since Q matches s1, the known bits in Q
1982	// are exactly the same as in s1, which means that they are exactly the
1983	// same as in P. This implies that P == Q.
1984
1985	// There can be a situation where there are more entries with the same
1986	// bits set than there are set bits (e.g. value 9 occurring more than 2
1987	// times). In such cases it will be impossible to complete this to a
1988	// perfect shuffle.
1989	SmallVector<uint32_t, `8`> Sorted(Worklist);
1990	llvm::sort(Start: Sorted.begin(), End: Sorted.end());
1991
1992	for (unsigned I = `0`, E = Sorted.size(); I != E;) {
1993	unsigned P = Sorted [I], Count = `1`;
1994	while (++I != E && P == Sorted [I])
1995	++Count;
1996	if ((unsigned)llvm::popcount(Value: P) < Count) {
1997	// Reset all occurrences of P, if there are more occurrences of P
1998	// than there are bits in P.
1999	llvm::replace(Range&: Worklist, OldValue: P, NewValue: `0U`);
2000	}
2001	}
2002
2003	return Worklist;
2004	}
2005
2006	SmallVector<uint32_t, `8`>
2007	HvxSelector::completeToPerfect(ArrayRef<uint32_t> Completions, unsigned Width) {
2008	// Pick a completion if there are multiple possibilities. For now just
2009	// select any valid completion.
2010	SmallVector<uint32_t, `8`> Comps(Completions);
2011
2012	for (unsigned I = `0`; I != Width; ++I) {
2013	uint32_t P = Comps [I];
2014	assert(P != `0`);
2015	if (isPowerOf2_32(Value: P))
2016	continue;
2017	// T = least significant bit of P.
2018	uint32_t T = P ^ ((P - `1`) & P);
2019	// Clear T in all remaining words matching P.
2020	for (unsigned J = I + `1`; J != Width; ++J) {
2021	if (Comps [J] == P)
2022	Comps [J] ^= T;
2023	}
2024	Comps [I] = T;
2025	}
2026
2027	#ifndef NDEBUG
2028	// Check that we have generated a valid completion.
2029	uint32_t OrAll = `0`;
2030	for (uint32_t C : Comps) {
2031	assert(isPowerOf2_32(C));
2032	OrAll \|= C;
2033	}
2034	assert(OrAll == (`1u` << Width) -`1`);
2035	#endif
2036
2037	return Comps;
2038	}
2039
2040	std::optional<int> HvxSelector::rotationDistance(ShuffleMask SM,
2041	unsigned WrapAt) {
2042	std::optional<int> Dist;
2043	for (int I = `0`, E = SM.Mask.size(); I != E; ++I) {
2044	int M = SM.Mask [I];
2045	if (M < `0`)
2046	continue;
2047	if (Dist) {
2048	if ((I + Dist) % static_cast<int*>(WrapAt) != M)
2049	return std::nullopt;
2050	} else {
2051	// Integer a%b operator assumes rounding towards zero by /, so it
2052	// "misbehaves" when a crosses 0 (the remainder also changes sign).
2053	// Add WrapAt in an attempt to keep I+Dist non-negative.
2054	Dist = M - I;
2055	if (Dist < `0`)
2056	Dist = *Dist + WrapAt;
2057	}
2058	}
2059	return Dist;
2060	}
2061
2062	OpRef HvxSelector::contracting(ShuffleMask SM, OpRef Va, OpRef Vb,
2063	ResultStack &Results) {
2064	DEBUG_WITH_TYPE("isel", { dbgs() << __func__ << `'\n'`; });
2065	if (!Va.isValid() \|\| !Vb.isValid())
2066	return OpRef::fail();
2067
2068	// Contracting shuffles, i.e. instructions that always discard some bytes
2069	// from the operand vectors.
2070	//
2071	// Funnel shifts
2072	// V6_vshuff{e,o}b
2073	// V6_vshuf{e,o}h
2074	// V6_vdealb4w
2075	// V6_vpack{e,o}{b,h}
2076
2077	int VecLen = SM.Mask.size();
2078
2079	// First, check for funnel shifts.
2080	if (auto Dist = rotationDistance(SM, WrapAt: `2` * VecLen)) {
2081	OpRef Funnel = funnels(Va, Vb, Amount: *Dist, Results);
2082	if (Funnel.isValid())
2083	return Funnel;
2084	}
2085
2086	MVT SingleTy = getSingleVT(ElemTy: MVT::i8);
2087	MVT PairTy = getPairVT(ElemTy: MVT::i8);
2088
2089	auto same = [](ArrayRef<int> Mask1, ArrayRef<int> Mask2) -> bool {
2090	return Mask1 == Mask2;
2091	};
2092
2093	using PackConfig = std::pair<unsigned, bool>;
2094	PackConfig Packs[] = {
2095	{`1`, false}, // byte, even
2096	{`1`, true}, // byte, odd
2097	{`2`, false}, // half, even
2098	{`2`, true}, // half, odd
2099	};
2100
2101	{ // Check vpack
2102	unsigned Opcodes[] = {
2103	Hexagon::V6_vpackeb,
2104	Hexagon::V6_vpackob,
2105	Hexagon::V6_vpackeh,
2106	Hexagon::V6_vpackoh,
2107	};
2108	for (int i = `0`, e = std::size(Opcodes); i != e; ++i) {
2109	auto [Size, Odd] = Packs[i];
2110	if (same (SM.Mask, shuffles::mask(S: shuffles::vpack, Length: HwLen, args: Size, args: Odd))) {
2111	Results.push(Opc: Opcodes[i], Ty: SingleTy, Ops: {Vb, Va});
2112	return OpRef::res(N: Results.top());
2113	}
2114	}
2115	}
2116
2117	{ // Check vshuff
2118	unsigned Opcodes[] = {
2119	Hexagon::V6_vshuffeb,
2120	Hexagon::V6_vshuffob,
2121	Hexagon::V6_vshufeh,
2122	Hexagon::V6_vshufoh,
2123	};
2124	for (int i = `0`, e = std::size(Opcodes); i != e; ++i) {
2125	auto [Size, Odd] = Packs[i];
2126	if (same (SM.Mask, shuffles::mask(S: shuffles::vshuff, Length: HwLen, args: Size, args: Odd))) {
2127	Results.push(Opc: Opcodes[i], Ty: SingleTy, Ops: {Vb, Va});
2128	return OpRef::res(N: Results.top());
2129	}
2130	}
2131	}
2132
2133	{ // Check vdeal
2134	// There is no "V6_vdealeb", etc, but the supposed behavior of vdealeb
2135	// is equivalent to "(V6_vpackeb (V6_vdealvdd Vu, Vv, -2))". Other such
2136	// variants of "deal" can be done similarly.
2137	unsigned Opcodes[] = {
2138	Hexagon::V6_vpackeb,
2139	Hexagon::V6_vpackob,
2140	Hexagon::V6_vpackeh,
2141	Hexagon::V6_vpackoh,
2142	};
2143	const SDLoc &dl(Results.InpNode);
2144
2145	for (int i = `0`, e = std::size(Opcodes); i != e; ++i) {
2146	auto [Size, Odd] = Packs[i];
2147	if (same (SM.Mask, shuffles::mask(S: shuffles::vdeal, Length: HwLen, args: Size, args: Odd))) {
2148	Results.push(Opc: Hexagon::A2_tfrsi, Ty: MVT::i32, Ops: {getConst32(Val: -`2` * Size, dl)});
2149	Results.push(Opc: Hexagon::V6_vdealvdd, Ty: PairTy, Ops: {Vb, Va, OpRef::res(N: -`1`)});
2150	auto vdeal = OpRef::res(N: Results.top());
2151	Results.push(Opc: Opcodes[i], Ty: SingleTy,
2152	Ops: {OpRef::hi(R: vdeal), OpRef::lo(R: vdeal)});
2153	return OpRef::res(N: Results.top());
2154	}
2155	}
2156	}
2157
2158	if (same (SM.Mask, shuffles::mask(S: shuffles::vdealb4w, Length: HwLen))) {
2159	Results.push(Opc: Hexagon::V6_vdealb4w, Ty: SingleTy, Ops: {Vb, Va});
2160	return OpRef::res(N: Results.top());
2161	}
2162
2163	return OpRef::fail();
2164	}
2165
2166	OpRef HvxSelector::expanding(ShuffleMask SM, OpRef Va, ResultStack &Results) {
2167	DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << `'\n'`;});
2168	// Expanding shuffles (using all elements and inserting into larger vector):
2169	//
2170	// V6_vunpacku{b,h} []*
2171	//
2172	// [] Only if the upper elements (filled with 0s) are "don't care" in Mask.*
2173	//
2174	// Note: V6_vunpacko{b,h} are or-ing the high byte/half in the result, so
2175	// they are not shuffles.
2176	//
2177	// The argument is a single vector.
2178
2179	int VecLen = SM.Mask.size();
2180	assert(`2`HwLen == unsigned*(VecLen) && "Expecting vector-pair type");
2181
2182	std::pair<int,unsigned> Strip = findStrip(A: SM.Mask, Inc: `1`, MaxLen: VecLen);
2183
2184	// The patterns for the unpacks, in terms of the starting offsets of the
2185	// consecutive strips (L = length of the strip, N = VecLen):
2186	//
2187	// vunpacku: 0, -1, L, -1, 2L, -1 ...
2188
2189	if (Strip.first != `0`)
2190	return OpRef::fail();
2191
2192	// The vunpackus only handle byte and half-word.
2193	if (Strip.second != `1` && Strip.second != `2`)
2194	return OpRef::fail();
2195
2196	int N = VecLen;
2197	int L = Strip.second;
2198
2199	// First, check the non-ignored strips.
2200	for (int I = `2`L; I < N; I += `2`L) {
2201	auto S = findStrip(A: SM.Mask.drop_front(N: I), Inc: `1`, MaxLen: N-I);
2202	if (S.second != unsigned(L))
2203	return OpRef::fail();
2204	if (`2`*S.first != I)
2205	return OpRef::fail();
2206	}
2207	// Check the -1s.
2208	for (int I = L; I < N; I += `2`*L) {
2209	auto S = findStrip(A: SM.Mask.drop_front(N: I), Inc: `0`, MaxLen: N-I);
2210	if (S.first != -`1` \|\| S.second != unsigned(L))
2211	return OpRef::fail();
2212	}
2213
2214	unsigned Opc = Strip.second == `1` ? Hexagon::V6_vunpackub
2215	: Hexagon::V6_vunpackuh;
2216	Results.push(Opc, Ty: getPairVT(ElemTy: MVT::i8), Ops: {Va});
2217	return OpRef::res(N: Results.top());
2218	}
2219
2220	OpRef HvxSelector::perfect(ShuffleMask SM, OpRef Va, ResultStack &Results) {
2221	DEBUG_WITH_TYPE("isel", { dbgs() << __func__ << `'\n'`; });
2222	// V6_vdeal{b,h}
2223	// V6_vshuff{b,h}
2224
2225	// V6_vshufoe{b,h} those are equivalent to vshuffvdd(..,{1,2})
2226	// V6_vshuffvdd (V6_vshuff)
2227	// V6_dealvdd (V6_vdeal)
2228
2229	int VecLen = SM.Mask.size();
2230	assert(isPowerOf2_32(VecLen) && Log2_32(VecLen) <= `8`);
2231	unsigned LogLen = Log2_32(Value: VecLen);
2232	unsigned HwLog = Log2_32(Value: HwLen);
2233	// The result length must be the same as the length of a single vector,
2234	// or a vector pair.
2235	assert(LogLen == HwLog \|\| LogLen == HwLog + `1`);
2236	bool HavePairs = LogLen == HwLog + `1`;
2237
2238	SmallVector<unsigned, `8`> Perm(LogLen);
2239
2240	// Check if this could be a perfect shuffle, or a combination of perfect
2241	// shuffles.
2242	//
2243	// Consider this permutation (using hex digits to make the ASCII diagrams
2244	// easier to read):
2245	// { 0, 8, 1, 9, 2, A, 3, B, 4, C, 5, D, 6, E, 7, F }.
2246	// This is a "deal" operation: divide the input into two halves, and
2247	// create the output by picking elements by alternating between these two
2248	// halves:
2249	// 0 1 2 3 4 5 6 7 --> 0 8 1 9 2 A 3 B 4 C 5 D 6 E 7 F []*
2250	// 8 9 A B C D E F
2251	//
2252	// Aside from a few special explicit cases (V6_vdealb, etc.), HVX provides
2253	// a somwehat different mechanism that could be used to perform shuffle/
2254	// deal operations: a 2x2 transpose.
2255	// Consider the halves of inputs again, they can be interpreted as a 2x8
2256	// matrix. A 2x8 matrix can be looked at four 2x2 matrices concatenated
2257	// together. Now, when considering 2 elements at a time, it will be a 2x4
2258	// matrix (with elements 01, 23, 45, etc.), or two 2x2 matrices:
2259	// 01 23 45 67
2260	// 89 AB CD EF
2261	// With groups of 4, this will become a single 2x2 matrix, and so on.
2262	//
2263	// The 2x2 transpose instruction works by transposing each of the 2x2
2264	// matrices (or "sub-matrices"), given a specific group size. For example,
2265	// if the group size is 1 (i.e. each element is its own group), there
2266	// will be four transposes of the four 2x2 matrices that form the 2x8.
2267	// For example, with the inputs as above, the result will be:
2268	// 0 8 2 A 4 C 6 E
2269	// 1 9 3 B 5 D 7 F
2270	// Now, this result can be transposed again, but with the group size of 2:
2271	// 08 19 4C 5D
2272	// 2A 3B 6E 7F
2273	// If we then transpose that result, but with the group size of 4, we get:
2274	// 0819 2A3B
2275	// 4C5D 6E7F
2276	// If we concatenate these two rows, it will be
2277	// 0 8 1 9 2 A 3 B 4 C 5 D 6 E 7 F
2278	// which is the same as the "deal" [] above.*
2279	//
2280	// In general, a "deal" of individual elements is a series of 2x2 transposes,
2281	// with changing group size. HVX has two instructions:
2282	// Vdd = V6_vdealvdd Vu, Vv, Rt
2283	// Vdd = V6_shufvdd Vu, Vv, Rt
2284	// that perform exactly that. The register Rt controls which transposes are
2285	// going to happen: a bit at position n (counting from 0) indicates that a
2286	// transpose with a group size of 2^n will take place. If multiple bits are
2287	// set, multiple transposes will happen: vdealvdd will perform them starting
2288	// with the largest group size, vshuffvdd will do them in the reverse order.
2289	//
2290	// The main observation is that each 2x2 transpose corresponds to swapping
2291	// columns of bits in the binary representation of the values.
2292	//
2293	// The numbers {3,2,1,0} and the log2 of the number of contiguous 1 bits
2294	// in a given column. The denote the columns that will be swapped.*
2295	// The transpose with the group size 2^n corresponds to swapping columns
2296	// 3 (the highest log) and log2(n):
2297	//
2298	// 3 2 1 0 0 2 1 3 0 2 3 1
2299	// * * * * * *
2300	// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2301	// 1 0 0 0 1 8 1 0 0 0 8 1 0 0 0 8 1 0 0 0
2302	// 2 0 0 1 0 2 0 0 1 0 1 0 0 0 1 1 0 0 0 1
2303	// 3 0 0 1 1 A 1 0 1 0 9 1 0 0 1 9 1 0 0 1
2304	// 4 0 1 0 0 4 0 1 0 0 4 0 1 0 0 2 0 0 1 0
2305	// 5 0 1 0 1 C 1 1 0 0 C 1 1 0 0 A 1 0 1 0
2306	// 6 0 1 1 0 6 0 1 1 0 5 0 1 0 1 3 0 0 1 1
2307	// 7 0 1 1 1 E 1 1 1 0 D 1 1 0 1 B 1 0 1 1
2308	// 8 1 0 0 0 1 0 0 0 1 2 0 0 1 0 4 0 1 0 0
2309	// 9 1 0 0 1 9 1 0 0 1 A 1 0 1 0 C 1 1 0 0
2310	// A 1 0 1 0 3 0 0 1 1 3 0 0 1 1 5 0 1 0 1
2311	// B 1 0 1 1 B 1 0 1 1 B 1 0 1 1 D 1 1 0 1
2312	// C 1 1 0 0 5 0 1 0 1 6 0 1 1 0 6 0 1 1 0
2313	// D 1 1 0 1 D 1 1 0 1 E 1 1 1 0 E 1 1 1 0
2314	// E 1 1 1 0 7 0 1 1 1 7 0 1 1 1 7 0 1 1 1
2315	// F 1 1 1 1 F 1 1 1 1 F 1 1 1 1 F 1 1 1 1
2316
2317	// There is one special case that is not a perfect shuffle, but can be
2318	// turned into one easily: when the shuffle operates on a vector pair,
2319	// but the two vectors in the pair are swapped. The code that identifies
2320	// perfect shuffles will reject it, unless the order is reversed.
2321	shuffles::MaskT MaskStorage(SM.Mask);
2322	bool InvertedPair = false;
2323	if (HavePairs && SM.Mask [`0`] >= int(HwLen)) {
2324	for (int i = `0`, e = SM.Mask.size(); i != e; ++i) {
2325	int M = SM.Mask [i];
2326	MaskStorage [i] = M >= int(HwLen) ? M - HwLen : M + HwLen;
2327	}
2328	InvertedPair = true;
2329	SM = ShuffleMask (MaskStorage);
2330	}
2331
2332	auto Comps = getPerfectCompletions(SM, Width: LogLen);
2333	if (llvm::is_contained(Range&: Comps, Element: `0`))
2334	return OpRef::fail();
2335
2336	auto Pick = completeToPerfect(Completions: Comps, Width: LogLen);
2337	for (unsigned I = `0`; I != LogLen; ++I)
2338	Perm [I] = Log2_32(Value: Pick [I]);
2339
2340	// Once we have Perm, represent it as cycles. Denote the maximum log2
2341	// (equal to log2(VecLen)-1) as M. The cycle containing M can then be
2342	// written as (M a1 a2 a3 ... an). That cycle can be broken up into
2343	// simple swaps as (M a1)(M a2)(M a3)...(M an), with the composition
2344	// order being from left to right. Any (contiguous) segment where the
2345	// values ai, ai+1...aj are either all increasing or all decreasing,
2346	// can be implemented via a single vshuffvdd/vdealvdd respectively.
2347	//
2348	// If there is a cycle (a1 a2 ... an) that does not involve M, it can
2349	// be written as (M an)(a1 a2 ... an)(M a1). The first two cycles can
2350	// then be folded to get (M a1 a2 ... an)(M a1), and the above procedure
2351	// can be used to generate a sequence of vshuffvdd/vdealvdd.
2352	//
2353	// Example:
2354	// Assume M = 4 and consider a permutation (0 1)(2 3). It can be written
2355	// as (4 0 1)(4 0) composed with (4 2 3)(4 2), or simply
2356	// (4 0 1)(4 0)(4 2 3)(4 2).
2357	// It can then be expanded into swaps as
2358	// (4 0)(4 1)(4 0)(4 2)(4 3)(4 2),
2359	// and broken up into "increasing" segments as
2360	// [(4 0)(4 1)] [(4 0)(4 2)(4 3)] [(4 2)].
2361	// This is equivalent to
2362	// (4 0 1)(4 0 2 3)(4 2),
2363	// which can be implemented as 3 vshufvdd instructions.
2364
2365	using CycleType = SmallVector<unsigned, `8`>;
2366	std::set<CycleType> Cycles;
2367	std::set<unsigned> All;
2368
2369	for (unsigned I : Perm)
2370	All.insert(x: I);
2371
2372	// If the cycle contains LogLen-1, move it to the front of the cycle.
2373	// Otherwise, return the cycle unchanged.
2374	auto canonicalize = [LogLen](const CycleType &C) -> CycleType {
2375	unsigned LogPos, N = C.size();
2376	for (LogPos = `0`; LogPos != N; ++LogPos)
2377	if (C [LogPos] == LogLen - `1`)
2378	break;
2379	if (LogPos == N)
2380	return C;
2381
2382	CycleType NewC(C.begin() + LogPos, C.end());
2383	NewC.append(in_start: C.begin(), in_end: C.begin() + LogPos);
2384	return NewC;
2385	};
2386
2387	auto pfs = [](const std::set<CycleType> &Cs, unsigned Len) {
2388	// Ordering: shuff: 5 0 1 2 3 4, deal: 5 4 3 2 1 0 (for Log=6),
2389	// for bytes zero is included, for halfwords is not.
2390	if (Cs.size() != `1`)
2391	return `0u`;
2392	const CycleType &C = *Cs.begin();
2393	if (C [`0`] != Len - `1`)
2394	return `0u`;
2395	int D = Len - C.size();
2396	if (D != `0` && D != `1`)
2397	return `0u`;
2398
2399	bool IsDeal = true, IsShuff = true;
2400	for (unsigned I = `1`; I != Len - D; ++I) {
2401	if (C [I] != Len - `1` - I)
2402	IsDeal = false;
2403	if (C [I] != I - (`1` - D)) // I-1, I
2404	IsShuff = false;
2405	}
2406	// At most one, IsDeal or IsShuff, can be non-zero.
2407	assert(!(IsDeal \|\| IsShuff) \|\| IsDeal != IsShuff);
2408	static unsigned Deals[] = {Hexagon::V6_vdealb, Hexagon::V6_vdealh};
2409	static unsigned Shufs[] = {Hexagon::V6_vshuffb, Hexagon::V6_vshuffh};
2410	return IsDeal ? Deals[D] : (IsShuff ? Shufs[D] : `0`);
2411	};
2412
2413	while (!All.empty()) {
2414	unsigned A = *All.begin();
2415	All.erase(x: A);
2416	CycleType C;
2417	C.push_back(Elt: A);
2418	for (unsigned B = Perm [A]; B != A; B = Perm [B]) {
2419	C.push_back(Elt: B);
2420	All.erase(x: B);
2421	}
2422	if (C.size() <= `1`)
2423	continue;
2424	Cycles.insert(x: canonicalize (C));
2425	}
2426
2427	MVT SingleTy = getSingleVT(ElemTy: MVT::i8);
2428	MVT PairTy = getPairVT(ElemTy: MVT::i8);
2429
2430	// Recognize patterns for V6_vdeal{b,h} and V6_vshuff{b,h}.
2431	if (unsigned(VecLen) == HwLen) {
2432	if (unsigned SingleOpc = pfs (Cycles, LogLen)) {
2433	Results.push(Opc: SingleOpc, Ty: SingleTy, Ops: {Va});
2434	return OpRef::res(N: Results.top());
2435	}
2436	}
2437
2438	// From the cycles, construct the sequence of values that will
2439	// then form the control values for vdealvdd/vshuffvdd, i.e.
2440	// (M a1 a2)(M a3 a4 a5)... -> a1 a2 a3 a4 a5
2441	// This essentially strips the M value from the cycles where
2442	// it's present, and performs the insertion of M (then stripping)
2443	// for cycles without M (as described in an earlier comment).
2444	SmallVector<unsigned, `8`> SwapElems;
2445	// When the input is extended (i.e. single vector becomes a pair),
2446	// this is done by using an "undef" vector as the second input.
2447	// However, then we get
2448	// input 1: GOODBITS
2449	// input 2: ........
2450	// but we need
2451	// input 1: ....BITS
2452	// input 2: ....GOOD
2453	// Then at the end, this needs to be undone. To accomplish this,
2454	// artificially add "LogLen-1" at both ends of the sequence.
2455	if (!HavePairs)
2456	SwapElems.push_back(Elt: LogLen - `1`);
2457	for (const CycleType &C : Cycles) {
2458	// Do the transformation: (a1..an) -> (M a1..an)(M a1).
2459	unsigned First = (C [`0`] == LogLen - `1`) ? `1` : `0`;
2460	SwapElems.append(in_start: C.begin() + First, in_end: C.end());
2461	if (First == `0`)
2462	SwapElems.push_back(Elt: C [`0`]);
2463	}
2464	if (!HavePairs)
2465	SwapElems.push_back(Elt: LogLen - `1`);
2466
2467	const SDLoc &dl(Results.InpNode);
2468	OpRef Arg = HavePairs ? Va : concats(Lo: Va, Hi: OpRef::undef(Ty: SingleTy), Results);
2469	if (InvertedPair)
2470	Arg = concats(Lo: OpRef::hi(R: Arg), Hi: OpRef::lo(R: Arg), Results);
2471
2472	for (unsigned I = `0`, E = SwapElems.size(); I != E;) {
2473	bool IsInc = I == E - `1` \|\| SwapElems [I] < SwapElems [I + `1`];
2474	unsigned S = (`1u` << SwapElems [I]);
2475	if (I < E - `1`) {
2476	while (++I < E - `1` && IsInc == (SwapElems [I] < SwapElems [I + `1`]))
2477	S \|= `1u` << SwapElems [I];
2478	// The above loop will not add a bit for the final SwapElems[I+1],
2479	// so add it here.
2480	S \|= `1u` << SwapElems [I];
2481	}
2482	++I;
2483
2484	NodeTemplate Res;
2485	Results.push(Opc: Hexagon::A2_tfrsi, Ty: MVT::i32, Ops: {getConst32(Val: S, dl)});
2486	Res.Opc = IsInc ? Hexagon::V6_vshuffvdd : Hexagon::V6_vdealvdd;
2487	Res.Ty = PairTy;
2488	Res.Ops = {OpRef::hi(R: Arg), OpRef::lo(R: Arg), OpRef::res(N: -`1`)};
2489	Results.push(Res);
2490	Arg = OpRef::res(N: Results.top());
2491	}
2492
2493	return HavePairs ? Arg : OpRef::lo(R: Arg);
2494	}
2495
2496	OpRef HvxSelector::butterfly(ShuffleMask SM, OpRef Va, ResultStack &Results) {
2497	DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << `'\n'`;});
2498	// Butterfly shuffles.
2499	//
2500	// V6_vdelta
2501	// V6_vrdelta
2502	// V6_vror
2503
2504	// The assumption here is that all elements picked by Mask are in the
2505	// first operand to the vector_shuffle. This assumption is enforced
2506	// by the caller.
2507
2508	MVT ResTy = getSingleVT(ElemTy: MVT::i8);
2509	PermNetwork::Controls FC, RC;
2510	const SDLoc &dl(Results.InpNode);
2511	int VecLen = SM.Mask.size();
2512
2513	for (int M : SM.Mask) {
2514	if (M != -`1` && M >= VecLen)
2515	return OpRef::fail();
2516	}
2517
2518	// Try the deltas/benes for both single vectors and vector pairs.
2519	ForwardDeltaNetwork FN(SM.Mask);
2520	if (FN.run(V&: FC)) {
2521	SDValue Ctl = getVectorConstant(Data: FC, dl);
2522	Results.push(Opc: Hexagon::V6_vdelta, Ty: ResTy, Ops: {Va, OpRef (Ctl)});
2523	return OpRef::res(N: Results.top());
2524	}
2525
2526	// Try reverse delta.
2527	ReverseDeltaNetwork RN(SM.Mask);
2528	if (RN.run(V&: RC)) {
2529	SDValue Ctl = getVectorConstant(Data: RC, dl);
2530	Results.push(Opc: Hexagon::V6_vrdelta, Ty: ResTy, Ops: {Va, OpRef (Ctl)});
2531	return OpRef::res(N: Results.top());
2532	}
2533
2534	// Do Benes.
2535	BenesNetwork BN(SM.Mask);
2536	if (BN.run(F&: FC, R&: RC)) {
2537	SDValue CtlF = getVectorConstant(Data: FC, dl);
2538	SDValue CtlR = getVectorConstant(Data: RC, dl);
2539	Results.push(Opc: Hexagon::V6_vdelta, Ty: ResTy, Ops: {Va, OpRef (CtlF)});
2540	Results.push(Opc: Hexagon::V6_vrdelta, Ty: ResTy,
2541	Ops: {OpRef::res(N: -`1`), OpRef (CtlR)});
2542	return OpRef::res(N: Results.top());
2543	}
2544
2545	return OpRef::fail();
2546	}
2547
2548	SDValue HvxSelector::getConst32(int Val, const SDLoc &dl) {
2549	return DAG.getTargetConstant(Val, DL: dl, VT: MVT::i32);
2550	}
2551
2552	SDValue HvxSelector::getVectorConstant(ArrayRef<uint8_t> Data,
2553	const SDLoc &dl) {
2554	SmallVector<SDValue, `128`> Elems;
2555	for (uint8_t C : Data)
2556	Elems.push_back(Elt: DAG.getConstant(Val: C, DL: dl, VT: MVT::i8));
2557	MVT VecTy = MVT::getVectorVT(VT: MVT::i8, NumElements: Data.size());
2558	SDValue BV = DAG.getBuildVector(VT: VecTy, DL: dl, Ops: Elems);
2559	SDValue LV = Lower.LowerOperation(Op: BV, DAG);
2560	DAG.RemoveDeadNode(N: BV.getNode());
2561	return DAG.getNode(Opcode: HexagonISD::ISEL, DL: dl, VT: VecTy, Operand: LV);
2562	}
2563
2564	void HvxSelector::selectExtractSubvector(SDNode *N) {
2565	SDValue Inp = N->getOperand(Num: `0`);
2566	MVT ResTy = N->getValueType(ResNo: `0`).getSimpleVT();
2567	unsigned Idx = N->getConstantOperandVal(Num: `1`);
2568
2569	[[maybe_unused]] MVT InpTy = Inp.getValueType().getSimpleVT();
2570	[[maybe_unused]] unsigned ResLen = ResTy.getVectorNumElements();
2571	assert(InpTy.getVectorElementType() == ResTy.getVectorElementType());
2572	assert(`2` * ResLen == InpTy.getVectorNumElements());
2573	assert(Idx == `0` \|\| Idx == ResLen);
2574
2575	unsigned SubReg = Idx == `0` ? Hexagon::vsub_lo : Hexagon::vsub_hi;
2576	SDValue Ext = DAG.getTargetExtractSubreg(SRIdx: SubReg, DL: SDLoc (N), VT: ResTy, Operand: Inp);
2577
2578	ISel.ReplaceNode(F: N, T: Ext.getNode());
2579	}
2580
2581	void HvxSelector::selectShuffle(SDNode *N) {
2582	DEBUG_WITH_TYPE("isel", {
2583	dbgs() << "Starting " << __func__ << " on node:\n";
2584	N->dump(&DAG);
2585	});
2586	MVT ResTy = N->getValueType(ResNo: `0`).getSimpleVT();
2587	// Assume that vector shuffles operate on vectors of bytes.
2588	assert(ResTy.isVector() && ResTy.getVectorElementType() == MVT::i8);
2589
2590	auto *SN = cast<ShuffleVectorSDNode>(Val: N);
2591	std::vector<int> Mask(SN->getMask().begin(), SN->getMask().end());
2592	// This shouldn't really be necessary. Is it?
2593	for (int &Idx : Mask)
2594	if (Idx != -`1` && Idx < `0`)
2595	Idx = -`1`;
2596
2597	unsigned VecLen = Mask.size();
2598	bool HavePairs = (`2`*HwLen == VecLen);
2599	assert(ResTy.getSizeInBits() / `8` == VecLen);
2600
2601	// Vd = vector_shuffle Va, Vb, Mask
2602	//
2603
2604	bool UseLeft = false, UseRight = false;
2605	for (unsigned I = `0`; I != VecLen; ++I) {
2606	if (Mask [I] == -`1`)
2607	continue;
2608	unsigned Idx = Mask [I];
2609	assert(Idx < `2`*VecLen);
2610	if (Idx < VecLen)
2611	UseLeft = true;
2612	else
2613	UseRight = true;
2614	}
2615
2616	DEBUG_WITH_TYPE("isel", {
2617	dbgs() << "VecLen=" << VecLen << " HwLen=" << HwLen << " UseLeft="
2618	<< UseLeft << " UseRight=" << UseRight << " HavePairs="
2619	<< HavePairs << `'\n'`;
2620	});
2621	// If the mask is all -1's, generate "undef".
2622	if (!UseLeft && !UseRight) {
2623	ISel.ReplaceNode(F: N, T: ISel.selectUndef(dl: SDLoc (SN), ResTy).getNode());
2624	return;
2625	}
2626
2627	SDValue Vec0 = N->getOperand(Num: `0`);
2628	SDValue Vec1 = N->getOperand(Num: `1`);
2629	assert(Vec0.getValueType() == ResTy && Vec1.getValueType() == ResTy);
2630
2631	ResultStack Results(SN);
2632	OpRef Va = OpRef::undef(Ty: ResTy);
2633	OpRef Vb = OpRef::undef(Ty: ResTy);
2634
2635	if (!Vec0.isUndef()) {
2636	Results.push(Opc: TargetOpcode::COPY, Ty: ResTy, Ops: {Vec0});
2637	Va = OpRef::OpRef::res(N: Results.top());
2638	}
2639	if (!Vec1.isUndef()) {
2640	Results.push(Opc: TargetOpcode::COPY, Ty: ResTy, Ops: {Vec1});
2641	Vb = OpRef::res(N: Results.top());
2642	}
2643
2644	OpRef Res = !HavePairs ? shuffs2(SM: ShuffleMask (Mask), Va, Vb, Results)
2645	: shuffp2(SM: ShuffleMask (Mask), Va, Vb, Results);
2646
2647	bool Done = Res.isValid();
2648	if (Done) {
2649	// Make sure that Res is on the stack before materializing.
2650	Results.push(Opc: TargetOpcode::COPY, Ty: ResTy, Ops: {Res});
2651	materialize(Results);
2652	} else {
2653	Done = scalarizeShuffle(Mask, dl: SDLoc (N), ResTy, Va: Vec0, Vb: Vec1, N);
2654	}
2655
2656	if (!Done) {
2657	#ifndef NDEBUG
2658	dbgs() << "Unhandled shuffle:\n";
2659	SN->dumpr(&DAG);
2660	#endif
2661	llvm_unreachable("Failed to select vector shuffle");
2662	}
2663	}
2664
2665	void HvxSelector::selectRor(SDNode *N) {
2666	// If this is a rotation by less than 8, use V6_valignbi.
2667	MVT Ty = N->getValueType(ResNo: `0`).getSimpleVT();
2668	const SDLoc &dl(N);
2669	SDValue VecV = N->getOperand(Num: `0`);
2670	SDValue RotV = N->getOperand(Num: `1`);
2671	SDNode NewN = nullptr*;
2672
2673	if (auto *CN = dyn_cast<ConstantSDNode>(Val: RotV.getNode())) {
2674	unsigned S = CN->getZExtValue() % HST.getVectorLength();
2675	if (S == `0`) {
2676	NewN = VecV.getNode();
2677	} else if (isUInt<`3`>(x: S)) {
2678	NewN = DAG.getMachineNode(Opcode: Hexagon::V6_valignbi, dl, VT: Ty,
2679	Ops: {VecV, VecV, getConst32(Val: S, dl)});
2680	}
2681	}
2682
2683	if (!NewN)
2684	NewN = DAG.getMachineNode(Opcode: Hexagon::V6_vror, dl, VT: Ty, Ops: {VecV, RotV});
2685
2686	ISel.ReplaceNode(F: N, T: NewN);
2687	}
2688
2689	void HvxSelector::selectVAlign(SDNode *N) {
2690	SDValue Vv = N->getOperand(Num: `0`);
2691	SDValue Vu = N->getOperand(Num: `1`);
2692	SDValue Rt = N->getOperand(Num: `2`);
2693	SDNode *NewN = DAG.getMachineNode(Opcode: Hexagon::V6_valignb, dl: SDLoc (N),
2694	VT: N->getValueType(ResNo: `0`), Ops: {Vv, Vu, Rt});
2695	ISel.ReplaceNode(F: N, T: NewN);
2696	DAG.RemoveDeadNode(N);
2697	}
2698
2699	void HexagonDAGToDAGISel::PreprocessHvxISelDAG() {
2700	auto getNodes = [this]() -> std::vector<SDNode *> {
2701	std::vector<SDNode *> T;
2702	T.reserve(n: CurDAG->allnodes_size());
2703	for (SDNode &N : CurDAG->allnodes())
2704	T.push_back(x: &N);
2705	return T;
2706	};
2707
2708	ppHvxShuffleOfShuffle(Nodes: getNodes ());
2709	}
2710
2711	template <> struct std::hash<SDValue> {
2712	std::size_t operator()(SDValue V) const {
2713	return std::hash<const void *>()(V.getNode()) +
2714	std::hash<unsigned>()(V.getResNo());
2715	};
2716	};
2717
2718	void HexagonDAGToDAGISel::ppHvxShuffleOfShuffle(std::vector<SDNode *> &&Nodes) {
2719	// Motivating case:
2720	// t10: v64i32 = ...
2721	// t46: v128i8 = vector_shuffle<...> t44, t45
2722	// t48: v128i8 = vector_shuffle<...> t44, t45
2723	// t42: v128i8 = vector_shuffle<...> t46, t48
2724	// t12: v32i32 = extract_subvector t10, Constant:i32<0>
2725	// t44: v128i8 = bitcast t12
2726	// t15: v32i32 = extract_subvector t10, Constant:i32<32>
2727	// t45: v128i8 = bitcast t15
2728	SelectionDAG &DAG = *CurDAG;
2729	unsigned HwLen = HST->getVectorLength();
2730
2731	struct SubVectorInfo {
2732	SubVectorInfo(SDValue S, unsigned H) : Src (S), HalfIdx(H) {}
2733	SDValue Src;
2734	unsigned HalfIdx;
2735	};
2736
2737	using MapType = std::unordered_map<SDValue, unsigned>;
2738
2739	auto getMaskElt = [&](unsigned Idx, ShuffleVectorSDNode *Shuff0,
2740	ShuffleVectorSDNode *Shuff1,
2741	const MapType &OpMap) -> int {
2742	// Treat Shuff0 and Shuff1 as operands to another vector shuffle, and
2743	// Idx as a (non-undef) element of the top level shuffle's mask, that
2744	// is, index into concat(Shuff0, Shuff1).
2745	// Assuming that Shuff0 and Shuff1 both operate on subvectors of the
2746	// same source vector (as described by OpMap), return the index of
2747	// that source vector corresponding to Idx.
2748	ShuffleVectorSDNode *OpShuff = Idx < HwLen ? Shuff0 : Shuff1;
2749	if (Idx >= HwLen)
2750	Idx -= HwLen;
2751
2752	// Get the mask index that M points at in the corresponding operand.
2753	int MaybeN = OpShuff->getMaskElt(Idx);
2754	if (MaybeN < `0`)
2755	return -`1`;
2756
2757	auto N = static_cast<unsigned>(MaybeN);
2758	unsigned SrcBase = N < HwLen ? OpMap.at(k: OpShuff->getOperand(Num: `0`))
2759	: OpMap.at(k: OpShuff->getOperand(Num: `1`));
2760	if (N >= HwLen)
2761	N -= HwLen;
2762
2763	return N + SrcBase;
2764	};
2765
2766	auto fold3 = [&](SDValue TopShuff, SDValue Inp, MapType &&OpMap) -> SDValue {
2767	// Fold all 3 shuffles into a single one.
2768	auto *This = cast<ShuffleVectorSDNode>(Val&: TopShuff);
2769	auto *S0 = cast<ShuffleVectorSDNode>(Val: TopShuff.getOperand(i: `0`));
2770	auto *S1 = cast<ShuffleVectorSDNode>(Val: TopShuff.getOperand(i: `1`));
2771	ArrayRef<int> TopMask = This->getMask();
2772	// This should be guaranteed by type checks in the caller, and the fact
2773	// that all shuffles should have been promoted to operate on MVT::i8.
2774	assert(TopMask.size() == S0->getMask().size() &&
2775	TopMask.size() == S1->getMask().size());
2776	assert(TopMask.size() == HwLen);
2777
2778	SmallVector<int, `256`> FoldedMask(`2` * HwLen);
2779	for (unsigned I = `0`; I != HwLen; ++I) {
2780	int MaybeM = TopMask [I];
2781	if (MaybeM >= `0`) {
2782	FoldedMask [I] =
2783	getMaskElt (static_cast<unsigned>(MaybeM), S0, S1, OpMap);
2784	} else {
2785	FoldedMask [I] = -`1`;
2786	}
2787	}
2788	// The second half of the result will be all-undef.
2789	std::fill(first: FoldedMask.begin() + HwLen, last: FoldedMask.end(), value: -`1`);
2790
2791	// Return
2792	// FoldedShuffle = (Shuffle Inp, undef, FoldedMask)
2793	// (LoHalf FoldedShuffle)
2794	const SDLoc &dl(TopShuff);
2795	MVT SingleTy = MVT::getVectorVT(VT: MVT::i8, NumElements: HwLen);
2796	MVT PairTy = MVT::getVectorVT(VT: MVT::i8, NumElements: `2` * HwLen);
2797	SDValue FoldedShuff =
2798	DAG.getVectorShuffle(VT: PairTy, dl, N1: DAG.getBitcast(VT: PairTy, V: Inp),
2799	N2: DAG.getUNDEF(VT: PairTy), Mask: FoldedMask);
2800	return DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: dl, VT: SingleTy, N1: FoldedShuff,
2801	N2: DAG.getConstant(Val: `0`, DL: dl, VT: MVT::i32));
2802	};
2803
2804	auto getSourceInfo = [](SDValue V) -> std::optional<SubVectorInfo> {
2805	while (V.getOpcode() == ISD::BITCAST)
2806	V = V.getOperand(i: `0`);
2807	if (V.getOpcode() != ISD::EXTRACT_SUBVECTOR)
2808	return std::nullopt;
2809	return SubVectorInfo(V.getOperand(i: `0`),
2810	!cast<ConstantSDNode>(Val: V.getOperand(i: `1`))->isZero());
2811	};
2812
2813	for (SDNode *N : Nodes) {
2814	if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
2815	continue;
2816	EVT ResTy = N->getValueType(ResNo: `0`);
2817	if (ResTy.getVectorElementType() != MVT::i8)
2818	continue;
2819	if (ResTy.getVectorNumElements() != HwLen)
2820	continue;
2821
2822	SDValue V0 = N->getOperand(Num: `0`);
2823	SDValue V1 = N->getOperand(Num: `1`);
2824	if (V0.getOpcode() != ISD::VECTOR_SHUFFLE)
2825	continue;
2826	if (V1.getOpcode() != ISD::VECTOR_SHUFFLE)
2827	continue;
2828	if (V0.getValueType() != ResTy \|\| V1.getValueType() != ResTy)
2829	continue;
2830
2831	// Check if all operands of the two operand shuffles are extract_subvectors
2832	// from the same vector pair.
2833	auto V0A = getSourceInfo (V0.getOperand(i: `0`));
2834	if (!V0A.has_value())
2835	continue;
2836	auto V0B = getSourceInfo (V0.getOperand(i: `1`));
2837	if (!V0B.has_value() \|\| V0B ->Src != V0A ->Src)
2838	continue;
2839	auto V1A = getSourceInfo (V1.getOperand(i: `0`));
2840	if (!V1A.has_value() \|\| V1A ->Src != V0A ->Src)
2841	continue;
2842	auto V1B = getSourceInfo (V1.getOperand(i: `1`));
2843	if (!V1B.has_value() \|\| V1B ->Src != V0A ->Src)
2844	continue;
2845
2846	// The source must be a pair. This should be guaranteed here,
2847	// but check just in case.
2848	assert(V0A->Src.getValueType().getSizeInBits() == `16` * HwLen);
2849
2850	MapType OpMap = {
2851	{V0.getOperand(i: `0`), V0A ->HalfIdx * HwLen},
2852	{V0.getOperand(i: `1`), V0B ->HalfIdx * HwLen},
2853	{V1.getOperand(i: `0`), V1A ->HalfIdx * HwLen},
2854	{V1.getOperand(i: `1`), V1B ->HalfIdx * HwLen},
2855	};
2856	SDValue NewS = fold3 (SDValue (N, `0`), V0A ->Src, std::move(OpMap));
2857	ReplaceNode(F: N, T: NewS.getNode());
2858	}
2859	}
2860
2861	void HexagonDAGToDAGISel::SelectHvxExtractSubvector(SDNode *N) {
2862	HvxSelector (*this, *CurDAG).selectExtractSubvector(N);
2863	}
2864
2865	void HexagonDAGToDAGISel::SelectHvxShuffle(SDNode *N) {
2866	HvxSelector (*this, *CurDAG).selectShuffle(N);
2867	}
2868
2869	void HexagonDAGToDAGISel::SelectHvxRor(SDNode *N) {
2870	HvxSelector (*this, *CurDAG).selectRor(N);
2871	}
2872
2873	void HexagonDAGToDAGISel::SelectHvxVAlign(SDNode *N) {
2874	HvxSelector (*this, *CurDAG).selectVAlign(N);
2875	}
2876
2877	void HexagonDAGToDAGISel::SelectV65GatherPred(SDNode *N) {
2878	const SDLoc &dl(N);
2879	SDValue Chain = N->getOperand(Num: `0`);
2880	SDValue Address = N->getOperand(Num: `2`);
2881	SDValue Predicate = N->getOperand(Num: `3`);
2882	SDValue Base = N->getOperand(Num: `4`);
2883	SDValue Modifier = N->getOperand(Num: `5`);
2884	SDValue Offset = N->getOperand(Num: `6`);
2885	SDValue ImmOperand = CurDAG->getTargetConstant(Val: `0`, DL: dl, VT: MVT::i32);
2886
2887	unsigned Opcode;
2888	unsigned IntNo = N->getConstantOperandVal(Num: `1`);
2889	switch (IntNo) {
2890	default:
2891	llvm_unreachable("Unexpected HVX gather intrinsic.");
2892	case Intrinsic::hexagon_V6_vgathermhq:
2893	case Intrinsic::hexagon_V6_vgathermhq_128B:
2894	Opcode = Hexagon::V6_vgathermhq_pseudo;
2895	break;
2896	case Intrinsic::hexagon_V6_vgathermwq:
2897	case Intrinsic::hexagon_V6_vgathermwq_128B:
2898	Opcode = Hexagon::V6_vgathermwq_pseudo;
2899	break;
2900	case Intrinsic::hexagon_V6_vgathermhwq:
2901	case Intrinsic::hexagon_V6_vgathermhwq_128B:
2902	Opcode = Hexagon::V6_vgathermhwq_pseudo;
2903	break;
2904	}
2905
2906	SDVTList VTs = CurDAG->getVTList(VT: MVT::Other);
2907	SDValue Ops[] = { Address, ImmOperand,
2908	Predicate, Base, Modifier, Offset, Chain };
2909	SDNode *Result = CurDAG->getMachineNode(Opcode, dl, VTs, Ops);
2910
2911	MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(Val: N)->getMemOperand();
2912	CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: Result), NewMemRefs: {MemOp});
2913
2914	ReplaceNode(F: N, T: Result);
2915	}
2916
2917	void HexagonDAGToDAGISel::SelectV65Gather(SDNode *N) {
2918	const SDLoc &dl(N);
2919	SDValue Chain = N->getOperand(Num: `0`);
2920	SDValue Address = N->getOperand(Num: `2`);
2921	SDValue Base = N->getOperand(Num: `3`);
2922	SDValue Modifier = N->getOperand(Num: `4`);
2923	SDValue Offset = N->getOperand(Num: `5`);
2924	SDValue ImmOperand = CurDAG->getTargetConstant(Val: `0`, DL: dl, VT: MVT::i32);
2925
2926	unsigned Opcode;
2927	unsigned IntNo = N->getConstantOperandVal(Num: `1`);
2928	switch (IntNo) {
2929	default:
2930	llvm_unreachable("Unexpected HVX gather intrinsic.");
2931	case Intrinsic::hexagon_V6_vgathermh:
2932	case Intrinsic::hexagon_V6_vgathermh_128B:
2933	Opcode = Hexagon::V6_vgathermh_pseudo;
2934	break;
2935	case Intrinsic::hexagon_V6_vgathermw:
2936	case Intrinsic::hexagon_V6_vgathermw_128B:
2937	Opcode = Hexagon::V6_vgathermw_pseudo;
2938	break;
2939	case Intrinsic::hexagon_V6_vgathermhw:
2940	case Intrinsic::hexagon_V6_vgathermhw_128B:
2941	Opcode = Hexagon::V6_vgathermhw_pseudo;
2942	break;
2943	}
2944
2945	SDVTList VTs = CurDAG->getVTList(VT: MVT::Other);
2946	SDValue Ops[] = { Address, ImmOperand, Base, Modifier, Offset, Chain };
2947	SDNode *Result = CurDAG->getMachineNode(Opcode, dl, VTs, Ops);
2948
2949	MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(Val: N)->getMemOperand();
2950	CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: Result), NewMemRefs: {MemOp});
2951
2952	ReplaceNode(F: N, T: Result);
2953	}
2954
2955	void HexagonDAGToDAGISel::SelectHVXDualOutput(SDNode *N) {
2956	unsigned IID = N->getConstantOperandVal(Num: `0`);
2957	SDNode *Result;
2958	switch (IID) {
2959	case Intrinsic::hexagon_V6_vaddcarry: {
2960	std::array<SDValue, `3`> Ops = {
2961	._M_elems: {N->getOperand(Num: `1`), N->getOperand(Num: `2`), N->getOperand(Num: `3`)}};
2962	SDVTList VTs = CurDAG->getVTList(VT1: MVT::v16i32, VT2: MVT::v64i1);
2963	Result = CurDAG->getMachineNode(Opcode: Hexagon::V6_vaddcarry, dl: SDLoc (N), VTs, Ops);
2964	break;
2965	}
2966	case Intrinsic::hexagon_V6_vaddcarry_128B: {
2967	std::array<SDValue, `3`> Ops = {
2968	._M_elems: {N->getOperand(Num: `1`), N->getOperand(Num: `2`), N->getOperand(Num: `3`)}};
2969	SDVTList VTs = CurDAG->getVTList(VT1: MVT::v32i32, VT2: MVT::v128i1);
2970	Result = CurDAG->getMachineNode(Opcode: Hexagon::V6_vaddcarry, dl: SDLoc (N), VTs, Ops);
2971	break;
2972	}
2973	case Intrinsic::hexagon_V6_vsubcarry: {
2974	std::array<SDValue, `3`> Ops = {
2975	._M_elems: {N->getOperand(Num: `1`), N->getOperand(Num: `2`), N->getOperand(Num: `3`)}};
2976	SDVTList VTs = CurDAG->getVTList(VT1: MVT::v16i32, VT2: MVT::v64i1);
2977	Result = CurDAG->getMachineNode(Opcode: Hexagon::V6_vsubcarry, dl: SDLoc (N), VTs, Ops);
2978	break;
2979	}
2980	case Intrinsic::hexagon_V6_vsubcarry_128B: {
2981	std::array<SDValue, `3`> Ops = {
2982	._M_elems: {N->getOperand(Num: `1`), N->getOperand(Num: `2`), N->getOperand(Num: `3`)}};
2983	SDVTList VTs = CurDAG->getVTList(VT1: MVT::v32i32, VT2: MVT::v128i1);
2984	Result = CurDAG->getMachineNode(Opcode: Hexagon::V6_vsubcarry, dl: SDLoc (N), VTs, Ops);
2985	break;
2986	}
2987	default:
2988	llvm_unreachable("Unexpected HVX dual output intrinsic.");
2989	}
2990	ReplaceUses(F: N, T: Result);
2991	ReplaceUses(F: SDValue (N, `0`), T: SDValue (Result, `0`));
2992	ReplaceUses(F: SDValue (N, `1`), T: SDValue (Result, `1`));
2993	CurDAG->RemoveDeadNode(N);
2994	}
2995

Browse the source code of llvm_projects/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp