HexagonISelDAGToDAGHVX.cpp source code [llvm_projects/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp]

1	//===-- HexagonISelDAGToDAGHVX.cpp ----------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "Hexagon.h"
10	#include "HexagonISelDAGToDAG.h"
11	#include "HexagonISelLowering.h"
12	#include "HexagonTargetMachine.h"
13	#include "llvm/ADT/BitVector.h"
14	#include "llvm/ADT/SetVector.h"
15	#include "llvm/CodeGen/MachineInstrBuilder.h"
16	#include "llvm/CodeGen/SelectionDAGISel.h"
17	#include "llvm/IR/Intrinsics.h"
18	#include "llvm/IR/IntrinsicsHexagon.h"
19	#include "llvm/Support/CommandLine.h"
20	#include "llvm/Support/Debug.h"
21	#include "llvm/Support/MathExtras.h"
22
23	#include <algorithm>
24	#include <cmath>
25	#include <deque>
26	#include <functional>
27	#include <map>
28	#include <optional>
29	#include <set>
30	#include <unordered_map>
31	#include <utility>
32	#include <vector>
33
34	#define DEBUG_TYPE "hexagon-isel"
35	using namespace llvm;
36
37	namespace {
38
39	// --------------------------------------------------------------------
40	// Implementation of permutation networks.
41
42	// Implementation of the node routing through butterfly networks:
43	// - Forward delta.
44	// - Reverse delta.
45	// - Benes.
46	//
47	//
48	// Forward delta network consists of log(N) steps, where N is the number
49	// of inputs. In each step, an input can stay in place, or it can get
50	// routed to another position[1]. The step after that consists of two
51	// networks, each half in size in terms of the number of nodes. In those
52	// terms, in the given step, an input can go to either the upper or the
53	// lower network in the next step.
54	//
55	// [1] Hexagon's vdelta/vrdelta allow an element to be routed to both
56	// positions as long as there is no conflict.
57
58	// Here's a delta network for 8 inputs, only the switching routes are
59	// shown:
60	//
61	// Steps:
62	// \|- 1 ---------------\|- 2 -----\|- 3 -\|
63	//
64	// Inp[0] * * * Out[0]*
65	// \ / \ / \ /
66	// \ / \ / X
67	// \ / \ / / \
68	// Inp[1] \ / * X * * Out[1]*
69	// \ \ / / \ / \ /
70	// \ \ / / X X
71	// \ \ / / / \ / \
72	// Inp[2] \ \ / / * X * * Out[2]*
73	// \ \ X / / / \ \ /
74	// \ \ / \ / / / \ X
75	// \ X X / / \ / \
76	// Inp[3] \ / \ / \ / * * * Out[3]*
77	// \ X X X /
78	// \ / \ / \ / \ /
79	// X X X X
80	// / \ / \ / \ / \
81	// / X X X \
82	// Inp[4] / \ / \ / \ * * * Out[4]*
83	// / X X \ \ / \ /
84	// / / \ / \ \ \ / X
85	// / / X \ \ \ / / \
86	// Inp[5] / / \ \ * X * * Out[5]*
87	// / / \ \ \ / \ /
88	// / / \ \ X X
89	// / / \ \ / \ / \
90	// Inp[6] / \ * X * * Out[6]*
91	// / \ / \ \ /
92	// / \ / \ X
93	// / \ / \ / \
94	// Inp[7] * * * Out[7]*
95	//
96	//
97	// Reverse delta network is same as delta network, with the steps in
98	// the opposite order.
99	//
100	//
101	// Benes network is a forward delta network immediately followed by
102	// a reverse delta network.
103
104	enum class ColorKind { None, Red, Black };
105
106	// Graph coloring utility used to partition nodes into two groups:
107	// they will correspond to nodes routed to the upper and lower networks.
108	struct Coloring {
109	using Node = int;
110	using MapType = std::map<Node, ColorKind>;
111	static constexpr Node Ignore = Node(-`1`);
112
113	Coloring(ArrayRef<Node> Ord) : Order (Ord) {
114	build();
115	if (!color())
116	Colors.clear();
117	}
118
119	const MapType &colors() const {
120	return Colors;
121	}
122
123	ColorKind other(ColorKind Color) {
124	if (Color == ColorKind::None)
125	return ColorKind::Red;
126	return Color == ColorKind::Red ? ColorKind::Black : ColorKind::Red;
127	}
128
129	LLVM_DUMP_METHOD void dump() const;
130
131	private:
132	ArrayRef<Node> Order;
133	MapType Colors;
134	std::set<Node> Needed;
135
136	using NodeSet = std::set<Node>;
137	std::map<Node,NodeSet> Edges;
138
139	Node conj(Node Pos) {
140	Node Num = Order.size();
141	return (Pos < Num/`2`) ? Pos + Num/`2` : Pos - Num/`2`;
142	}
143
144	ColorKind getColor(Node N) {
145	auto F = Colors.find(x: N);
146	return F != Colors.end() ? F ->second : ColorKind::None;
147	}
148
149	std::pair<bool, ColorKind> getUniqueColor(const NodeSet &Nodes);
150
151	void build();
152	bool color();
153	};
154	} // namespace
155
156	std::pair<bool, ColorKind> Coloring::getUniqueColor(const NodeSet &Nodes) {
157	auto Color = ColorKind::None;
158	for (Node N : Nodes) {
159	ColorKind ColorN = getColor(N);
160	if (ColorN == ColorKind::None)
161	continue;
162	if (Color == ColorKind::None)
163	Color = ColorN;
164	else if (Color != ColorKind::None && Color != ColorN)
165	return { false, ColorKind::None };
166	}
167	return { true, Color };
168	}
169
170	void Coloring::build() {
171	// Add Order[P] and Order[conj(P)] to Edges.
172	for (unsigned P = `0`; P != Order.size(); ++P) {
173	Node I = Order [P];
174	if (I != Ignore) {
175	Needed.insert(x: I);
176	Node PC = Order [conj(Pos: P)];
177	if (PC != Ignore && PC != I)
178	Edges [I].insert(x: PC);
179	}
180	}
181	// Add I and conj(I) to Edges.
182	for (unsigned I = `0`; I != Order.size(); ++I) {
183	if (!Needed.count(x: I))
184	continue;
185	Node C = conj(Pos: I);
186	// This will create an entry in the edge table, even if I is not
187	// connected to any other node. This is necessary, because it still
188	// needs to be colored.
189	NodeSet &Is = Edges [I];
190	if (Needed.count(x: C))
191	Is.insert(x: C);
192	}
193	}
194
195	bool Coloring::color() {
196	SetVector<Node> FirstQ;
197	auto Enqueue = [this,&FirstQ] (Node N) {
198	SetVector<Node> Q;
199	Q.insert(X: N);
200	for (unsigned I = `0`; I != Q.size(); ++I) {
201	NodeSet &Ns = Edges [Q [I]];
202	Q.insert(Start: Ns.begin(), End: Ns.end());
203	}
204	FirstQ.insert(Start: Q.begin(), End: Q.end());
205	};
206	for (Node N : Needed)
207	Enqueue (N);
208
209	for (Node N : FirstQ) {
210	if (Colors.count(x: N))
211	continue;
212	NodeSet &Ns = Edges [N];
213	auto P = getUniqueColor(Nodes: Ns);
214	if (!P.first)
215	return false;
216	Colors [N] = other(Color: P.second);
217	}
218
219	// First, color nodes that don't have any dups.
220	for (auto E : Edges) {
221	Node N = E.first;
222	if (!Needed.count(x: conj(Pos: N)) \|\| Colors.count(x: N))
223	continue;
224	auto P = getUniqueColor(Nodes: E.second);
225	if (!P.first)
226	return false;
227	Colors [N] = other(Color: P.second);
228	}
229
230	// Now, nodes that are still uncolored. Since the graph can be modified
231	// in this step, create a work queue.
232	std::vector<Node> WorkQ;
233	for (auto E : Edges) {
234	Node N = E.first;
235	if (!Colors.count(x: N))
236	WorkQ.push_back(x: N);
237	}
238
239	for (Node N : WorkQ) {
240	NodeSet &Ns = Edges [N];
241	auto P = getUniqueColor(Nodes: Ns);
242	if (P.first) {
243	Colors [N] = other(Color: P.second);
244	continue;
245	}
246
247	// Coloring failed. Split this node.
248	Node C = conj(Pos: N);
249	ColorKind ColorN = other(Color: ColorKind::None);
250	ColorKind ColorC = other(Color: ColorN);
251	NodeSet &Cs = Edges [C];
252	NodeSet CopyNs = Ns;
253	for (Node M : CopyNs) {
254	ColorKind ColorM = getColor(N: M);
255	if (ColorM == ColorC) {
256	// Connect M with C, disconnect M from N.
257	Cs.insert(x: M);
258	Edges [M].insert(x: C);
259	Ns.erase(x: M);
260	Edges [M].erase(x: N);
261	}
262	}
263	Colors [N] = ColorN;
264	Colors [C] = ColorC;
265	}
266
267	// Explicitly assign "None" to all uncolored nodes.
268	for (unsigned I = `0`; I != Order.size(); ++I)
269	if (Colors.count(x: I) == `0`)
270	Colors [I] = ColorKind::None;
271
272	return true;
273	}
274
275	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
276	void Coloring::dump() const {
277	dbgs() << "{ Order: {";
278	for (Node P : Order) {
279	if (P != Ignore)
280	dbgs() << `' '` << P;
281	else
282	dbgs() << " -";
283	}
284	dbgs() << " }\n";
285	dbgs() << " Needed: {";
286	for (Node N : Needed)
287	dbgs() << `' '` << N;
288	dbgs() << " }\n";
289
290	dbgs() << " Edges: {\n";
291	for (auto E : Edges) {
292	dbgs() << " " << E.first << " -> {";
293	for (auto N : E.second)
294	dbgs() << `' '` << N;
295	dbgs() << " }\n";
296	}
297	dbgs() << " }\n";
298
299	auto ColorKindToName = [](ColorKind C) {
300	switch (C) {
301	case ColorKind::None:
302	return "None";
303	case ColorKind::Red:
304	return "Red";
305	case ColorKind::Black:
306	return "Black";
307	}
308	llvm_unreachable("all ColorKinds should be handled by the switch above");
309	};
310
311	dbgs() << " Colors: {\n";
312	for (auto C : Colors)
313	dbgs() << " " << C.first << " -> " << ColorKindToName(C.second) << "\n";
314	dbgs() << " }\n}\n";
315	}
316	#endif
317
318	namespace {
319	// Base class of for reordering networks. They don't strictly need to be
320	// permutations, as outputs with repeated occurrences of an input element
321	// are allowed.
322	struct PermNetwork {
323	using Controls = std::vector<uint8_t>;
324	using ElemType = int;
325	static constexpr ElemType Ignore = ElemType(-`1`);
326
327	enum : uint8_t {
328	None,
329	Pass,
330	Switch
331	};
332	enum : uint8_t {
333	Forward,
334	Reverse
335	};
336
337	PermNetwork(ArrayRef<ElemType> Ord, unsigned Mult = `1`) {
338	Order.assign(first: Ord.data(), last: Ord.data()+Ord.size());
339	Log = `0`;
340
341	unsigned S = Order.size();
342	while (S >>= `1`)
343	++Log;
344
345	Table.resize(new_size: Order.size());
346	for (RowType &Row : Table)
347	Row.resize(new_size: Mult*Log, x: None);
348	}
349
350	void getControls(Controls &V, unsigned StartAt, uint8_t Dir) const {
351	unsigned Size = Order.size();
352	V.resize(new_size: Size);
353	for (unsigned I = `0`; I != Size; ++I) {
354	unsigned W = `0`;
355	for (unsigned L = `0`; L != Log; ++L) {
356	unsigned C = ctl(Pos: I, Step: StartAt+L) == Switch;
357	if (Dir == Forward)
358	W \|= C << (Log-`1`-L);
359	else
360	W \|= C << L;
361	}
362	assert(isUInt<`8`>(W));
363	V [I] = uint8_t(W);
364	}
365	}
366
367	uint8_t ctl(ElemType Pos, unsigned Step) const {
368	return Table [Pos][Step];
369	}
370	unsigned size() const {
371	return Order.size();
372	}
373	unsigned steps() const {
374	return Log;
375	}
376
377	protected:
378	unsigned Log;
379	std::vector<ElemType> Order;
380	using RowType = std::vector<uint8_t>;
381	std::vector<RowType> Table;
382	};
383
384	struct ForwardDeltaNetwork : public PermNetwork {
385	ForwardDeltaNetwork(ArrayRef<ElemType> Ord) : PermNetwork (Ord) {}
386
387	bool run(Controls &V) {
388	if (!route(P: Order.data(), T: Table.data(), Size: size(), Step: `0`))
389	return false;
390	getControls(V, StartAt: `0`, Dir: Forward);
391	return true;
392	}
393
394	private:
395	bool route(ElemType P, RowType T, unsigned Size, unsigned Step);
396	};
397
398	struct ReverseDeltaNetwork : public PermNetwork {
399	ReverseDeltaNetwork(ArrayRef<ElemType> Ord) : PermNetwork (Ord) {}
400
401	bool run(Controls &V) {
402	if (!route(P: Order.data(), T: Table.data(), Size: size(), Step: `0`))
403	return false;
404	getControls(V, StartAt: `0`, Dir: Reverse);
405	return true;
406	}
407
408	private:
409	bool route(ElemType P, RowType T, unsigned Size, unsigned Step);
410	};
411
412	struct BenesNetwork : public PermNetwork {
413	BenesNetwork(ArrayRef<ElemType> Ord) : PermNetwork (Ord, `2`) {}
414
415	bool run(Controls &F, Controls &R) {
416	if (!route(P: Order.data(), T: Table.data(), Size: size(), Step: `0`))
417	return false;
418
419	getControls(V&: F, StartAt: `0`, Dir: Forward);
420	getControls(V&: R, StartAt: Log, Dir: Reverse);
421	return true;
422	}
423
424	private:
425	bool route(ElemType P, RowType T, unsigned Size, unsigned Step);
426	};
427	} // namespace
428
429	bool ForwardDeltaNetwork::route(ElemType P, RowType T, unsigned Size,
430	unsigned Step) {
431	bool UseUp = false, UseDown = false;
432	ElemType Num = Size;
433
434	// Cannot use coloring here, because coloring is used to determine
435	// the "big" switch, i.e. the one that changes halves, and in a forward
436	// network, a color can be simultaneously routed to both halves in the
437	// step we're working on.
438	for (ElemType J = `0`; J != Num; ++J) {
439	ElemType I = P[J];
440	// I is the position in the input,
441	// J is the position in the output.
442	if (I == Ignore)
443	continue;
444	uint8_t S;
445	if (I < Num/`2`)
446	S = (J < Num/`2`) ? Pass : Switch;
447	else
448	S = (J < Num/`2`) ? Switch : Pass;
449
450	// U is the element in the table that needs to be updated.
451	ElemType U = (S == Pass) ? I : (I < Num/`2` ? I+Num/`2` : I-Num/`2`);
452	if (U < Num/`2`)
453	UseUp = true;
454	else
455	UseDown = true;
456	if (T[U][Step] != S && T[U][Step] != None)
457	return false;
458	T[U][Step] = S;
459	}
460
461	for (ElemType J = `0`; J != Num; ++J)
462	if (P[J] != Ignore && P[J] >= Num/`2`)
463	P[J] -= Num/`2`;
464
465	if (Step+`1` < Log) {
466	if (UseUp && !route(P, T, Size: Size/`2`, Step: Step+`1`))
467	return false;
468	if (UseDown && !route(P: P+Size/`2`, T: T+Size/`2`, Size: Size/`2`, Step: Step+`1`))
469	return false;
470	}
471	return true;
472	}
473
474	bool ReverseDeltaNetwork::route(ElemType P, RowType T, unsigned Size,
475	unsigned Step) {
476	unsigned Pets = Log-`1` - Step;
477	bool UseUp = false, UseDown = false;
478	ElemType Num = Size;
479
480	// In this step half-switching occurs, so coloring can be used.
481	Coloring G({P,Size});
482	const Coloring::MapType &M = G.colors();
483	if (M.empty())
484	return false;
485
486	ColorKind ColorUp = ColorKind::None;
487	for (ElemType J = `0`; J != Num; ++J) {
488	ElemType I = P[J];
489	// I is the position in the input,
490	// J is the position in the output.
491	if (I == Ignore)
492	continue;
493	ColorKind C = M.at(k: I);
494	if (C == ColorKind::None)
495	continue;
496	// During "Step", inputs cannot switch halves, so if the "up" color
497	// is still unknown, make sure that it is selected in such a way that
498	// "I" will stay in the same half.
499	bool InpUp = I < Num/`2`;
500	if (ColorUp == ColorKind::None)
501	ColorUp = InpUp ? C : G.other(Color: C);
502	if ((C == ColorUp) != InpUp) {
503	// If I should go to a different half than where is it now, give up.
504	return false;
505	}
506
507	uint8_t S;
508	if (InpUp) {
509	S = (J < Num/`2`) ? Pass : Switch;
510	UseUp = true;
511	} else {
512	S = (J < Num/`2`) ? Switch : Pass;
513	UseDown = true;
514	}
515	T[J][Pets] = S;
516	}
517
518	// Reorder the working permutation according to the computed switch table
519	// for the last step (i.e. Pets).
520	for (ElemType J = `0`, E = Size / `2`; J != E; ++J) {
521	ElemType PJ = P[J]; // Current values of P[J]
522	ElemType PC = P[J+Size/`2`]; // and P[conj(J)]
523	ElemType QJ = PJ; // New values of P[J]
524	ElemType QC = PC; // and P[conj(J)]
525	if (T[J][Pets] == Switch)
526	QC = PJ;
527	if (T[J+Size/`2`][Pets] == Switch)
528	QJ = PC;
529	P[J] = QJ;
530	P[J+Size/`2`] = QC;
531	}
532
533	for (ElemType J = `0`; J != Num; ++J)
534	if (P[J] != Ignore && P[J] >= Num/`2`)
535	P[J] -= Num/`2`;
536
537	if (Step+`1` < Log) {
538	if (UseUp && !route(P, T, Size: Size/`2`, Step: Step+`1`))
539	return false;
540	if (UseDown && !route(P: P+Size/`2`, T: T+Size/`2`, Size: Size/`2`, Step: Step+`1`))
541	return false;
542	}
543	return true;
544	}
545
546	bool BenesNetwork::route(ElemType P, RowType T, unsigned Size,
547	unsigned Step) {
548	Coloring G({P,Size});
549	const Coloring::MapType &M = G.colors();
550	if (M.empty())
551	return false;
552	ElemType Num = Size;
553
554	unsigned Pets = `2`*Log-`1` - Step;
555	bool UseUp = false, UseDown = false;
556
557	// Both assignments, i.e. Red->Up and Red->Down are valid, but they will
558	// result in different controls. Let's pick the one where the first
559	// control will be "Pass".
560	ColorKind ColorUp = ColorKind::None;
561	for (ElemType J = `0`; J != Num; ++J) {
562	ElemType I = P[J];
563	if (I == Ignore)
564	continue;
565	ColorKind C = M.at(k: I);
566	if (C == ColorKind::None)
567	continue;
568	if (ColorUp == ColorKind::None) {
569	ColorUp = (I < Num / `2`) ? ColorKind::Red : ColorKind::Black;
570	}
571	unsigned CI = (I < Num/`2`) ? I+Num/`2` : I-Num/`2`;
572	if (C == ColorUp) {
573	if (I < Num/`2`)
574	T[I][Step] = Pass;
575	else
576	T[CI][Step] = Switch;
577	T[J][Pets] = (J < Num/`2`) ? Pass : Switch;
578	UseUp = true;
579	} else { // Down
580	if (I < Num/`2`)
581	T[CI][Step] = Switch;
582	else
583	T[I][Step] = Pass;
584	T[J][Pets] = (J < Num/`2`) ? Switch : Pass;
585	UseDown = true;
586	}
587	}
588
589	// Reorder the working permutation according to the computed switch table
590	// for the last step (i.e. Pets).
591	for (ElemType J = `0`; J != Num/`2`; ++J) {
592	ElemType PJ = P[J]; // Current values of P[J]
593	ElemType PC = P[J+Num/`2`]; // and P[conj(J)]
594	ElemType QJ = PJ; // New values of P[J]
595	ElemType QC = PC; // and P[conj(J)]
596	if (T[J][Pets] == Switch)
597	QC = PJ;
598	if (T[J+Num/`2`][Pets] == Switch)
599	QJ = PC;
600	P[J] = QJ;
601	P[J+Num/`2`] = QC;
602	}
603
604	for (ElemType J = `0`; J != Num; ++J)
605	if (P[J] != Ignore && P[J] >= Num/`2`)
606	P[J] -= Num/`2`;
607
608	if (Step+`1` < Log) {
609	if (UseUp && !route(P, T, Size: Size/`2`, Step: Step+`1`))
610	return false;
611	if (UseDown && !route(P: P+Size/`2`, T: T+Size/`2`, Size: Size/`2`, Step: Step+`1`))
612	return false;
613	}
614	return true;
615	}
616
617	// --------------------------------------------------------------------
618	// Support for building selection results (output instructions that are
619	// parts of the final selection).
620
621	namespace {
622	struct OpRef {
623	OpRef(SDValue V) : OpV (V) {}
624	bool isValue() const { return OpV.getNode() != nullptr; }
625	bool isValid() const { return isValue() \|\| !(OpN & Invalid); }
626	bool isUndef() const { return OpN & Undef; }
627	static OpRef res(int N) { return OpRef (Whole \| (N & Index)); }
628	static OpRef fail() { return OpRef (Invalid); }
629
630	static OpRef lo(const OpRef &R) {
631	assert(!R.isValue());
632	return OpRef (R.OpN & (Undef \| Index \| LoHalf));
633	}
634	static OpRef hi(const OpRef &R) {
635	assert(!R.isValue());
636	return OpRef (R.OpN & (Undef \| Index \| HiHalf));
637	}
638	static OpRef undef(MVT Ty) { return OpRef (Undef \| Ty.SimpleTy); }
639
640	// Direct value.
641	SDValue OpV = SDValue ();
642
643	// Reference to the operand of the input node:
644	// If the 31st bit is 1, it's undef, otherwise, bits 28..0 are the
645	// operand index:
646	// If bit 30 is set, it's the high half of the operand.
647	// If bit 29 is set, it's the low half of the operand.
648	unsigned OpN = `0`;
649
650	enum : unsigned {
651	Invalid = `0x10000000`,
652	LoHalf = `0x20000000`,
653	HiHalf = `0x40000000`,
654	Whole = LoHalf \| HiHalf,
655	Undef = `0x80000000`,
656	Index = `0x0FFFFFFF`, // Mask of the index value.
657	IndexBits = `28`,
658	};
659
660	LLVM_DUMP_METHOD
661	void print(raw_ostream &OS, const SelectionDAG &G) const;
662
663	private:
664	OpRef(unsigned N) : OpN(N) {}
665	};
666
667	struct NodeTemplate {
668	NodeTemplate() = default;
669	unsigned Opc = `0`;
670	MVT Ty = MVT::Other;
671	std::vector<OpRef> Ops;
672
673	LLVM_DUMP_METHOD void print(raw_ostream &OS, const SelectionDAG &G) const;
674	};
675
676	struct ResultStack {
677	ResultStack(SDNode *Inp)
678	: InpNode(Inp), InpTy(Inp->getValueType(ResNo: `0`).getSimpleVT()) {}
679	SDNode *InpNode;
680	MVT InpTy;
681	unsigned push(const NodeTemplate &Res) {
682	List.push_back(x: Res);
683	return List.size()-`1`;
684	}
685	unsigned push(unsigned Opc, MVT Ty, std::vector<OpRef> &&Ops) {
686	NodeTemplate Res;
687	Res.Opc = Opc;
688	Res.Ty = Ty;
689	Res.Ops = Ops;
690	return push(Res);
691	}
692	bool empty() const { return List.empty(); }
693	unsigned size() const { return List.size(); }
694	unsigned top() const { return size()-`1`; }
695	const NodeTemplate &operator[](unsigned I) const { return List [I]; }
696	unsigned reset(unsigned NewTop) {
697	List.resize(new_size: NewTop+`1`);
698	return NewTop;
699	}
700
701	using BaseType = std::vector<NodeTemplate>;
702	BaseType::iterator begin() { return List.begin(); }
703	BaseType::iterator end() { return List.end(); }
704	BaseType::const_iterator begin() const { return List.begin(); }
705	BaseType::const_iterator end() const { return List.end(); }
706
707	BaseType List;
708
709	LLVM_DUMP_METHOD
710	void print(raw_ostream &OS, const SelectionDAG &G) const;
711	};
712	} // namespace
713
714	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
715	void OpRef::print(raw_ostream &OS, const SelectionDAG &G) const {
716	if (isValue()) {
717	OpV.getNode()->print(OS, &G);
718	return;
719	}
720	if (OpN & Invalid) {
721	OS << "invalid";
722	return;
723	}
724	if (OpN & Undef) {
725	OS << "undef";
726	return;
727	}
728	if ((OpN & Whole) != Whole) {
729	assert((OpN & Whole) == LoHalf \|\| (OpN & Whole) == HiHalf);
730	if (OpN & LoHalf)
731	OS << "lo ";
732	else
733	OS << "hi ";
734	}
735	OS << `'#'` << SignExtend32(OpN & Index, IndexBits);
736	}
737
738	void NodeTemplate::print(raw_ostream &OS, const SelectionDAG &G) const {
739	const TargetInstrInfo &TII = *G.getSubtarget().getInstrInfo();
740	OS << format("%8s", EVT(Ty).getEVTString().c_str()) << " "
741	<< TII.getName(Opc);
742	bool Comma = false;
743	for (const auto &R : Ops) {
744	if (Comma)
745	OS << `','`;
746	Comma = true;
747	OS << `' '`;
748	R.print(OS, G);
749	}
750	}
751
752	void ResultStack::print(raw_ostream &OS, const SelectionDAG &G) const {
753	OS << "Input node:\n";
754	#ifndef NDEBUG
755	InpNode->dumpr(&G);
756	#endif
757	OS << "Result templates:\n";
758	for (unsigned I = `0`, E = List.size(); I != E; ++I) {
759	OS << `'['` << I << "] ";
760	List[I].print(OS, G);
761	OS << `'\n'`;
762	}
763	}
764	#endif
765
766	namespace {
767	struct ShuffleMask {
768	ShuffleMask(ArrayRef<int> M) : Mask (M) {
769	for (int M : Mask) {
770	if (M == -`1`)
771	continue;
772	MinSrc = (MinSrc == -`1`) ? M : std::min(a: MinSrc, b: M);
773	MaxSrc = (MaxSrc == -`1`) ? M : std::max(a: MaxSrc, b: M);
774	}
775	}
776
777	ArrayRef<int> Mask;
778	int MinSrc = -`1`, MaxSrc = -`1`;
779
780	ShuffleMask lo() const {
781	size_t H = Mask.size()/`2`;
782	return ShuffleMask (Mask.take_front(N: H));
783	}
784	ShuffleMask hi() const {
785	size_t H = Mask.size()/`2`;
786	return ShuffleMask (Mask.take_back(N: H));
787	}
788
789	void print(raw_ostream &OS) const {
790	OS << "MinSrc:" << MinSrc << ", MaxSrc:" << MaxSrc << " {";
791	for (int M : Mask)
792	OS << `' '` << M;
793	OS << " }";
794	}
795	};
796
797	LLVM_ATTRIBUTE_UNUSED
798	raw_ostream &operator<<(raw_ostream &OS, const ShuffleMask &SM) {
799	SM.print(OS);
800	return OS;
801	}
802	} // namespace
803
804	namespace shuffles {
805	using MaskT = SmallVector<int, `128`>;
806	// Vdd = vshuffvdd(Vu, Vv, Rt)
807	// Vdd = vdealvdd(Vu, Vv, Rt)
808	// Vd = vpack(Vu, Vv, Size, TakeOdd)
809	// Vd = vshuff(Vu, Vv, Size, TakeOdd)
810	// Vd = vdeal(Vu, Vv, Size, TakeOdd)
811	// Vd = vdealb4w(Vu, Vv)
812
813	ArrayRef<int> lo(ArrayRef<int> Vuu) { return Vuu.take_front(N: Vuu.size() / `2`); }
814	ArrayRef<int> hi(ArrayRef<int> Vuu) { return Vuu.take_back(N: Vuu.size() / `2`); }
815
816	MaskT vshuffvdd(ArrayRef<int> Vu, ArrayRef<int> Vv, unsigned Rt) {
817	int Len = Vu.size();
818	MaskT Vdd(`2` * Len);
819	std::copy(Vv.begin(), Vv.end(), Vdd.begin());
820	std::copy(Vu.begin(), Vu.end(), Vdd.begin() + Len);
821
822	auto Vd0 = MutableArrayRef<int>(Vdd).take_front(N: Len);
823	auto Vd1 = MutableArrayRef<int>(Vdd).take_back(N: Len);
824
825	for (int Offset = `1`; Offset < Len; Offset *= `2`) {
826	if ((Rt & Offset) == `0`)
827	continue;
828	for (int i = `0`; i != Len; ++i) {
829	if ((i & Offset) == `0`)
830	std::swap(a&: Vd1 [i], b&: Vd0 [i + Offset]);
831	}
832	}
833	return Vdd;
834	}
835
836	MaskT vdealvdd(ArrayRef<int> Vu, ArrayRef<int> Vv, unsigned Rt) {
837	int Len = Vu.size();
838	MaskT Vdd(`2` * Len);
839	std::copy(Vv.begin(), Vv.end(), Vdd.begin());
840	std::copy(Vu.begin(), Vu.end(), Vdd.begin() + Len);
841
842	auto Vd0 = MutableArrayRef<int>(Vdd).take_front(N: Len);
843	auto Vd1 = MutableArrayRef<int>(Vdd).take_back(N: Len);
844
845	for (int Offset = Len / `2`; Offset > `0`; Offset /= `2`) {
846	if ((Rt & Offset) == `0`)
847	continue;
848	for (int i = `0`; i != Len; ++i) {
849	if ((i & Offset) == `0`)
850	std::swap(a&: Vd1 [i], b&: Vd0 [i + Offset]);
851	}
852	}
853	return Vdd;
854	}
855
856	MaskT vpack(ArrayRef<int> Vu, ArrayRef<int> Vv, unsigned Size, bool TakeOdd) {
857	int Len = Vu.size();
858	MaskT Vd(Len);
859	auto Odd = static_cast<int>(TakeOdd);
860	for (int i = `0`, e = Len / (`2` * Size); i != e; ++i) {
861	for (int b = `0`; b != static_cast<int>(Size); ++b) {
862	// clang-format off
863	Vd [i * Size + b] = Vv [(`2` * i + Odd) * Size + b];
864	Vd [i * Size + b + Len / `2`] = Vu [(`2` * i + Odd) * Size + b];
865	// clang-format on
866	}
867	}
868	return Vd;
869	}
870
871	MaskT vshuff(ArrayRef<int> Vu, ArrayRef<int> Vv, unsigned Size, bool TakeOdd) {
872	int Len = Vu.size();
873	MaskT Vd(Len);
874	auto Odd = static_cast<int>(TakeOdd);
875	for (int i = `0`, e = Len / (`2` * Size); i != e; ++i) {
876	for (int b = `0`; b != static_cast<int>(Size); ++b) {
877	Vd [(`2` * i + `0`) * Size + b] = Vv [(`2` * i + Odd) * Size + b];
878	Vd [(`2` * i + `1`) * Size + b] = Vu [(`2` * i + Odd) * Size + b];
879	}
880	}
881	return Vd;
882	}
883
884	MaskT vdeal(ArrayRef<int> Vu, ArrayRef<int> Vv, unsigned Size, bool TakeOdd) {
885	int Len = Vu.size();
886	MaskT T = vdealvdd(Vu, Vv, Rt: Len - `2` * Size);
887	return vpack(Vu: hi(Vuu: T), Vv: lo(Vuu: T), Size, TakeOdd);
888	}
889
890	MaskT vdealb4w(ArrayRef<int> Vu, ArrayRef<int> Vv) {
891	int Len = Vu.size();
892	MaskT Vd(Len);
893	for (int i = `0`, e = Len / `4`; i != e; ++i) {
894	Vd [`0` * (Len / `4`) + i] = Vv [`4` * i + `0`];
895	Vd [`1` * (Len / `4`) + i] = Vv [`4` * i + `2`];
896	Vd [`2` * (Len / `4`) + i] = Vu [`4` * i + `0`];
897	Vd [`3` * (Len / `4`) + i] = Vu [`4` * i + `2`];
898	}
899	return Vd;
900	}
901
902	template <typename ShuffFunc, typename... OptArgs>
903	auto mask(ShuffFunc S, unsigned Length, OptArgs... args) -> MaskT {
904	MaskT Vu(Length), Vv(Length);
905	std::iota(first: Vu.begin(), last: Vu.end(), value: Length); // High
906	std::iota(first: Vv.begin(), last: Vv.end(), value: `0`); // Low
907	return S(Vu, Vv, args...);
908	}
909
910	} // namespace shuffles
911
912	// --------------------------------------------------------------------
913	// The HvxSelector class.
914
915	static const HexagonTargetLowering &getHexagonLowering(SelectionDAG &G) {
916	return static_cast<const HexagonTargetLowering&>(G.getTargetLoweringInfo());
917	}
918	static const HexagonSubtarget &getHexagonSubtarget(SelectionDAG &G) {
919	return G.getSubtarget<HexagonSubtarget>();
920	}
921
922	namespace llvm {
923	struct HvxSelector {
924	const HexagonTargetLowering &Lower;
925	HexagonDAGToDAGISel &ISel;
926	SelectionDAG &DAG;
927	const HexagonSubtarget &HST;
928	const unsigned HwLen;
929
930	HvxSelector(HexagonDAGToDAGISel &HS, SelectionDAG &G)
931	: Lower(getHexagonLowering(G)), ISel(HS), DAG(G),
932	HST(getHexagonSubtarget(G)), HwLen(HST.getVectorLength()) {}
933
934	MVT getSingleVT(MVT ElemTy) const {
935	assert(ElemTy != MVT::i1 && "Use getBoolVT for predicates");
936	unsigned NumElems = HwLen / (ElemTy.getSizeInBits() / `8`);
937	return MVT::getVectorVT(VT: ElemTy, NumElements: NumElems);
938	}
939
940	MVT getPairVT(MVT ElemTy) const {
941	assert(ElemTy != MVT::i1); // Suspicious: there are no predicate pairs.
942	unsigned NumElems = (`2` * HwLen) / (ElemTy.getSizeInBits() / `8`);
943	return MVT::getVectorVT(VT: ElemTy, NumElements: NumElems);
944	}
945
946	MVT getBoolVT() const {
947	// Return HwLen x i1.
948	return MVT::getVectorVT(VT: MVT::i1, NumElements: HwLen);
949	}
950
951	void selectExtractSubvector(SDNode *N);
952	void selectShuffle(SDNode *N);
953	void selectRor(SDNode *N);
954	void selectVAlign(SDNode *N);
955
956	static SmallVector<uint32_t, `8`> getPerfectCompletions(ShuffleMask SM,
957	unsigned Width);
958	static SmallVector<uint32_t, `8`> completeToPerfect(
959	ArrayRef<uint32_t> Completions, unsigned Width);
960	static std::optional<int> rotationDistance(ShuffleMask SM, unsigned WrapAt);
961
962	private:
963	void select(SDNode *ISelN);
964	void materialize(const ResultStack &Results);
965
966	SDValue getConst32(int Val, const SDLoc &dl);
967	SDValue getVectorConstant(ArrayRef<uint8_t> Data, const SDLoc &dl);
968
969	enum : unsigned {
970	None,
971	PackMux,
972	};
973	OpRef concats(OpRef Va, OpRef Vb, ResultStack &Results);
974	OpRef funnels(OpRef Va, OpRef Vb, int Amount, ResultStack &Results);
975
976	OpRef packs(ShuffleMask SM, OpRef Va, OpRef Vb, ResultStack &Results,
977	MutableArrayRef<int> NewMask, unsigned Options = None);
978	OpRef packp(ShuffleMask SM, OpRef Va, OpRef Vb, ResultStack &Results,
979	MutableArrayRef<int> NewMask);
980	OpRef vmuxs(ArrayRef<uint8_t> Bytes, OpRef Va, OpRef Vb,
981	ResultStack &Results);
982	OpRef vmuxp(ArrayRef<uint8_t> Bytes, OpRef Va, OpRef Vb,
983	ResultStack &Results);
984
985	OpRef shuffs1(ShuffleMask SM, OpRef Va, ResultStack &Results);
986	OpRef shuffs2(ShuffleMask SM, OpRef Va, OpRef Vb, ResultStack &Results);
987	OpRef shuffp1(ShuffleMask SM, OpRef Va, ResultStack &Results);
988	OpRef shuffp2(ShuffleMask SM, OpRef Va, OpRef Vb, ResultStack &Results);
989
990	OpRef butterfly(ShuffleMask SM, OpRef Va, ResultStack &Results);
991	OpRef contracting(ShuffleMask SM, OpRef Va, OpRef Vb, ResultStack &Results);
992	OpRef expanding(ShuffleMask SM, OpRef Va, ResultStack &Results);
993	OpRef perfect(ShuffleMask SM, OpRef Va, ResultStack &Results);
994
995	bool selectVectorConstants(SDNode *N);
996	bool scalarizeShuffle(ArrayRef<int> Mask, const SDLoc &dl, MVT ResTy,
997	SDValue Va, SDValue Vb, SDNode *N);
998	};
999	} // namespace llvm
1000
1001	static void splitMask(ArrayRef<int> Mask, MutableArrayRef<int> MaskL,
1002	MutableArrayRef<int> MaskR) {
1003	unsigned VecLen = Mask.size();
1004	assert(MaskL.size() == VecLen && MaskR.size() == VecLen);
1005	for (unsigned I = `0`; I != VecLen; ++I) {
1006	int M = Mask [I];
1007	if (M < `0`) {
1008	MaskL [I] = MaskR [I] = -`1`;
1009	} else if (unsigned(M) < VecLen) {
1010	MaskL [I] = M;
1011	MaskR [I] = -`1`;
1012	} else {
1013	MaskL [I] = -`1`;
1014	MaskR [I] = M-VecLen;
1015	}
1016	}
1017	}
1018
1019	static std::pair<int,unsigned> findStrip(ArrayRef<int> A, int Inc,
1020	unsigned MaxLen) {
1021	assert(A.size() > `0` && A.size() >= MaxLen);
1022	int F = A [`0`];
1023	int E = F;
1024	for (unsigned I = `1`; I != MaxLen; ++I) {
1025	if (A [I] - E != Inc)
1026	return { F, I };
1027	E = A [I];
1028	}
1029	return { F, MaxLen };
1030	}
1031
1032	static bool isUndef(ArrayRef<int> Mask) {
1033	for (int Idx : Mask)
1034	if (Idx != -`1`)
1035	return false;
1036	return true;
1037	}
1038
1039	static bool isIdentity(ArrayRef<int> Mask) {
1040	for (int I = `0`, E = Mask.size(); I != E; ++I) {
1041	int M = Mask [I];
1042	if (M >= `0` && M != I)
1043	return false;
1044	}
1045	return true;
1046	}
1047
1048	static bool isLowHalfOnly(ArrayRef<int> Mask) {
1049	int L = Mask.size();
1050	assert(L % `2` == `0`);
1051	// Check if the second half of the mask is all-undef.
1052	return llvm::all_of(Range: Mask.drop_front(N: L / `2`), P: [](int M) { return M < `0`; });
1053	}
1054
1055	static SmallVector<unsigned, `4`> getInputSegmentList(ShuffleMask SM,
1056	unsigned SegLen) {
1057	assert(isPowerOf2_32(SegLen));
1058	SmallVector<unsigned, `4`> SegList;
1059	if (SM.MaxSrc == -`1`)
1060	return SegList;
1061
1062	unsigned Shift = Log2_32(Value: SegLen);
1063	BitVector Segs(alignTo(Value: SM.MaxSrc + `1`, Align: SegLen) >> Shift);
1064
1065	for (int M : SM.Mask) {
1066	if (M >= `0`)
1067	Segs.set(M >> Shift);
1068	}
1069
1070	for (unsigned B : Segs.set_bits())
1071	SegList.push_back(Elt: B);
1072	return SegList;
1073	}
1074
1075	static SmallVector<unsigned, `4`> getOutputSegmentMap(ShuffleMask SM,
1076	unsigned SegLen) {
1077	// Calculate the layout of the output segments in terms of the input
1078	// segments.
1079	// For example [1,3,1,0] means that the output consists of 4 output
1080	// segments, where the first output segment has only elements of the
1081	// input segment at index 1. The next output segment only has elements
1082	// of the input segment 3, etc.
1083	// If an output segment only has undef elements, the value will be ~0u.
1084	// If an output segment has elements from more than one input segment,
1085	// the corresponding value will be ~1u.
1086	unsigned MaskLen = SM.Mask.size();
1087	assert(MaskLen % SegLen == `0`);
1088	SmallVector<unsigned, `4`> Map(MaskLen / SegLen);
1089
1090	for (int S = `0`, E = Map.size(); S != E; ++S) {
1091	unsigned Idx = ~`0u`;
1092	for (int I = `0`; I != static_cast<int>(SegLen); ++I) {
1093	int M = SM.Mask [S*SegLen + I];
1094	if (M < `0`)
1095	continue;
1096	unsigned G = M / SegLen; // Input segment of this element.
1097	if (Idx == ~`0u`) {
1098	Idx = G;
1099	} else if (Idx != G) {
1100	Idx = ~`1u`;
1101	break;
1102	}
1103	}
1104	Map [S] = Idx;
1105	}
1106
1107	return Map;
1108	}
1109
1110	static void packSegmentMask(ArrayRef<int> Mask, ArrayRef<unsigned> OutSegMap,
1111	unsigned SegLen, MutableArrayRef<int> PackedMask) {
1112	SmallVector<unsigned, `4`> InvMap;
1113	for (int I = OutSegMap.size() - `1`; I >= `0`; --I) {
1114	unsigned S = OutSegMap [I];
1115	assert(S != ~`0u` && "Unexpected undef");
1116	assert(S != ~`1u` && "Unexpected multi");
1117	if (InvMap.size() <= S)
1118	InvMap.resize(N: S+`1`);
1119	InvMap [S] = I;
1120	}
1121
1122	unsigned Shift = Log2_32(Value: SegLen);
1123	for (int I = `0`, E = Mask.size(); I != E; ++I) {
1124	int M = Mask [I];
1125	if (M >= `0`) {
1126	int OutIdx = InvMap [M >> Shift];
1127	M = (M & (SegLen-`1`)) + SegLen*OutIdx;
1128	}
1129	PackedMask [I] = M;
1130	}
1131	}
1132
1133	bool HvxSelector::selectVectorConstants(SDNode *N) {
1134	// Constant vectors are generated as loads from constant pools or as
1135	// splats of a constant value. Since they are generated during the
1136	// selection process, the main selection algorithm is not aware of them.
1137	// Select them directly here.
1138	SmallVector<SDNode*,`4`> Nodes;
1139	SetVector<SDNode*> WorkQ;
1140
1141	// The DAG can change (due to CSE) during selection, so cache all the
1142	// unselected nodes first to avoid traversing a mutating DAG.
1143	WorkQ.insert(X: N);
1144	for (unsigned i = `0`; i != WorkQ.size(); ++i) {
1145	SDNode *W = WorkQ [i];
1146	if (!W->isMachineOpcode() && W->getOpcode() == HexagonISD::ISEL)
1147	Nodes.push_back(Elt: W);
1148	for (unsigned j = `0`, f = W->getNumOperands(); j != f; ++j)
1149	WorkQ.insert(X: W->getOperand(Num: j).getNode());
1150	}
1151
1152	for (SDNode *L : Nodes)
1153	select(ISelN: L);
1154
1155	return !Nodes.empty();
1156	}
1157
1158	void HvxSelector::materialize(const ResultStack &Results) {
1159	DEBUG_WITH_TYPE("isel", {
1160	dbgs() << "Materializing\n";
1161	Results.print(dbgs(), DAG);
1162	});
1163	if (Results.empty())
1164	return;
1165	const SDLoc &dl(Results.InpNode);
1166	std::vector<SDValue> Output;
1167
1168	for (unsigned I = `0`, E = Results.size(); I != E; ++I) {
1169	const NodeTemplate &Node = Results [I];
1170	std::vector<SDValue> Ops;
1171	for (const OpRef &R : Node.Ops) {
1172	assert(R.isValid());
1173	if (R.isValue()) {
1174	Ops.push_back(x: R.OpV);
1175	continue;
1176	}
1177	if (R.OpN & OpRef::Undef) {
1178	MVT::SimpleValueType SVT = MVT::SimpleValueType(R.OpN & OpRef::Index);
1179	Ops.push_back(x: ISel.selectUndef(dl, ResTy: MVT (SVT)));
1180	continue;
1181	}
1182	// R is an index of a result.
1183	unsigned Part = R.OpN & OpRef::Whole;
1184	int Idx = SignExtend32(X: R.OpN & OpRef::Index, B: OpRef::IndexBits);
1185	if (Idx < `0`)
1186	Idx += I;
1187	assert(Idx >= `0` && unsigned(Idx) < Output.size());
1188	SDValue Op = Output [Idx];
1189	MVT OpTy = Op.getValueType().getSimpleVT();
1190	if (Part != OpRef::Whole) {
1191	assert(Part == OpRef::LoHalf \|\| Part == OpRef::HiHalf);
1192	MVT HalfTy = MVT::getVectorVT(VT: OpTy.getVectorElementType(),
1193	NumElements: OpTy.getVectorNumElements()/`2`);
1194	unsigned Sub = (Part == OpRef::LoHalf) ? Hexagon::vsub_lo
1195	: Hexagon::vsub_hi;
1196	Op = DAG.getTargetExtractSubreg(SRIdx: Sub, DL: dl, VT: HalfTy, Operand: Op);
1197	}
1198	Ops.push_back(x: Op);
1199	} // for (Node : Results)
1200
1201	assert(Node.Ty != MVT::Other);
1202	SDNode *ResN = (Node.Opc == TargetOpcode::COPY)
1203	? Ops.front().getNode()
1204	: DAG.getMachineNode(Opcode: Node.Opc, dl, VT: Node.Ty, Ops);
1205	Output.push_back(x: SDValue (ResN, `0`));
1206	}
1207
1208	SDNode *OutN = Output.back().getNode();
1209	SDNode *InpN = Results.InpNode;
1210	DEBUG_WITH_TYPE("isel", {
1211	dbgs() << "Generated node:\n";
1212	OutN->dumpr(&DAG);
1213	});
1214
1215	ISel.ReplaceNode(F: InpN, T: OutN);
1216	selectVectorConstants(N: OutN);
1217	DAG.RemoveDeadNodes();
1218	}
1219
1220	OpRef HvxSelector::concats(OpRef Lo, OpRef Hi, ResultStack &Results) {
1221	DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << `'\n'`;});
1222	const SDLoc &dl(Results.InpNode);
1223	Results.push(Opc: TargetOpcode::REG_SEQUENCE, Ty: getPairVT(ElemTy: MVT::i8), Ops: {
1224	getConst32(Val: Hexagon::HvxWRRegClassID, dl),
1225	Lo, getConst32(Val: Hexagon::vsub_lo, dl),
1226	Hi, getConst32(Val: Hexagon::vsub_hi, dl),
1227	});
1228	return OpRef::res(N: Results.top());
1229	}
1230
1231	OpRef HvxSelector::funnels(OpRef Va, OpRef Vb, int Amount,
1232	ResultStack &Results) {
1233	// Do a funnel shift towards the low end (shift right) by Amount bytes.
1234	// If Amount < 0, treat it as shift left, i.e. do a shift right by
1235	// Amount + HwLen.
1236	auto VecLen = static_cast<int>(HwLen);
1237
1238	if (Amount == `0`)
1239	return Va;
1240	if (Amount == VecLen)
1241	return Vb;
1242
1243	MVT Ty = getSingleVT(ElemTy: MVT::i8);
1244	const SDLoc &dl(Results.InpNode);
1245
1246	if (Amount < `0`)
1247	Amount += VecLen;
1248	if (Amount > VecLen) {
1249	Amount -= VecLen;
1250	std::swap(a&: Va, b&: Vb);
1251	}
1252
1253	if (isUInt<`3`>(x: Amount)) {
1254	SDValue A = getConst32(Val: Amount, dl);
1255	Results.push(Opc: Hexagon::V6_valignbi, Ty, Ops: {Vb, Va, A});
1256	} else if (isUInt<`3`>(x: VecLen - Amount)) {
1257	SDValue A = getConst32(Val: VecLen - Amount, dl);
1258	Results.push(Opc: Hexagon::V6_vlalignbi, Ty, Ops: {Vb, Va, A});
1259	} else {
1260	SDValue A = getConst32(Val: Amount, dl);
1261	Results.push(Opc: Hexagon::A2_tfrsi, Ty, Ops: {A});
1262	Results.push(Opc: Hexagon::V6_valignb, Ty, Ops: {Vb, Va, OpRef::res(N: -`1`)});
1263	}
1264	return OpRef::res(N: Results.top());
1265	}
1266
1267	// Va, Vb are single vectors. If SM only uses two vector halves from Va/Vb,
1268	// pack these halves into a single vector, and remap SM into NewMask to use
1269	// the new vector instead.
1270	OpRef HvxSelector::packs(ShuffleMask SM, OpRef Va, OpRef Vb,
1271	ResultStack &Results, MutableArrayRef<int> NewMask,
1272	unsigned Options) {
1273	DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << `'\n'`;});
1274	if (!Va.isValid() \|\| !Vb.isValid())
1275	return OpRef::fail();
1276
1277	if (Vb.isUndef()) {
1278	std::copy(SM.Mask.begin(), SM.Mask.end(), NewMask.begin());
1279	return Va;
1280	}
1281	if (Va.isUndef()) {
1282	std::copy(SM.Mask.begin(), SM.Mask.end(), NewMask.begin());
1283	ShuffleVectorSDNode::commuteMask(Mask: NewMask);
1284	return Vb;
1285	}
1286
1287	MVT Ty = getSingleVT(ElemTy: MVT::i8);
1288	MVT PairTy = getPairVT(ElemTy: MVT::i8);
1289	OpRef Inp[`2`] = {Va, Vb};
1290	unsigned VecLen = SM.Mask.size();
1291
1292	auto valign = [this](OpRef Lo, OpRef Hi, unsigned Amt, MVT Ty,
1293	ResultStack &Results) {
1294	if (Amt == `0`)
1295	return Lo;
1296	const SDLoc &dl(Results.InpNode);
1297	if (isUInt<`3`>(x: Amt) \|\| isUInt<`3`>(x: HwLen - Amt)) {
1298	bool IsRight = isUInt<`3`>(x: Amt); // Right align.
1299	SDValue S = getConst32(Val: IsRight ? Amt : HwLen - Amt, dl);
1300	unsigned Opc = IsRight ? Hexagon::V6_valignbi : Hexagon::V6_vlalignbi;
1301	Results.push(Opc, Ty, Ops: {Hi, Lo, S});
1302	return OpRef::res(N: Results.top());
1303	}
1304	Results.push(Opc: Hexagon::A2_tfrsi, Ty: MVT::i32, Ops: {getConst32(Val: Amt, dl)});
1305	OpRef A = OpRef::res(N: Results.top());
1306	Results.push(Opc: Hexagon::V6_valignb, Ty, Ops: {Hi, Lo, A});
1307	return OpRef::res(N: Results.top());
1308	};
1309
1310	// Segment is a vector half.
1311	unsigned SegLen = HwLen / `2`;
1312
1313	// Check if we can shuffle vector halves around to get the used elements
1314	// into a single vector.
1315	shuffles::MaskT MaskH(SM.Mask);
1316	SmallVector<unsigned, `4`> SegList = getInputSegmentList(SM: SM.Mask, SegLen);
1317	unsigned SegCount = SegList.size();
1318	SmallVector<unsigned, `4`> SegMap = getOutputSegmentMap(SM: SM.Mask, SegLen);
1319
1320	if (SegList.empty())
1321	return OpRef::undef(Ty);
1322
1323	// NOTE:
1324	// In the following part of the function, where the segments are rearranged,
1325	// the shuffle mask SM can be of any length that is a multiple of a vector
1326	// (i.e. a multiple of 2SegLen), and non-zero.*
1327	// The output segment map is computed, and it may have any even number of
1328	// entries, but the rearrangement of input segments will be done based only
1329	// on the first two (non-undef) entries in the segment map.
1330	// For example, if the output map is 3, 1, 1, 3 (it can have at most two
1331	// distinct entries!), the segments 1 and 3 of Va/Vb will be packaged into
1332	// a single vector V = 3:1. The output mask will then be updated to use
1333	// seg(0,V), seg(1,V), seg(1,V), seg(0,V).
1334	//
1335	// Picking the segments based on the output map is an optimization. For
1336	// correctness it is only necessary that Seg0 and Seg1 are the two input
1337	// segments that are used in the output.
1338
1339	unsigned Seg0 = ~`0u`, Seg1 = ~`0u`;
1340	for (unsigned X : SegMap) {
1341	if (X == ~`0u`)
1342	continue;
1343	if (Seg0 == ~`0u`)
1344	Seg0 = X;
1345	else if (Seg1 != ~`0u`)
1346	break;
1347	if (X == ~`1u` \|\| X != Seg0)
1348	Seg1 = X;
1349	}
1350
1351	if (SegCount == `1`) {
1352	unsigned SrcOp = SegList [`0`] / `2`;
1353	for (int I = `0`; I != static_cast<int>(VecLen); ++I) {
1354	int M = SM.Mask [I];
1355	if (M >= `0`) {
1356	M -= SrcOp * HwLen;
1357	assert(M >= `0`);
1358	}
1359	NewMask [I] = M;
1360	}
1361	return Inp[SrcOp];
1362	}
1363
1364	if (SegCount == `2`) {
1365	// Seg0 should not be undef here: this would imply a SegList
1366	// with <= 1 elements, which was checked earlier.
1367	assert(Seg0 != ~`0u`);
1368
1369	// If Seg0 or Seg1 are "multi-defined", pick them from the input
1370	// segment list instead.
1371	if (Seg0 == ~`1u` \|\| Seg1 == ~`1u`) {
1372	if (Seg0 == Seg1) {
1373	Seg0 = SegList [`0`];
1374	Seg1 = SegList [`1`];
1375	} else if (Seg0 == ~`1u`) {
1376	Seg0 = SegList [`0`] != Seg1 ? SegList [`0`] : SegList [`1`];
1377	} else {
1378	assert(Seg1 == ~`1u`);
1379	Seg1 = SegList [`0`] != Seg0 ? SegList [`0`] : SegList [`1`];
1380	}
1381	}
1382	assert(Seg0 != ~`1u` && Seg1 != ~`1u`);
1383
1384	assert(Seg0 != Seg1 && "Expecting different segments");
1385	const SDLoc &dl(Results.InpNode);
1386	Results.push(Opc: Hexagon::A2_tfrsi, Ty: MVT::i32, Ops: {getConst32(Val: SegLen, dl)});
1387	OpRef HL = OpRef::res(N: Results.top());
1388
1389	// Va = AB, Vb = CD
1390
1391	if (Seg0 / `2` == Seg1 / `2`) {
1392	// Same input vector.
1393	Va = Inp[Seg0 / `2`];
1394	if (Seg0 > Seg1) {
1395	// Swap halves.
1396	Results.push(Opc: Hexagon::V6_vror, Ty, Ops: {Inp[Seg0 / `2`], HL});
1397	Va = OpRef::res(N: Results.top());
1398	}
1399	packSegmentMask(Mask: SM.Mask, OutSegMap: {Seg0, Seg1}, SegLen, PackedMask: MaskH);
1400	} else if (Seg0 % `2` == Seg1 % `2`) {
1401	// Picking AC, BD, CA, or DB.
1402	// vshuff(CD,AB,HL) -> BD:AC
1403	// vshuff(AB,CD,HL) -> DB:CA
1404	auto Vs = (Seg0 == `0` \|\| Seg0 == `1`) ? std::make_pair(x&: Vb, y&: Va) // AC or BD
1405	: std::make_pair(x&: Va, y&: Vb); // CA or DB
1406	Results.push(Opc: Hexagon::V6_vshuffvdd, Ty: PairTy, Ops: {Vs.first, Vs.second, HL});
1407	OpRef P = OpRef::res(N: Results.top());
1408	Va = (Seg0 == `0` \|\| Seg0 == `2`) ? OpRef::lo(R: P) : OpRef::hi(R: P);
1409	packSegmentMask(Mask: SM.Mask, OutSegMap: {Seg0, Seg1}, SegLen, PackedMask: MaskH);
1410	} else {
1411	// Picking AD, BC, CB, or DA.
1412	if ((Seg0 == `0` && Seg1 == `3`) \|\| (Seg0 == `2` && Seg1 == `1`)) {
1413	// AD or BC: this can be done using vmux.
1414	// Q = V6_pred_scalar2 SegLen
1415	// V = V6_vmux Q, (Va, Vb) or (Vb, Va)
1416	Results.push(Opc: Hexagon::V6_pred_scalar2, Ty: getBoolVT(), Ops: {HL});
1417	OpRef Qt = OpRef::res(N: Results.top());
1418	auto Vs = (Seg0 == `0`) ? std::make_pair(x&: Va, y&: Vb) // AD
1419	: std::make_pair(x&: Vb, y&: Va); // CB
1420	Results.push(Opc: Hexagon::V6_vmux, Ty, Ops: {Qt, Vs.first, Vs.second});
1421	Va = OpRef::res(N: Results.top());
1422	packSegmentMask(Mask: SM.Mask, OutSegMap: {Seg0, Seg1}, SegLen, PackedMask: MaskH);
1423	} else {
1424	// BC or DA: this could be done via valign by SegLen.
1425	// Do nothing here, because valign (if possible) will be generated
1426	// later on (make sure the Seg0 values are as expected).
1427	assert(Seg0 == `1` \|\| Seg0 == `3`);
1428	}
1429	}
1430	}
1431
1432	// Check if the arguments can be packed by valign(Va,Vb) or valign(Vb,Va).
1433	// FIXME: maybe remove this?
1434	ShuffleMask SMH(MaskH);
1435	assert(SMH.Mask.size() == VecLen);
1436	shuffles::MaskT MaskA(SMH.Mask);
1437
1438	if (SMH.MaxSrc - SMH.MinSrc >= static_cast<int>(HwLen)) {
1439	// valign(Lo=Va,Hi=Vb) won't work. Try swapping Va/Vb.
1440	shuffles::MaskT Swapped(SMH.Mask);
1441	ShuffleVectorSDNode::commuteMask(Mask: Swapped);
1442	ShuffleMask SW(Swapped);
1443	if (SW.MaxSrc - SW.MinSrc < static_cast<int>(HwLen)) {
1444	MaskA.assign(in_start: SW.Mask.begin(), in_end: SW.Mask.end());
1445	std::swap(a&: Va, b&: Vb);
1446	}
1447	}
1448	ShuffleMask SMA(MaskA);
1449	assert(SMA.Mask.size() == VecLen);
1450
1451	if (SMA.MaxSrc - SMA.MinSrc < static_cast<int>(HwLen)) {
1452	int ShiftR = SMA.MinSrc;
1453	if (ShiftR >= static_cast<int>(HwLen)) {
1454	Va = Vb;
1455	Vb = OpRef::undef(Ty);
1456	ShiftR -= HwLen;
1457	}
1458	OpRef RetVal = valign (Va, Vb, ShiftR, Ty, Results);
1459
1460	for (int I = `0`; I != static_cast<int>(VecLen); ++I) {
1461	int M = SMA.Mask [I];
1462	if (M != -`1`)
1463	M -= SMA.MinSrc;
1464	NewMask [I] = M;
1465	}
1466	return RetVal;
1467	}
1468
1469	// By here, packing by segment (half-vector) shuffling, and vector alignment
1470	// failed. Try vmux.
1471	// Note: since this is using the original mask, Va and Vb must not have been
1472	// modified.
1473
1474	if (Options & PackMux) {
1475	// If elements picked from Va and Vb have all different (source) indexes
1476	// (relative to the start of the argument), do a mux, and update the mask.
1477	BitVector Picked(HwLen);
1478	SmallVector<uint8_t,`128`> MuxBytes(HwLen);
1479	bool CanMux = true;
1480	for (int I = `0`; I != static_cast<int>(VecLen); ++I) {
1481	int M = SM.Mask [I];
1482	if (M == -`1`)
1483	continue;
1484	if (M >= static_cast<int>(HwLen))
1485	M -= HwLen;
1486	else
1487	MuxBytes [M] = `0xFF`;
1488	if (Picked [M]) {
1489	CanMux = false;
1490	break;
1491	}
1492	NewMask [I] = M;
1493	}
1494	if (CanMux)
1495	return vmuxs(Bytes: MuxBytes, Va, Vb, Results);
1496	}
1497	return OpRef::fail();
1498	}
1499
1500	// Va, Vb are vector pairs. If SM only uses two single vectors from Va/Vb,
1501	// pack these vectors into a pair, and remap SM into NewMask to use the
1502	// new pair instead.
1503	OpRef HvxSelector::packp(ShuffleMask SM, OpRef Va, OpRef Vb,
1504	ResultStack &Results, MutableArrayRef<int> NewMask) {
1505	DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << `'\n'`;});
1506	SmallVector<unsigned, `4`> SegList = getInputSegmentList(SM: SM.Mask, SegLen: HwLen);
1507	if (SegList.empty())
1508	return OpRef::undef(Ty: getPairVT(ElemTy: MVT::i8));
1509
1510	// If more than two halves are used, bail.
1511	// TODO: be more aggressive here?
1512	unsigned SegCount = SegList.size();
1513	if (SegCount > `2`)
1514	return OpRef::fail();
1515
1516	MVT HalfTy = getSingleVT(ElemTy: MVT::i8);
1517
1518	OpRef Inp[`2`] = { Va, Vb };
1519	OpRef Out[`2`] = { OpRef::undef(Ty: HalfTy), OpRef::undef(Ty: HalfTy) };
1520
1521	// Really make sure we have at most 2 vectors used in the mask.
1522	assert(SegCount <= `2`);
1523
1524	for (int I = `0`, E = SegList.size(); I != E; ++I) {
1525	unsigned S = SegList [I];
1526	OpRef Op = Inp[S / `2`];
1527	Out[I] = (S & `1`) ? OpRef::hi(R: Op) : OpRef::lo(R: Op);
1528	}
1529
1530	// NOTE: Using SegList as the packing map here (not SegMap). This works,
1531	// because we're not concerned here about the order of the segments (i.e.
1532	// single vectors) in the output pair. Changing the order of vectors is
1533	// free (as opposed to changing the order of vector halves as in packs),
1534	// and so there is no extra cost added in case the order needs to be
1535	// changed later.
1536	packSegmentMask(Mask: SM.Mask, OutSegMap: SegList, SegLen: HwLen, PackedMask: NewMask);
1537	return concats(Lo: Out[`0`], Hi: Out[`1`], Results);
1538	}
1539
1540	OpRef HvxSelector::vmuxs(ArrayRef<uint8_t> Bytes, OpRef Va, OpRef Vb,
1541	ResultStack &Results) {
1542	DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << `'\n'`;});
1543	MVT ByteTy = getSingleVT(ElemTy: MVT::i8);
1544	MVT BoolTy = MVT::getVectorVT(VT: MVT::i1, NumElements: HwLen);
1545	const SDLoc &dl(Results.InpNode);
1546	SDValue B = getVectorConstant(Data: Bytes, dl);
1547	Results.push(Opc: Hexagon::V6_vd0, Ty: ByteTy, Ops: {});
1548	Results.push(Opc: Hexagon::V6_veqb, Ty: BoolTy, Ops: {OpRef (B), OpRef::res(N: -`1`)});
1549	Results.push(Opc: Hexagon::V6_vmux, Ty: ByteTy, Ops: {OpRef::res(N: -`1`), Vb, Va});
1550	return OpRef::res(N: Results.top());
1551	}
1552
1553	OpRef HvxSelector::vmuxp(ArrayRef<uint8_t> Bytes, OpRef Va, OpRef Vb,
1554	ResultStack &Results) {
1555	DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << `'\n'`;});
1556	size_t S = Bytes.size() / `2`;
1557	OpRef L = vmuxs(Bytes: Bytes.take_front(N: S), Va: OpRef::lo(R: Va), Vb: OpRef::lo(R: Vb), Results);
1558	OpRef H = vmuxs(Bytes: Bytes.drop_front(N: S), Va: OpRef::hi(R: Va), Vb: OpRef::hi(R: Vb), Results);
1559	return concats(Lo: L, Hi: H, Results);
1560	}
1561
1562	OpRef HvxSelector::shuffs1(ShuffleMask SM, OpRef Va, ResultStack &Results) {
1563	DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << `'\n'`;});
1564	unsigned VecLen = SM.Mask.size();
1565	assert(HwLen == VecLen);
1566	(void)VecLen;
1567	assert(all_of(SM.Mask, [this](int M) { return M == -`1` \|\| M < int(HwLen); }));
1568
1569	if (isIdentity(Mask: SM.Mask))
1570	return Va;
1571	if (isUndef(Mask: SM.Mask))
1572	return OpRef::undef(Ty: getSingleVT(ElemTy: MVT::i8));
1573
1574	// First, check for rotations.
1575	if (auto Dist = rotationDistance(SM, WrapAt: VecLen)) {
1576	OpRef Rotate = funnels(Va, Vb: Va, Amount: *Dist, Results);
1577	if (Rotate.isValid())
1578	return Rotate;
1579	}
1580	unsigned HalfLen = HwLen / `2`;
1581	assert(isPowerOf2_32(HalfLen));
1582
1583	// Handle special case where the output is the same half of the input
1584	// repeated twice, i.e. if Va = AB, then handle the output of AA or BB.
1585	std::pair<int, unsigned> Strip1 = findStrip(A: SM.Mask, Inc: `1`, MaxLen: HalfLen);
1586	if ((Strip1.first & ~HalfLen) == `0` && Strip1.second == HalfLen) {
1587	std::pair<int, unsigned> Strip2 =
1588	findStrip(A: SM.Mask.drop_front(N: HalfLen), Inc: `1`, MaxLen: HalfLen);
1589	if (Strip1 == Strip2) {
1590	const SDLoc &dl(Results.InpNode);
1591	Results.push(Opc: Hexagon::A2_tfrsi, Ty: MVT::i32, Ops: {getConst32(Val: HalfLen, dl)});
1592	Results.push(Opc: Hexagon::V6_vshuffvdd, Ty: getPairVT(ElemTy: MVT::i8),
1593	Ops: {Va, Va, OpRef::res(N: Results.top())});
1594	OpRef S = OpRef::res(N: Results.top());
1595	return (Strip1.first == `0`) ? OpRef::lo(R: S) : OpRef::hi(R: S);
1596	}
1597	}
1598
1599	OpRef P = perfect(SM, Va, Results);
1600	if (P.isValid())
1601	return P;
1602	return butterfly(SM, Va, Results);
1603	}
1604
1605	OpRef HvxSelector::shuffs2(ShuffleMask SM, OpRef Va, OpRef Vb,
1606	ResultStack &Results) {
1607	DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << `'\n'`;});
1608	if (isUndef(Mask: SM.Mask))
1609	return OpRef::undef(Ty: getSingleVT(ElemTy: MVT::i8));
1610
1611	OpRef C = contracting(SM, Va, Vb, Results);
1612	if (C.isValid())
1613	return C;
1614
1615	int VecLen = SM.Mask.size();
1616	shuffles::MaskT PackedMask(VecLen);
1617	OpRef P = packs(SM, Va, Vb, Results, NewMask: PackedMask);
1618	if (P.isValid())
1619	return shuffs1(SM: ShuffleMask (PackedMask), Va: P, Results);
1620
1621	// TODO: Before we split the mask, try perfect shuffle on concatenated
1622	// operands.
1623
1624	shuffles::MaskT MaskL(VecLen), MaskR(VecLen);
1625	splitMask(Mask: SM.Mask, MaskL, MaskR);
1626
1627	OpRef L = shuffs1(SM: ShuffleMask (MaskL), Va, Results);
1628	OpRef R = shuffs1(SM: ShuffleMask (MaskR), Va: Vb, Results);
1629	if (!L.isValid() \|\| !R.isValid())
1630	return OpRef::fail();
1631
1632	SmallVector<uint8_t, `128`> Bytes(VecLen);
1633	for (int I = `0`; I != VecLen; ++I) {
1634	if (MaskL [I] != -`1`)
1635	Bytes [I] = `0xFF`;
1636	}
1637	return vmuxs(Bytes, Va: L, Vb: R, Results);
1638	}
1639
1640	OpRef HvxSelector::shuffp1(ShuffleMask SM, OpRef Va, ResultStack &Results) {
1641	DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << `'\n'`;});
1642	int VecLen = SM.Mask.size();
1643
1644	if (isIdentity(Mask: SM.Mask))
1645	return Va;
1646	if (isUndef(Mask: SM.Mask))
1647	return OpRef::undef(Ty: getPairVT(ElemTy: MVT::i8));
1648
1649	shuffles::MaskT PackedMask(VecLen);
1650	OpRef P = packs(SM, Va: OpRef::lo(R: Va), Vb: OpRef::hi(R: Va), Results, NewMask: PackedMask);
1651	if (P.isValid()) {
1652	ShuffleMask PM(PackedMask);
1653	OpRef E = expanding(SM: PM, Va: P, Results);
1654	if (E.isValid())
1655	return E;
1656
1657	OpRef L = shuffs1(SM: PM.lo(), Va: P, Results);
1658	OpRef H = shuffs1(SM: PM.hi(), Va: P, Results);
1659	if (L.isValid() && H.isValid())
1660	return concats(Lo: L, Hi: H, Results);
1661	}
1662
1663	if (!isLowHalfOnly(Mask: SM.Mask)) {
1664	// Doing a perfect shuffle on a low-half mask (i.e. where the upper half
1665	// is all-undef) may produce a perfect shuffle that generates legitimate
1666	// upper half. This isn't wrong, but if the perfect shuffle was possible,
1667	// then there is a good chance that a shorter (contracting) code may be
1668	// used as well (e.g. V6_vshuffeb, etc).
1669	OpRef R = perfect(SM, Va, Results);
1670	if (R.isValid())
1671	return R;
1672	// TODO commute the mask and try the opposite order of the halves.
1673	}
1674
1675	OpRef L = shuffs2(SM: SM.lo(), Va: OpRef::lo(R: Va), Vb: OpRef::hi(R: Va), Results);
1676	OpRef H = shuffs2(SM: SM.hi(), Va: OpRef::lo(R: Va), Vb: OpRef::hi(R: Va), Results);
1677	if (L.isValid() && H.isValid())
1678	return concats(Lo: L, Hi: H, Results);
1679
1680	return OpRef::fail();
1681	}
1682
1683	OpRef HvxSelector::shuffp2(ShuffleMask SM, OpRef Va, OpRef Vb,
1684	ResultStack &Results) {
1685	DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << `'\n'`;});
1686	if (isUndef(Mask: SM.Mask))
1687	return OpRef::undef(Ty: getPairVT(ElemTy: MVT::i8));
1688
1689	int VecLen = SM.Mask.size();
1690	SmallVector<int,`256`> PackedMask(VecLen);
1691	OpRef P = packp(SM, Va, Vb, Results, NewMask: PackedMask);
1692	if (P.isValid())
1693	return shuffp1(SM: ShuffleMask (PackedMask), Va: P, Results);
1694
1695	SmallVector<int,`256`> MaskL(VecLen), MaskR(VecLen);
1696	splitMask(Mask: SM.Mask, MaskL, MaskR);
1697
1698	OpRef L = shuffp1(SM: ShuffleMask (MaskL), Va, Results);
1699	OpRef R = shuffp1(SM: ShuffleMask (MaskR), Va: Vb, Results);
1700	if (!L.isValid() \|\| !R.isValid())
1701	return OpRef::fail();
1702
1703	// Mux the results.
1704	SmallVector<uint8_t,`256`> Bytes(VecLen);
1705	for (int I = `0`; I != VecLen; ++I) {
1706	if (MaskL [I] != -`1`)
1707	Bytes [I] = `0xFF`;
1708	}
1709	return vmuxp(Bytes, Va: L, Vb: R, Results);
1710	}
1711
1712	namespace {
1713	struct Deleter : public SelectionDAG::DAGNodeDeletedListener {
1714	template <typename T>
1715	Deleter(SelectionDAG &D, T &C)
1716	: SelectionDAG::DAGNodeDeletedListener(D, [&C] (SDNode N, SDNode E) {
1717	C.erase(N);
1718	}) {}
1719	};
1720
1721	template <typename T>
1722	struct NullifyingVector : public T {
1723	DenseMap<SDNode, SDNode*> Refs;
1724	NullifyingVector(T &&V) : T(V) {
1725	for (unsigned i = `0`, e = T::size(); i != e; ++i) {
1726	SDNode &N = T::operator*[](i);
1727	Refs [N] = &N;
1728	}
1729	}
1730	void erase(SDNode *N) {
1731	auto F = Refs.find(Val: N);
1732	if (F != Refs.end())
1733	F ->second = nullptr*;
1734	}
1735	};
1736	}
1737
1738	void HvxSelector::select(SDNode *ISelN) {
1739	// What's important here is to select the right set of nodes. The main
1740	// selection algorithm loops over nodes in a topological order, i.e. users
1741	// are visited before their operands.
1742	//
1743	// It is an error to have an unselected node with a selected operand, and
1744	// there is an assertion in the main selector code to enforce that.
1745	//
1746	// Such a situation could occur if we selected a node, which is both a
1747	// subnode of ISelN, and a subnode of an unrelated (and yet unselected)
1748	// node in the DAG.
1749	assert(ISelN->getOpcode() == HexagonISD::ISEL);
1750	SDNode *N0 = ISelN->getOperand(Num: `0`).getNode();
1751
1752	// There could have been nodes created (i.e. inserted into the DAG)
1753	// that are now dead. Remove them, in case they use any of the nodes
1754	// to select (and make them look shared).
1755	DAG.RemoveDeadNodes();
1756
1757	SetVector<SDNode *> SubNodes;
1758
1759	if (!N0->isMachineOpcode()) {
1760	// Don't want to select N0 if it's shared with another node, except if
1761	// it's shared with other ISELs.
1762	auto IsISelN = [](SDNode T) { return* T->getOpcode() == HexagonISD::ISEL; };
1763	if (llvm::all_of(Range: N0->uses(), P: IsISelN))
1764	SubNodes.insert(X: N0);
1765	}
1766	if (SubNodes.empty()) {
1767	ISel.ReplaceNode(F: ISelN, T: N0);
1768	return;
1769	}
1770
1771	// Need to manually select the nodes that are dominated by the ISEL. Other
1772	// nodes are reachable from the rest of the DAG, and so will be selected
1773	// by the DAG selection routine.
1774	SetVector<SDNode*> Dom, NonDom;
1775	Dom.insert(X: N0);
1776
1777	auto IsDomRec = [&Dom, &NonDom] (SDNode T, auto* Rec) -> bool {
1778	if (Dom.count(key: T))
1779	return true;
1780	if (T->use_empty() \|\| NonDom.count(key: T))
1781	return false;
1782	for (SDNode *U : T->uses()) {
1783	// If T is reachable from a known non-dominated node, then T itself
1784	// is non-dominated.
1785	if (!Rec(U, Rec)) {
1786	NonDom.insert(X: T);
1787	return false;
1788	}
1789	}
1790	Dom.insert(X: T);
1791	return true;
1792	};
1793
1794	auto IsDom = [&IsDomRec] (SDNode T) { return* IsDomRec (T, IsDomRec); };
1795
1796	// Add the rest of nodes dominated by ISEL to SubNodes.
1797	for (unsigned I = `0`; I != SubNodes.size(); ++I) {
1798	for (SDValue Op : SubNodes [I]->ops()) {
1799	SDNode *O = Op.getNode();
1800	if (IsDom (O))
1801	SubNodes.insert(X: O);
1802	}
1803	}
1804
1805	// Do a topological sort of nodes from Dom.
1806	SetVector<SDNode*> TmpQ;
1807
1808	std::map<SDNode , unsigned*> OpCount;
1809	for (SDNode *T : Dom) {
1810	unsigned NumDomOps = llvm::count_if(Range: T->ops(), P: [&Dom](const SDUse &U) {
1811	return Dom.count(key: U.getNode());
1812	});
1813
1814	OpCount.insert(x: {T, NumDomOps});
1815	if (NumDomOps == `0`)
1816	TmpQ.insert(X: T);
1817	}
1818
1819	for (unsigned I = `0`; I != TmpQ.size(); ++I) {
1820	SDNode *S = TmpQ [I];
1821	for (SDNode *U : S->uses()) {
1822	if (U == ISelN)
1823	continue;
1824	auto F = OpCount.find(x: U);
1825	assert(F != OpCount.end());
1826	if (F ->second > `0` && !--F ->second)
1827	TmpQ.insert(X: F ->first);
1828	}
1829	}
1830
1831	// Remove the marker.
1832	ISel.ReplaceNode(F: ISelN, T: N0);
1833
1834	assert(SubNodes.size() == TmpQ.size());
1835	NullifyingVector<decltype(TmpQ)::vector_type> Queue(TmpQ.takeVector());
1836
1837	Deleter DUQ(DAG, Queue);
1838	for (SDNode *S : reverse(C&: Queue)) {
1839	if (S == nullptr)
1840	continue;
1841	DEBUG_WITH_TYPE("isel", {dbgs() << "HVX selecting: "; S->dump(&DAG);});
1842	ISel.Select(N: S);
1843	}
1844	}
1845
1846	bool HvxSelector::scalarizeShuffle(ArrayRef<int> Mask, const SDLoc &dl,
1847	MVT ResTy, SDValue Va, SDValue Vb,
1848	SDNode *N) {
1849	DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << `'\n'`;});
1850	MVT ElemTy = ResTy.getVectorElementType();
1851	assert(ElemTy == MVT::i8);
1852	unsigned VecLen = Mask.size();
1853	bool HavePairs = (`2`*HwLen == VecLen);
1854	MVT SingleTy = getSingleVT(ElemTy: MVT::i8);
1855
1856	// The prior attempts to handle this shuffle may have left a bunch of
1857	// dead nodes in the DAG (such as constants). These nodes will be added
1858	// at the end of DAG's node list, which at that point had already been
1859	// sorted topologically. In the main selection loop, the node list is
1860	// traversed backwards from the root node, which means that any new
1861	// nodes (from the end of the list) will not be visited.
1862	// Scalarization will replace the shuffle node with the scalarized
1863	// expression, and if that expression reused any if the leftoever (dead)
1864	// nodes, these nodes would not be selected (since the "local" selection
1865	// only visits nodes that are not in AllNodes).
1866	// To avoid this issue, remove all dead nodes from the DAG now.
1867	// DAG.RemoveDeadNodes();
1868
1869	SmallVector<SDValue,`128`> Ops;
1870	LLVMContext &Ctx = *DAG.getContext();
1871	MVT LegalTy = Lower.getTypeToTransformTo(Context&: Ctx, VT: ElemTy).getSimpleVT();
1872	for (int I : Mask) {
1873	if (I < `0`) {
1874	Ops.push_back(Elt: ISel.selectUndef(dl, ResTy: LegalTy));
1875	continue;
1876	}
1877	SDValue Vec;
1878	unsigned M = I;
1879	if (M < VecLen) {
1880	Vec = Va;
1881	} else {
1882	Vec = Vb;
1883	M -= VecLen;
1884	}
1885	if (HavePairs) {
1886	if (M < HwLen) {
1887	Vec = DAG.getTargetExtractSubreg(SRIdx: Hexagon::vsub_lo, DL: dl, VT: SingleTy, Operand: Vec);
1888	} else {
1889	Vec = DAG.getTargetExtractSubreg(SRIdx: Hexagon::vsub_hi, DL: dl, VT: SingleTy, Operand: Vec);
1890	M -= HwLen;
1891	}
1892	}
1893	SDValue Idx = DAG.getConstant(Val: M, DL: dl, VT: MVT::i32);
1894	SDValue Ex = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT: LegalTy, Ops: {Vec, Idx});
1895	SDValue L = Lower.LowerOperation(Op: Ex, DAG);
1896	assert(L.getNode());
1897	Ops.push_back(Elt: L);
1898	}
1899
1900	SDValue LV;
1901	if (`2`*HwLen == VecLen) {
1902	SDValue B0 = DAG.getBuildVector(VT: SingleTy, DL: dl, Ops: {Ops.data(), HwLen});
1903	SDValue L0 = Lower.LowerOperation(Op: B0, DAG);
1904	SDValue B1 = DAG.getBuildVector(VT: SingleTy, DL: dl, Ops: {Ops.data()+HwLen, HwLen});
1905	SDValue L1 = Lower.LowerOperation(Op: B1, DAG);
1906	// XXX CONCAT_VECTORS is legal for HVX vectors. Legalizing (lowering)
1907	// functions may expect to be called only for illegal operations, so
1908	// make sure that they are not called for legal ones. Develop a better
1909	// mechanism for dealing with this.
1910	LV = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT: ResTy, Ops: {L0, L1});
1911	} else {
1912	SDValue BV = DAG.getBuildVector(VT: ResTy, DL: dl, Ops);
1913	LV = Lower.LowerOperation(Op: BV, DAG);
1914	}
1915
1916	assert(!N->use_empty());
1917	SDValue IS = DAG.getNode(Opcode: HexagonISD::ISEL, DL: dl, VT: ResTy, Operand: LV);
1918	ISel.ReplaceNode(F: N, T: IS.getNode());
1919	select(ISelN: IS.getNode());
1920	DAG.RemoveDeadNodes();
1921	return true;
1922	}
1923
1924	SmallVector<uint32_t, `8`> HvxSelector::getPerfectCompletions(ShuffleMask SM,
1925	unsigned Width) {
1926	auto possibilities = [](ArrayRef<uint8_t> Bs, unsigned Width) -> uint32_t {
1927	unsigned Impossible = ~(`1u` << Width) + `1`;
1928	for (unsigned I = `0`, E = Bs.size(); I != E; ++I) {
1929	uint8_t B = Bs [I];
1930	if (B == `0xff`)
1931	continue;
1932	if (~Impossible == `0`)
1933	break;
1934	for (unsigned Log = `0`; Log != Width; ++Log) {
1935	if (Impossible & (`1u` << Log))
1936	continue;
1937	unsigned Expected = (I >> Log) % `2`;
1938	if (B != Expected)
1939	Impossible \|= (`1u` << Log);
1940	}
1941	}
1942	return ~Impossible;
1943	};
1944
1945	SmallVector<uint32_t, `8`> Worklist(Width);
1946
1947	for (unsigned BitIdx = `0`; BitIdx != Width; ++BitIdx) {
1948	SmallVector<uint8_t> BitValues(SM.Mask.size());
1949	for (int i = `0`, e = SM.Mask.size(); i != e; ++i) {
1950	int M = SM.Mask [i];
1951	if (M < `0`)
1952	BitValues [i] = `0xff`;
1953	else
1954	BitValues [i] = (M & (`1u` << BitIdx)) != `0`;
1955	}
1956	Worklist [BitIdx] = possibilities (BitValues, Width);
1957	}
1958
1959	// If there is a word P in Worklist that matches multiple possibilities,
1960	// then if any other word Q matches any of the possibilities matched by P,
1961	// then Q matches all the possibilities matched by P. In fact, P == Q.
1962	// In other words, for each words P, Q, the sets of possibilities matched
1963	// by P and Q are either equal or disjoint (no partial overlap).
1964	//
1965	// Illustration: For 4-bit values there are 4 complete sequences:
1966	// a: 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1
1967	// b: 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1
1968	// c: 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1
1969	// d: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1
1970	//
1971	// Words containing unknown bits that match two of the complete
1972	// sequences:
1973	// ab: 0 u u 1 0 u u 1 0 u u 1 0 u u 1
1974	// ac: 0 u 0 u u 1 u 1 0 u 0 u u 1 u 1
1975	// ad: 0 u 0 u 0 u 0 u u 1 u 1 u 1 u 1
1976	// bc: 0 0 u u u u 1 1 0 0 u u u u 1 1
1977	// bd: 0 0 u u 0 0 u u u u 1 1 u u 1 1
1978	// cd: 0 0 0 0 u u u u u u u u 1 1 1 1
1979	//
1980	// Proof of the claim above:
1981	// Let P be a word that matches s0 and s1. For that to happen, all known
1982	// bits in P must match s0 and s1 exactly.
1983	// Assume there is Q that matches s1. Note that since P and Q came from
1984	// the same shuffle mask, the positions of unknown bits in P and Q match
1985	// exactly, which makes the indices of known bits be exactly the same
1986	// between P and Q. Since P matches s0 and s1, the known bits of P much
1987	// match both s0 and s1. Also, since Q matches s1, the known bits in Q
1988	// are exactly the same as in s1, which means that they are exactly the
1989	// same as in P. This implies that P == Q.
1990
1991	// There can be a situation where there are more entries with the same
1992	// bits set than there are set bits (e.g. value 9 occuring more than 2
1993	// times). In such cases it will be impossible to complete this to a
1994	// perfect shuffle.
1995	SmallVector<uint32_t, `8`> Sorted(Worklist);
1996	llvm::sort(Start: Sorted.begin(), End: Sorted.end());
1997
1998	for (unsigned I = `0`, E = Sorted.size(); I != E;) {
1999	unsigned P = Sorted [I], Count = `1`;
2000	while (++I != E && P == Sorted [I])
2001	++Count;
2002	if ((unsigned)llvm::popcount(Value: P) < Count) {
2003	// Reset all occurences of P, if there are more occurrences of P
2004	// than there are bits in P.
2005	for (unsigned &Q : Worklist) {
2006	if (Q == P)
2007	Q = `0`;
2008	}
2009	}
2010	}
2011
2012	return Worklist;
2013	}
2014
2015	SmallVector<uint32_t, `8`>
2016	HvxSelector::completeToPerfect(ArrayRef<uint32_t> Completions, unsigned Width) {
2017	// Pick a completion if there are multiple possibilities. For now just
2018	// select any valid completion.
2019	SmallVector<uint32_t, `8`> Comps(Completions);
2020
2021	for (unsigned I = `0`; I != Width; ++I) {
2022	uint32_t P = Comps [I];
2023	assert(P != `0`);
2024	if (isPowerOf2_32(Value: P))
2025	continue;
2026	// T = least significant bit of P.
2027	uint32_t T = P ^ ((P - `1`) & P);
2028	// Clear T in all remaining words matching P.
2029	for (unsigned J = I + `1`; J != Width; ++J) {
2030	if (Comps [J] == P)
2031	Comps [J] ^= T;
2032	}
2033	Comps [I] = T;
2034	}
2035
2036	#ifndef NDEBUG
2037	// Check that we have generated a valid completion.
2038	uint32_t OrAll = `0`;
2039	for (uint32_t C : Comps) {
2040	assert(isPowerOf2_32(C));
2041	OrAll \|= C;
2042	}
2043	assert(OrAll == (`1u` << Width) -`1`);
2044	#endif
2045
2046	return Comps;
2047	}
2048
2049	std::optional<int> HvxSelector::rotationDistance(ShuffleMask SM,
2050	unsigned WrapAt) {
2051	std::optional<int> Dist;
2052	for (int I = `0`, E = SM.Mask.size(); I != E; ++I) {
2053	int M = SM.Mask [I];
2054	if (M < `0`)
2055	continue;
2056	if (Dist) {
2057	if ((I + Dist) % static_cast<int*>(WrapAt) != M)
2058	return std::nullopt;
2059	} else {
2060	// Integer a%b operator assumes rounding towards zero by /, so it
2061	// "misbehaves" when a crosses 0 (the remainder also changes sign).
2062	// Add WrapAt in an attempt to keep I+Dist non-negative.
2063	Dist = M - I;
2064	if (Dist < `0`)
2065	Dist = *Dist + WrapAt;
2066	}
2067	}
2068	return Dist;
2069	}
2070
2071	OpRef HvxSelector::contracting(ShuffleMask SM, OpRef Va, OpRef Vb,
2072	ResultStack &Results) {
2073	DEBUG_WITH_TYPE("isel", { dbgs() << __func__ << `'\n'`; });
2074	if (!Va.isValid() \|\| !Vb.isValid())
2075	return OpRef::fail();
2076
2077	// Contracting shuffles, i.e. instructions that always discard some bytes
2078	// from the operand vectors.
2079	//
2080	// Funnel shifts
2081	// V6_vshuff{e,o}b
2082	// V6_vshuf{e,o}h
2083	// V6_vdealb4w
2084	// V6_vpack{e,o}{b,h}
2085
2086	int VecLen = SM.Mask.size();
2087
2088	// First, check for funnel shifts.
2089	if (auto Dist = rotationDistance(SM, WrapAt: `2` * VecLen)) {
2090	OpRef Funnel = funnels(Va, Vb, Amount: *Dist, Results);
2091	if (Funnel.isValid())
2092	return Funnel;
2093	}
2094
2095	MVT SingleTy = getSingleVT(ElemTy: MVT::i8);
2096	MVT PairTy = getPairVT(ElemTy: MVT::i8);
2097
2098	auto same = [](ArrayRef<int> Mask1, ArrayRef<int> Mask2) -> bool {
2099	return Mask1 == Mask2;
2100	};
2101
2102	using PackConfig = std::pair<unsigned, bool>;
2103	PackConfig Packs[] = {
2104	{`1`, false}, // byte, even
2105	{`1`, true}, // byte, odd
2106	{`2`, false}, // half, even
2107	{`2`, true}, // half, odd
2108	};
2109
2110	{ // Check vpack
2111	unsigned Opcodes[] = {
2112	Hexagon::V6_vpackeb,
2113	Hexagon::V6_vpackob,
2114	Hexagon::V6_vpackeh,
2115	Hexagon::V6_vpackoh,
2116	};
2117	for (int i = `0`, e = std::size(Opcodes); i != e; ++i) {
2118	auto [Size, Odd] = Packs[i];
2119	if (same (SM.Mask, shuffles::mask(S: shuffles::vpack, Length: HwLen, args: Size, args: Odd))) {
2120	Results.push(Opc: Opcodes[i], Ty: SingleTy, Ops: {Vb, Va});
2121	return OpRef::res(N: Results.top());
2122	}
2123	}
2124	}
2125
2126	{ // Check vshuff
2127	unsigned Opcodes[] = {
2128	Hexagon::V6_vshuffeb,
2129	Hexagon::V6_vshuffob,
2130	Hexagon::V6_vshufeh,
2131	Hexagon::V6_vshufoh,
2132	};
2133	for (int i = `0`, e = std::size(Opcodes); i != e; ++i) {
2134	auto [Size, Odd] = Packs[i];
2135	if (same (SM.Mask, shuffles::mask(S: shuffles::vshuff, Length: HwLen, args: Size, args: Odd))) {
2136	Results.push(Opc: Opcodes[i], Ty: SingleTy, Ops: {Vb, Va});
2137	return OpRef::res(N: Results.top());
2138	}
2139	}
2140	}
2141
2142	{ // Check vdeal
2143	// There is no "V6_vdealeb", etc, but the supposed behavior of vdealeb
2144	// is equivalent to "(V6_vpackeb (V6_vdealvdd Vu, Vv, -2))". Other such
2145	// variants of "deal" can be done similarly.
2146	unsigned Opcodes[] = {
2147	Hexagon::V6_vpackeb,
2148	Hexagon::V6_vpackob,
2149	Hexagon::V6_vpackeh,
2150	Hexagon::V6_vpackoh,
2151	};
2152	const SDLoc &dl(Results.InpNode);
2153
2154	for (int i = `0`, e = std::size(Opcodes); i != e; ++i) {
2155	auto [Size, Odd] = Packs[i];
2156	if (same (SM.Mask, shuffles::mask(S: shuffles::vdeal, Length: HwLen, args: Size, args: Odd))) {
2157	Results.push(Opc: Hexagon::A2_tfrsi, Ty: MVT::i32, Ops: {getConst32(Val: -`2` * Size, dl)});
2158	Results.push(Opc: Hexagon::V6_vdealvdd, Ty: PairTy, Ops: {Vb, Va, OpRef::res(N: -`1`)});
2159	auto vdeal = OpRef::res(N: Results.top());
2160	Results.push(Opc: Opcodes[i], Ty: SingleTy,
2161	Ops: {OpRef::hi(R: vdeal), OpRef::lo(R: vdeal)});
2162	return OpRef::res(N: Results.top());
2163	}
2164	}
2165	}
2166
2167	if (same (SM.Mask, shuffles::mask(S: shuffles::vdealb4w, Length: HwLen))) {
2168	Results.push(Opc: Hexagon::V6_vdealb4w, Ty: SingleTy, Ops: {Vb, Va});
2169	return OpRef::res(N: Results.top());
2170	}
2171
2172	return OpRef::fail();
2173	}
2174
2175	OpRef HvxSelector::expanding(ShuffleMask SM, OpRef Va, ResultStack &Results) {
2176	DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << `'\n'`;});
2177	// Expanding shuffles (using all elements and inserting into larger vector):
2178	//
2179	// V6_vunpacku{b,h} []*
2180	//
2181	// [] Only if the upper elements (filled with 0s) are "don't care" in Mask.*
2182	//
2183	// Note: V6_vunpacko{b,h} are or-ing the high byte/half in the result, so
2184	// they are not shuffles.
2185	//
2186	// The argument is a single vector.
2187
2188	int VecLen = SM.Mask.size();
2189	assert(`2`HwLen == unsigned*(VecLen) && "Expecting vector-pair type");
2190
2191	std::pair<int,unsigned> Strip = findStrip(A: SM.Mask, Inc: `1`, MaxLen: VecLen);
2192
2193	// The patterns for the unpacks, in terms of the starting offsets of the
2194	// consecutive strips (L = length of the strip, N = VecLen):
2195	//
2196	// vunpacku: 0, -1, L, -1, 2L, -1 ...
2197
2198	if (Strip.first != `0`)
2199	return OpRef::fail();
2200
2201	// The vunpackus only handle byte and half-word.
2202	if (Strip.second != `1` && Strip.second != `2`)
2203	return OpRef::fail();
2204
2205	int N = VecLen;
2206	int L = Strip.second;
2207
2208	// First, check the non-ignored strips.
2209	for (int I = `2`L; I < N; I += `2`L) {
2210	auto S = findStrip(A: SM.Mask.drop_front(N: I), Inc: `1`, MaxLen: N-I);
2211	if (S.second != unsigned(L))
2212	return OpRef::fail();
2213	if (`2`*S.first != I)
2214	return OpRef::fail();
2215	}
2216	// Check the -1s.
2217	for (int I = L; I < N; I += `2`*L) {
2218	auto S = findStrip(A: SM.Mask.drop_front(N: I), Inc: `0`, MaxLen: N-I);
2219	if (S.first != -`1` \|\| S.second != unsigned(L))
2220	return OpRef::fail();
2221	}
2222
2223	unsigned Opc = Strip.second == `1` ? Hexagon::V6_vunpackub
2224	: Hexagon::V6_vunpackuh;
2225	Results.push(Opc, Ty: getPairVT(ElemTy: MVT::i8), Ops: {Va});
2226	return OpRef::res(N: Results.top());
2227	}
2228
2229	OpRef HvxSelector::perfect(ShuffleMask SM, OpRef Va, ResultStack &Results) {
2230	DEBUG_WITH_TYPE("isel", { dbgs() << __func__ << `'\n'`; });
2231	// V6_vdeal{b,h}
2232	// V6_vshuff{b,h}
2233
2234	// V6_vshufoe{b,h} those are quivalent to vshuffvdd(..,{1,2})
2235	// V6_vshuffvdd (V6_vshuff)
2236	// V6_dealvdd (V6_vdeal)
2237
2238	int VecLen = SM.Mask.size();
2239	assert(isPowerOf2_32(VecLen) && Log2_32(VecLen) <= `8`);
2240	unsigned LogLen = Log2_32(Value: VecLen);
2241	unsigned HwLog = Log2_32(Value: HwLen);
2242	// The result length must be the same as the length of a single vector,
2243	// or a vector pair.
2244	assert(LogLen == HwLog \|\| LogLen == HwLog + `1`);
2245	bool HavePairs = LogLen == HwLog + `1`;
2246
2247	SmallVector<unsigned, `8`> Perm(LogLen);
2248
2249	// Check if this could be a perfect shuffle, or a combination of perfect
2250	// shuffles.
2251	//
2252	// Consider this permutation (using hex digits to make the ASCII diagrams
2253	// easier to read):
2254	// { 0, 8, 1, 9, 2, A, 3, B, 4, C, 5, D, 6, E, 7, F }.
2255	// This is a "deal" operation: divide the input into two halves, and
2256	// create the output by picking elements by alternating between these two
2257	// halves:
2258	// 0 1 2 3 4 5 6 7 --> 0 8 1 9 2 A 3 B 4 C 5 D 6 E 7 F []*
2259	// 8 9 A B C D E F
2260	//
2261	// Aside from a few special explicit cases (V6_vdealb, etc.), HVX provides
2262	// a somwehat different mechanism that could be used to perform shuffle/
2263	// deal operations: a 2x2 transpose.
2264	// Consider the halves of inputs again, they can be interpreted as a 2x8
2265	// matrix. A 2x8 matrix can be looked at four 2x2 matrices concatenated
2266	// together. Now, when considering 2 elements at a time, it will be a 2x4
2267	// matrix (with elements 01, 23, 45, etc.), or two 2x2 matrices:
2268	// 01 23 45 67
2269	// 89 AB CD EF
2270	// With groups of 4, this will become a single 2x2 matrix, and so on.
2271	//
2272	// The 2x2 transpose instruction works by transposing each of the 2x2
2273	// matrices (or "sub-matrices"), given a specific group size. For example,
2274	// if the group size is 1 (i.e. each element is its own group), there
2275	// will be four transposes of the four 2x2 matrices that form the 2x8.
2276	// For example, with the inputs as above, the result will be:
2277	// 0 8 2 A 4 C 6 E
2278	// 1 9 3 B 5 D 7 F
2279	// Now, this result can be tranposed again, but with the group size of 2:
2280	// 08 19 4C 5D
2281	// 2A 3B 6E 7F
2282	// If we then transpose that result, but with the group size of 4, we get:
2283	// 0819 2A3B
2284	// 4C5D 6E7F
2285	// If we concatenate these two rows, it will be
2286	// 0 8 1 9 2 A 3 B 4 C 5 D 6 E 7 F
2287	// which is the same as the "deal" [] above.*
2288	//
2289	// In general, a "deal" of individual elements is a series of 2x2 transposes,
2290	// with changing group size. HVX has two instructions:
2291	// Vdd = V6_vdealvdd Vu, Vv, Rt
2292	// Vdd = V6_shufvdd Vu, Vv, Rt
2293	// that perform exactly that. The register Rt controls which transposes are
2294	// going to happen: a bit at position n (counting from 0) indicates that a
2295	// transpose with a group size of 2^n will take place. If multiple bits are
2296	// set, multiple transposes will happen: vdealvdd will perform them starting
2297	// with the largest group size, vshuffvdd will do them in the reverse order.
2298	//
2299	// The main observation is that each 2x2 transpose corresponds to swapping
2300	// columns of bits in the binary representation of the values.
2301	//
2302	// The numbers {3,2,1,0} and the log2 of the number of contiguous 1 bits
2303	// in a given column. The denote the columns that will be swapped.*
2304	// The transpose with the group size 2^n corresponds to swapping columns
2305	// 3 (the highest log) and log2(n):
2306	//
2307	// 3 2 1 0 0 2 1 3 0 2 3 1
2308	// * * * * * *
2309	// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2310	// 1 0 0 0 1 8 1 0 0 0 8 1 0 0 0 8 1 0 0 0
2311	// 2 0 0 1 0 2 0 0 1 0 1 0 0 0 1 1 0 0 0 1
2312	// 3 0 0 1 1 A 1 0 1 0 9 1 0 0 1 9 1 0 0 1
2313	// 4 0 1 0 0 4 0 1 0 0 4 0 1 0 0 2 0 0 1 0
2314	// 5 0 1 0 1 C 1 1 0 0 C 1 1 0 0 A 1 0 1 0
2315	// 6 0 1 1 0 6 0 1 1 0 5 0 1 0 1 3 0 0 1 1
2316	// 7 0 1 1 1 E 1 1 1 0 D 1 1 0 1 B 1 0 1 1
2317	// 8 1 0 0 0 1 0 0 0 1 2 0 0 1 0 4 0 1 0 0
2318	// 9 1 0 0 1 9 1 0 0 1 A 1 0 1 0 C 1 1 0 0
2319	// A 1 0 1 0 3 0 0 1 1 3 0 0 1 1 5 0 1 0 1
2320	// B 1 0 1 1 B 1 0 1 1 B 1 0 1 1 D 1 1 0 1
2321	// C 1 1 0 0 5 0 1 0 1 6 0 1 1 0 6 0 1 1 0
2322	// D 1 1 0 1 D 1 1 0 1 E 1 1 1 0 E 1 1 1 0
2323	// E 1 1 1 0 7 0 1 1 1 7 0 1 1 1 7 0 1 1 1
2324	// F 1 1 1 1 F 1 1 1 1 F 1 1 1 1 F 1 1 1 1
2325
2326	// There is one special case that is not a perfect shuffle, but can be
2327	// turned into one easily: when the shuffle operates on a vector pair,
2328	// but the two vectors in the pair are swapped. The code that identifies
2329	// perfect shuffles will reject it, unless the order is reversed.
2330	shuffles::MaskT MaskStorage(SM.Mask);
2331	bool InvertedPair = false;
2332	if (HavePairs && SM.Mask [`0`] >= int(HwLen)) {
2333	for (int i = `0`, e = SM.Mask.size(); i != e; ++i) {
2334	int M = SM.Mask [i];
2335	MaskStorage [i] = M >= int(HwLen) ? M - HwLen : M + HwLen;
2336	}
2337	InvertedPair = true;
2338	SM = ShuffleMask (MaskStorage);
2339	}
2340
2341	auto Comps = getPerfectCompletions(SM, Width: LogLen);
2342	if (llvm::is_contained(Range&: Comps, Element: `0`))
2343	return OpRef::fail();
2344
2345	auto Pick = completeToPerfect(Completions: Comps, Width: LogLen);
2346	for (unsigned I = `0`; I != LogLen; ++I)
2347	Perm [I] = Log2_32(Value: Pick [I]);
2348
2349	// Once we have Perm, represent it as cycles. Denote the maximum log2
2350	// (equal to log2(VecLen)-1) as M. The cycle containing M can then be
2351	// written as (M a1 a2 a3 ... an). That cycle can be broken up into
2352	// simple swaps as (M a1)(M a2)(M a3)...(M an), with the composition
2353	// order being from left to right. Any (contiguous) segment where the
2354	// values ai, ai+1...aj are either all increasing or all decreasing,
2355	// can be implemented via a single vshuffvdd/vdealvdd respectively.
2356	//
2357	// If there is a cycle (a1 a2 ... an) that does not involve M, it can
2358	// be written as (M an)(a1 a2 ... an)(M a1). The first two cycles can
2359	// then be folded to get (M a1 a2 ... an)(M a1), and the above procedure
2360	// can be used to generate a sequence of vshuffvdd/vdealvdd.
2361	//
2362	// Example:
2363	// Assume M = 4 and consider a permutation (0 1)(2 3). It can be written
2364	// as (4 0 1)(4 0) composed with (4 2 3)(4 2), or simply
2365	// (4 0 1)(4 0)(4 2 3)(4 2).
2366	// It can then be expanded into swaps as
2367	// (4 0)(4 1)(4 0)(4 2)(4 3)(4 2),
2368	// and broken up into "increasing" segments as
2369	// [(4 0)(4 1)] [(4 0)(4 2)(4 3)] [(4 2)].
2370	// This is equivalent to
2371	// (4 0 1)(4 0 2 3)(4 2),
2372	// which can be implemented as 3 vshufvdd instructions.
2373
2374	using CycleType = SmallVector<unsigned, `8`>;
2375	std::set<CycleType> Cycles;
2376	std::set<unsigned> All;
2377
2378	for (unsigned I : Perm)
2379	All.insert(x: I);
2380
2381	// If the cycle contains LogLen-1, move it to the front of the cycle.
2382	// Otherwise, return the cycle unchanged.
2383	auto canonicalize = [LogLen](const CycleType &C) -> CycleType {
2384	unsigned LogPos, N = C.size();
2385	for (LogPos = `0`; LogPos != N; ++LogPos)
2386	if (C [LogPos] == LogLen - `1`)
2387	break;
2388	if (LogPos == N)
2389	return C;
2390
2391	CycleType NewC(C.begin() + LogPos, C.end());
2392	NewC.append(in_start: C.begin(), in_end: C.begin() + LogPos);
2393	return NewC;
2394	};
2395
2396	auto pfs = [](const std::set<CycleType> &Cs, unsigned Len) {
2397	// Ordering: shuff: 5 0 1 2 3 4, deal: 5 4 3 2 1 0 (for Log=6),
2398	// for bytes zero is included, for halfwords is not.
2399	if (Cs.size() != `1`)
2400	return `0u`;
2401	const CycleType &C = *Cs.begin();
2402	if (C [`0`] != Len - `1`)
2403	return `0u`;
2404	int D = Len - C.size();
2405	if (D != `0` && D != `1`)
2406	return `0u`;
2407
2408	bool IsDeal = true, IsShuff = true;
2409	for (unsigned I = `1`; I != Len - D; ++I) {
2410	if (C [I] != Len - `1` - I)
2411	IsDeal = false;
2412	if (C [I] != I - (`1` - D)) // I-1, I
2413	IsShuff = false;
2414	}
2415	// At most one, IsDeal or IsShuff, can be non-zero.
2416	assert(!(IsDeal \|\| IsShuff) \|\| IsDeal != IsShuff);
2417	static unsigned Deals[] = {Hexagon::V6_vdealb, Hexagon::V6_vdealh};
2418	static unsigned Shufs[] = {Hexagon::V6_vshuffb, Hexagon::V6_vshuffh};
2419	return IsDeal ? Deals[D] : (IsShuff ? Shufs[D] : `0`);
2420	};
2421
2422	while (!All.empty()) {
2423	unsigned A = *All.begin();
2424	All.erase(x: A);
2425	CycleType C;
2426	C.push_back(Elt: A);
2427	for (unsigned B = Perm [A]; B != A; B = Perm [B]) {
2428	C.push_back(Elt: B);
2429	All.erase(x: B);
2430	}
2431	if (C.size() <= `1`)
2432	continue;
2433	Cycles.insert(x: canonicalize (C));
2434	}
2435
2436	MVT SingleTy = getSingleVT(ElemTy: MVT::i8);
2437	MVT PairTy = getPairVT(ElemTy: MVT::i8);
2438
2439	// Recognize patterns for V6_vdeal{b,h} and V6_vshuff{b,h}.
2440	if (unsigned(VecLen) == HwLen) {
2441	if (unsigned SingleOpc = pfs (Cycles, LogLen)) {
2442	Results.push(Opc: SingleOpc, Ty: SingleTy, Ops: {Va});
2443	return OpRef::res(N: Results.top());
2444	}
2445	}
2446
2447	// From the cycles, construct the sequence of values that will
2448	// then form the control values for vdealvdd/vshuffvdd, i.e.
2449	// (M a1 a2)(M a3 a4 a5)... -> a1 a2 a3 a4 a5
2450	// This essentially strips the M value from the cycles where
2451	// it's present, and performs the insertion of M (then stripping)
2452	// for cycles without M (as described in an earlier comment).
2453	SmallVector<unsigned, `8`> SwapElems;
2454	// When the input is extended (i.e. single vector becomes a pair),
2455	// this is done by using an "undef" vector as the second input.
2456	// However, then we get
2457	// input 1: GOODBITS
2458	// input 2: ........
2459	// but we need
2460	// input 1: ....BITS
2461	// input 2: ....GOOD
2462	// Then at the end, this needs to be undone. To accomplish this,
2463	// artificially add "LogLen-1" at both ends of the sequence.
2464	if (!HavePairs)
2465	SwapElems.push_back(Elt: LogLen - `1`);
2466	for (const CycleType &C : Cycles) {
2467	// Do the transformation: (a1..an) -> (M a1..an)(M a1).
2468	unsigned First = (C [`0`] == LogLen - `1`) ? `1` : `0`;
2469	SwapElems.append(in_start: C.begin() + First, in_end: C.end());
2470	if (First == `0`)
2471	SwapElems.push_back(Elt: C [`0`]);
2472	}
2473	if (!HavePairs)
2474	SwapElems.push_back(Elt: LogLen - `1`);
2475
2476	const SDLoc &dl(Results.InpNode);
2477	OpRef Arg = HavePairs ? Va : concats(Lo: Va, Hi: OpRef::undef(Ty: SingleTy), Results);
2478	if (InvertedPair)
2479	Arg = concats(Lo: OpRef::hi(R: Arg), Hi: OpRef::lo(R: Arg), Results);
2480
2481	for (unsigned I = `0`, E = SwapElems.size(); I != E;) {
2482	bool IsInc = I == E - `1` \|\| SwapElems [I] < SwapElems [I + `1`];
2483	unsigned S = (`1u` << SwapElems [I]);
2484	if (I < E - `1`) {
2485	while (++I < E - `1` && IsInc == (SwapElems [I] < SwapElems [I + `1`]))
2486	S \|= `1u` << SwapElems [I];
2487	// The above loop will not add a bit for the final SwapElems[I+1],
2488	// so add it here.
2489	S \|= `1u` << SwapElems [I];
2490	}
2491	++I;
2492
2493	NodeTemplate Res;
2494	Results.push(Opc: Hexagon::A2_tfrsi, Ty: MVT::i32, Ops: {getConst32(Val: S, dl)});
2495	Res.Opc = IsInc ? Hexagon::V6_vshuffvdd : Hexagon::V6_vdealvdd;
2496	Res.Ty = PairTy;
2497	Res.Ops = {OpRef::hi(R: Arg), OpRef::lo(R: Arg), OpRef::res(N: -`1`)};
2498	Results.push(Res);
2499	Arg = OpRef::res(N: Results.top());
2500	}
2501
2502	return HavePairs ? Arg : OpRef::lo(R: Arg);
2503	}
2504
2505	OpRef HvxSelector::butterfly(ShuffleMask SM, OpRef Va, ResultStack &Results) {
2506	DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << `'\n'`;});
2507	// Butterfly shuffles.
2508	//
2509	// V6_vdelta
2510	// V6_vrdelta
2511	// V6_vror
2512
2513	// The assumption here is that all elements picked by Mask are in the
2514	// first operand to the vector_shuffle. This assumption is enforced
2515	// by the caller.
2516
2517	MVT ResTy = getSingleVT(ElemTy: MVT::i8);
2518	PermNetwork::Controls FC, RC;
2519	const SDLoc &dl(Results.InpNode);
2520	int VecLen = SM.Mask.size();
2521
2522	for (int M : SM.Mask) {
2523	if (M != -`1` && M >= VecLen)
2524	return OpRef::fail();
2525	}
2526
2527	// Try the deltas/benes for both single vectors and vector pairs.
2528	ForwardDeltaNetwork FN(SM.Mask);
2529	if (FN.run(V&: FC)) {
2530	SDValue Ctl = getVectorConstant(Data: FC, dl);
2531	Results.push(Opc: Hexagon::V6_vdelta, Ty: ResTy, Ops: {Va, OpRef (Ctl)});
2532	return OpRef::res(N: Results.top());
2533	}
2534
2535	// Try reverse delta.
2536	ReverseDeltaNetwork RN(SM.Mask);
2537	if (RN.run(V&: RC)) {
2538	SDValue Ctl = getVectorConstant(Data: RC, dl);
2539	Results.push(Opc: Hexagon::V6_vrdelta, Ty: ResTy, Ops: {Va, OpRef (Ctl)});
2540	return OpRef::res(N: Results.top());
2541	}
2542
2543	// Do Benes.
2544	BenesNetwork BN(SM.Mask);
2545	if (BN.run(F&: FC, R&: RC)) {
2546	SDValue CtlF = getVectorConstant(Data: FC, dl);
2547	SDValue CtlR = getVectorConstant(Data: RC, dl);
2548	Results.push(Opc: Hexagon::V6_vdelta, Ty: ResTy, Ops: {Va, OpRef (CtlF)});
2549	Results.push(Opc: Hexagon::V6_vrdelta, Ty: ResTy,
2550	Ops: {OpRef::res(N: -`1`), OpRef (CtlR)});
2551	return OpRef::res(N: Results.top());
2552	}
2553
2554	return OpRef::fail();
2555	}
2556
2557	SDValue HvxSelector::getConst32(int Val, const SDLoc &dl) {
2558	return DAG.getTargetConstant(Val, DL: dl, VT: MVT::i32);
2559	}
2560
2561	SDValue HvxSelector::getVectorConstant(ArrayRef<uint8_t> Data,
2562	const SDLoc &dl) {
2563	SmallVector<SDValue, `128`> Elems;
2564	for (uint8_t C : Data)
2565	Elems.push_back(Elt: DAG.getConstant(Val: C, DL: dl, VT: MVT::i8));
2566	MVT VecTy = MVT::getVectorVT(VT: MVT::i8, NumElements: Data.size());
2567	SDValue BV = DAG.getBuildVector(VT: VecTy, DL: dl, Ops: Elems);
2568	SDValue LV = Lower.LowerOperation(Op: BV, DAG);
2569	DAG.RemoveDeadNode(N: BV.getNode());
2570	return DAG.getNode(Opcode: HexagonISD::ISEL, DL: dl, VT: VecTy, Operand: LV);
2571	}
2572
2573	void HvxSelector::selectExtractSubvector(SDNode *N) {
2574	SDValue Inp = N->getOperand(Num: `0`);
2575	MVT ResTy = N->getValueType(ResNo: `0`).getSimpleVT();
2576	unsigned Idx = N->getConstantOperandVal(Num: `1`);
2577
2578	[[maybe_unused]] MVT InpTy = Inp.getValueType().getSimpleVT();
2579	[[maybe_unused]] unsigned ResLen = ResTy.getVectorNumElements();
2580	assert(InpTy.getVectorElementType() == ResTy.getVectorElementType());
2581	assert(`2` * ResLen == InpTy.getVectorNumElements());
2582	assert(Idx == `0` \|\| Idx == ResLen);
2583
2584	unsigned SubReg = Idx == `0` ? Hexagon::vsub_lo : Hexagon::vsub_hi;
2585	SDValue Ext = DAG.getTargetExtractSubreg(SRIdx: SubReg, DL: SDLoc (N), VT: ResTy, Operand: Inp);
2586
2587	ISel.ReplaceNode(F: N, T: Ext.getNode());
2588	}
2589
2590	void HvxSelector::selectShuffle(SDNode *N) {
2591	DEBUG_WITH_TYPE("isel", {
2592	dbgs() << "Starting " << __func__ << " on node:\n";
2593	N->dump(&DAG);
2594	});
2595	MVT ResTy = N->getValueType(ResNo: `0`).getSimpleVT();
2596	// Assume that vector shuffles operate on vectors of bytes.
2597	assert(ResTy.isVector() && ResTy.getVectorElementType() == MVT::i8);
2598
2599	auto *SN = cast<ShuffleVectorSDNode>(Val: N);
2600	std::vector<int> Mask(SN->getMask().begin(), SN->getMask().end());
2601	// This shouldn't really be necessary. Is it?
2602	for (int &Idx : Mask)
2603	if (Idx != -`1` && Idx < `0`)
2604	Idx = -`1`;
2605
2606	unsigned VecLen = Mask.size();
2607	bool HavePairs = (`2`*HwLen == VecLen);
2608	assert(ResTy.getSizeInBits() / `8` == VecLen);
2609
2610	// Vd = vector_shuffle Va, Vb, Mask
2611	//
2612
2613	bool UseLeft = false, UseRight = false;
2614	for (unsigned I = `0`; I != VecLen; ++I) {
2615	if (Mask [I] == -`1`)
2616	continue;
2617	unsigned Idx = Mask [I];
2618	assert(Idx < `2`*VecLen);
2619	if (Idx < VecLen)
2620	UseLeft = true;
2621	else
2622	UseRight = true;
2623	}
2624
2625	DEBUG_WITH_TYPE("isel", {
2626	dbgs() << "VecLen=" << VecLen << " HwLen=" << HwLen << " UseLeft="
2627	<< UseLeft << " UseRight=" << UseRight << " HavePairs="
2628	<< HavePairs << `'\n'`;
2629	});
2630	// If the mask is all -1's, generate "undef".
2631	if (!UseLeft && !UseRight) {
2632	ISel.ReplaceNode(F: N, T: ISel.selectUndef(dl: SDLoc (SN), ResTy).getNode());
2633	return;
2634	}
2635
2636	SDValue Vec0 = N->getOperand(Num: `0`);
2637	SDValue Vec1 = N->getOperand(Num: `1`);
2638	assert(Vec0.getValueType() == ResTy && Vec1.getValueType() == ResTy);
2639
2640	ResultStack Results(SN);
2641	OpRef Va = OpRef::undef(Ty: ResTy);
2642	OpRef Vb = OpRef::undef(Ty: ResTy);
2643
2644	if (!Vec0.isUndef()) {
2645	Results.push(Opc: TargetOpcode::COPY, Ty: ResTy, Ops: {Vec0});
2646	Va = OpRef::OpRef::res(N: Results.top());
2647	}
2648	if (!Vec1.isUndef()) {
2649	Results.push(Opc: TargetOpcode::COPY, Ty: ResTy, Ops: {Vec1});
2650	Vb = OpRef::res(N: Results.top());
2651	}
2652
2653	OpRef Res = !HavePairs ? shuffs2(SM: ShuffleMask (Mask), Va, Vb, Results)
2654	: shuffp2(SM: ShuffleMask (Mask), Va, Vb, Results);
2655
2656	bool Done = Res.isValid();
2657	if (Done) {
2658	// Make sure that Res is on the stack before materializing.
2659	Results.push(Opc: TargetOpcode::COPY, Ty: ResTy, Ops: {Res});
2660	materialize(Results);
2661	} else {
2662	Done = scalarizeShuffle(Mask, dl: SDLoc (N), ResTy, Va: Vec0, Vb: Vec1, N);
2663	}
2664
2665	if (!Done) {
2666	#ifndef NDEBUG
2667	dbgs() << "Unhandled shuffle:\n";
2668	SN->dumpr(&DAG);
2669	#endif
2670	llvm_unreachable("Failed to select vector shuffle");
2671	}
2672	}
2673
2674	void HvxSelector::selectRor(SDNode *N) {
2675	// If this is a rotation by less than 8, use V6_valignbi.
2676	MVT Ty = N->getValueType(ResNo: `0`).getSimpleVT();
2677	const SDLoc &dl(N);
2678	SDValue VecV = N->getOperand(Num: `0`);
2679	SDValue RotV = N->getOperand(Num: `1`);
2680	SDNode NewN = nullptr*;
2681
2682	if (auto *CN = dyn_cast<ConstantSDNode>(Val: RotV.getNode())) {
2683	unsigned S = CN->getZExtValue() % HST.getVectorLength();
2684	if (S == `0`) {
2685	NewN = VecV.getNode();
2686	} else if (isUInt<`3`>(x: S)) {
2687	NewN = DAG.getMachineNode(Opcode: Hexagon::V6_valignbi, dl, VT: Ty,
2688	Ops: {VecV, VecV, getConst32(Val: S, dl)});
2689	}
2690	}
2691
2692	if (!NewN)
2693	NewN = DAG.getMachineNode(Opcode: Hexagon::V6_vror, dl, VT: Ty, Ops: {VecV, RotV});
2694
2695	ISel.ReplaceNode(F: N, T: NewN);
2696	}
2697
2698	void HvxSelector::selectVAlign(SDNode *N) {
2699	SDValue Vv = N->getOperand(Num: `0`);
2700	SDValue Vu = N->getOperand(Num: `1`);
2701	SDValue Rt = N->getOperand(Num: `2`);
2702	SDNode *NewN = DAG.getMachineNode(Opcode: Hexagon::V6_valignb, dl: SDLoc (N),
2703	VT: N->getValueType(ResNo: `0`), Ops: {Vv, Vu, Rt});
2704	ISel.ReplaceNode(F: N, T: NewN);
2705	DAG.RemoveDeadNode(N);
2706	}
2707
2708	void HexagonDAGToDAGISel::PreprocessHvxISelDAG() {
2709	auto getNodes = [this]() -> std::vector<SDNode *> {
2710	std::vector<SDNode *> T;
2711	T.reserve(n: CurDAG->allnodes_size());
2712	for (SDNode &N : CurDAG->allnodes())
2713	T.push_back(x: &N);
2714	return T;
2715	};
2716
2717	ppHvxShuffleOfShuffle(Nodes: getNodes ());
2718	}
2719
2720	template <> struct std::hash<SDValue> {
2721	std::size_t operator()(SDValue V) const {
2722	return std::hash<const void *>()(V.getNode()) +
2723	std::hash<unsigned>()(V.getResNo());
2724	};
2725	};
2726
2727	void HexagonDAGToDAGISel::ppHvxShuffleOfShuffle(std::vector<SDNode *> &&Nodes) {
2728	// Motivating case:
2729	// t10: v64i32 = ...
2730	// t46: v128i8 = vector_shuffle<...> t44, t45
2731	// t48: v128i8 = vector_shuffle<...> t44, t45
2732	// t42: v128i8 = vector_shuffle<...> t46, t48
2733	// t12: v32i32 = extract_subvector t10, Constant:i32<0>
2734	// t44: v128i8 = bitcast t12
2735	// t15: v32i32 = extract_subvector t10, Constant:i32<32>
2736	// t45: v128i8 = bitcast t15
2737	SelectionDAG &DAG = *CurDAG;
2738	unsigned HwLen = HST->getVectorLength();
2739
2740	struct SubVectorInfo {
2741	SubVectorInfo(SDValue S, unsigned H) : Src (S), HalfIdx(H) {}
2742	SDValue Src;
2743	unsigned HalfIdx;
2744	};
2745
2746	using MapType = std::unordered_map<SDValue, unsigned>;
2747
2748	auto getMaskElt = [&](unsigned Idx, ShuffleVectorSDNode *Shuff0,
2749	ShuffleVectorSDNode *Shuff1,
2750	const MapType &OpMap) -> int {
2751	// Treat Shuff0 and Shuff1 as operands to another vector shuffle, and
2752	// Idx as a (non-undef) element of the top level shuffle's mask, that
2753	// is, index into concat(Shuff0, Shuff1).
2754	// Assuming that Shuff0 and Shuff1 both operate on subvectors of the
2755	// same source vector (as described by OpMap), return the index of
2756	// that source vector corresponding to Idx.
2757	ShuffleVectorSDNode *OpShuff = Idx < HwLen ? Shuff0 : Shuff1;
2758	if (Idx >= HwLen)
2759	Idx -= HwLen;
2760
2761	// Get the mask index that M points at in the corresponding operand.
2762	int MaybeN = OpShuff->getMaskElt(Idx);
2763	if (MaybeN < `0`)
2764	return -`1`;
2765
2766	auto N = static_cast<unsigned>(MaybeN);
2767	unsigned SrcBase = N < HwLen ? OpMap.at(k: OpShuff->getOperand(Num: `0`))
2768	: OpMap.at(k: OpShuff->getOperand(Num: `1`));
2769	if (N >= HwLen)
2770	N -= HwLen;
2771
2772	return N + SrcBase;
2773	};
2774
2775	auto fold3 = [&](SDValue TopShuff, SDValue Inp, MapType &&OpMap) -> SDValue {
2776	// Fold all 3 shuffles into a single one.
2777	auto *This = cast<ShuffleVectorSDNode>(Val&: TopShuff);
2778	auto *S0 = cast<ShuffleVectorSDNode>(Val: TopShuff.getOperand(i: `0`));
2779	auto *S1 = cast<ShuffleVectorSDNode>(Val: TopShuff.getOperand(i: `1`));
2780	ArrayRef<int> TopMask = This->getMask();
2781	// This should be guaranteed by type checks in the caller, and the fact
2782	// that all shuffles should have been promoted to operate on MVT::i8.
2783	assert(TopMask.size() == S0->getMask().size() &&
2784	TopMask.size() == S1->getMask().size());
2785	assert(TopMask.size() == HwLen);
2786
2787	SmallVector<int, `256`> FoldedMask(`2` * HwLen);
2788	for (unsigned I = `0`; I != HwLen; ++I) {
2789	int MaybeM = TopMask [I];
2790	if (MaybeM >= `0`) {
2791	FoldedMask [I] =
2792	getMaskElt (static_cast<unsigned>(MaybeM), S0, S1, OpMap);
2793	} else {
2794	FoldedMask [I] = -`1`;
2795	}
2796	}
2797	// The second half of the result will be all-undef.
2798	std::fill(FoldedMask.begin() + HwLen, FoldedMask.end(), -`1`);
2799
2800	// Return
2801	// FoldedShuffle = (Shuffle Inp, undef, FoldedMask)
2802	// (LoHalf FoldedShuffle)
2803	const SDLoc &dl(TopShuff);
2804	MVT SingleTy = MVT::getVectorVT(VT: MVT::i8, NumElements: HwLen);
2805	MVT PairTy = MVT::getVectorVT(VT: MVT::i8, NumElements: `2` * HwLen);
2806	SDValue FoldedShuff =
2807	DAG.getVectorShuffle(VT: PairTy, dl, N1: DAG.getBitcast(VT: PairTy, V: Inp),
2808	N2: DAG.getUNDEF(VT: PairTy), Mask: FoldedMask);
2809	return DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: dl, VT: SingleTy, N1: FoldedShuff,
2810	N2: DAG.getConstant(Val: `0`, DL: dl, VT: MVT::i32));
2811	};
2812
2813	auto getSourceInfo = [](SDValue V) -> std::optional<SubVectorInfo> {
2814	while (V.getOpcode() == ISD::BITCAST)
2815	V = V.getOperand(i: `0`);
2816	if (V.getOpcode() != ISD::EXTRACT_SUBVECTOR)
2817	return std::nullopt;
2818	return SubVectorInfo(V.getOperand(i: `0`),
2819	!cast<ConstantSDNode>(Val: V.getOperand(i: `1`))->isZero());
2820	};
2821
2822	for (SDNode *N : Nodes) {
2823	if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
2824	continue;
2825	EVT ResTy = N->getValueType(ResNo: `0`);
2826	if (ResTy.getVectorElementType() != MVT::i8)
2827	continue;
2828	if (ResTy.getVectorNumElements() != HwLen)
2829	continue;
2830
2831	SDValue V0 = N->getOperand(Num: `0`);
2832	SDValue V1 = N->getOperand(Num: `1`);
2833	if (V0.getOpcode() != ISD::VECTOR_SHUFFLE)
2834	continue;
2835	if (V1.getOpcode() != ISD::VECTOR_SHUFFLE)
2836	continue;
2837	if (V0.getValueType() != ResTy \|\| V1.getValueType() != ResTy)
2838	continue;
2839
2840	// Check if all operands of the two operand shuffles are extract_subvectors
2841	// from the same vector pair.
2842	auto V0A = getSourceInfo (V0.getOperand(i: `0`));
2843	if (!V0A.has_value())
2844	continue;
2845	auto V0B = getSourceInfo (V0.getOperand(i: `1`));
2846	if (!V0B.has_value() \|\| V0B ->Src != V0A ->Src)
2847	continue;
2848	auto V1A = getSourceInfo (V1.getOperand(i: `0`));
2849	if (!V1A.has_value() \|\| V1A ->Src != V0A ->Src)
2850	continue;
2851	auto V1B = getSourceInfo (V1.getOperand(i: `1`));
2852	if (!V1B.has_value() \|\| V1B ->Src != V0A ->Src)
2853	continue;
2854
2855	// The source must be a pair. This should be guaranteed here,
2856	// but check just in case.
2857	assert(V0A->Src.getValueType().getSizeInBits() == `16` * HwLen);
2858
2859	MapType OpMap = {
2860	{V0.getOperand(i: `0`), V0A ->HalfIdx * HwLen},
2861	{V0.getOperand(i: `1`), V0B ->HalfIdx * HwLen},
2862	{V1.getOperand(i: `0`), V1A ->HalfIdx * HwLen},
2863	{V1.getOperand(i: `1`), V1B ->HalfIdx * HwLen},
2864	};
2865	SDValue NewS = fold3 (SDValue (N, `0`), V0A ->Src, std::move(OpMap));
2866	ReplaceNode(F: N, T: NewS.getNode());
2867	}
2868	}
2869
2870	void HexagonDAGToDAGISel::SelectHvxExtractSubvector(SDNode *N) {
2871	HvxSelector (*this, *CurDAG).selectExtractSubvector(N);
2872	}
2873
2874	void HexagonDAGToDAGISel::SelectHvxShuffle(SDNode *N) {
2875	HvxSelector (*this, *CurDAG).selectShuffle(N);
2876	}
2877
2878	void HexagonDAGToDAGISel::SelectHvxRor(SDNode *N) {
2879	HvxSelector (*this, *CurDAG).selectRor(N);
2880	}
2881
2882	void HexagonDAGToDAGISel::SelectHvxVAlign(SDNode *N) {
2883	HvxSelector (*this, *CurDAG).selectVAlign(N);
2884	}
2885
2886	void HexagonDAGToDAGISel::SelectV65GatherPred(SDNode *N) {
2887	const SDLoc &dl(N);
2888	SDValue Chain = N->getOperand(Num: `0`);
2889	SDValue Address = N->getOperand(Num: `2`);
2890	SDValue Predicate = N->getOperand(Num: `3`);
2891	SDValue Base = N->getOperand(Num: `4`);
2892	SDValue Modifier = N->getOperand(Num: `5`);
2893	SDValue Offset = N->getOperand(Num: `6`);
2894	SDValue ImmOperand = CurDAG->getTargetConstant(Val: `0`, DL: dl, VT: MVT::i32);
2895
2896	unsigned Opcode;
2897	unsigned IntNo = N->getConstantOperandVal(Num: `1`);
2898	switch (IntNo) {
2899	default:
2900	llvm_unreachable("Unexpected HVX gather intrinsic.");
2901	case Intrinsic::hexagon_V6_vgathermhq:
2902	case Intrinsic::hexagon_V6_vgathermhq_128B:
2903	Opcode = Hexagon::V6_vgathermhq_pseudo;
2904	break;
2905	case Intrinsic::hexagon_V6_vgathermwq:
2906	case Intrinsic::hexagon_V6_vgathermwq_128B:
2907	Opcode = Hexagon::V6_vgathermwq_pseudo;
2908	break;
2909	case Intrinsic::hexagon_V6_vgathermhwq:
2910	case Intrinsic::hexagon_V6_vgathermhwq_128B:
2911	Opcode = Hexagon::V6_vgathermhwq_pseudo;
2912	break;
2913	}
2914
2915	SDVTList VTs = CurDAG->getVTList(VT: MVT::Other);
2916	SDValue Ops[] = { Address, ImmOperand,
2917	Predicate, Base, Modifier, Offset, Chain };
2918	SDNode *Result = CurDAG->getMachineNode(Opcode, dl, VTs, Ops);
2919
2920	MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(Val: N)->getMemOperand();
2921	CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: Result), NewMemRefs: {MemOp});
2922
2923	ReplaceNode(F: N, T: Result);
2924	}
2925
2926	void HexagonDAGToDAGISel::SelectV65Gather(SDNode *N) {
2927	const SDLoc &dl(N);
2928	SDValue Chain = N->getOperand(Num: `0`);
2929	SDValue Address = N->getOperand(Num: `2`);
2930	SDValue Base = N->getOperand(Num: `3`);
2931	SDValue Modifier = N->getOperand(Num: `4`);
2932	SDValue Offset = N->getOperand(Num: `5`);
2933	SDValue ImmOperand = CurDAG->getTargetConstant(Val: `0`, DL: dl, VT: MVT::i32);
2934
2935	unsigned Opcode;
2936	unsigned IntNo = N->getConstantOperandVal(Num: `1`);
2937	switch (IntNo) {
2938	default:
2939	llvm_unreachable("Unexpected HVX gather intrinsic.");
2940	case Intrinsic::hexagon_V6_vgathermh:
2941	case Intrinsic::hexagon_V6_vgathermh_128B:
2942	Opcode = Hexagon::V6_vgathermh_pseudo;
2943	break;
2944	case Intrinsic::hexagon_V6_vgathermw:
2945	case Intrinsic::hexagon_V6_vgathermw_128B:
2946	Opcode = Hexagon::V6_vgathermw_pseudo;
2947	break;
2948	case Intrinsic::hexagon_V6_vgathermhw:
2949	case Intrinsic::hexagon_V6_vgathermhw_128B:
2950	Opcode = Hexagon::V6_vgathermhw_pseudo;
2951	break;
2952	}
2953
2954	SDVTList VTs = CurDAG->getVTList(VT: MVT::Other);
2955	SDValue Ops[] = { Address, ImmOperand, Base, Modifier, Offset, Chain };
2956	SDNode *Result = CurDAG->getMachineNode(Opcode, dl, VTs, Ops);
2957
2958	MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(Val: N)->getMemOperand();
2959	CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: Result), NewMemRefs: {MemOp});
2960
2961	ReplaceNode(F: N, T: Result);
2962	}
2963
2964	void HexagonDAGToDAGISel::SelectHVXDualOutput(SDNode *N) {
2965	unsigned IID = N->getConstantOperandVal(Num: `0`);
2966	SDNode *Result;
2967	switch (IID) {
2968	case Intrinsic::hexagon_V6_vaddcarry: {
2969	std::array<SDValue, `3`> Ops = {
2970	._M_elems: {N->getOperand(Num: `1`), N->getOperand(Num: `2`), N->getOperand(Num: `3`)}};
2971	SDVTList VTs = CurDAG->getVTList(VT1: MVT::v16i32, VT2: MVT::v64i1);
2972	Result = CurDAG->getMachineNode(Opcode: Hexagon::V6_vaddcarry, dl: SDLoc (N), VTs, Ops);
2973	break;
2974	}
2975	case Intrinsic::hexagon_V6_vaddcarry_128B: {
2976	std::array<SDValue, `3`> Ops = {
2977	._M_elems: {N->getOperand(Num: `1`), N->getOperand(Num: `2`), N->getOperand(Num: `3`)}};
2978	SDVTList VTs = CurDAG->getVTList(VT1: MVT::v32i32, VT2: MVT::v128i1);
2979	Result = CurDAG->getMachineNode(Opcode: Hexagon::V6_vaddcarry, dl: SDLoc (N), VTs, Ops);
2980	break;
2981	}
2982	case Intrinsic::hexagon_V6_vsubcarry: {
2983	std::array<SDValue, `3`> Ops = {
2984	._M_elems: {N->getOperand(Num: `1`), N->getOperand(Num: `2`), N->getOperand(Num: `3`)}};
2985	SDVTList VTs = CurDAG->getVTList(VT1: MVT::v16i32, VT2: MVT::v64i1);
2986	Result = CurDAG->getMachineNode(Opcode: Hexagon::V6_vsubcarry, dl: SDLoc (N), VTs, Ops);
2987	break;
2988	}
2989	case Intrinsic::hexagon_V6_vsubcarry_128B: {
2990	std::array<SDValue, `3`> Ops = {
2991	._M_elems: {N->getOperand(Num: `1`), N->getOperand(Num: `2`), N->getOperand(Num: `3`)}};
2992	SDVTList VTs = CurDAG->getVTList(VT1: MVT::v32i32, VT2: MVT::v128i1);
2993	Result = CurDAG->getMachineNode(Opcode: Hexagon::V6_vsubcarry, dl: SDLoc (N), VTs, Ops);
2994	break;
2995	}
2996	default:
2997	llvm_unreachable("Unexpected HVX dual output intrinsic.");
2998	}
2999	ReplaceUses(F: N, T: Result);
3000	ReplaceUses(F: SDValue (N, `0`), T: SDValue (Result, `0`));
3001	ReplaceUses(F: SDValue (N, `1`), T: SDValue (Result, `1`));
3002	CurDAG->RemoveDeadNode(N);
3003	}
3004

Browse the source code of llvm_projects/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp