PPCVSXSwapRemoval.cpp source code [llvm_projects/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp]

1	//===----------- PPCVSXSwapRemoval.cpp - Remove VSX LE Swaps -------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===---------------------------------------------------------------------===//
8	//
9	// This pass analyzes vector computations and removes unnecessary
10	// doubleword swaps (xxswapd instructions). This pass is performed
11	// only for little-endian VSX code generation.
12	//
13	// For this specific case, loads and stores of v4i32, v4f32, v2i64,
14	// and v2f64 vectors are inefficient. These are implemented using
15	// the lxvd2x and stxvd2x instructions, which invert the order of
16	// doublewords in a vector register. Thus code generation inserts
17	// an xxswapd after each such load, and prior to each such store.
18	//
19	// The extra xxswapd instructions reduce performance. The purpose
20	// of this pass is to reduce the number of xxswapd instructions
21	// required for correctness.
22	//
23	// The primary insight is that much code that operates on vectors
24	// does not care about the relative order of elements in a register,
25	// so long as the correct memory order is preserved. If we have a
26	// computation where all input values are provided by lxvd2x/xxswapd,
27	// all outputs are stored using xxswapd/lxvd2x, and all intermediate
28	// computations are lane-insensitive (independent of element order),
29	// then all the xxswapd instructions associated with the loads and
30	// stores may be removed without changing observable semantics.
31	//
32	// This pass uses standard equivalence class infrastructure to create
33	// maximal webs of computations fitting the above description. Each
34	// such web is then optimized by removing its unnecessary xxswapd
35	// instructions.
36	//
37	// There are some lane-sensitive operations for which we can still
38	// permit the optimization, provided we modify those operations
39	// accordingly. Such operations are identified as using "special
40	// handling" within this module.
41	//
42	//===---------------------------------------------------------------------===//
43
44	#include "PPC.h"
45	#include "PPCInstrInfo.h"
46	#include "PPCTargetMachine.h"
47	#include "llvm/ADT/DenseMap.h"
48	#include "llvm/ADT/EquivalenceClasses.h"
49	#include "llvm/CodeGen/MachineFunctionPass.h"
50	#include "llvm/CodeGen/MachineInstrBuilder.h"
51	#include "llvm/CodeGen/MachineRegisterInfo.h"
52	#include "llvm/Config/llvm-config.h"
53	#include "llvm/Support/Debug.h"
54	#include "llvm/Support/Format.h"
55	#include "llvm/Support/raw_ostream.h"
56
57	using namespace llvm;
58
59	#define DEBUG_TYPE "ppc-vsx-swaps"
60
61	namespace {
62
63	// A PPCVSXSwapEntry is created for each machine instruction that
64	// is relevant to a vector computation.
65	struct PPCVSXSwapEntry {
66	// Pointer to the instruction.
67	MachineInstr *VSEMI;
68
69	// Unique ID (position in the swap vector).
70	int VSEId;
71
72	// Attributes of this node.
73	unsigned int IsLoad : `1`;
74	unsigned int IsStore : `1`;
75	unsigned int IsSwap : `1`;
76	unsigned int MentionsPhysVR : `1`;
77	unsigned int IsSwappable : `1`;
78	unsigned int MentionsPartialVR : `1`;
79	unsigned int SpecialHandling : `3`;
80	unsigned int WebRejected : `1`;
81	unsigned int WillRemove : `1`;
82	};
83
84	enum SHValues {
85	SH_NONE = `0`,
86	SH_EXTRACT,
87	SH_INSERT,
88	SH_NOSWAP_LD,
89	SH_NOSWAP_ST,
90	SH_SPLAT,
91	SH_XXPERMDI,
92	SH_COPYWIDEN
93	};
94
95	struct PPCVSXSwapRemoval : public MachineFunctionPass {
96
97	static char ID;
98	const PPCInstrInfo *TII;
99	MachineFunction *MF;
100	MachineRegisterInfo *MRI;
101
102	// Swap entries are allocated in a vector for better performance.
103	std::vector<PPCVSXSwapEntry> SwapVector;
104
105	// A mapping is maintained between machine instructions and
106	// their swap entries. The key is the address of the MI.
107	DenseMap<MachineInstr, int*> SwapMap;
108
109	// Equivalence classes are used to gather webs of related computation.
110	// Swap entries are represented by their VSEId fields.
111	EquivalenceClasses<int> *EC;
112
113	PPCVSXSwapRemoval() : MachineFunctionPass (ID) {}
114
115	private:
116	// Initialize data structures.
117	void initialize(MachineFunction &MFParm);
118
119	// Walk the machine instructions to gather vector usage information.
120	// Return true iff vector mentions are present.
121	bool gatherVectorInstructions();
122
123	// Add an entry to the swap vector and swap map.
124	int addSwapEntry(MachineInstr *MI, PPCVSXSwapEntry &SwapEntry);
125
126	// Hunt backwards through COPY and SUBREG_TO_REG chains for a
127	// source register. VecIdx indicates the swap vector entry to
128	// mark as mentioning a physical register if the search leads
129	// to one.
130	unsigned lookThruCopyLike(unsigned SrcReg, unsigned VecIdx);
131
132	// Generate equivalence classes for related computations (webs).
133	void formWebs();
134
135	// Analyze webs and determine those that cannot be optimized.
136	void recordUnoptimizableWebs();
137
138	// Record which swap instructions can be safely removed.
139	void markSwapsForRemoval();
140
141	// Remove swaps and update other instructions requiring special
142	// handling. Return true iff any changes are made.
143	bool removeSwaps();
144
145	// Insert a swap instruction from SrcReg to DstReg at the given
146	// InsertPoint.
147	void insertSwap(MachineInstr *MI, MachineBasicBlock::iterator InsertPoint,
148	unsigned DstReg, unsigned SrcReg);
149
150	// Update instructions requiring special handling.
151	void handleSpecialSwappables(int EntryIdx);
152
153	// Dump a description of the entries in the swap vector.
154	void dumpSwapVector();
155
156	// Return true iff the given register is in the given class.
157	bool isRegInClass(unsigned Reg, const TargetRegisterClass *RC) {
158	if (Register::isVirtualRegister(Reg))
159	return RC->hasSubClassEq(RC: MRI->getRegClass(Reg));
160	return RC->contains(Reg);
161	}
162
163	// Return true iff the given register is a full vector register.
164	bool isVecReg(unsigned Reg) {
165	return (isRegInClass(Reg, RC: &PPC::VSRCRegClass) \|\|
166	isRegInClass(Reg, RC: &PPC::VRRCRegClass));
167	}
168
169	// Return true iff the given register is a partial vector register.
170	bool isScalarVecReg(unsigned Reg) {
171	return (isRegInClass(Reg, RC: &PPC::VSFRCRegClass) \|\|
172	isRegInClass(Reg, RC: &PPC::VSSRCRegClass));
173	}
174
175	// Return true iff the given register mentions all or part of a
176	// vector register. Also sets Partial to true if the mention
177	// is for just the floating-point register overlap of the register.
178	bool isAnyVecReg(unsigned Reg, bool &Partial) {
179	if (isScalarVecReg(Reg))
180	Partial = true;
181	return isScalarVecReg(Reg) \|\| isVecReg(Reg);
182	}
183
184	public:
185	// Main entry point for this pass.
186	bool runOnMachineFunction(MachineFunction &MF) override {
187	if (skipFunction(F: MF.getFunction()))
188	return false;
189
190	// If we don't have VSX on the subtarget, don't do anything.
191	// Also, on Power 9 the load and store ops preserve element order and so
192	// the swaps are not required.
193	const PPCSubtarget &STI = MF.getSubtarget<PPCSubtarget>();
194	if (!STI.hasVSX() \|\| !STI.needsSwapsForVSXMemOps())
195	return false;
196
197	bool Changed = false;
198	initialize(MFParm&: MF);
199
200	if (gatherVectorInstructions()) {
201	formWebs();
202	recordUnoptimizableWebs();
203	markSwapsForRemoval();
204	Changed = removeSwaps();
205	}
206
207	// FIXME: See the allocation of EC in initialize().
208	delete EC;
209	return Changed;
210	}
211	};
212	} // end anonymous namespace
213
214	// Initialize data structures for this pass. In particular, clear the
215	// swap vector and allocate the equivalence class mapping before
216	// processing each function.
217	void PPCVSXSwapRemoval::initialize(MachineFunction &MFParm) {
218	MF = &MFParm;
219	MRI = &MF->getRegInfo();
220	TII = MF->getSubtarget<PPCSubtarget>().getInstrInfo();
221
222	// An initial vector size of 256 appears to work well in practice.
223	// Small/medium functions with vector content tend not to incur a
224	// reallocation at this size. Three of the vector tests in
225	// projects/test-suite reallocate, which seems like a reasonable rate.
226	const int InitialVectorSize(`256`);
227	SwapVector.clear();
228	SwapVector.reserve(n: InitialVectorSize);
229
230	// FIXME: Currently we allocate EC each time because we don't have
231	// access to the set representation on which to call clear(). Should
232	// consider adding a clear() method to the EquivalenceClasses class.
233	EC = new EquivalenceClasses<int>;
234	}
235
236	// Create an entry in the swap vector for each instruction that mentions
237	// a full vector register, recording various characteristics of the
238	// instructions there.
239	bool PPCVSXSwapRemoval::gatherVectorInstructions() {
240	bool RelevantFunction = false;
241
242	for (MachineBasicBlock &MBB : *MF) {
243	for (MachineInstr &MI : MBB) {
244
245	if (MI.isDebugInstr())
246	continue;
247
248	bool RelevantInstr = false;
249	bool Partial = false;
250
251	for (const MachineOperand &MO : MI.operands()) {
252	if (!MO.isReg())
253	continue;
254	Register Reg = MO.getReg();
255	// All operands need to be checked because there are instructions that
256	// operate on a partial register and produce a full register (such as
257	// XXPERMDIs).
258	if (isAnyVecReg(Reg, Partial))
259	RelevantInstr = true;
260	}
261
262	if (!RelevantInstr)
263	continue;
264
265	RelevantFunction = true;
266
267	// Create a SwapEntry initialized to zeros, then fill in the
268	// instruction and ID fields before pushing it to the back
269	// of the swap vector.
270	PPCVSXSwapEntry SwapEntry{};
271	int VecIdx = addSwapEntry(MI: &MI, SwapEntry);
272
273	switch(MI.getOpcode()) {
274	default:
275	// Unless noted otherwise, an instruction is considered
276	// safe for the optimization. There are a large number of
277	// such true-SIMD instructions (all vector math, logical,
278	// select, compare, etc.). However, if the instruction
279	// mentions a partial vector register and does not have
280	// special handling defined, it is not swappable.
281	if (Partial)
282	SwapVector [VecIdx].MentionsPartialVR = `1`;
283	else
284	SwapVector [VecIdx].IsSwappable = `1`;
285	break;
286	case PPC::XXPERMDI: {
287	// This is a swap if it is of the form XXPERMDI t, s, s, 2.
288	// Unfortunately, MachineCSE ignores COPY and SUBREG_TO_REG, so we
289	// can also see XXPERMDI t, SUBREG_TO_REG(s), SUBREG_TO_REG(s), 2,
290	// for example. We have to look through chains of COPY and
291	// SUBREG_TO_REG to find the real source value for comparison.
292	// If the real source value is a physical register, then mark the
293	// XXPERMDI as mentioning a physical register.
294	int immed = MI.getOperand(i: `3`).getImm();
295	if (immed == `2`) {
296	unsigned trueReg1 = lookThruCopyLike(SrcReg: MI.getOperand(i: `1`).getReg(),
297	VecIdx);
298	unsigned trueReg2 = lookThruCopyLike(SrcReg: MI.getOperand(i: `2`).getReg(),
299	VecIdx);
300	if (trueReg1 == trueReg2)
301	SwapVector [VecIdx].IsSwap = `1`;
302	else {
303	// We can still handle these if the two registers are not
304	// identical, by adjusting the form of the XXPERMDI.
305	SwapVector [VecIdx].IsSwappable = `1`;
306	SwapVector [VecIdx].SpecialHandling = SHValues::SH_XXPERMDI;
307	}
308	// This is a doubleword splat if it is of the form
309	// XXPERMDI t, s, s, 0 or XXPERMDI t, s, s, 3. As above we
310	// must look through chains of copy-likes to find the source
311	// register. We turn off the marking for mention of a physical
312	// register, because splatting it is safe; the optimization
313	// will not swap the value in the physical register. Whether
314	// or not the two input registers are identical, we can handle
315	// these by adjusting the form of the XXPERMDI.
316	} else if (immed == `0` \|\| immed == `3`) {
317
318	SwapVector [VecIdx].IsSwappable = `1`;
319	SwapVector [VecIdx].SpecialHandling = SHValues::SH_XXPERMDI;
320
321	unsigned trueReg1 = lookThruCopyLike(SrcReg: MI.getOperand(i: `1`).getReg(),
322	VecIdx);
323	unsigned trueReg2 = lookThruCopyLike(SrcReg: MI.getOperand(i: `2`).getReg(),
324	VecIdx);
325	if (trueReg1 == trueReg2)
326	SwapVector [VecIdx].MentionsPhysVR = `0`;
327
328	} else {
329	// We can still handle these by adjusting the form of the XXPERMDI.
330	SwapVector [VecIdx].IsSwappable = `1`;
331	SwapVector [VecIdx].SpecialHandling = SHValues::SH_XXPERMDI;
332	}
333	break;
334	}
335	case PPC::LVX:
336	// Non-permuting loads are currently unsafe. We can use special
337	// handling for this in the future. By not marking these as
338	// IsSwap, we ensure computations containing them will be rejected
339	// for now.
340	SwapVector [VecIdx].IsLoad = `1`;
341	break;
342	case PPC::LXVD2X:
343	case PPC::LXVW4X:
344	// Permuting loads are marked as both load and swap, and are
345	// safe for optimization.
346	SwapVector [VecIdx].IsLoad = `1`;
347	SwapVector [VecIdx].IsSwap = `1`;
348	break;
349	case PPC::LXSDX:
350	case PPC::LXSSPX:
351	case PPC::XFLOADf64:
352	case PPC::XFLOADf32:
353	// A load of a floating-point value into the high-order half of
354	// a vector register is safe, provided that we introduce a swap
355	// following the load, which will be done by the SUBREG_TO_REG
356	// support. So just mark these as safe.
357	SwapVector [VecIdx].IsLoad = `1`;
358	SwapVector [VecIdx].IsSwappable = `1`;
359	break;
360	case PPC::STVX:
361	// Non-permuting stores are currently unsafe. We can use special
362	// handling for this in the future. By not marking these as
363	// IsSwap, we ensure computations containing them will be rejected
364	// for now.
365	SwapVector [VecIdx].IsStore = `1`;
366	break;
367	case PPC::STXVD2X:
368	case PPC::STXVW4X:
369	// Permuting stores are marked as both store and swap, and are
370	// safe for optimization.
371	SwapVector [VecIdx].IsStore = `1`;
372	SwapVector [VecIdx].IsSwap = `1`;
373	break;
374	case PPC::COPY:
375	// These are fine provided they are moving between full vector
376	// register classes.
377	if (isVecReg(Reg: MI.getOperand(i: `0`).getReg()) &&
378	isVecReg(Reg: MI.getOperand(i: `1`).getReg()))
379	SwapVector [VecIdx].IsSwappable = `1`;
380	// If we have a copy from one scalar floating-point register
381	// to another, we can accept this even if it is a physical
382	// register. The only way this gets involved is if it feeds
383	// a SUBREG_TO_REG, which is handled by introducing a swap.
384	else if (isScalarVecReg(Reg: MI.getOperand(i: `0`).getReg()) &&
385	isScalarVecReg(Reg: MI.getOperand(i: `1`).getReg()))
386	SwapVector [VecIdx].IsSwappable = `1`;
387	break;
388	case PPC::SUBREG_TO_REG: {
389	// These are fine provided they are moving between full vector
390	// register classes. If they are moving from a scalar
391	// floating-point class to a vector class, we can handle those
392	// as well, provided we introduce a swap. It is generally the
393	// case that we will introduce fewer swaps than we remove, but
394	// (FIXME) a cost model could be used. However, introduced
395	// swaps could potentially be CSEd, so this is not trivial.
396	if (isVecReg(Reg: MI.getOperand(i: `0`).getReg()) &&
397	isVecReg(Reg: MI.getOperand(i: `1`).getReg()))
398	SwapVector [VecIdx].IsSwappable = `1`;
399	else if (isVecReg(Reg: MI.getOperand(i: `0`).getReg()) &&
400	isScalarVecReg(Reg: MI.getOperand(i: `1`).getReg())) {
401	SwapVector [VecIdx].IsSwappable = `1`;
402	SwapVector [VecIdx].SpecialHandling = SHValues::SH_COPYWIDEN;
403	}
404	break;
405	}
406	case PPC::VSPLTB:
407	case PPC::VSPLTH:
408	case PPC::VSPLTW:
409	case PPC::XXSPLTW:
410	// Splats are lane-sensitive, but we can use special handling
411	// to adjust the source lane for the splat.
412	SwapVector [VecIdx].IsSwappable = `1`;
413	SwapVector [VecIdx].SpecialHandling = SHValues::SH_SPLAT;
414	break;
415	// The presence of the following lane-sensitive operations in a
416	// web will kill the optimization, at least for now. For these
417	// we do nothing, causing the optimization to fail.
418	// FIXME: Some of these could be permitted with special handling,
419	// and will be phased in as time permits.
420	// FIXME: There is no simple and maintainable way to express a set
421	// of opcodes having a common attribute in TableGen. Should this
422	// change, this is a prime candidate to use such a mechanism.
423	case PPC::INLINEASM:
424	case PPC::INLINEASM_BR:
425	case PPC::EXTRACT_SUBREG:
426	case PPC::INSERT_SUBREG:
427	case PPC::COPY_TO_REGCLASS:
428	case PPC::LVEBX:
429	case PPC::LVEHX:
430	case PPC::LVEWX:
431	case PPC::LVSL:
432	case PPC::LVSR:
433	case PPC::LVXL:
434	case PPC::STVEBX:
435	case PPC::STVEHX:
436	case PPC::STVEWX:
437	case PPC::STVXL:
438	// We can handle STXSDX and STXSSPX similarly to LXSDX and LXSSPX,
439	// by adding special handling for narrowing copies as well as
440	// widening ones. However, I've experimented with this, and in
441	// practice we currently do not appear to use STXSDX fed by
442	// a narrowing copy from a full vector register. Since I can't
443	// generate any useful test cases, I've left this alone for now.
444	case PPC::STXSDX:
445	case PPC::STXSSPX:
446	case PPC::VCIPHER:
447	case PPC::VCIPHERLAST:
448	case PPC::VMRGHB:
449	case PPC::VMRGHH:
450	case PPC::VMRGHW:
451	case PPC::VMRGLB:
452	case PPC::VMRGLH:
453	case PPC::VMRGLW:
454	case PPC::VMULESB:
455	case PPC::VMULESH:
456	case PPC::VMULESW:
457	case PPC::VMULEUB:
458	case PPC::VMULEUH:
459	case PPC::VMULEUW:
460	case PPC::VMULOSB:
461	case PPC::VMULOSH:
462	case PPC::VMULOSW:
463	case PPC::VMULOUB:
464	case PPC::VMULOUH:
465	case PPC::VMULOUW:
466	case PPC::VNCIPHER:
467	case PPC::VNCIPHERLAST:
468	case PPC::VPERM:
469	case PPC::VPERMXOR:
470	case PPC::VPKPX:
471	case PPC::VPKSHSS:
472	case PPC::VPKSHUS:
473	case PPC::VPKSDSS:
474	case PPC::VPKSDUS:
475	case PPC::VPKSWSS:
476	case PPC::VPKSWUS:
477	case PPC::VPKUDUM:
478	case PPC::VPKUDUS:
479	case PPC::VPKUHUM:
480	case PPC::VPKUHUS:
481	case PPC::VPKUWUM:
482	case PPC::VPKUWUS:
483	case PPC::VPMSUMB:
484	case PPC::VPMSUMD:
485	case PPC::VPMSUMH:
486	case PPC::VPMSUMW:
487	case PPC::VRLB:
488	case PPC::VRLD:
489	case PPC::VRLH:
490	case PPC::VRLW:
491	case PPC::VSBOX:
492	case PPC::VSHASIGMAD:
493	case PPC::VSHASIGMAW:
494	case PPC::VSL:
495	case PPC::VSLDOI:
496	case PPC::VSLO:
497	case PPC::VSR:
498	case PPC::VSRO:
499	case PPC::VSUM2SWS:
500	case PPC::VSUM4SBS:
501	case PPC::VSUM4SHS:
502	case PPC::VSUM4UBS:
503	case PPC::VSUMSWS:
504	case PPC::VUPKHPX:
505	case PPC::VUPKHSB:
506	case PPC::VUPKHSH:
507	case PPC::VUPKHSW:
508	case PPC::VUPKLPX:
509	case PPC::VUPKLSB:
510	case PPC::VUPKLSH:
511	case PPC::VUPKLSW:
512	case PPC::XXMRGHW:
513	case PPC::XXMRGLW:
514	// XXSLDWI could be replaced by a general permute with one of three
515	// permute control vectors (for shift values 1, 2, 3). However,
516	// VPERM has a more restrictive register class.
517	case PPC::XXSLDWI:
518	case PPC::XSCVDPSPN:
519	case PPC::XSCVSPDPN:
520	case PPC::MTVSCR:
521	case PPC::MFVSCR:
522	break;
523	}
524	}
525	}
526
527	if (RelevantFunction) {
528	LLVM_DEBUG(dbgs() << "Swap vector when first built\n\n");
529	LLVM_DEBUG(dumpSwapVector());
530	}
531
532	return RelevantFunction;
533	}
534
535	// Add an entry to the swap vector and swap map, and make a
536	// singleton equivalence class for the entry.
537	int PPCVSXSwapRemoval::addSwapEntry(MachineInstr *MI,
538	PPCVSXSwapEntry& SwapEntry) {
539	SwapEntry.VSEMI = MI;
540	SwapEntry.VSEId = SwapVector.size();
541	SwapVector.push_back(x: SwapEntry);
542	EC->insert(Data: SwapEntry.VSEId);
543	SwapMap [MI] = SwapEntry.VSEId;
544	return SwapEntry.VSEId;
545	}
546
547	// This is used to find the "true" source register for an
548	// XXPERMDI instruction, since MachineCSE does not handle the
549	// "copy-like" operations (Copy and SubregToReg). Returns
550	// the original SrcReg unless it is the target of a copy-like
551	// operation, in which case we chain backwards through all
552	// such operations to the ultimate source register. If a
553	// physical register is encountered, we stop the search and
554	// flag the swap entry indicated by VecIdx (the original
555	// XXPERMDI) as mentioning a physical register.
556	unsigned PPCVSXSwapRemoval::lookThruCopyLike(unsigned SrcReg,
557	unsigned VecIdx) {
558	MachineInstr *MI = MRI->getVRegDef(Reg: SrcReg);
559	if (!MI->isCopyLike())
560	return SrcReg;
561
562	assert((MI->isCopy() \|\| MI->isSubregToReg()) &&
563	"bad opcode for lookThruCopyLike");
564	unsigned CopySrcReg = MI->getOperand(i: `1`).getReg();
565
566	if (!Register::isVirtualRegister(Reg: CopySrcReg)) {
567	if (!isScalarVecReg(Reg: CopySrcReg))
568	SwapVector [VecIdx].MentionsPhysVR = `1`;
569	return CopySrcReg;
570	}
571
572	return lookThruCopyLike(SrcReg: CopySrcReg, VecIdx);
573	}
574
575	// Generate equivalence classes for related computations (webs) by
576	// def-use relationships of virtual registers. Mention of a physical
577	// register terminates the generation of equivalence classes as this
578	// indicates a use of a parameter, definition of a return value, use
579	// of a value returned from a call, or definition of a parameter to a
580	// call. Computations with physical register mentions are flagged
581	// as such so their containing webs will not be optimized.
582	void PPCVSXSwapRemoval::formWebs() {
583
584	LLVM_DEBUG(dbgs() << "\n* Forming webs for swap removal *\n\n");
585
586	for (unsigned EntryIdx = `0`; EntryIdx < SwapVector.size(); ++EntryIdx) {
587
588	MachineInstr *MI = SwapVector [EntryIdx].VSEMI;
589
590	LLVM_DEBUG(dbgs() << "\n" << SwapVector[EntryIdx].VSEId << " ");
591	LLVM_DEBUG(MI->dump());
592
593	// It's sufficient to walk vector uses and join them to their unique
594	// definitions. In addition, check full vector register operands
595	// for physical regs. We exclude partial-vector register operands
596	// because we can handle them if copied to a full vector.
597	for (const MachineOperand &MO : MI->operands()) {
598	if (!MO.isReg())
599	continue;
600
601	Register Reg = MO.getReg();
602	if (!isVecReg(Reg) && !isScalarVecReg(Reg))
603	continue;
604
605	if (!Reg.isVirtual()) {
606	if (!(MI->isCopy() && isScalarVecReg(Reg)))
607	SwapVector [EntryIdx].MentionsPhysVR = `1`;
608	continue;
609	}
610
611	if (!MO.isUse())
612	continue;
613
614	MachineInstr* DefMI = MRI->getVRegDef(Reg);
615	assert(SwapMap.contains(DefMI) &&
616	"Inconsistency: def of vector reg not found in swap map!");
617	int DefIdx = SwapMap [DefMI];
618	(void)EC->unionSets(V1: SwapVector [DefIdx].VSEId,
619	V2: SwapVector [EntryIdx].VSEId);
620
621	LLVM_DEBUG(dbgs() << format("Unioning %d with %d\n",
622	SwapVector[DefIdx].VSEId,
623	SwapVector[EntryIdx].VSEId));
624	LLVM_DEBUG(dbgs() << " Def: ");
625	LLVM_DEBUG(DefMI->dump());
626	}
627	}
628	}
629
630	// Walk the swap vector entries looking for conditions that prevent their
631	// containing computations from being optimized. When such conditions are
632	// found, mark the representative of the computation's equivalence class
633	// as rejected.
634	void PPCVSXSwapRemoval::recordUnoptimizableWebs() {
635
636	LLVM_DEBUG(dbgs() << "\n* Rejecting webs for swap removal *\n\n");
637
638	for (unsigned EntryIdx = `0`; EntryIdx < SwapVector.size(); ++EntryIdx) {
639	int Repr = EC->getLeaderValue(V: SwapVector [EntryIdx].VSEId);
640
641	// If representative is already rejected, don't waste further time.
642	if (SwapVector [Repr].WebRejected)
643	continue;
644
645	// Reject webs containing mentions of physical or partial registers, or
646	// containing operations that we don't know how to handle in a lane-
647	// permuted region.
648	if (SwapVector [EntryIdx].MentionsPhysVR \|\|
649	SwapVector [EntryIdx].MentionsPartialVR \|\|
650	!(SwapVector [EntryIdx].IsSwappable \|\| SwapVector [EntryIdx].IsSwap)) {
651
652	SwapVector [Repr].WebRejected = `1`;
653
654	LLVM_DEBUG(
655	dbgs() << format("Web %d rejected for physreg, partial reg, or not "
656	"swap[pable]\n",
657	Repr));
658	LLVM_DEBUG(dbgs() << " in " << EntryIdx << ": ");
659	LLVM_DEBUG(SwapVector[EntryIdx].VSEMI->dump());
660	LLVM_DEBUG(dbgs() << "\n");
661	}
662
663	// Reject webs than contain swapping loads that feed something other
664	// than a swap instruction.
665	else if (SwapVector [EntryIdx].IsLoad && SwapVector [EntryIdx].IsSwap) {
666	MachineInstr *MI = SwapVector [EntryIdx].VSEMI;
667	Register DefReg = MI->getOperand(i: `0`).getReg();
668
669	// We skip debug instructions in the analysis. (Note that debug
670	// location information is still maintained by this optimization
671	// because it remains on the LXVD2X and STXVD2X instructions after
672	// the XXPERMDIs are removed.)
673	for (MachineInstr &UseMI : MRI->use_nodbg_instructions(Reg: DefReg)) {
674	int UseIdx = SwapMap [&UseMI];
675
676	if (!SwapVector [UseIdx].IsSwap \|\| SwapVector [UseIdx].IsLoad \|\|
677	SwapVector [UseIdx].IsStore) {
678
679	SwapVector [Repr].WebRejected = `1`;
680
681	LLVM_DEBUG(dbgs() << format(
682	"Web %d rejected for load not feeding swap\n", Repr));
683	LLVM_DEBUG(dbgs() << " def " << EntryIdx << ": ");
684	LLVM_DEBUG(MI->dump());
685	LLVM_DEBUG(dbgs() << " use " << UseIdx << ": ");
686	LLVM_DEBUG(UseMI.dump());
687	LLVM_DEBUG(dbgs() << "\n");
688	}
689
690	// It is possible that the load feeds a swap and that swap feeds a
691	// store. In such a case, the code is actually trying to store a swapped
692	// vector. We must reject such webs.
693	if (SwapVector [UseIdx].IsSwap && !SwapVector [UseIdx].IsLoad &&
694	!SwapVector [UseIdx].IsStore) {
695	Register SwapDefReg = UseMI.getOperand(i: `0`).getReg();
696	for (MachineInstr &UseOfUseMI :
697	MRI->use_nodbg_instructions(Reg: SwapDefReg)) {
698	int UseOfUseIdx = SwapMap [&UseOfUseMI];
699	if (SwapVector [UseOfUseIdx].IsStore) {
700	SwapVector [Repr].WebRejected = `1`;
701	LLVM_DEBUG(
702	dbgs() << format(
703	"Web %d rejected for load/swap feeding a store\n", Repr));
704	LLVM_DEBUG(dbgs() << " def " << EntryIdx << ": ");
705	LLVM_DEBUG(MI->dump());
706	LLVM_DEBUG(dbgs() << " use " << UseIdx << ": ");
707	LLVM_DEBUG(UseMI.dump());
708	LLVM_DEBUG(dbgs() << "\n");
709	}
710	}
711	}
712	}
713
714	// Reject webs that contain swapping stores that are fed by something
715	// other than a swap instruction.
716	} else if (SwapVector [EntryIdx].IsStore && SwapVector [EntryIdx].IsSwap) {
717	MachineInstr *MI = SwapVector [EntryIdx].VSEMI;
718	Register UseReg = MI->getOperand(i: `0`).getReg();
719	MachineInstr *DefMI = MRI->getVRegDef(Reg: UseReg);
720	Register DefReg = DefMI->getOperand(i: `0`).getReg();
721	int DefIdx = SwapMap [DefMI];
722
723	if (!SwapVector [DefIdx].IsSwap \|\| SwapVector [DefIdx].IsLoad \|\|
724	SwapVector [DefIdx].IsStore) {
725
726	SwapVector [Repr].WebRejected = `1`;
727
728	LLVM_DEBUG(dbgs() << format(
729	"Web %d rejected for store not fed by swap\n", Repr));
730	LLVM_DEBUG(dbgs() << " def " << DefIdx << ": ");
731	LLVM_DEBUG(DefMI->dump());
732	LLVM_DEBUG(dbgs() << " use " << EntryIdx << ": ");
733	LLVM_DEBUG(MI->dump());
734	LLVM_DEBUG(dbgs() << "\n");
735	}
736
737	// Ensure all uses of the register defined by DefMI feed store
738	// instructions
739	for (MachineInstr &UseMI : MRI->use_nodbg_instructions(Reg: DefReg)) {
740	int UseIdx = SwapMap [&UseMI];
741
742	if (SwapVector [UseIdx].VSEMI->getOpcode() != MI->getOpcode()) {
743	SwapVector [Repr].WebRejected = `1`;
744
745	LLVM_DEBUG(
746	dbgs() << format(
747	"Web %d rejected for swap not feeding only stores\n", Repr));
748	LLVM_DEBUG(dbgs() << " def "
749	<< " : ");
750	LLVM_DEBUG(DefMI->dump());
751	LLVM_DEBUG(dbgs() << " use " << UseIdx << ": ");
752	LLVM_DEBUG(SwapVector[UseIdx].VSEMI->dump());
753	LLVM_DEBUG(dbgs() << "\n");
754	}
755	}
756	}
757	}
758
759	LLVM_DEBUG(dbgs() << "Swap vector after web analysis:\n\n");
760	LLVM_DEBUG(dumpSwapVector());
761	}
762
763	// Walk the swap vector entries looking for swaps fed by permuting loads
764	// and swaps that feed permuting stores. If the containing computation
765	// has not been marked rejected, mark each such swap for removal.
766	// (Removal is delayed in case optimization has disturbed the pattern,
767	// such that multiple loads feed the same swap, etc.)
768	void PPCVSXSwapRemoval::markSwapsForRemoval() {
769
770	LLVM_DEBUG(dbgs() << "\n* Marking swaps for removal *\n\n");
771
772	for (unsigned EntryIdx = `0`; EntryIdx < SwapVector.size(); ++EntryIdx) {
773
774	if (SwapVector [EntryIdx].IsLoad && SwapVector [EntryIdx].IsSwap) {
775	int Repr = EC->getLeaderValue(V: SwapVector [EntryIdx].VSEId);
776
777	if (!SwapVector [Repr].WebRejected) {
778	MachineInstr *MI = SwapVector [EntryIdx].VSEMI;
779	Register DefReg = MI->getOperand(i: `0`).getReg();
780
781	for (MachineInstr &UseMI : MRI->use_nodbg_instructions(Reg: DefReg)) {
782	int UseIdx = SwapMap [&UseMI];
783	SwapVector [UseIdx].WillRemove = `1`;
784
785	LLVM_DEBUG(dbgs() << "Marking swap fed by load for removal: ");
786	LLVM_DEBUG(UseMI.dump());
787	}
788	}
789
790	} else if (SwapVector [EntryIdx].IsStore && SwapVector [EntryIdx].IsSwap) {
791	int Repr = EC->getLeaderValue(V: SwapVector [EntryIdx].VSEId);
792
793	if (!SwapVector [Repr].WebRejected) {
794	MachineInstr *MI = SwapVector [EntryIdx].VSEMI;
795	Register UseReg = MI->getOperand(i: `0`).getReg();
796	MachineInstr *DefMI = MRI->getVRegDef(Reg: UseReg);
797	int DefIdx = SwapMap [DefMI];
798	SwapVector [DefIdx].WillRemove = `1`;
799
800	LLVM_DEBUG(dbgs() << "Marking swap feeding store for removal: ");
801	LLVM_DEBUG(DefMI->dump());
802	}
803
804	} else if (SwapVector [EntryIdx].IsSwappable &&
805	SwapVector [EntryIdx].SpecialHandling != `0`) {
806	int Repr = EC->getLeaderValue(V: SwapVector [EntryIdx].VSEId);
807
808	if (!SwapVector [Repr].WebRejected)
809	handleSpecialSwappables(EntryIdx);
810	}
811	}
812	}
813
814	// Create an xxswapd instruction and insert it prior to the given point.
815	// MI is used to determine basic block and debug loc information.
816	// FIXME: When inserting a swap, we should check whether SrcReg is
817	// defined by another swap: SrcReg = XXPERMDI Reg, Reg, 2; If so,
818	// then instead we should generate a copy from Reg to DstReg.
819	void PPCVSXSwapRemoval::insertSwap(MachineInstr *MI,
820	MachineBasicBlock::iterator InsertPoint,
821	unsigned DstReg, unsigned SrcReg) {
822	BuildMI(BB&: *MI->getParent(), I: InsertPoint, MIMD: MI->getDebugLoc(),
823	MCID: TII->get(Opcode: PPC::XXPERMDI), DestReg: DstReg)
824	.addReg(RegNo: SrcReg)
825	.addReg(RegNo: SrcReg)
826	.addImm(Val: `2`);
827	}
828
829	// The identified swap entry requires special handling to allow its
830	// containing computation to be optimized. Perform that handling
831	// here.
832	// FIXME: Additional opportunities will be phased in with subsequent
833	// patches.
834	void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
835	switch (SwapVector [EntryIdx].SpecialHandling) {
836
837	default:
838	llvm_unreachable("Unexpected special handling type");
839
840	// For splats based on an index into a vector, add N/2 modulo N
841	// to the index, where N is the number of vector elements.
842	case SHValues::SH_SPLAT: {
843	MachineInstr *MI = SwapVector [EntryIdx].VSEMI;
844	unsigned NElts;
845
846	LLVM_DEBUG(dbgs() << "Changing splat: ");
847	LLVM_DEBUG(MI->dump());
848
849	switch (MI->getOpcode()) {
850	default:
851	llvm_unreachable("Unexpected splat opcode");
852	case PPC::VSPLTB: NElts = `16`; break;
853	case PPC::VSPLTH: NElts = `8`; break;
854	case PPC::VSPLTW:
855	case PPC::XXSPLTW: NElts = `4`; break;
856	}
857
858	unsigned EltNo;
859	if (MI->getOpcode() == PPC::XXSPLTW)
860	EltNo = MI->getOperand(i: `2`).getImm();
861	else
862	EltNo = MI->getOperand(i: `1`).getImm();
863
864	EltNo = (EltNo + NElts / `2`) % NElts;
865	if (MI->getOpcode() == PPC::XXSPLTW)
866	MI->getOperand(i: `2`).setImm(EltNo);
867	else
868	MI->getOperand(i: `1`).setImm(EltNo);
869
870	LLVM_DEBUG(dbgs() << " Into: ");
871	LLVM_DEBUG(MI->dump());
872	break;
873	}
874
875	// For an XXPERMDI that isn't handled otherwise, we need to
876	// reverse the order of the operands. If the selector operand
877	// has a value of 0 or 3, we need to change it to 3 or 0,
878	// respectively. Otherwise we should leave it alone. (This
879	// is equivalent to reversing the two bits of the selector
880	// operand and complementing the result.)
881	case SHValues::SH_XXPERMDI: {
882	MachineInstr *MI = SwapVector [EntryIdx].VSEMI;
883
884	LLVM_DEBUG(dbgs() << "Changing XXPERMDI: ");
885	LLVM_DEBUG(MI->dump());
886
887	unsigned Selector = MI->getOperand(i: `3`).getImm();
888	if (Selector == `0` \|\| Selector == `3`)
889	Selector = `3` - Selector;
890	MI->getOperand(i: `3`).setImm(Selector);
891
892	Register Reg1 = MI->getOperand(i: `1`).getReg();
893	Register Reg2 = MI->getOperand(i: `2`).getReg();
894	MI->getOperand(i: `1`).setReg(Reg2);
895	MI->getOperand(i: `2`).setReg(Reg1);
896
897	// We also need to swap kill flag associated with the register.
898	bool IsKill1 = MI->getOperand(i: `1`).isKill();
899	bool IsKill2 = MI->getOperand(i: `2`).isKill();
900	MI->getOperand(i: `1`).setIsKill(IsKill2);
901	MI->getOperand(i: `2`).setIsKill(IsKill1);
902
903	LLVM_DEBUG(dbgs() << " Into: ");
904	LLVM_DEBUG(MI->dump());
905	break;
906	}
907
908	// For a copy from a scalar floating-point register to a vector
909	// register, removing swaps will leave the copied value in the
910	// wrong lane. Insert a swap following the copy to fix this.
911	case SHValues::SH_COPYWIDEN: {
912	MachineInstr *MI = SwapVector [EntryIdx].VSEMI;
913
914	LLVM_DEBUG(dbgs() << "Changing SUBREG_TO_REG: ");
915	LLVM_DEBUG(MI->dump());
916
917	Register DstReg = MI->getOperand(i: `0`).getReg();
918	const TargetRegisterClass *DstRC = MRI->getRegClass(Reg: DstReg);
919	Register NewVReg = MRI->createVirtualRegister(RegClass: DstRC);
920
921	MI->getOperand(i: `0`).setReg(NewVReg);
922	LLVM_DEBUG(dbgs() << " Into: ");
923	LLVM_DEBUG(MI->dump());
924
925	auto InsertPoint = ++MachineBasicBlock::iterator (MI);
926
927	// Note that an XXPERMDI requires a VSRC, so if the SUBREG_TO_REG
928	// is copying to a VRRC, we need to be careful to avoid a register
929	// assignment problem. In this case we must copy from VRRC to VSRC
930	// prior to the swap, and from VSRC to VRRC following the swap.
931	// Coalescing will usually remove all this mess.
932	if (DstRC == &PPC::VRRCRegClass) {
933	Register VSRCTmp1 = MRI->createVirtualRegister(RegClass: &PPC::VSRCRegClass);
934	Register VSRCTmp2 = MRI->createVirtualRegister(RegClass: &PPC::VSRCRegClass);
935
936	BuildMI(BB&: *MI->getParent(), I: InsertPoint, MIMD: MI->getDebugLoc(),
937	MCID: TII->get(Opcode: PPC::COPY), DestReg: VSRCTmp1)
938	.addReg(RegNo: NewVReg);
939	LLVM_DEBUG(std::prev(InsertPoint)->dump());
940
941	insertSwap(MI, InsertPoint, DstReg: VSRCTmp2, SrcReg: VSRCTmp1);
942	LLVM_DEBUG(std::prev(InsertPoint)->dump());
943
944	BuildMI(BB&: *MI->getParent(), I: InsertPoint, MIMD: MI->getDebugLoc(),
945	MCID: TII->get(Opcode: PPC::COPY), DestReg: DstReg)
946	.addReg(RegNo: VSRCTmp2);
947	LLVM_DEBUG(std::prev(InsertPoint)->dump());
948
949	} else {
950	insertSwap(MI, InsertPoint, DstReg, SrcReg: NewVReg);
951	LLVM_DEBUG(std::prev(InsertPoint)->dump());
952	}
953	break;
954	}
955	}
956	}
957
958	// Walk the swap vector and replace each entry marked for removal with
959	// a copy operation.
960	bool PPCVSXSwapRemoval::removeSwaps() {
961
962	LLVM_DEBUG(dbgs() << "\n* Removing swaps *\n\n");
963
964	bool Changed = false;
965
966	for (unsigned EntryIdx = `0`; EntryIdx < SwapVector.size(); ++EntryIdx) {
967	if (SwapVector [EntryIdx].WillRemove) {
968	Changed = true;
969	MachineInstr *MI = SwapVector [EntryIdx].VSEMI;
970	MachineBasicBlock *MBB = MI->getParent();
971	BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY),
972	DestReg: MI->getOperand(i: `0`).getReg())
973	.add(MO: MI->getOperand(i: `1`));
974
975	LLVM_DEBUG(dbgs() << format("Replaced %d with copy: ",
976	SwapVector[EntryIdx].VSEId));
977	LLVM_DEBUG(MI->dump());
978
979	MI->eraseFromParent();
980	}
981	}
982
983	return Changed;
984	}
985
986	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
987	// For debug purposes, dump the contents of the swap vector.
988	LLVM_DUMP_METHOD void PPCVSXSwapRemoval::dumpSwapVector() {
989
990	for (unsigned EntryIdx = `0`; EntryIdx < SwapVector.size(); ++EntryIdx) {
991
992	MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
993	int ID = SwapVector[EntryIdx].VSEId;
994
995	dbgs() << format("%6d", ID);
996	dbgs() << format("%6d", EC->getLeaderValue(ID));
997	dbgs() << format(" %bb.%3d", MI->getParent()->getNumber());
998	dbgs() << format(" %14s ", TII->getName(MI->getOpcode()).str().c_str());
999
1000	if (SwapVector[EntryIdx].IsLoad)
1001	dbgs() << "load ";
1002	if (SwapVector[EntryIdx].IsStore)
1003	dbgs() << "store ";
1004	if (SwapVector[EntryIdx].IsSwap)
1005	dbgs() << "swap ";
1006	if (SwapVector[EntryIdx].MentionsPhysVR)
1007	dbgs() << "physreg ";
1008	if (SwapVector[EntryIdx].MentionsPartialVR)
1009	dbgs() << "partialreg ";
1010
1011	if (SwapVector[EntryIdx].IsSwappable) {
1012	dbgs() << "swappable ";
1013	switch(SwapVector[EntryIdx].SpecialHandling) {
1014	default:
1015	dbgs() << "special:unknown";
1016	break;
1017	case SH_NONE:
1018	break;
1019	case SH_EXTRACT:
1020	dbgs() << "special:extract ";
1021	break;
1022	case SH_INSERT:
1023	dbgs() << "special:insert ";
1024	break;
1025	case SH_NOSWAP_LD:
1026	dbgs() << "special:load ";
1027	break;
1028	case SH_NOSWAP_ST:
1029	dbgs() << "special:store ";
1030	break;
1031	case SH_SPLAT:
1032	dbgs() << "special:splat ";
1033	break;
1034	case SH_XXPERMDI:
1035	dbgs() << "special:xxpermdi ";
1036	break;
1037	case SH_COPYWIDEN:
1038	dbgs() << "special:copywiden ";
1039	break;
1040	}
1041	}
1042
1043	if (SwapVector[EntryIdx].WebRejected)
1044	dbgs() << "rejected ";
1045	if (SwapVector[EntryIdx].WillRemove)
1046	dbgs() << "remove ";
1047
1048	dbgs() << "\n";
1049
1050	// For no-asserts builds.
1051	(void)MI;
1052	(void)ID;
1053	}
1054
1055	dbgs() << "\n";
1056	}
1057	#endif
1058
1059	INITIALIZE_PASS_BEGIN(PPCVSXSwapRemoval, DEBUG_TYPE,
1060	"PowerPC VSX Swap Removal", false, false)
1061	INITIALIZE_PASS_END(PPCVSXSwapRemoval, DEBUG_TYPE,
1062	"PowerPC VSX Swap Removal", false, false)
1063
1064	char PPCVSXSwapRemoval::ID = `0`;
1065	FunctionPass*
1066	llvm::createPPCVSXSwapRemovalPass() { return new PPCVSXSwapRemoval (); }
1067

Browse the source code of llvm_projects/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp