PPCVSXSwapRemoval.cpp source code [llvm_projects/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp]

1	//===----------- PPCVSXSwapRemoval.cpp - Remove VSX LE Swaps -------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===---------------------------------------------------------------------===//
8	//
9	// This pass analyzes vector computations and removes unnecessary
10	// doubleword swaps (xxswapd instructions). This pass is performed
11	// only for little-endian VSX code generation.
12	//
13	// For this specific case, loads and stores of v4i32, v4f32, v2i64,
14	// and v2f64 vectors are inefficient. These are implemented using
15	// the lxvd2x and stxvd2x instructions, which invert the order of
16	// doublewords in a vector register. Thus code generation inserts
17	// an xxswapd after each such load, and prior to each such store.
18	//
19	// The extra xxswapd instructions reduce performance. The purpose
20	// of this pass is to reduce the number of xxswapd instructions
21	// required for correctness.
22	//
23	// The primary insight is that much code that operates on vectors
24	// does not care about the relative order of elements in a register,
25	// so long as the correct memory order is preserved. If we have a
26	// computation where all input values are provided by lxvd2x/xxswapd,
27	// all outputs are stored using xxswapd/lxvd2x, and all intermediate
28	// computations are lane-insensitive (independent of element order),
29	// then all the xxswapd instructions associated with the loads and
30	// stores may be removed without changing observable semantics.
31	//
32	// This pass uses standard equivalence class infrastructure to create
33	// maximal webs of computations fitting the above description. Each
34	// such web is then optimized by removing its unnecessary xxswapd
35	// instructions.
36	//
37	// There are some lane-sensitive operations for which we can still
38	// permit the optimization, provided we modify those operations
39	// accordingly. Such operations are identified as using "special
40	// handling" within this module.
41	//
42	//===---------------------------------------------------------------------===//
43
44	#include "PPC.h"
45	#include "PPCInstrInfo.h"
46	#include "PPCTargetMachine.h"
47	#include "llvm/ADT/DenseMap.h"
48	#include "llvm/ADT/EquivalenceClasses.h"
49	#include "llvm/CodeGen/MachineFunctionPass.h"
50	#include "llvm/CodeGen/MachineInstrBuilder.h"
51	#include "llvm/CodeGen/MachineRegisterInfo.h"
52	#include "llvm/Config/llvm-config.h"
53	#include "llvm/Support/Debug.h"
54	#include "llvm/Support/Format.h"
55	#include "llvm/Support/raw_ostream.h"
56
57	using namespace llvm;
58
59	#define DEBUG_TYPE "ppc-vsx-swaps"
60
61	namespace {
62
63	// A PPCVSXSwapEntry is created for each machine instruction that
64	// is relevant to a vector computation.
65	struct PPCVSXSwapEntry {
66	// Pointer to the instruction.
67	MachineInstr *VSEMI;
68
69	// Unique ID (position in the swap vector).
70	int VSEId;
71
72	// Attributes of this node.
73	unsigned int IsLoad : `1`;
74	unsigned int IsStore : `1`;
75	unsigned int IsSwap : `1`;
76	unsigned int MentionsPhysVR : `1`;
77	unsigned int IsSwappable : `1`;
78	unsigned int MentionsPartialVR : `1`;
79	unsigned int SpecialHandling : `3`;
80	unsigned int WebRejected : `1`;
81	unsigned int WillRemove : `1`;
82	};
83
84	enum SHValues {
85	SH_NONE = `0`,
86	SH_EXTRACT,
87	SH_INSERT,
88	SH_NOSWAP_LD,
89	SH_NOSWAP_ST,
90	SH_SPLAT,
91	SH_XXPERMDI,
92	SH_COPYWIDEN
93	};
94
95	struct PPCVSXSwapRemoval : public MachineFunctionPass {
96
97	static char ID;
98	const PPCInstrInfo *TII;
99	MachineFunction *MF;
100	MachineRegisterInfo *MRI;
101
102	// Swap entries are allocated in a vector for better performance.
103	std::vector<PPCVSXSwapEntry> SwapVector;
104
105	// A mapping is maintained between machine instructions and
106	// their swap entries. The key is the address of the MI.
107	DenseMap<MachineInstr, int*> SwapMap;
108
109	// Equivalence classes are used to gather webs of related computation.
110	// Swap entries are represented by their VSEId fields.
111	EquivalenceClasses<int> *EC;
112
113	PPCVSXSwapRemoval() : MachineFunctionPass (ID) {}
114
115	private:
116	// Initialize data structures.
117	void initialize(MachineFunction &MFParm);
118
119	// Walk the machine instructions to gather vector usage information.
120	// Return true iff vector mentions are present.
121	bool gatherVectorInstructions();
122
123	// Add an entry to the swap vector and swap map.
124	int addSwapEntry(MachineInstr *MI, PPCVSXSwapEntry &SwapEntry);
125
126	// Hunt backwards through COPY and SUBREG_TO_REG chains for a
127	// source register. VecIdx indicates the swap vector entry to
128	// mark as mentioning a physical register if the search leads
129	// to one.
130	unsigned lookThruCopyLike(unsigned SrcReg, unsigned VecIdx);
131
132	// Generate equivalence classes for related computations (webs).
133	void formWebs();
134
135	// Analyze webs and determine those that cannot be optimized.
136	void recordUnoptimizableWebs();
137
138	// Record which swap instructions can be safely removed.
139	void markSwapsForRemoval();
140
141	// Remove swaps and update other instructions requiring special
142	// handling. Return true iff any changes are made.
143	bool removeSwaps();
144
145	// Insert a swap instruction from SrcReg to DstReg at the given
146	// InsertPoint.
147	void insertSwap(MachineInstr *MI, MachineBasicBlock::iterator InsertPoint,
148	unsigned DstReg, unsigned SrcReg);
149
150	// Update instructions requiring special handling.
151	void handleSpecialSwappables(int EntryIdx);
152
153	// Dump a description of the entries in the swap vector.
154	void dumpSwapVector();
155
156	// Return true iff the given register is in the given class.
157	bool isRegInClass(unsigned Reg, const TargetRegisterClass *RC) {
158	if (Register::isVirtualRegister(Reg))
159	return RC->hasSubClassEq(RC: MRI->getRegClass(Reg));
160	return RC->contains(Reg);
161	}
162
163	// Return true iff the given register is a full vector register.
164	bool isVecReg(unsigned Reg) {
165	return (isRegInClass(Reg, RC: &PPC::VSRCRegClass) \|\|
166	isRegInClass(Reg, RC: &PPC::VRRCRegClass));
167	}
168
169	// Return true iff the given register is a partial vector register.
170	bool isScalarVecReg(unsigned Reg) {
171	return (isRegInClass(Reg, RC: &PPC::VSFRCRegClass) \|\|
172	isRegInClass(Reg, RC: &PPC::VSSRCRegClass));
173	}
174
175	// Return true iff the given register mentions all or part of a
176	// vector register. Also sets Partial to true if the mention
177	// is for just the floating-point register overlap of the register.
178	bool isAnyVecReg(unsigned Reg, bool &Partial) {
179	if (isScalarVecReg(Reg))
180	Partial = true;
181	return isScalarVecReg(Reg) \|\| isVecReg(Reg);
182	}
183
184	public:
185	// Main entry point for this pass.
186	bool runOnMachineFunction(MachineFunction &MF) override {
187	if (skipFunction(F: MF.getFunction()))
188	return false;
189
190	// If we don't have VSX on the subtarget, don't do anything.
191	// Also, on Power 9 the load and store ops preserve element order and so
192	// the swaps are not required.
193	const PPCSubtarget &STI = MF.getSubtarget<PPCSubtarget>();
194	if (!STI.hasVSX() \|\| !STI.needsSwapsForVSXMemOps())
195	return false;
196
197	bool Changed = false;
198	initialize(MFParm&: MF);
199
200	if (gatherVectorInstructions()) {
201	formWebs();
202	recordUnoptimizableWebs();
203	markSwapsForRemoval();
204	Changed = removeSwaps();
205	}
206
207	// FIXME: See the allocation of EC in initialize().
208	delete EC;
209	return Changed;
210	}
211	};
212	} // end anonymous namespace
213
214	// Initialize data structures for this pass. In particular, clear the
215	// swap vector and allocate the equivalence class mapping before
216	// processing each function.
217	void PPCVSXSwapRemoval::initialize(MachineFunction &MFParm) {
218	MF = &MFParm;
219	MRI = &MF->getRegInfo();
220	TII = MF->getSubtarget<PPCSubtarget>().getInstrInfo();
221
222	// An initial vector size of 256 appears to work well in practice.
223	// Small/medium functions with vector content tend not to incur a
224	// reallocation at this size. Three of the vector tests in
225	// projects/test-suite reallocate, which seems like a reasonable rate.
226	const int InitialVectorSize(`256`);
227	SwapVector.clear();
228	SwapVector.reserve(n: InitialVectorSize);
229
230	// FIXME: Currently we allocate EC each time because we don't have
231	// access to the set representation on which to call clear(). Should
232	// consider adding a clear() method to the EquivalenceClasses class.
233	EC = new EquivalenceClasses<int>;
234	}
235
236	// Create an entry in the swap vector for each instruction that mentions
237	// a full vector register, recording various characteristics of the
238	// instructions there.
239	bool PPCVSXSwapRemoval::gatherVectorInstructions() {
240	bool RelevantFunction = false;
241
242	for (MachineBasicBlock &MBB : *MF) {
243	for (MachineInstr &MI : MBB) {
244
245	if (MI.isDebugInstr())
246	continue;
247
248	bool RelevantInstr = false;
249	bool Partial = false;
250
251	for (const MachineOperand &MO : MI.operands()) {
252	if (!MO.isReg())
253	continue;
254	Register Reg = MO.getReg();
255	// All operands need to be checked because there are instructions that
256	// operate on a partial register and produce a full register (such as
257	// XXPERMDIs).
258	if (isAnyVecReg(Reg, Partial))
259	RelevantInstr = true;
260	}
261
262	if (!RelevantInstr)
263	continue;
264
265	RelevantFunction = true;
266
267	// Create a SwapEntry initialized to zeros, then fill in the
268	// instruction and ID fields before pushing it to the back
269	// of the swap vector.
270	PPCVSXSwapEntry SwapEntry{};
271	int VecIdx = addSwapEntry(MI: &MI, SwapEntry);
272
273	switch(MI.getOpcode()) {
274	default:
275	// Unless noted otherwise, an instruction is considered
276	// safe for the optimization. There are a large number of
277	// such true-SIMD instructions (all vector math, logical,
278	// select, compare, etc.). However, if the instruction
279	// mentions a partial vector register and does not have
280	// special handling defined, it is not swappable.
281	if (Partial)
282	SwapVector [VecIdx].MentionsPartialVR = `1`;
283	else
284	SwapVector [VecIdx].IsSwappable = `1`;
285	break;
286	case PPC::XXPERMDI: {
287	// This is a swap if it is of the form XXPERMDI t, s, s, 2.
288	// Unfortunately, MachineCSE ignores COPY and SUBREG_TO_REG, so we
289	// can also see XXPERMDI t, SUBREG_TO_REG(s), SUBREG_TO_REG(s), 2,
290	// for example. We have to look through chains of COPY and
291	// SUBREG_TO_REG to find the real source value for comparison.
292	// If the real source value is a physical register, then mark the
293	// XXPERMDI as mentioning a physical register.
294	int immed = MI.getOperand(i: `3`).getImm();
295	if (immed == `2`) {
296	unsigned trueReg1 = lookThruCopyLike(SrcReg: MI.getOperand(i: `1`).getReg(),
297	VecIdx);
298	unsigned trueReg2 = lookThruCopyLike(SrcReg: MI.getOperand(i: `2`).getReg(),
299	VecIdx);
300	if (trueReg1 == trueReg2)
301	SwapVector [VecIdx].IsSwap = `1`;
302	else {
303	// We can still handle these if the two registers are not
304	// identical, by adjusting the form of the XXPERMDI.
305	SwapVector [VecIdx].IsSwappable = `1`;
306	SwapVector [VecIdx].SpecialHandling = SHValues::SH_XXPERMDI;
307	}
308	// This is a doubleword splat if it is of the form
309	// XXPERMDI t, s, s, 0 or XXPERMDI t, s, s, 3. As above we
310	// must look through chains of copy-likes to find the source
311	// register. We turn off the marking for mention of a physical
312	// register, because splatting it is safe; the optimization
313	// will not swap the value in the physical register. Whether
314	// or not the two input registers are identical, we can handle
315	// these by adjusting the form of the XXPERMDI.
316	} else if (immed == `0` \|\| immed == `3`) {
317
318	SwapVector [VecIdx].IsSwappable = `1`;
319	SwapVector [VecIdx].SpecialHandling = SHValues::SH_XXPERMDI;
320
321	unsigned trueReg1 = lookThruCopyLike(SrcReg: MI.getOperand(i: `1`).getReg(),
322	VecIdx);
323	unsigned trueReg2 = lookThruCopyLike(SrcReg: MI.getOperand(i: `2`).getReg(),
324	VecIdx);
325	if (trueReg1 == trueReg2)
326	SwapVector [VecIdx].MentionsPhysVR = `0`;
327
328	} else {
329	// We can still handle these by adjusting the form of the XXPERMDI.
330	SwapVector [VecIdx].IsSwappable = `1`;
331	SwapVector [VecIdx].SpecialHandling = SHValues::SH_XXPERMDI;
332	}
333	break;
334	}
335	case PPC::LVX:
336	// Non-permuting loads are currently unsafe. We can use special
337	// handling for this in the future. By not marking these as
338	// IsSwap, we ensure computations containing them will be rejected
339	// for now.
340	SwapVector [VecIdx].IsLoad = `1`;
341	break;
342	case PPC::LXVD2X:
343	case PPC::LXVW4X:
344	// Permuting loads are marked as both load and swap, and are
345	// safe for optimization.
346	SwapVector [VecIdx].IsLoad = `1`;
347	SwapVector [VecIdx].IsSwap = `1`;
348	break;
349	case PPC::LXSDX:
350	case PPC::LXSSPX:
351	case PPC::XFLOADf64:
352	case PPC::XFLOADf32:
353	// A load of a floating-point value into the high-order half of
354	// a vector register is safe, provided that we introduce a swap
355	// following the load, which will be done by the SUBREG_TO_REG
356	// support. So just mark these as safe.
357	SwapVector [VecIdx].IsLoad = `1`;
358	SwapVector [VecIdx].IsSwappable = `1`;
359	break;
360	case PPC::STVX:
361	// Non-permuting stores are currently unsafe. We can use special
362	// handling for this in the future. By not marking these as
363	// IsSwap, we ensure computations containing them will be rejected
364	// for now.
365	SwapVector [VecIdx].IsStore = `1`;
366	break;
367	case PPC::STXVD2X:
368	case PPC::STXVW4X:
369	// Permuting stores are marked as both store and swap, and are
370	// safe for optimization.
371	SwapVector [VecIdx].IsStore = `1`;
372	SwapVector [VecIdx].IsSwap = `1`;
373	break;
374	case PPC::COPY:
375	// These are fine provided they are moving between full vector
376	// register classes.
377	if (isVecReg(Reg: MI.getOperand(i: `0`).getReg()) &&
378	isVecReg(Reg: MI.getOperand(i: `1`).getReg()))
379	SwapVector [VecIdx].IsSwappable = `1`;
380	// If we have a copy from one scalar floating-point register
381	// to another, we can accept this even if it is a physical
382	// register. The only way this gets involved is if it feeds
383	// a SUBREG_TO_REG, which is handled by introducing a swap.
384	else if (isScalarVecReg(Reg: MI.getOperand(i: `0`).getReg()) &&
385	isScalarVecReg(Reg: MI.getOperand(i: `1`).getReg()))
386	SwapVector [VecIdx].IsSwappable = `1`;
387	break;
388	case PPC::SUBREG_TO_REG: {
389	// These are fine provided they are moving between full vector
390	// register classes. If they are moving from a scalar
391	// floating-point class to a vector class, we can handle those
392	// as well, provided we introduce a swap. It is generally the
393	// case that we will introduce fewer swaps than we remove, but
394	// (FIXME) a cost model could be used. However, introduced
395	// swaps could potentially be CSEd, so this is not trivial.
396	if (isVecReg(Reg: MI.getOperand(i: `0`).getReg()) &&
397	isVecReg(Reg: MI.getOperand(i: `2`).getReg()))
398	SwapVector [VecIdx].IsSwappable = `1`;
399	else if (isVecReg(Reg: MI.getOperand(i: `0`).getReg()) &&
400	isScalarVecReg(Reg: MI.getOperand(i: `2`).getReg())) {
401	SwapVector [VecIdx].IsSwappable = `1`;
402	SwapVector [VecIdx].SpecialHandling = SHValues::SH_COPYWIDEN;
403	}
404	break;
405	}
406	case PPC::VSPLTB:
407	case PPC::VSPLTH:
408	case PPC::VSPLTW:
409	case PPC::XXSPLTW:
410	// Splats are lane-sensitive, but we can use special handling
411	// to adjust the source lane for the splat.
412	SwapVector [VecIdx].IsSwappable = `1`;
413	SwapVector [VecIdx].SpecialHandling = SHValues::SH_SPLAT;
414	break;
415	// The presence of the following lane-sensitive operations in a
416	// web will kill the optimization, at least for now. For these
417	// we do nothing, causing the optimization to fail.
418	// FIXME: Some of these could be permitted with special handling,
419	// and will be phased in as time permits.
420	// FIXME: There is no simple and maintainable way to express a set
421	// of opcodes having a common attribute in TableGen. Should this
422	// change, this is a prime candidate to use such a mechanism.
423	case PPC::INLINEASM:
424	case PPC::INLINEASM_BR:
425	case PPC::EXTRACT_SUBREG:
426	case PPC::INSERT_SUBREG:
427	case PPC::COPY_TO_REGCLASS:
428	case PPC::LVEBX:
429	case PPC::LVEHX:
430	case PPC::LVEWX:
431	case PPC::LVSL:
432	case PPC::LVSR:
433	case PPC::LVXL:
434	case PPC::STVEBX:
435	case PPC::STVEHX:
436	case PPC::STVEWX:
437	case PPC::STVXL:
438	// We can handle STXSDX and STXSSPX similarly to LXSDX and LXSSPX,
439	// by adding special handling for narrowing copies as well as
440	// widening ones. However, I've experimented with this, and in
441	// practice we currently do not appear to use STXSDX fed by
442	// a narrowing copy from a full vector register. Since I can't
443	// generate any useful test cases, I've left this alone for now.
444	case PPC::STXSDX:
445	case PPC::STXSSPX:
446	case PPC::VCIPHER:
447	case PPC::VCIPHERLAST:
448	case PPC::VMRGHB:
449	case PPC::VMRGHH:
450	case PPC::VMRGHW:
451	case PPC::VMRGLB:
452	case PPC::VMRGLH:
453	case PPC::VMRGLW:
454	case PPC::VMULESB:
455	case PPC::VMULESH:
456	case PPC::VMULESW:
457	case PPC::VMULEUB:
458	case PPC::VMULEUH:
459	case PPC::VMULEUW:
460	case PPC::VMULOSB:
461	case PPC::VMULOSH:
462	case PPC::VMULOSW:
463	case PPC::VMULOUB:
464	case PPC::VMULOUH:
465	case PPC::VMULOUW:
466	case PPC::VNCIPHER:
467	case PPC::VNCIPHERLAST:
468	case PPC::VPERM:
469	case PPC::VPERMXOR:
470	case PPC::VPKPX:
471	case PPC::VPKSHSS:
472	case PPC::VPKSHUS:
473	case PPC::VPKSDSS:
474	case PPC::VPKSDUS:
475	case PPC::VPKSWSS:
476	case PPC::VPKSWUS:
477	case PPC::VPKUDUM:
478	case PPC::VPKUDUS:
479	case PPC::VPKUHUM:
480	case PPC::VPKUHUS:
481	case PPC::VPKUWUM:
482	case PPC::VPKUWUS:
483	case PPC::VPMSUMB:
484	case PPC::VPMSUMD:
485	case PPC::VPMSUMH:
486	case PPC::VPMSUMW:
487	case PPC::VRLB:
488	case PPC::VRLD:
489	case PPC::VRLH:
490	case PPC::VRLW:
491	case PPC::VSBOX:
492	case PPC::VSHASIGMAD:
493	case PPC::VSHASIGMAW:
494	case PPC::VSL:
495	case PPC::VSLDOI:
496	case PPC::VSLO:
497	case PPC::VSR:
498	case PPC::VSRO:
499	case PPC::VSUM2SWS:
500	case PPC::VSUM4SBS:
501	case PPC::VSUM4SHS:
502	case PPC::VSUM4UBS:
503	case PPC::VSUMSWS:
504	case PPC::VUPKHPX:
505	case PPC::VUPKHSB:
506	case PPC::VUPKHSH:
507	case PPC::VUPKHSW:
508	case PPC::VUPKLPX:
509	case PPC::VUPKLSB:
510	case PPC::VUPKLSH:
511	case PPC::VUPKLSW:
512	case PPC::XXMRGHW:
513	case PPC::XXMRGLW:
514	// XXSLDWI could be replaced by a general permute with one of three
515	// permute control vectors (for shift values 1, 2, 3). However,
516	// VPERM has a more restrictive register class.
517	case PPC::XXSLDWI:
518	case PPC::XSCVDPSPN:
519	case PPC::XSCVSPDPN:
520	case PPC::MTVSCR:
521	case PPC::MFVSCR:
522	break;
523	}
524	}
525	}
526
527	if (RelevantFunction) {
528	LLVM_DEBUG(dbgs() << "Swap vector when first built\n\n");
529	LLVM_DEBUG(dumpSwapVector());
530	}
531
532	return RelevantFunction;
533	}
534
535	// Add an entry to the swap vector and swap map, and make a
536	// singleton equivalence class for the entry.
537	int PPCVSXSwapRemoval::addSwapEntry(MachineInstr *MI,
538	PPCVSXSwapEntry& SwapEntry) {
539	SwapEntry.VSEMI = MI;
540	SwapEntry.VSEId = SwapVector.size();
541	SwapVector.push_back(x: SwapEntry);
542	EC->insert(Data: SwapEntry.VSEId);
543	SwapMap [MI] = SwapEntry.VSEId;
544	return SwapEntry.VSEId;
545	}
546
547	// This is used to find the "true" source register for an
548	// XXPERMDI instruction, since MachineCSE does not handle the
549	// "copy-like" operations (Copy and SubregToReg). Returns
550	// the original SrcReg unless it is the target of a copy-like
551	// operation, in which case we chain backwards through all
552	// such operations to the ultimate source register. If a
553	// physical register is encountered, we stop the search and
554	// flag the swap entry indicated by VecIdx (the original
555	// XXPERMDI) as mentioning a physical register.
556	unsigned PPCVSXSwapRemoval::lookThruCopyLike(unsigned SrcReg,
557	unsigned VecIdx) {
558	MachineInstr *MI = MRI->getVRegDef(Reg: SrcReg);
559	if (!MI->isCopyLike())
560	return SrcReg;
561
562	unsigned CopySrcReg;
563	if (MI->isCopy())
564	CopySrcReg = MI->getOperand(i: `1`).getReg();
565	else {
566	assert(MI->isSubregToReg() && "bad opcode for lookThruCopyLike");
567	CopySrcReg = MI->getOperand(i: `2`).getReg();
568	}
569
570	if (!Register::isVirtualRegister(Reg: CopySrcReg)) {
571	if (!isScalarVecReg(Reg: CopySrcReg))
572	SwapVector [VecIdx].MentionsPhysVR = `1`;
573	return CopySrcReg;
574	}
575
576	return lookThruCopyLike(SrcReg: CopySrcReg, VecIdx);
577	}
578
579	// Generate equivalence classes for related computations (webs) by
580	// def-use relationships of virtual registers. Mention of a physical
581	// register terminates the generation of equivalence classes as this
582	// indicates a use of a parameter, definition of a return value, use
583	// of a value returned from a call, or definition of a parameter to a
584	// call. Computations with physical register mentions are flagged
585	// as such so their containing webs will not be optimized.
586	void PPCVSXSwapRemoval::formWebs() {
587
588	LLVM_DEBUG(dbgs() << "\n* Forming webs for swap removal *\n\n");
589
590	for (unsigned EntryIdx = `0`; EntryIdx < SwapVector.size(); ++EntryIdx) {
591
592	MachineInstr *MI = SwapVector [EntryIdx].VSEMI;
593
594	LLVM_DEBUG(dbgs() << "\n" << SwapVector[EntryIdx].VSEId << " ");
595	LLVM_DEBUG(MI->dump());
596
597	// It's sufficient to walk vector uses and join them to their unique
598	// definitions. In addition, check full vector register operands
599	// for physical regs. We exclude partial-vector register operands
600	// because we can handle them if copied to a full vector.
601	for (const MachineOperand &MO : MI->operands()) {
602	if (!MO.isReg())
603	continue;
604
605	Register Reg = MO.getReg();
606	if (!isVecReg(Reg) && !isScalarVecReg(Reg))
607	continue;
608
609	if (!Reg.isVirtual()) {
610	if (!(MI->isCopy() && isScalarVecReg(Reg)))
611	SwapVector [EntryIdx].MentionsPhysVR = `1`;
612	continue;
613	}
614
615	if (!MO.isUse())
616	continue;
617
618	MachineInstr* DefMI = MRI->getVRegDef(Reg);
619	assert(SwapMap.contains(DefMI) &&
620	"Inconsistency: def of vector reg not found in swap map!");
621	int DefIdx = SwapMap [DefMI];
622	(void)EC->unionSets(V1: SwapVector [DefIdx].VSEId,
623	V2: SwapVector [EntryIdx].VSEId);
624
625	LLVM_DEBUG(dbgs() << format("Unioning %d with %d\n",
626	SwapVector[DefIdx].VSEId,
627	SwapVector[EntryIdx].VSEId));
628	LLVM_DEBUG(dbgs() << " Def: ");
629	LLVM_DEBUG(DefMI->dump());
630	}
631	}
632	}
633
634	// Walk the swap vector entries looking for conditions that prevent their
635	// containing computations from being optimized. When such conditions are
636	// found, mark the representative of the computation's equivalence class
637	// as rejected.
638	void PPCVSXSwapRemoval::recordUnoptimizableWebs() {
639
640	LLVM_DEBUG(dbgs() << "\n* Rejecting webs for swap removal *\n\n");
641
642	for (unsigned EntryIdx = `0`; EntryIdx < SwapVector.size(); ++EntryIdx) {
643	int Repr = EC->getLeaderValue(V: SwapVector [EntryIdx].VSEId);
644
645	// If representative is already rejected, don't waste further time.
646	if (SwapVector [Repr].WebRejected)
647	continue;
648
649	// Reject webs containing mentions of physical or partial registers, or
650	// containing operations that we don't know how to handle in a lane-
651	// permuted region.
652	if (SwapVector [EntryIdx].MentionsPhysVR \|\|
653	SwapVector [EntryIdx].MentionsPartialVR \|\|
654	!(SwapVector [EntryIdx].IsSwappable \|\| SwapVector [EntryIdx].IsSwap)) {
655
656	SwapVector [Repr].WebRejected = `1`;
657
658	LLVM_DEBUG(
659	dbgs() << format("Web %d rejected for physreg, partial reg, or not "
660	"swap[pable]\n",
661	Repr));
662	LLVM_DEBUG(dbgs() << " in " << EntryIdx << ": ");
663	LLVM_DEBUG(SwapVector[EntryIdx].VSEMI->dump());
664	LLVM_DEBUG(dbgs() << "\n");
665	}
666
667	// Reject webs than contain swapping loads that feed something other
668	// than a swap instruction.
669	else if (SwapVector [EntryIdx].IsLoad && SwapVector [EntryIdx].IsSwap) {
670	MachineInstr *MI = SwapVector [EntryIdx].VSEMI;
671	Register DefReg = MI->getOperand(i: `0`).getReg();
672
673	// We skip debug instructions in the analysis. (Note that debug
674	// location information is still maintained by this optimization
675	// because it remains on the LXVD2X and STXVD2X instructions after
676	// the XXPERMDIs are removed.)
677	for (MachineInstr &UseMI : MRI->use_nodbg_instructions(Reg: DefReg)) {
678	int UseIdx = SwapMap [&UseMI];
679
680	if (!SwapVector [UseIdx].IsSwap \|\| SwapVector [UseIdx].IsLoad \|\|
681	SwapVector [UseIdx].IsStore) {
682
683	SwapVector [Repr].WebRejected = `1`;
684
685	LLVM_DEBUG(dbgs() << format(
686	"Web %d rejected for load not feeding swap\n", Repr));
687	LLVM_DEBUG(dbgs() << " def " << EntryIdx << ": ");
688	LLVM_DEBUG(MI->dump());
689	LLVM_DEBUG(dbgs() << " use " << UseIdx << ": ");
690	LLVM_DEBUG(UseMI.dump());
691	LLVM_DEBUG(dbgs() << "\n");
692	}
693
694	// It is possible that the load feeds a swap and that swap feeds a
695	// store. In such a case, the code is actually trying to store a swapped
696	// vector. We must reject such webs.
697	if (SwapVector [UseIdx].IsSwap && !SwapVector [UseIdx].IsLoad &&
698	!SwapVector [UseIdx].IsStore) {
699	Register SwapDefReg = UseMI.getOperand(i: `0`).getReg();
700	for (MachineInstr &UseOfUseMI :
701	MRI->use_nodbg_instructions(Reg: SwapDefReg)) {
702	int UseOfUseIdx = SwapMap [&UseOfUseMI];
703	if (SwapVector [UseOfUseIdx].IsStore) {
704	SwapVector [Repr].WebRejected = `1`;
705	LLVM_DEBUG(
706	dbgs() << format(
707	"Web %d rejected for load/swap feeding a store\n", Repr));
708	LLVM_DEBUG(dbgs() << " def " << EntryIdx << ": ");
709	LLVM_DEBUG(MI->dump());
710	LLVM_DEBUG(dbgs() << " use " << UseIdx << ": ");
711	LLVM_DEBUG(UseMI.dump());
712	LLVM_DEBUG(dbgs() << "\n");
713	}
714	}
715	}
716	}
717
718	// Reject webs that contain swapping stores that are fed by something
719	// other than a swap instruction.
720	} else if (SwapVector [EntryIdx].IsStore && SwapVector [EntryIdx].IsSwap) {
721	MachineInstr *MI = SwapVector [EntryIdx].VSEMI;
722	Register UseReg = MI->getOperand(i: `0`).getReg();
723	MachineInstr *DefMI = MRI->getVRegDef(Reg: UseReg);
724	Register DefReg = DefMI->getOperand(i: `0`).getReg();
725	int DefIdx = SwapMap [DefMI];
726
727	if (!SwapVector [DefIdx].IsSwap \|\| SwapVector [DefIdx].IsLoad \|\|
728	SwapVector [DefIdx].IsStore) {
729
730	SwapVector [Repr].WebRejected = `1`;
731
732	LLVM_DEBUG(dbgs() << format(
733	"Web %d rejected for store not fed by swap\n", Repr));
734	LLVM_DEBUG(dbgs() << " def " << DefIdx << ": ");
735	LLVM_DEBUG(DefMI->dump());
736	LLVM_DEBUG(dbgs() << " use " << EntryIdx << ": ");
737	LLVM_DEBUG(MI->dump());
738	LLVM_DEBUG(dbgs() << "\n");
739	}
740
741	// Ensure all uses of the register defined by DefMI feed store
742	// instructions
743	for (MachineInstr &UseMI : MRI->use_nodbg_instructions(Reg: DefReg)) {
744	int UseIdx = SwapMap [&UseMI];
745
746	if (SwapVector [UseIdx].VSEMI->getOpcode() != MI->getOpcode()) {
747	SwapVector [Repr].WebRejected = `1`;
748
749	LLVM_DEBUG(
750	dbgs() << format(
751	"Web %d rejected for swap not feeding only stores\n", Repr));
752	LLVM_DEBUG(dbgs() << " def "
753	<< " : ");
754	LLVM_DEBUG(DefMI->dump());
755	LLVM_DEBUG(dbgs() << " use " << UseIdx << ": ");
756	LLVM_DEBUG(SwapVector[UseIdx].VSEMI->dump());
757	LLVM_DEBUG(dbgs() << "\n");
758	}
759	}
760	}
761	}
762
763	LLVM_DEBUG(dbgs() << "Swap vector after web analysis:\n\n");
764	LLVM_DEBUG(dumpSwapVector());
765	}
766
767	// Walk the swap vector entries looking for swaps fed by permuting loads
768	// and swaps that feed permuting stores. If the containing computation
769	// has not been marked rejected, mark each such swap for removal.
770	// (Removal is delayed in case optimization has disturbed the pattern,
771	// such that multiple loads feed the same swap, etc.)
772	void PPCVSXSwapRemoval::markSwapsForRemoval() {
773
774	LLVM_DEBUG(dbgs() << "\n* Marking swaps for removal *\n\n");
775
776	for (unsigned EntryIdx = `0`; EntryIdx < SwapVector.size(); ++EntryIdx) {
777
778	if (SwapVector [EntryIdx].IsLoad && SwapVector [EntryIdx].IsSwap) {
779	int Repr = EC->getLeaderValue(V: SwapVector [EntryIdx].VSEId);
780
781	if (!SwapVector [Repr].WebRejected) {
782	MachineInstr *MI = SwapVector [EntryIdx].VSEMI;
783	Register DefReg = MI->getOperand(i: `0`).getReg();
784
785	for (MachineInstr &UseMI : MRI->use_nodbg_instructions(Reg: DefReg)) {
786	int UseIdx = SwapMap [&UseMI];
787	SwapVector [UseIdx].WillRemove = `1`;
788
789	LLVM_DEBUG(dbgs() << "Marking swap fed by load for removal: ");
790	LLVM_DEBUG(UseMI.dump());
791	}
792	}
793
794	} else if (SwapVector [EntryIdx].IsStore && SwapVector [EntryIdx].IsSwap) {
795	int Repr = EC->getLeaderValue(V: SwapVector [EntryIdx].VSEId);
796
797	if (!SwapVector [Repr].WebRejected) {
798	MachineInstr *MI = SwapVector [EntryIdx].VSEMI;
799	Register UseReg = MI->getOperand(i: `0`).getReg();
800	MachineInstr *DefMI = MRI->getVRegDef(Reg: UseReg);
801	int DefIdx = SwapMap [DefMI];
802	SwapVector [DefIdx].WillRemove = `1`;
803
804	LLVM_DEBUG(dbgs() << "Marking swap feeding store for removal: ");
805	LLVM_DEBUG(DefMI->dump());
806	}
807
808	} else if (SwapVector [EntryIdx].IsSwappable &&
809	SwapVector [EntryIdx].SpecialHandling != `0`) {
810	int Repr = EC->getLeaderValue(V: SwapVector [EntryIdx].VSEId);
811
812	if (!SwapVector [Repr].WebRejected)
813	handleSpecialSwappables(EntryIdx);
814	}
815	}
816	}
817
818	// Create an xxswapd instruction and insert it prior to the given point.
819	// MI is used to determine basic block and debug loc information.
820	// FIXME: When inserting a swap, we should check whether SrcReg is
821	// defined by another swap: SrcReg = XXPERMDI Reg, Reg, 2; If so,
822	// then instead we should generate a copy from Reg to DstReg.
823	void PPCVSXSwapRemoval::insertSwap(MachineInstr *MI,
824	MachineBasicBlock::iterator InsertPoint,
825	unsigned DstReg, unsigned SrcReg) {
826	BuildMI(BB&: *MI->getParent(), I: InsertPoint, MIMD: MI->getDebugLoc(),
827	MCID: TII->get(Opcode: PPC::XXPERMDI), DestReg: DstReg)
828	.addReg(RegNo: SrcReg)
829	.addReg(RegNo: SrcReg)
830	.addImm(Val: `2`);
831	}
832
833	// The identified swap entry requires special handling to allow its
834	// containing computation to be optimized. Perform that handling
835	// here.
836	// FIXME: Additional opportunities will be phased in with subsequent
837	// patches.
838	void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
839	switch (SwapVector [EntryIdx].SpecialHandling) {
840
841	default:
842	llvm_unreachable("Unexpected special handling type");
843
844	// For splats based on an index into a vector, add N/2 modulo N
845	// to the index, where N is the number of vector elements.
846	case SHValues::SH_SPLAT: {
847	MachineInstr *MI = SwapVector [EntryIdx].VSEMI;
848	unsigned NElts;
849
850	LLVM_DEBUG(dbgs() << "Changing splat: ");
851	LLVM_DEBUG(MI->dump());
852
853	switch (MI->getOpcode()) {
854	default:
855	llvm_unreachable("Unexpected splat opcode");
856	case PPC::VSPLTB: NElts = `16`; break;
857	case PPC::VSPLTH: NElts = `8`; break;
858	case PPC::VSPLTW:
859	case PPC::XXSPLTW: NElts = `4`; break;
860	}
861
862	unsigned EltNo;
863	if (MI->getOpcode() == PPC::XXSPLTW)
864	EltNo = MI->getOperand(i: `2`).getImm();
865	else
866	EltNo = MI->getOperand(i: `1`).getImm();
867
868	EltNo = (EltNo + NElts / `2`) % NElts;
869	if (MI->getOpcode() == PPC::XXSPLTW)
870	MI->getOperand(i: `2`).setImm(EltNo);
871	else
872	MI->getOperand(i: `1`).setImm(EltNo);
873
874	LLVM_DEBUG(dbgs() << " Into: ");
875	LLVM_DEBUG(MI->dump());
876	break;
877	}
878
879	// For an XXPERMDI that isn't handled otherwise, we need to
880	// reverse the order of the operands. If the selector operand
881	// has a value of 0 or 3, we need to change it to 3 or 0,
882	// respectively. Otherwise we should leave it alone. (This
883	// is equivalent to reversing the two bits of the selector
884	// operand and complementing the result.)
885	case SHValues::SH_XXPERMDI: {
886	MachineInstr *MI = SwapVector [EntryIdx].VSEMI;
887
888	LLVM_DEBUG(dbgs() << "Changing XXPERMDI: ");
889	LLVM_DEBUG(MI->dump());
890
891	unsigned Selector = MI->getOperand(i: `3`).getImm();
892	if (Selector == `0` \|\| Selector == `3`)
893	Selector = `3` - Selector;
894	MI->getOperand(i: `3`).setImm(Selector);
895
896	Register Reg1 = MI->getOperand(i: `1`).getReg();
897	Register Reg2 = MI->getOperand(i: `2`).getReg();
898	MI->getOperand(i: `1`).setReg(Reg2);
899	MI->getOperand(i: `2`).setReg(Reg1);
900
901	// We also need to swap kill flag associated with the register.
902	bool IsKill1 = MI->getOperand(i: `1`).isKill();
903	bool IsKill2 = MI->getOperand(i: `2`).isKill();
904	MI->getOperand(i: `1`).setIsKill(IsKill2);
905	MI->getOperand(i: `2`).setIsKill(IsKill1);
906
907	LLVM_DEBUG(dbgs() << " Into: ");
908	LLVM_DEBUG(MI->dump());
909	break;
910	}
911
912	// For a copy from a scalar floating-point register to a vector
913	// register, removing swaps will leave the copied value in the
914	// wrong lane. Insert a swap following the copy to fix this.
915	case SHValues::SH_COPYWIDEN: {
916	MachineInstr *MI = SwapVector [EntryIdx].VSEMI;
917
918	LLVM_DEBUG(dbgs() << "Changing SUBREG_TO_REG: ");
919	LLVM_DEBUG(MI->dump());
920
921	Register DstReg = MI->getOperand(i: `0`).getReg();
922	const TargetRegisterClass *DstRC = MRI->getRegClass(Reg: DstReg);
923	Register NewVReg = MRI->createVirtualRegister(RegClass: DstRC);
924
925	MI->getOperand(i: `0`).setReg(NewVReg);
926	LLVM_DEBUG(dbgs() << " Into: ");
927	LLVM_DEBUG(MI->dump());
928
929	auto InsertPoint = ++MachineBasicBlock::iterator (MI);
930
931	// Note that an XXPERMDI requires a VSRC, so if the SUBREG_TO_REG
932	// is copying to a VRRC, we need to be careful to avoid a register
933	// assignment problem. In this case we must copy from VRRC to VSRC
934	// prior to the swap, and from VSRC to VRRC following the swap.
935	// Coalescing will usually remove all this mess.
936	if (DstRC == &PPC::VRRCRegClass) {
937	Register VSRCTmp1 = MRI->createVirtualRegister(RegClass: &PPC::VSRCRegClass);
938	Register VSRCTmp2 = MRI->createVirtualRegister(RegClass: &PPC::VSRCRegClass);
939
940	BuildMI(BB&: *MI->getParent(), I: InsertPoint, MIMD: MI->getDebugLoc(),
941	MCID: TII->get(Opcode: PPC::COPY), DestReg: VSRCTmp1)
942	.addReg(RegNo: NewVReg);
943	LLVM_DEBUG(std::prev(InsertPoint)->dump());
944
945	insertSwap(MI, InsertPoint, DstReg: VSRCTmp2, SrcReg: VSRCTmp1);
946	LLVM_DEBUG(std::prev(InsertPoint)->dump());
947
948	BuildMI(BB&: *MI->getParent(), I: InsertPoint, MIMD: MI->getDebugLoc(),
949	MCID: TII->get(Opcode: PPC::COPY), DestReg: DstReg)
950	.addReg(RegNo: VSRCTmp2);
951	LLVM_DEBUG(std::prev(InsertPoint)->dump());
952
953	} else {
954	insertSwap(MI, InsertPoint, DstReg, SrcReg: NewVReg);
955	LLVM_DEBUG(std::prev(InsertPoint)->dump());
956	}
957	break;
958	}
959	}
960	}
961
962	// Walk the swap vector and replace each entry marked for removal with
963	// a copy operation.
964	bool PPCVSXSwapRemoval::removeSwaps() {
965
966	LLVM_DEBUG(dbgs() << "\n* Removing swaps *\n\n");
967
968	bool Changed = false;
969
970	for (unsigned EntryIdx = `0`; EntryIdx < SwapVector.size(); ++EntryIdx) {
971	if (SwapVector [EntryIdx].WillRemove) {
972	Changed = true;
973	MachineInstr *MI = SwapVector [EntryIdx].VSEMI;
974	MachineBasicBlock *MBB = MI->getParent();
975	BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY),
976	DestReg: MI->getOperand(i: `0`).getReg())
977	.add(MO: MI->getOperand(i: `1`));
978
979	LLVM_DEBUG(dbgs() << format("Replaced %d with copy: ",
980	SwapVector[EntryIdx].VSEId));
981	LLVM_DEBUG(MI->dump());
982
983	MI->eraseFromParent();
984	}
985	}
986
987	return Changed;
988	}
989
990	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
991	// For debug purposes, dump the contents of the swap vector.
992	LLVM_DUMP_METHOD void PPCVSXSwapRemoval::dumpSwapVector() {
993
994	for (unsigned EntryIdx = `0`; EntryIdx < SwapVector.size(); ++EntryIdx) {
995
996	MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
997	int ID = SwapVector[EntryIdx].VSEId;
998
999	dbgs() << format("%6d", ID);
1000	dbgs() << format("%6d", EC->getLeaderValue(ID));
1001	dbgs() << format(" %bb.%3d", MI->getParent()->getNumber());
1002	dbgs() << format(" %14s ", TII->getName(MI->getOpcode()).str().c_str());
1003
1004	if (SwapVector[EntryIdx].IsLoad)
1005	dbgs() << "load ";
1006	if (SwapVector[EntryIdx].IsStore)
1007	dbgs() << "store ";
1008	if (SwapVector[EntryIdx].IsSwap)
1009	dbgs() << "swap ";
1010	if (SwapVector[EntryIdx].MentionsPhysVR)
1011	dbgs() << "physreg ";
1012	if (SwapVector[EntryIdx].MentionsPartialVR)
1013	dbgs() << "partialreg ";
1014
1015	if (SwapVector[EntryIdx].IsSwappable) {
1016	dbgs() << "swappable ";
1017	switch(SwapVector[EntryIdx].SpecialHandling) {
1018	default:
1019	dbgs() << "special:unknown";
1020	break;
1021	case SH_NONE:
1022	break;
1023	case SH_EXTRACT:
1024	dbgs() << "special:extract ";
1025	break;
1026	case SH_INSERT:
1027	dbgs() << "special:insert ";
1028	break;
1029	case SH_NOSWAP_LD:
1030	dbgs() << "special:load ";
1031	break;
1032	case SH_NOSWAP_ST:
1033	dbgs() << "special:store ";
1034	break;
1035	case SH_SPLAT:
1036	dbgs() << "special:splat ";
1037	break;
1038	case SH_XXPERMDI:
1039	dbgs() << "special:xxpermdi ";
1040	break;
1041	case SH_COPYWIDEN:
1042	dbgs() << "special:copywiden ";
1043	break;
1044	}
1045	}
1046
1047	if (SwapVector[EntryIdx].WebRejected)
1048	dbgs() << "rejected ";
1049	if (SwapVector[EntryIdx].WillRemove)
1050	dbgs() << "remove ";
1051
1052	dbgs() << "\n";
1053
1054	// For no-asserts builds.
1055	(void)MI;
1056	(void)ID;
1057	}
1058
1059	dbgs() << "\n";
1060	}
1061	#endif
1062
1063	INITIALIZE_PASS_BEGIN(PPCVSXSwapRemoval, DEBUG_TYPE,
1064	"PowerPC VSX Swap Removal", false, false)
1065	INITIALIZE_PASS_END(PPCVSXSwapRemoval, DEBUG_TYPE,
1066	"PowerPC VSX Swap Removal", false, false)
1067
1068	char PPCVSXSwapRemoval::ID = `0`;
1069	FunctionPass*
1070	llvm::createPPCVSXSwapRemovalPass() { return new PPCVSXSwapRemoval (); }
1071

Browse the source code of llvm_projects/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp