PPCVSXSwapRemoval.cpp source code [llvm_projects/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp]

1	//===----------- PPCVSXSwapRemoval.cpp - Remove VSX LE Swaps -------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===---------------------------------------------------------------------===//
8	//
9	// This pass analyzes vector computations and removes unnecessary
10	// doubleword swaps (xxswapd instructions). This pass is performed
11	// only for little-endian VSX code generation.
12	//
13	// For this specific case, loads and stores of v4i32, v4f32, v2i64,
14	// and v2f64 vectors are inefficient. These are implemented using
15	// the lxvd2x and stxvd2x instructions, which invert the order of
16	// doublewords in a vector register. Thus code generation inserts
17	// an xxswapd after each such load, and prior to each such store.
18	//
19	// The extra xxswapd instructions reduce performance. The purpose
20	// of this pass is to reduce the number of xxswapd instructions
21	// required for correctness.
22	//
23	// The primary insight is that much code that operates on vectors
24	// does not care about the relative order of elements in a register,
25	// so long as the correct memory order is preserved. If we have a
26	// computation where all input values are provided by lxvd2x/xxswapd,
27	// all outputs are stored using xxswapd/lxvd2x, and all intermediate
28	// computations are lane-insensitive (independent of element order),
29	// then all the xxswapd instructions associated with the loads and
30	// stores may be removed without changing observable semantics.
31	//
32	// This pass uses standard equivalence class infrastructure to create
33	// maximal webs of computations fitting the above description. Each
34	// such web is then optimized by removing its unnecessary xxswapd
35	// instructions.
36	//
37	// There are some lane-sensitive operations for which we can still
38	// permit the optimization, provided we modify those operations
39	// accordingly. Such operations are identified as using "special
40	// handling" within this module.
41	//
42	//===---------------------------------------------------------------------===//
43
44	#include "PPC.h"
45	#include "PPCInstrBuilder.h"
46	#include "PPCInstrInfo.h"
47	#include "PPCTargetMachine.h"
48	#include "llvm/ADT/DenseMap.h"
49	#include "llvm/ADT/EquivalenceClasses.h"
50	#include "llvm/CodeGen/MachineFunctionPass.h"
51	#include "llvm/CodeGen/MachineInstrBuilder.h"
52	#include "llvm/CodeGen/MachineRegisterInfo.h"
53	#include "llvm/Config/llvm-config.h"
54	#include "llvm/Support/Debug.h"
55	#include "llvm/Support/Format.h"
56	#include "llvm/Support/raw_ostream.h"
57
58	using namespace llvm;
59
60	#define DEBUG_TYPE "ppc-vsx-swaps"
61
62	namespace {
63
64	// A PPCVSXSwapEntry is created for each machine instruction that
65	// is relevant to a vector computation.
66	struct PPCVSXSwapEntry {
67	// Pointer to the instruction.
68	MachineInstr *VSEMI;
69
70	// Unique ID (position in the swap vector).
71	int VSEId;
72
73	// Attributes of this node.
74	unsigned int IsLoad : `1`;
75	unsigned int IsStore : `1`;
76	unsigned int IsSwap : `1`;
77	unsigned int MentionsPhysVR : `1`;
78	unsigned int IsSwappable : `1`;
79	unsigned int MentionsPartialVR : `1`;
80	unsigned int SpecialHandling : `3`;
81	unsigned int WebRejected : `1`;
82	unsigned int WillRemove : `1`;
83	};
84
85	enum SHValues {
86	SH_NONE = `0`,
87	SH_EXTRACT,
88	SH_INSERT,
89	SH_NOSWAP_LD,
90	SH_NOSWAP_ST,
91	SH_SPLAT,
92	SH_XXPERMDI,
93	SH_COPYWIDEN
94	};
95
96	struct PPCVSXSwapRemoval : public MachineFunctionPass {
97
98	static char ID;
99	const PPCInstrInfo *TII;
100	MachineFunction *MF;
101	MachineRegisterInfo *MRI;
102
103	// Swap entries are allocated in a vector for better performance.
104	std::vector<PPCVSXSwapEntry> SwapVector;
105
106	// A mapping is maintained between machine instructions and
107	// their swap entries. The key is the address of the MI.
108	DenseMap<MachineInstr, int*> SwapMap;
109
110	// Equivalence classes are used to gather webs of related computation.
111	// Swap entries are represented by their VSEId fields.
112	EquivalenceClasses<int> *EC;
113
114	PPCVSXSwapRemoval() : MachineFunctionPass (ID) {
115	initializePPCVSXSwapRemovalPass(*PassRegistry::getPassRegistry());
116	}
117
118	private:
119	// Initialize data structures.
120	void initialize(MachineFunction &MFParm);
121
122	// Walk the machine instructions to gather vector usage information.
123	// Return true iff vector mentions are present.
124	bool gatherVectorInstructions();
125
126	// Add an entry to the swap vector and swap map.
127	int addSwapEntry(MachineInstr *MI, PPCVSXSwapEntry &SwapEntry);
128
129	// Hunt backwards through COPY and SUBREG_TO_REG chains for a
130	// source register. VecIdx indicates the swap vector entry to
131	// mark as mentioning a physical register if the search leads
132	// to one.
133	unsigned lookThruCopyLike(unsigned SrcReg, unsigned VecIdx);
134
135	// Generate equivalence classes for related computations (webs).
136	void formWebs();
137
138	// Analyze webs and determine those that cannot be optimized.
139	void recordUnoptimizableWebs();
140
141	// Record which swap instructions can be safely removed.
142	void markSwapsForRemoval();
143
144	// Remove swaps and update other instructions requiring special
145	// handling. Return true iff any changes are made.
146	bool removeSwaps();
147
148	// Insert a swap instruction from SrcReg to DstReg at the given
149	// InsertPoint.
150	void insertSwap(MachineInstr *MI, MachineBasicBlock::iterator InsertPoint,
151	unsigned DstReg, unsigned SrcReg);
152
153	// Update instructions requiring special handling.
154	void handleSpecialSwappables(int EntryIdx);
155
156	// Dump a description of the entries in the swap vector.
157	void dumpSwapVector();
158
159	// Return true iff the given register is in the given class.
160	bool isRegInClass(unsigned Reg, const TargetRegisterClass *RC) {
161	if (Register::isVirtualRegister(Reg))
162	return RC->hasSubClassEq(RC: MRI->getRegClass(Reg));
163	return RC->contains(Reg);
164	}
165
166	// Return true iff the given register is a full vector register.
167	bool isVecReg(unsigned Reg) {
168	return (isRegInClass(Reg, RC: &PPC::VSRCRegClass) \|\|
169	isRegInClass(Reg, RC: &PPC::VRRCRegClass));
170	}
171
172	// Return true iff the given register is a partial vector register.
173	bool isScalarVecReg(unsigned Reg) {
174	return (isRegInClass(Reg, RC: &PPC::VSFRCRegClass) \|\|
175	isRegInClass(Reg, RC: &PPC::VSSRCRegClass));
176	}
177
178	// Return true iff the given register mentions all or part of a
179	// vector register. Also sets Partial to true if the mention
180	// is for just the floating-point register overlap of the register.
181	bool isAnyVecReg(unsigned Reg, bool &Partial) {
182	if (isScalarVecReg(Reg))
183	Partial = true;
184	return isScalarVecReg(Reg) \|\| isVecReg(Reg);
185	}
186
187	public:
188	// Main entry point for this pass.
189	bool runOnMachineFunction(MachineFunction &MF) override {
190	if (skipFunction(F: MF.getFunction()))
191	return false;
192
193	// If we don't have VSX on the subtarget, don't do anything.
194	// Also, on Power 9 the load and store ops preserve element order and so
195	// the swaps are not required.
196	const PPCSubtarget &STI = MF.getSubtarget<PPCSubtarget>();
197	if (!STI.hasVSX() \|\| !STI.needsSwapsForVSXMemOps())
198	return false;
199
200	bool Changed = false;
201	initialize(MFParm&: MF);
202
203	if (gatherVectorInstructions()) {
204	formWebs();
205	recordUnoptimizableWebs();
206	markSwapsForRemoval();
207	Changed = removeSwaps();
208	}
209
210	// FIXME: See the allocation of EC in initialize().
211	delete EC;
212	return Changed;
213	}
214	};
215
216	// Initialize data structures for this pass. In particular, clear the
217	// swap vector and allocate the equivalence class mapping before
218	// processing each function.
219	void PPCVSXSwapRemoval::initialize(MachineFunction &MFParm) {
220	MF = &MFParm;
221	MRI = &MF->getRegInfo();
222	TII = MF->getSubtarget<PPCSubtarget>().getInstrInfo();
223
224	// An initial vector size of 256 appears to work well in practice.
225	// Small/medium functions with vector content tend not to incur a
226	// reallocation at this size. Three of the vector tests in
227	// projects/test-suite reallocate, which seems like a reasonable rate.
228	const int InitialVectorSize(`256`);
229	SwapVector.clear();
230	SwapVector.reserve(n: InitialVectorSize);
231
232	// FIXME: Currently we allocate EC each time because we don't have
233	// access to the set representation on which to call clear(). Should
234	// consider adding a clear() method to the EquivalenceClasses class.
235	EC = new EquivalenceClasses<int>;
236	}
237
238	// Create an entry in the swap vector for each instruction that mentions
239	// a full vector register, recording various characteristics of the
240	// instructions there.
241	bool PPCVSXSwapRemoval::gatherVectorInstructions() {
242	bool RelevantFunction = false;
243
244	for (MachineBasicBlock &MBB : *MF) {
245	for (MachineInstr &MI : MBB) {
246
247	if (MI.isDebugInstr())
248	continue;
249
250	bool RelevantInstr = false;
251	bool Partial = false;
252
253	for (const MachineOperand &MO : MI.operands()) {
254	if (!MO.isReg())
255	continue;
256	Register Reg = MO.getReg();
257	// All operands need to be checked because there are instructions that
258	// operate on a partial register and produce a full register (such as
259	// XXPERMDIs).
260	if (isAnyVecReg(Reg, Partial))
261	RelevantInstr = true;
262	}
263
264	if (!RelevantInstr)
265	continue;
266
267	RelevantFunction = true;
268
269	// Create a SwapEntry initialized to zeros, then fill in the
270	// instruction and ID fields before pushing it to the back
271	// of the swap vector.
272	PPCVSXSwapEntry SwapEntry{};
273	int VecIdx = addSwapEntry(MI: &MI, SwapEntry);
274
275	switch(MI.getOpcode()) {
276	default:
277	// Unless noted otherwise, an instruction is considered
278	// safe for the optimization. There are a large number of
279	// such true-SIMD instructions (all vector math, logical,
280	// select, compare, etc.). However, if the instruction
281	// mentions a partial vector register and does not have
282	// special handling defined, it is not swappable.
283	if (Partial)
284	SwapVector [VecIdx].MentionsPartialVR = `1`;
285	else
286	SwapVector [VecIdx].IsSwappable = `1`;
287	break;
288	case PPC::XXPERMDI: {
289	// This is a swap if it is of the form XXPERMDI t, s, s, 2.
290	// Unfortunately, MachineCSE ignores COPY and SUBREG_TO_REG, so we
291	// can also see XXPERMDI t, SUBREG_TO_REG(s), SUBREG_TO_REG(s), 2,
292	// for example. We have to look through chains of COPY and
293	// SUBREG_TO_REG to find the real source value for comparison.
294	// If the real source value is a physical register, then mark the
295	// XXPERMDI as mentioning a physical register.
296	int immed = MI.getOperand(i: `3`).getImm();
297	if (immed == `2`) {
298	unsigned trueReg1 = lookThruCopyLike(SrcReg: MI.getOperand(i: `1`).getReg(),
299	VecIdx);
300	unsigned trueReg2 = lookThruCopyLike(SrcReg: MI.getOperand(i: `2`).getReg(),
301	VecIdx);
302	if (trueReg1 == trueReg2)
303	SwapVector [VecIdx].IsSwap = `1`;
304	else {
305	// We can still handle these if the two registers are not
306	// identical, by adjusting the form of the XXPERMDI.
307	SwapVector [VecIdx].IsSwappable = `1`;
308	SwapVector [VecIdx].SpecialHandling = SHValues::SH_XXPERMDI;
309	}
310	// This is a doubleword splat if it is of the form
311	// XXPERMDI t, s, s, 0 or XXPERMDI t, s, s, 3. As above we
312	// must look through chains of copy-likes to find the source
313	// register. We turn off the marking for mention of a physical
314	// register, because splatting it is safe; the optimization
315	// will not swap the value in the physical register. Whether
316	// or not the two input registers are identical, we can handle
317	// these by adjusting the form of the XXPERMDI.
318	} else if (immed == `0` \|\| immed == `3`) {
319
320	SwapVector [VecIdx].IsSwappable = `1`;
321	SwapVector [VecIdx].SpecialHandling = SHValues::SH_XXPERMDI;
322
323	unsigned trueReg1 = lookThruCopyLike(SrcReg: MI.getOperand(i: `1`).getReg(),
324	VecIdx);
325	unsigned trueReg2 = lookThruCopyLike(SrcReg: MI.getOperand(i: `2`).getReg(),
326	VecIdx);
327	if (trueReg1 == trueReg2)
328	SwapVector [VecIdx].MentionsPhysVR = `0`;
329
330	} else {
331	// We can still handle these by adjusting the form of the XXPERMDI.
332	SwapVector [VecIdx].IsSwappable = `1`;
333	SwapVector [VecIdx].SpecialHandling = SHValues::SH_XXPERMDI;
334	}
335	break;
336	}
337	case PPC::LVX:
338	// Non-permuting loads are currently unsafe. We can use special
339	// handling for this in the future. By not marking these as
340	// IsSwap, we ensure computations containing them will be rejected
341	// for now.
342	SwapVector [VecIdx].IsLoad = `1`;
343	break;
344	case PPC::LXVD2X:
345	case PPC::LXVW4X:
346	// Permuting loads are marked as both load and swap, and are
347	// safe for optimization.
348	SwapVector [VecIdx].IsLoad = `1`;
349	SwapVector [VecIdx].IsSwap = `1`;
350	break;
351	case PPC::LXSDX:
352	case PPC::LXSSPX:
353	case PPC::XFLOADf64:
354	case PPC::XFLOADf32:
355	// A load of a floating-point value into the high-order half of
356	// a vector register is safe, provided that we introduce a swap
357	// following the load, which will be done by the SUBREG_TO_REG
358	// support. So just mark these as safe.
359	SwapVector [VecIdx].IsLoad = `1`;
360	SwapVector [VecIdx].IsSwappable = `1`;
361	break;
362	case PPC::STVX:
363	// Non-permuting stores are currently unsafe. We can use special
364	// handling for this in the future. By not marking these as
365	// IsSwap, we ensure computations containing them will be rejected
366	// for now.
367	SwapVector [VecIdx].IsStore = `1`;
368	break;
369	case PPC::STXVD2X:
370	case PPC::STXVW4X:
371	// Permuting stores are marked as both store and swap, and are
372	// safe for optimization.
373	SwapVector [VecIdx].IsStore = `1`;
374	SwapVector [VecIdx].IsSwap = `1`;
375	break;
376	case PPC::COPY:
377	// These are fine provided they are moving between full vector
378	// register classes.
379	if (isVecReg(Reg: MI.getOperand(i: `0`).getReg()) &&
380	isVecReg(Reg: MI.getOperand(i: `1`).getReg()))
381	SwapVector [VecIdx].IsSwappable = `1`;
382	// If we have a copy from one scalar floating-point register
383	// to another, we can accept this even if it is a physical
384	// register. The only way this gets involved is if it feeds
385	// a SUBREG_TO_REG, which is handled by introducing a swap.
386	else if (isScalarVecReg(Reg: MI.getOperand(i: `0`).getReg()) &&
387	isScalarVecReg(Reg: MI.getOperand(i: `1`).getReg()))
388	SwapVector [VecIdx].IsSwappable = `1`;
389	break;
390	case PPC::SUBREG_TO_REG: {
391	// These are fine provided they are moving between full vector
392	// register classes. If they are moving from a scalar
393	// floating-point class to a vector class, we can handle those
394	// as well, provided we introduce a swap. It is generally the
395	// case that we will introduce fewer swaps than we remove, but
396	// (FIXME) a cost model could be used. However, introduced
397	// swaps could potentially be CSEd, so this is not trivial.
398	if (isVecReg(Reg: MI.getOperand(i: `0`).getReg()) &&
399	isVecReg(Reg: MI.getOperand(i: `2`).getReg()))
400	SwapVector [VecIdx].IsSwappable = `1`;
401	else if (isVecReg(Reg: MI.getOperand(i: `0`).getReg()) &&
402	isScalarVecReg(Reg: MI.getOperand(i: `2`).getReg())) {
403	SwapVector [VecIdx].IsSwappable = `1`;
404	SwapVector [VecIdx].SpecialHandling = SHValues::SH_COPYWIDEN;
405	}
406	break;
407	}
408	case PPC::VSPLTB:
409	case PPC::VSPLTH:
410	case PPC::VSPLTW:
411	case PPC::XXSPLTW:
412	// Splats are lane-sensitive, but we can use special handling
413	// to adjust the source lane for the splat.
414	SwapVector [VecIdx].IsSwappable = `1`;
415	SwapVector [VecIdx].SpecialHandling = SHValues::SH_SPLAT;
416	break;
417	// The presence of the following lane-sensitive operations in a
418	// web will kill the optimization, at least for now. For these
419	// we do nothing, causing the optimization to fail.
420	// FIXME: Some of these could be permitted with special handling,
421	// and will be phased in as time permits.
422	// FIXME: There is no simple and maintainable way to express a set
423	// of opcodes having a common attribute in TableGen. Should this
424	// change, this is a prime candidate to use such a mechanism.
425	case PPC::INLINEASM:
426	case PPC::INLINEASM_BR:
427	case PPC::EXTRACT_SUBREG:
428	case PPC::INSERT_SUBREG:
429	case PPC::COPY_TO_REGCLASS:
430	case PPC::LVEBX:
431	case PPC::LVEHX:
432	case PPC::LVEWX:
433	case PPC::LVSL:
434	case PPC::LVSR:
435	case PPC::LVXL:
436	case PPC::STVEBX:
437	case PPC::STVEHX:
438	case PPC::STVEWX:
439	case PPC::STVXL:
440	// We can handle STXSDX and STXSSPX similarly to LXSDX and LXSSPX,
441	// by adding special handling for narrowing copies as well as
442	// widening ones. However, I've experimented with this, and in
443	// practice we currently do not appear to use STXSDX fed by
444	// a narrowing copy from a full vector register. Since I can't
445	// generate any useful test cases, I've left this alone for now.
446	case PPC::STXSDX:
447	case PPC::STXSSPX:
448	case PPC::VCIPHER:
449	case PPC::VCIPHERLAST:
450	case PPC::VMRGHB:
451	case PPC::VMRGHH:
452	case PPC::VMRGHW:
453	case PPC::VMRGLB:
454	case PPC::VMRGLH:
455	case PPC::VMRGLW:
456	case PPC::VMULESB:
457	case PPC::VMULESH:
458	case PPC::VMULESW:
459	case PPC::VMULEUB:
460	case PPC::VMULEUH:
461	case PPC::VMULEUW:
462	case PPC::VMULOSB:
463	case PPC::VMULOSH:
464	case PPC::VMULOSW:
465	case PPC::VMULOUB:
466	case PPC::VMULOUH:
467	case PPC::VMULOUW:
468	case PPC::VNCIPHER:
469	case PPC::VNCIPHERLAST:
470	case PPC::VPERM:
471	case PPC::VPERMXOR:
472	case PPC::VPKPX:
473	case PPC::VPKSHSS:
474	case PPC::VPKSHUS:
475	case PPC::VPKSDSS:
476	case PPC::VPKSDUS:
477	case PPC::VPKSWSS:
478	case PPC::VPKSWUS:
479	case PPC::VPKUDUM:
480	case PPC::VPKUDUS:
481	case PPC::VPKUHUM:
482	case PPC::VPKUHUS:
483	case PPC::VPKUWUM:
484	case PPC::VPKUWUS:
485	case PPC::VPMSUMB:
486	case PPC::VPMSUMD:
487	case PPC::VPMSUMH:
488	case PPC::VPMSUMW:
489	case PPC::VRLB:
490	case PPC::VRLD:
491	case PPC::VRLH:
492	case PPC::VRLW:
493	case PPC::VSBOX:
494	case PPC::VSHASIGMAD:
495	case PPC::VSHASIGMAW:
496	case PPC::VSL:
497	case PPC::VSLDOI:
498	case PPC::VSLO:
499	case PPC::VSR:
500	case PPC::VSRO:
501	case PPC::VSUM2SWS:
502	case PPC::VSUM4SBS:
503	case PPC::VSUM4SHS:
504	case PPC::VSUM4UBS:
505	case PPC::VSUMSWS:
506	case PPC::VUPKHPX:
507	case PPC::VUPKHSB:
508	case PPC::VUPKHSH:
509	case PPC::VUPKHSW:
510	case PPC::VUPKLPX:
511	case PPC::VUPKLSB:
512	case PPC::VUPKLSH:
513	case PPC::VUPKLSW:
514	case PPC::XXMRGHW:
515	case PPC::XXMRGLW:
516	// XXSLDWI could be replaced by a general permute with one of three
517	// permute control vectors (for shift values 1, 2, 3). However,
518	// VPERM has a more restrictive register class.
519	case PPC::XXSLDWI:
520	case PPC::XSCVDPSPN:
521	case PPC::XSCVSPDPN:
522	case PPC::MTVSCR:
523	case PPC::MFVSCR:
524	break;
525	}
526	}
527	}
528
529	if (RelevantFunction) {
530	LLVM_DEBUG(dbgs() << "Swap vector when first built\n\n");
531	LLVM_DEBUG(dumpSwapVector());
532	}
533
534	return RelevantFunction;
535	}
536
537	// Add an entry to the swap vector and swap map, and make a
538	// singleton equivalence class for the entry.
539	int PPCVSXSwapRemoval::addSwapEntry(MachineInstr *MI,
540	PPCVSXSwapEntry& SwapEntry) {
541	SwapEntry.VSEMI = MI;
542	SwapEntry.VSEId = SwapVector.size();
543	SwapVector.push_back(x: SwapEntry);
544	EC->insert(Data: SwapEntry.VSEId);
545	SwapMap [MI] = SwapEntry.VSEId;
546	return SwapEntry.VSEId;
547	}
548
549	// This is used to find the "true" source register for an
550	// XXPERMDI instruction, since MachineCSE does not handle the
551	// "copy-like" operations (Copy and SubregToReg). Returns
552	// the original SrcReg unless it is the target of a copy-like
553	// operation, in which case we chain backwards through all
554	// such operations to the ultimate source register. If a
555	// physical register is encountered, we stop the search and
556	// flag the swap entry indicated by VecIdx (the original
557	// XXPERMDI) as mentioning a physical register.
558	unsigned PPCVSXSwapRemoval::lookThruCopyLike(unsigned SrcReg,
559	unsigned VecIdx) {
560	MachineInstr *MI = MRI->getVRegDef(Reg: SrcReg);
561	if (!MI->isCopyLike())
562	return SrcReg;
563
564	unsigned CopySrcReg;
565	if (MI->isCopy())
566	CopySrcReg = MI->getOperand(i: `1`).getReg();
567	else {
568	assert(MI->isSubregToReg() && "bad opcode for lookThruCopyLike");
569	CopySrcReg = MI->getOperand(i: `2`).getReg();
570	}
571
572	if (!Register::isVirtualRegister(Reg: CopySrcReg)) {
573	if (!isScalarVecReg(Reg: CopySrcReg))
574	SwapVector [VecIdx].MentionsPhysVR = `1`;
575	return CopySrcReg;
576	}
577
578	return lookThruCopyLike(SrcReg: CopySrcReg, VecIdx);
579	}
580
581	// Generate equivalence classes for related computations (webs) by
582	// def-use relationships of virtual registers. Mention of a physical
583	// register terminates the generation of equivalence classes as this
584	// indicates a use of a parameter, definition of a return value, use
585	// of a value returned from a call, or definition of a parameter to a
586	// call. Computations with physical register mentions are flagged
587	// as such so their containing webs will not be optimized.
588	void PPCVSXSwapRemoval::formWebs() {
589
590	LLVM_DEBUG(dbgs() << "\n* Forming webs for swap removal *\n\n");
591
592	for (unsigned EntryIdx = `0`; EntryIdx < SwapVector.size(); ++EntryIdx) {
593
594	MachineInstr *MI = SwapVector [EntryIdx].VSEMI;
595
596	LLVM_DEBUG(dbgs() << "\n" << SwapVector[EntryIdx].VSEId << " ");
597	LLVM_DEBUG(MI->dump());
598
599	// It's sufficient to walk vector uses and join them to their unique
600	// definitions. In addition, check full vector register operands
601	// for physical regs. We exclude partial-vector register operands
602	// because we can handle them if copied to a full vector.
603	for (const MachineOperand &MO : MI->operands()) {
604	if (!MO.isReg())
605	continue;
606
607	Register Reg = MO.getReg();
608	if (!isVecReg(Reg) && !isScalarVecReg(Reg))
609	continue;
610
611	if (!Reg.isVirtual()) {
612	if (!(MI->isCopy() && isScalarVecReg(Reg)))
613	SwapVector [EntryIdx].MentionsPhysVR = `1`;
614	continue;
615	}
616
617	if (!MO.isUse())
618	continue;
619
620	MachineInstr* DefMI = MRI->getVRegDef(Reg);
621	assert(SwapMap.contains(DefMI) &&
622	"Inconsistency: def of vector reg not found in swap map!");
623	int DefIdx = SwapMap [DefMI];
624	(void)EC->unionSets(V1: SwapVector [DefIdx].VSEId,
625	V2: SwapVector [EntryIdx].VSEId);
626
627	LLVM_DEBUG(dbgs() << format("Unioning %d with %d\n",
628	SwapVector[DefIdx].VSEId,
629	SwapVector[EntryIdx].VSEId));
630	LLVM_DEBUG(dbgs() << " Def: ");
631	LLVM_DEBUG(DefMI->dump());
632	}
633	}
634	}
635
636	// Walk the swap vector entries looking for conditions that prevent their
637	// containing computations from being optimized. When such conditions are
638	// found, mark the representative of the computation's equivalence class
639	// as rejected.
640	void PPCVSXSwapRemoval::recordUnoptimizableWebs() {
641
642	LLVM_DEBUG(dbgs() << "\n* Rejecting webs for swap removal *\n\n");
643
644	for (unsigned EntryIdx = `0`; EntryIdx < SwapVector.size(); ++EntryIdx) {
645	int Repr = EC->getLeaderValue(V: SwapVector [EntryIdx].VSEId);
646
647	// If representative is already rejected, don't waste further time.
648	if (SwapVector [Repr].WebRejected)
649	continue;
650
651	// Reject webs containing mentions of physical or partial registers, or
652	// containing operations that we don't know how to handle in a lane-
653	// permuted region.
654	if (SwapVector [EntryIdx].MentionsPhysVR \|\|
655	SwapVector [EntryIdx].MentionsPartialVR \|\|
656	!(SwapVector [EntryIdx].IsSwappable \|\| SwapVector [EntryIdx].IsSwap)) {
657
658	SwapVector [Repr].WebRejected = `1`;
659
660	LLVM_DEBUG(
661	dbgs() << format("Web %d rejected for physreg, partial reg, or not "
662	"swap[pable]\n",
663	Repr));
664	LLVM_DEBUG(dbgs() << " in " << EntryIdx << ": ");
665	LLVM_DEBUG(SwapVector[EntryIdx].VSEMI->dump());
666	LLVM_DEBUG(dbgs() << "\n");
667	}
668
669	// Reject webs than contain swapping loads that feed something other
670	// than a swap instruction.
671	else if (SwapVector [EntryIdx].IsLoad && SwapVector [EntryIdx].IsSwap) {
672	MachineInstr *MI = SwapVector [EntryIdx].VSEMI;
673	Register DefReg = MI->getOperand(i: `0`).getReg();
674
675	// We skip debug instructions in the analysis. (Note that debug
676	// location information is still maintained by this optimization
677	// because it remains on the LXVD2X and STXVD2X instructions after
678	// the XXPERMDIs are removed.)
679	for (MachineInstr &UseMI : MRI->use_nodbg_instructions(Reg: DefReg)) {
680	int UseIdx = SwapMap [&UseMI];
681
682	if (!SwapVector [UseIdx].IsSwap \|\| SwapVector [UseIdx].IsLoad \|\|
683	SwapVector [UseIdx].IsStore) {
684
685	SwapVector [Repr].WebRejected = `1`;
686
687	LLVM_DEBUG(dbgs() << format(
688	"Web %d rejected for load not feeding swap\n", Repr));
689	LLVM_DEBUG(dbgs() << " def " << EntryIdx << ": ");
690	LLVM_DEBUG(MI->dump());
691	LLVM_DEBUG(dbgs() << " use " << UseIdx << ": ");
692	LLVM_DEBUG(UseMI.dump());
693	LLVM_DEBUG(dbgs() << "\n");
694	}
695
696	// It is possible that the load feeds a swap and that swap feeds a
697	// store. In such a case, the code is actually trying to store a swapped
698	// vector. We must reject such webs.
699	if (SwapVector [UseIdx].IsSwap && !SwapVector [UseIdx].IsLoad &&
700	!SwapVector [UseIdx].IsStore) {
701	Register SwapDefReg = UseMI.getOperand(i: `0`).getReg();
702	for (MachineInstr &UseOfUseMI :
703	MRI->use_nodbg_instructions(Reg: SwapDefReg)) {
704	int UseOfUseIdx = SwapMap [&UseOfUseMI];
705	if (SwapVector [UseOfUseIdx].IsStore) {
706	SwapVector [Repr].WebRejected = `1`;
707	LLVM_DEBUG(
708	dbgs() << format(
709	"Web %d rejected for load/swap feeding a store\n", Repr));
710	LLVM_DEBUG(dbgs() << " def " << EntryIdx << ": ");
711	LLVM_DEBUG(MI->dump());
712	LLVM_DEBUG(dbgs() << " use " << UseIdx << ": ");
713	LLVM_DEBUG(UseMI.dump());
714	LLVM_DEBUG(dbgs() << "\n");
715	}
716	}
717	}
718	}
719
720	// Reject webs that contain swapping stores that are fed by something
721	// other than a swap instruction.
722	} else if (SwapVector [EntryIdx].IsStore && SwapVector [EntryIdx].IsSwap) {
723	MachineInstr *MI = SwapVector [EntryIdx].VSEMI;
724	Register UseReg = MI->getOperand(i: `0`).getReg();
725	MachineInstr *DefMI = MRI->getVRegDef(Reg: UseReg);
726	Register DefReg = DefMI->getOperand(i: `0`).getReg();
727	int DefIdx = SwapMap [DefMI];
728
729	if (!SwapVector [DefIdx].IsSwap \|\| SwapVector [DefIdx].IsLoad \|\|
730	SwapVector [DefIdx].IsStore) {
731
732	SwapVector [Repr].WebRejected = `1`;
733
734	LLVM_DEBUG(dbgs() << format(
735	"Web %d rejected for store not fed by swap\n", Repr));
736	LLVM_DEBUG(dbgs() << " def " << DefIdx << ": ");
737	LLVM_DEBUG(DefMI->dump());
738	LLVM_DEBUG(dbgs() << " use " << EntryIdx << ": ");
739	LLVM_DEBUG(MI->dump());
740	LLVM_DEBUG(dbgs() << "\n");
741	}
742
743	// Ensure all uses of the register defined by DefMI feed store
744	// instructions
745	for (MachineInstr &UseMI : MRI->use_nodbg_instructions(Reg: DefReg)) {
746	int UseIdx = SwapMap [&UseMI];
747
748	if (SwapVector [UseIdx].VSEMI->getOpcode() != MI->getOpcode()) {
749	SwapVector [Repr].WebRejected = `1`;
750
751	LLVM_DEBUG(
752	dbgs() << format(
753	"Web %d rejected for swap not feeding only stores\n", Repr));
754	LLVM_DEBUG(dbgs() << " def "
755	<< " : ");
756	LLVM_DEBUG(DefMI->dump());
757	LLVM_DEBUG(dbgs() << " use " << UseIdx << ": ");
758	LLVM_DEBUG(SwapVector[UseIdx].VSEMI->dump());
759	LLVM_DEBUG(dbgs() << "\n");
760	}
761	}
762	}
763	}
764
765	LLVM_DEBUG(dbgs() << "Swap vector after web analysis:\n\n");
766	LLVM_DEBUG(dumpSwapVector());
767	}
768
769	// Walk the swap vector entries looking for swaps fed by permuting loads
770	// and swaps that feed permuting stores. If the containing computation
771	// has not been marked rejected, mark each such swap for removal.
772	// (Removal is delayed in case optimization has disturbed the pattern,
773	// such that multiple loads feed the same swap, etc.)
774	void PPCVSXSwapRemoval::markSwapsForRemoval() {
775
776	LLVM_DEBUG(dbgs() << "\n* Marking swaps for removal *\n\n");
777
778	for (unsigned EntryIdx = `0`; EntryIdx < SwapVector.size(); ++EntryIdx) {
779
780	if (SwapVector [EntryIdx].IsLoad && SwapVector [EntryIdx].IsSwap) {
781	int Repr = EC->getLeaderValue(V: SwapVector [EntryIdx].VSEId);
782
783	if (!SwapVector [Repr].WebRejected) {
784	MachineInstr *MI = SwapVector [EntryIdx].VSEMI;
785	Register DefReg = MI->getOperand(i: `0`).getReg();
786
787	for (MachineInstr &UseMI : MRI->use_nodbg_instructions(Reg: DefReg)) {
788	int UseIdx = SwapMap [&UseMI];
789	SwapVector [UseIdx].WillRemove = `1`;
790
791	LLVM_DEBUG(dbgs() << "Marking swap fed by load for removal: ");
792	LLVM_DEBUG(UseMI.dump());
793	}
794	}
795
796	} else if (SwapVector [EntryIdx].IsStore && SwapVector [EntryIdx].IsSwap) {
797	int Repr = EC->getLeaderValue(V: SwapVector [EntryIdx].VSEId);
798
799	if (!SwapVector [Repr].WebRejected) {
800	MachineInstr *MI = SwapVector [EntryIdx].VSEMI;
801	Register UseReg = MI->getOperand(i: `0`).getReg();
802	MachineInstr *DefMI = MRI->getVRegDef(Reg: UseReg);
803	int DefIdx = SwapMap [DefMI];
804	SwapVector [DefIdx].WillRemove = `1`;
805
806	LLVM_DEBUG(dbgs() << "Marking swap feeding store for removal: ");
807	LLVM_DEBUG(DefMI->dump());
808	}
809
810	} else if (SwapVector [EntryIdx].IsSwappable &&
811	SwapVector [EntryIdx].SpecialHandling != `0`) {
812	int Repr = EC->getLeaderValue(V: SwapVector [EntryIdx].VSEId);
813
814	if (!SwapVector [Repr].WebRejected)
815	handleSpecialSwappables(EntryIdx);
816	}
817	}
818	}
819
820	// Create an xxswapd instruction and insert it prior to the given point.
821	// MI is used to determine basic block and debug loc information.
822	// FIXME: When inserting a swap, we should check whether SrcReg is
823	// defined by another swap: SrcReg = XXPERMDI Reg, Reg, 2; If so,
824	// then instead we should generate a copy from Reg to DstReg.
825	void PPCVSXSwapRemoval::insertSwap(MachineInstr *MI,
826	MachineBasicBlock::iterator InsertPoint,
827	unsigned DstReg, unsigned SrcReg) {
828	BuildMI(BB&: *MI->getParent(), I: InsertPoint, MIMD: MI->getDebugLoc(),
829	MCID: TII->get(Opcode: PPC::XXPERMDI), DestReg: DstReg)
830	.addReg(RegNo: SrcReg)
831	.addReg(RegNo: SrcReg)
832	.addImm(Val: `2`);
833	}
834
835	// The identified swap entry requires special handling to allow its
836	// containing computation to be optimized. Perform that handling
837	// here.
838	// FIXME: Additional opportunities will be phased in with subsequent
839	// patches.
840	void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
841	switch (SwapVector [EntryIdx].SpecialHandling) {
842
843	default:
844	llvm_unreachable("Unexpected special handling type");
845
846	// For splats based on an index into a vector, add N/2 modulo N
847	// to the index, where N is the number of vector elements.
848	case SHValues::SH_SPLAT: {
849	MachineInstr *MI = SwapVector [EntryIdx].VSEMI;
850	unsigned NElts;
851
852	LLVM_DEBUG(dbgs() << "Changing splat: ");
853	LLVM_DEBUG(MI->dump());
854
855	switch (MI->getOpcode()) {
856	default:
857	llvm_unreachable("Unexpected splat opcode");
858	case PPC::VSPLTB: NElts = `16`; break;
859	case PPC::VSPLTH: NElts = `8`; break;
860	case PPC::VSPLTW:
861	case PPC::XXSPLTW: NElts = `4`; break;
862	}
863
864	unsigned EltNo;
865	if (MI->getOpcode() == PPC::XXSPLTW)
866	EltNo = MI->getOperand(i: `2`).getImm();
867	else
868	EltNo = MI->getOperand(i: `1`).getImm();
869
870	EltNo = (EltNo + NElts / `2`) % NElts;
871	if (MI->getOpcode() == PPC::XXSPLTW)
872	MI->getOperand(i: `2`).setImm(EltNo);
873	else
874	MI->getOperand(i: `1`).setImm(EltNo);
875
876	LLVM_DEBUG(dbgs() << " Into: ");
877	LLVM_DEBUG(MI->dump());
878	break;
879	}
880
881	// For an XXPERMDI that isn't handled otherwise, we need to
882	// reverse the order of the operands. If the selector operand
883	// has a value of 0 or 3, we need to change it to 3 or 0,
884	// respectively. Otherwise we should leave it alone. (This
885	// is equivalent to reversing the two bits of the selector
886	// operand and complementing the result.)
887	case SHValues::SH_XXPERMDI: {
888	MachineInstr *MI = SwapVector [EntryIdx].VSEMI;
889
890	LLVM_DEBUG(dbgs() << "Changing XXPERMDI: ");
891	LLVM_DEBUG(MI->dump());
892
893	unsigned Selector = MI->getOperand(i: `3`).getImm();
894	if (Selector == `0` \|\| Selector == `3`)
895	Selector = `3` - Selector;
896	MI->getOperand(i: `3`).setImm(Selector);
897
898	Register Reg1 = MI->getOperand(i: `1`).getReg();
899	Register Reg2 = MI->getOperand(i: `2`).getReg();
900	MI->getOperand(i: `1`).setReg(Reg2);
901	MI->getOperand(i: `2`).setReg(Reg1);
902
903	// We also need to swap kill flag associated with the register.
904	bool IsKill1 = MI->getOperand(i: `1`).isKill();
905	bool IsKill2 = MI->getOperand(i: `2`).isKill();
906	MI->getOperand(i: `1`).setIsKill(IsKill2);
907	MI->getOperand(i: `2`).setIsKill(IsKill1);
908
909	LLVM_DEBUG(dbgs() << " Into: ");
910	LLVM_DEBUG(MI->dump());
911	break;
912	}
913
914	// For a copy from a scalar floating-point register to a vector
915	// register, removing swaps will leave the copied value in the
916	// wrong lane. Insert a swap following the copy to fix this.
917	case SHValues::SH_COPYWIDEN: {
918	MachineInstr *MI = SwapVector [EntryIdx].VSEMI;
919
920	LLVM_DEBUG(dbgs() << "Changing SUBREG_TO_REG: ");
921	LLVM_DEBUG(MI->dump());
922
923	Register DstReg = MI->getOperand(i: `0`).getReg();
924	const TargetRegisterClass *DstRC = MRI->getRegClass(Reg: DstReg);
925	Register NewVReg = MRI->createVirtualRegister(RegClass: DstRC);
926
927	MI->getOperand(i: `0`).setReg(NewVReg);
928	LLVM_DEBUG(dbgs() << " Into: ");
929	LLVM_DEBUG(MI->dump());
930
931	auto InsertPoint = ++MachineBasicBlock::iterator (MI);
932
933	// Note that an XXPERMDI requires a VSRC, so if the SUBREG_TO_REG
934	// is copying to a VRRC, we need to be careful to avoid a register
935	// assignment problem. In this case we must copy from VRRC to VSRC
936	// prior to the swap, and from VSRC to VRRC following the swap.
937	// Coalescing will usually remove all this mess.
938	if (DstRC == &PPC::VRRCRegClass) {
939	Register VSRCTmp1 = MRI->createVirtualRegister(RegClass: &PPC::VSRCRegClass);
940	Register VSRCTmp2 = MRI->createVirtualRegister(RegClass: &PPC::VSRCRegClass);
941
942	BuildMI(BB&: *MI->getParent(), I: InsertPoint, MIMD: MI->getDebugLoc(),
943	MCID: TII->get(Opcode: PPC::COPY), DestReg: VSRCTmp1)
944	.addReg(RegNo: NewVReg);
945	LLVM_DEBUG(std::prev(InsertPoint)->dump());
946
947	insertSwap(MI, InsertPoint, DstReg: VSRCTmp2, SrcReg: VSRCTmp1);
948	LLVM_DEBUG(std::prev(InsertPoint)->dump());
949
950	BuildMI(BB&: *MI->getParent(), I: InsertPoint, MIMD: MI->getDebugLoc(),
951	MCID: TII->get(Opcode: PPC::COPY), DestReg: DstReg)
952	.addReg(RegNo: VSRCTmp2);
953	LLVM_DEBUG(std::prev(InsertPoint)->dump());
954
955	} else {
956	insertSwap(MI, InsertPoint, DstReg, SrcReg: NewVReg);
957	LLVM_DEBUG(std::prev(InsertPoint)->dump());
958	}
959	break;
960	}
961	}
962	}
963
964	// Walk the swap vector and replace each entry marked for removal with
965	// a copy operation.
966	bool PPCVSXSwapRemoval::removeSwaps() {
967
968	LLVM_DEBUG(dbgs() << "\n* Removing swaps *\n\n");
969
970	bool Changed = false;
971
972	for (unsigned EntryIdx = `0`; EntryIdx < SwapVector.size(); ++EntryIdx) {
973	if (SwapVector [EntryIdx].WillRemove) {
974	Changed = true;
975	MachineInstr *MI = SwapVector [EntryIdx].VSEMI;
976	MachineBasicBlock *MBB = MI->getParent();
977	BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY),
978	DestReg: MI->getOperand(i: `0`).getReg())
979	.add(MO: MI->getOperand(i: `1`));
980
981	LLVM_DEBUG(dbgs() << format("Replaced %d with copy: ",
982	SwapVector[EntryIdx].VSEId));
983	LLVM_DEBUG(MI->dump());
984
985	MI->eraseFromParent();
986	}
987	}
988
989	return Changed;
990	}
991
992	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
993	// For debug purposes, dump the contents of the swap vector.
994	LLVM_DUMP_METHOD void PPCVSXSwapRemoval::dumpSwapVector() {
995
996	for (unsigned EntryIdx = `0`; EntryIdx < SwapVector.size(); ++EntryIdx) {
997
998	MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
999	int ID = SwapVector[EntryIdx].VSEId;
1000
1001	dbgs() << format("%6d", ID);
1002	dbgs() << format("%6d", EC->getLeaderValue(ID));
1003	dbgs() << format(" %bb.%3d", MI->getParent()->getNumber());
1004	dbgs() << format(" %14s ", TII->getName(MI->getOpcode()).str().c_str());
1005
1006	if (SwapVector[EntryIdx].IsLoad)
1007	dbgs() << "load ";
1008	if (SwapVector[EntryIdx].IsStore)
1009	dbgs() << "store ";
1010	if (SwapVector[EntryIdx].IsSwap)
1011	dbgs() << "swap ";
1012	if (SwapVector[EntryIdx].MentionsPhysVR)
1013	dbgs() << "physreg ";
1014	if (SwapVector[EntryIdx].MentionsPartialVR)
1015	dbgs() << "partialreg ";
1016
1017	if (SwapVector[EntryIdx].IsSwappable) {
1018	dbgs() << "swappable ";
1019	switch(SwapVector[EntryIdx].SpecialHandling) {
1020	default:
1021	dbgs() << "special:unknown";
1022	break;
1023	case SH_NONE:
1024	break;
1025	case SH_EXTRACT:
1026	dbgs() << "special:extract ";
1027	break;
1028	case SH_INSERT:
1029	dbgs() << "special:insert ";
1030	break;
1031	case SH_NOSWAP_LD:
1032	dbgs() << "special:load ";
1033	break;
1034	case SH_NOSWAP_ST:
1035	dbgs() << "special:store ";
1036	break;
1037	case SH_SPLAT:
1038	dbgs() << "special:splat ";
1039	break;
1040	case SH_XXPERMDI:
1041	dbgs() << "special:xxpermdi ";
1042	break;
1043	case SH_COPYWIDEN:
1044	dbgs() << "special:copywiden ";
1045	break;
1046	}
1047	}
1048
1049	if (SwapVector[EntryIdx].WebRejected)
1050	dbgs() << "rejected ";
1051	if (SwapVector[EntryIdx].WillRemove)
1052	dbgs() << "remove ";
1053
1054	dbgs() << "\n";
1055
1056	// For no-asserts builds.
1057	(void)MI;
1058	(void)ID;
1059	}
1060
1061	dbgs() << "\n";
1062	}
1063	#endif
1064
1065	} // end default namespace
1066
1067	INITIALIZE_PASS_BEGIN(PPCVSXSwapRemoval, DEBUG_TYPE,
1068	"PowerPC VSX Swap Removal", false, false)
1069	INITIALIZE_PASS_END(PPCVSXSwapRemoval, DEBUG_TYPE,
1070	"PowerPC VSX Swap Removal", false, false)
1071
1072	char PPCVSXSwapRemoval::ID = `0`;
1073	FunctionPass*
1074	llvm::createPPCVSXSwapRemovalPass() { return new PPCVSXSwapRemoval (); }
1075

Browse the source code of llvm_projects/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp