1//===----------- PPCVSXSwapRemoval.cpp - Remove VSX LE Swaps -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===---------------------------------------------------------------------===//
8//
9// This pass analyzes vector computations and removes unnecessary
10// doubleword swaps (xxswapd instructions). This pass is performed
11// only for little-endian VSX code generation.
12//
13// For this specific case, loads and stores of v4i32, v4f32, v2i64,
14// and v2f64 vectors are inefficient. These are implemented using
15// the lxvd2x and stxvd2x instructions, which invert the order of
16// doublewords in a vector register. Thus code generation inserts
17// an xxswapd after each such load, and prior to each such store.
18//
19// The extra xxswapd instructions reduce performance. The purpose
20// of this pass is to reduce the number of xxswapd instructions
21// required for correctness.
22//
23// The primary insight is that much code that operates on vectors
24// does not care about the relative order of elements in a register,
25// so long as the correct memory order is preserved. If we have a
26// computation where all input values are provided by lxvd2x/xxswapd,
27// all outputs are stored using xxswapd/lxvd2x, and all intermediate
28// computations are lane-insensitive (independent of element order),
29// then all the xxswapd instructions associated with the loads and
30// stores may be removed without changing observable semantics.
31//
32// This pass uses standard equivalence class infrastructure to create
33// maximal webs of computations fitting the above description. Each
34// such web is then optimized by removing its unnecessary xxswapd
35// instructions.
36//
37// There are some lane-sensitive operations for which we can still
38// permit the optimization, provided we modify those operations
39// accordingly. Such operations are identified as using "special
40// handling" within this module.
41//
42//===---------------------------------------------------------------------===//
43
44#include "PPC.h"
45#include "PPCInstrInfo.h"
46#include "PPCTargetMachine.h"
47#include "llvm/ADT/DenseMap.h"
48#include "llvm/ADT/EquivalenceClasses.h"
49#include "llvm/CodeGen/MachineFunctionPass.h"
50#include "llvm/CodeGen/MachineInstrBuilder.h"
51#include "llvm/CodeGen/MachineRegisterInfo.h"
52#include "llvm/Config/llvm-config.h"
53#include "llvm/Support/Debug.h"
54#include "llvm/Support/Format.h"
55#include "llvm/Support/raw_ostream.h"
56
57using namespace llvm;
58
59#define DEBUG_TYPE "ppc-vsx-swaps"
60
61namespace {
62
63// A PPCVSXSwapEntry is created for each machine instruction that
64// is relevant to a vector computation.
65struct PPCVSXSwapEntry {
66 // Pointer to the instruction.
67 MachineInstr *VSEMI;
68
69 // Unique ID (position in the swap vector).
70 int VSEId;
71
72 // Attributes of this node.
73 unsigned int IsLoad : 1;
74 unsigned int IsStore : 1;
75 unsigned int IsSwap : 1;
76 unsigned int MentionsPhysVR : 1;
77 unsigned int IsSwappable : 1;
78 unsigned int MentionsPartialVR : 1;
79 unsigned int SpecialHandling : 3;
80 unsigned int WebRejected : 1;
81 unsigned int WillRemove : 1;
82};
83
84enum SHValues {
85 SH_NONE = 0,
86 SH_EXTRACT,
87 SH_INSERT,
88 SH_NOSWAP_LD,
89 SH_NOSWAP_ST,
90 SH_SPLAT,
91 SH_XXPERMDI,
92 SH_COPYWIDEN
93};
94
95struct PPCVSXSwapRemoval : public MachineFunctionPass {
96
97 static char ID;
98 const PPCInstrInfo *TII;
99 MachineFunction *MF;
100 MachineRegisterInfo *MRI;
101
102 // Swap entries are allocated in a vector for better performance.
103 std::vector<PPCVSXSwapEntry> SwapVector;
104
105 // A mapping is maintained between machine instructions and
106 // their swap entries. The key is the address of the MI.
107 DenseMap<MachineInstr*, int> SwapMap;
108
109 // Equivalence classes are used to gather webs of related computation.
110 // Swap entries are represented by their VSEId fields.
111 EquivalenceClasses<int> *EC;
112
113 PPCVSXSwapRemoval() : MachineFunctionPass(ID) {}
114
115private:
116 // Initialize data structures.
117 void initialize(MachineFunction &MFParm);
118
119 // Walk the machine instructions to gather vector usage information.
120 // Return true iff vector mentions are present.
121 bool gatherVectorInstructions();
122
123 // Add an entry to the swap vector and swap map.
124 int addSwapEntry(MachineInstr *MI, PPCVSXSwapEntry &SwapEntry);
125
126 // Hunt backwards through COPY and SUBREG_TO_REG chains for a
127 // source register. VecIdx indicates the swap vector entry to
128 // mark as mentioning a physical register if the search leads
129 // to one.
130 unsigned lookThruCopyLike(unsigned SrcReg, unsigned VecIdx);
131
132 // Generate equivalence classes for related computations (webs).
133 void formWebs();
134
135 // Analyze webs and determine those that cannot be optimized.
136 void recordUnoptimizableWebs();
137
138 // Record which swap instructions can be safely removed.
139 void markSwapsForRemoval();
140
141 // Remove swaps and update other instructions requiring special
142 // handling. Return true iff any changes are made.
143 bool removeSwaps();
144
145 // Insert a swap instruction from SrcReg to DstReg at the given
146 // InsertPoint.
147 void insertSwap(MachineInstr *MI, MachineBasicBlock::iterator InsertPoint,
148 unsigned DstReg, unsigned SrcReg);
149
150 // Update instructions requiring special handling.
151 void handleSpecialSwappables(int EntryIdx);
152
153 // Dump a description of the entries in the swap vector.
154 void dumpSwapVector();
155
156 // Return true iff the given register is in the given class.
157 bool isRegInClass(unsigned Reg, const TargetRegisterClass *RC) {
158 if (Register::isVirtualRegister(Reg))
159 return RC->hasSubClassEq(RC: MRI->getRegClass(Reg));
160 return RC->contains(Reg);
161 }
162
163 // Return true iff the given register is a full vector register.
164 bool isVecReg(unsigned Reg) {
165 return (isRegInClass(Reg, RC: &PPC::VSRCRegClass) ||
166 isRegInClass(Reg, RC: &PPC::VRRCRegClass));
167 }
168
169 // Return true iff the given register is a partial vector register.
170 bool isScalarVecReg(unsigned Reg) {
171 return (isRegInClass(Reg, RC: &PPC::VSFRCRegClass) ||
172 isRegInClass(Reg, RC: &PPC::VSSRCRegClass));
173 }
174
175 // Return true iff the given register mentions all or part of a
176 // vector register. Also sets Partial to true if the mention
177 // is for just the floating-point register overlap of the register.
178 bool isAnyVecReg(unsigned Reg, bool &Partial) {
179 if (isScalarVecReg(Reg))
180 Partial = true;
181 return isScalarVecReg(Reg) || isVecReg(Reg);
182 }
183
184public:
185 // Main entry point for this pass.
186 bool runOnMachineFunction(MachineFunction &MF) override {
187 if (skipFunction(F: MF.getFunction()))
188 return false;
189
190 // If we don't have VSX on the subtarget, don't do anything.
191 // Also, on Power 9 the load and store ops preserve element order and so
192 // the swaps are not required.
193 const PPCSubtarget &STI = MF.getSubtarget<PPCSubtarget>();
194 if (!STI.hasVSX() || !STI.needsSwapsForVSXMemOps())
195 return false;
196
197 bool Changed = false;
198 initialize(MFParm&: MF);
199
200 if (gatherVectorInstructions()) {
201 formWebs();
202 recordUnoptimizableWebs();
203 markSwapsForRemoval();
204 Changed = removeSwaps();
205 }
206
207 // FIXME: See the allocation of EC in initialize().
208 delete EC;
209 return Changed;
210 }
211};
212} // end anonymous namespace
213
214// Initialize data structures for this pass. In particular, clear the
215// swap vector and allocate the equivalence class mapping before
216// processing each function.
217void PPCVSXSwapRemoval::initialize(MachineFunction &MFParm) {
218 MF = &MFParm;
219 MRI = &MF->getRegInfo();
220 TII = MF->getSubtarget<PPCSubtarget>().getInstrInfo();
221
222 // An initial vector size of 256 appears to work well in practice.
223 // Small/medium functions with vector content tend not to incur a
224 // reallocation at this size. Three of the vector tests in
225 // projects/test-suite reallocate, which seems like a reasonable rate.
226 const int InitialVectorSize(256);
227 SwapVector.clear();
228 SwapVector.reserve(n: InitialVectorSize);
229
230 // FIXME: Currently we allocate EC each time because we don't have
231 // access to the set representation on which to call clear(). Should
232 // consider adding a clear() method to the EquivalenceClasses class.
233 EC = new EquivalenceClasses<int>;
234}
235
236// Create an entry in the swap vector for each instruction that mentions
237// a full vector register, recording various characteristics of the
238// instructions there.
239bool PPCVSXSwapRemoval::gatherVectorInstructions() {
240 bool RelevantFunction = false;
241
242 for (MachineBasicBlock &MBB : *MF) {
243 for (MachineInstr &MI : MBB) {
244
245 if (MI.isDebugInstr())
246 continue;
247
248 bool RelevantInstr = false;
249 bool Partial = false;
250
251 for (const MachineOperand &MO : MI.operands()) {
252 if (!MO.isReg())
253 continue;
254 Register Reg = MO.getReg();
255 // All operands need to be checked because there are instructions that
256 // operate on a partial register and produce a full register (such as
257 // XXPERMDIs).
258 if (isAnyVecReg(Reg, Partial))
259 RelevantInstr = true;
260 }
261
262 if (!RelevantInstr)
263 continue;
264
265 RelevantFunction = true;
266
267 // Create a SwapEntry initialized to zeros, then fill in the
268 // instruction and ID fields before pushing it to the back
269 // of the swap vector.
270 PPCVSXSwapEntry SwapEntry{};
271 int VecIdx = addSwapEntry(MI: &MI, SwapEntry);
272
273 switch(MI.getOpcode()) {
274 default:
275 // Unless noted otherwise, an instruction is considered
276 // safe for the optimization. There are a large number of
277 // such true-SIMD instructions (all vector math, logical,
278 // select, compare, etc.). However, if the instruction
279 // mentions a partial vector register and does not have
280 // special handling defined, it is not swappable.
281 if (Partial)
282 SwapVector[VecIdx].MentionsPartialVR = 1;
283 else
284 SwapVector[VecIdx].IsSwappable = 1;
285 break;
286 case PPC::XXPERMDI: {
287 // This is a swap if it is of the form XXPERMDI t, s, s, 2.
288 // Unfortunately, MachineCSE ignores COPY and SUBREG_TO_REG, so we
289 // can also see XXPERMDI t, SUBREG_TO_REG(s), SUBREG_TO_REG(s), 2,
290 // for example. We have to look through chains of COPY and
291 // SUBREG_TO_REG to find the real source value for comparison.
292 // If the real source value is a physical register, then mark the
293 // XXPERMDI as mentioning a physical register.
294 int immed = MI.getOperand(i: 3).getImm();
295 if (immed == 2) {
296 unsigned trueReg1 = lookThruCopyLike(SrcReg: MI.getOperand(i: 1).getReg(),
297 VecIdx);
298 unsigned trueReg2 = lookThruCopyLike(SrcReg: MI.getOperand(i: 2).getReg(),
299 VecIdx);
300 if (trueReg1 == trueReg2)
301 SwapVector[VecIdx].IsSwap = 1;
302 else {
303 // We can still handle these if the two registers are not
304 // identical, by adjusting the form of the XXPERMDI.
305 SwapVector[VecIdx].IsSwappable = 1;
306 SwapVector[VecIdx].SpecialHandling = SHValues::SH_XXPERMDI;
307 }
308 // This is a doubleword splat if it is of the form
309 // XXPERMDI t, s, s, 0 or XXPERMDI t, s, s, 3. As above we
310 // must look through chains of copy-likes to find the source
311 // register. We turn off the marking for mention of a physical
312 // register, because splatting it is safe; the optimization
313 // will not swap the value in the physical register. Whether
314 // or not the two input registers are identical, we can handle
315 // these by adjusting the form of the XXPERMDI.
316 } else if (immed == 0 || immed == 3) {
317
318 SwapVector[VecIdx].IsSwappable = 1;
319 SwapVector[VecIdx].SpecialHandling = SHValues::SH_XXPERMDI;
320
321 unsigned trueReg1 = lookThruCopyLike(SrcReg: MI.getOperand(i: 1).getReg(),
322 VecIdx);
323 unsigned trueReg2 = lookThruCopyLike(SrcReg: MI.getOperand(i: 2).getReg(),
324 VecIdx);
325 if (trueReg1 == trueReg2)
326 SwapVector[VecIdx].MentionsPhysVR = 0;
327
328 } else {
329 // We can still handle these by adjusting the form of the XXPERMDI.
330 SwapVector[VecIdx].IsSwappable = 1;
331 SwapVector[VecIdx].SpecialHandling = SHValues::SH_XXPERMDI;
332 }
333 break;
334 }
335 case PPC::LVX:
336 // Non-permuting loads are currently unsafe. We can use special
337 // handling for this in the future. By not marking these as
338 // IsSwap, we ensure computations containing them will be rejected
339 // for now.
340 SwapVector[VecIdx].IsLoad = 1;
341 break;
342 case PPC::LXVD2X:
343 case PPC::LXVW4X:
344 // Permuting loads are marked as both load and swap, and are
345 // safe for optimization.
346 SwapVector[VecIdx].IsLoad = 1;
347 SwapVector[VecIdx].IsSwap = 1;
348 break;
349 case PPC::LXSDX:
350 case PPC::LXSSPX:
351 case PPC::XFLOADf64:
352 case PPC::XFLOADf32:
353 // A load of a floating-point value into the high-order half of
354 // a vector register is safe, provided that we introduce a swap
355 // following the load, which will be done by the SUBREG_TO_REG
356 // support. So just mark these as safe.
357 SwapVector[VecIdx].IsLoad = 1;
358 SwapVector[VecIdx].IsSwappable = 1;
359 break;
360 case PPC::STVX:
361 // Non-permuting stores are currently unsafe. We can use special
362 // handling for this in the future. By not marking these as
363 // IsSwap, we ensure computations containing them will be rejected
364 // for now.
365 SwapVector[VecIdx].IsStore = 1;
366 break;
367 case PPC::STXVD2X:
368 case PPC::STXVW4X:
369 // Permuting stores are marked as both store and swap, and are
370 // safe for optimization.
371 SwapVector[VecIdx].IsStore = 1;
372 SwapVector[VecIdx].IsSwap = 1;
373 break;
374 case PPC::COPY:
375 // These are fine provided they are moving between full vector
376 // register classes.
377 if (isVecReg(Reg: MI.getOperand(i: 0).getReg()) &&
378 isVecReg(Reg: MI.getOperand(i: 1).getReg()))
379 SwapVector[VecIdx].IsSwappable = 1;
380 // If we have a copy from one scalar floating-point register
381 // to another, we can accept this even if it is a physical
382 // register. The only way this gets involved is if it feeds
383 // a SUBREG_TO_REG, which is handled by introducing a swap.
384 else if (isScalarVecReg(Reg: MI.getOperand(i: 0).getReg()) &&
385 isScalarVecReg(Reg: MI.getOperand(i: 1).getReg()))
386 SwapVector[VecIdx].IsSwappable = 1;
387 break;
388 case PPC::SUBREG_TO_REG: {
389 // These are fine provided they are moving between full vector
390 // register classes. If they are moving from a scalar
391 // floating-point class to a vector class, we can handle those
392 // as well, provided we introduce a swap. It is generally the
393 // case that we will introduce fewer swaps than we remove, but
394 // (FIXME) a cost model could be used. However, introduced
395 // swaps could potentially be CSEd, so this is not trivial.
396 if (isVecReg(Reg: MI.getOperand(i: 0).getReg()) &&
397 isVecReg(Reg: MI.getOperand(i: 1).getReg()))
398 SwapVector[VecIdx].IsSwappable = 1;
399 else if (isVecReg(Reg: MI.getOperand(i: 0).getReg()) &&
400 isScalarVecReg(Reg: MI.getOperand(i: 1).getReg())) {
401 SwapVector[VecIdx].IsSwappable = 1;
402 SwapVector[VecIdx].SpecialHandling = SHValues::SH_COPYWIDEN;
403 }
404 break;
405 }
406 case PPC::VSPLTB:
407 case PPC::VSPLTH:
408 case PPC::VSPLTW:
409 case PPC::XXSPLTW:
410 // Splats are lane-sensitive, but we can use special handling
411 // to adjust the source lane for the splat.
412 SwapVector[VecIdx].IsSwappable = 1;
413 SwapVector[VecIdx].SpecialHandling = SHValues::SH_SPLAT;
414 break;
415 // The presence of the following lane-sensitive operations in a
416 // web will kill the optimization, at least for now. For these
417 // we do nothing, causing the optimization to fail.
418 // FIXME: Some of these could be permitted with special handling,
419 // and will be phased in as time permits.
420 // FIXME: There is no simple and maintainable way to express a set
421 // of opcodes having a common attribute in TableGen. Should this
422 // change, this is a prime candidate to use such a mechanism.
423 case PPC::INLINEASM:
424 case PPC::INLINEASM_BR:
425 case PPC::EXTRACT_SUBREG:
426 case PPC::INSERT_SUBREG:
427 case PPC::COPY_TO_REGCLASS:
428 case PPC::LVEBX:
429 case PPC::LVEHX:
430 case PPC::LVEWX:
431 case PPC::LVSL:
432 case PPC::LVSR:
433 case PPC::LVXL:
434 case PPC::STVEBX:
435 case PPC::STVEHX:
436 case PPC::STVEWX:
437 case PPC::STVXL:
438 // We can handle STXSDX and STXSSPX similarly to LXSDX and LXSSPX,
439 // by adding special handling for narrowing copies as well as
440 // widening ones. However, I've experimented with this, and in
441 // practice we currently do not appear to use STXSDX fed by
442 // a narrowing copy from a full vector register. Since I can't
443 // generate any useful test cases, I've left this alone for now.
444 case PPC::STXSDX:
445 case PPC::STXSSPX:
446 case PPC::VCIPHER:
447 case PPC::VCIPHERLAST:
448 case PPC::VMRGHB:
449 case PPC::VMRGHH:
450 case PPC::VMRGHW:
451 case PPC::VMRGLB:
452 case PPC::VMRGLH:
453 case PPC::VMRGLW:
454 case PPC::VMULESB:
455 case PPC::VMULESH:
456 case PPC::VMULESW:
457 case PPC::VMULEUB:
458 case PPC::VMULEUH:
459 case PPC::VMULEUW:
460 case PPC::VMULOSB:
461 case PPC::VMULOSH:
462 case PPC::VMULOSW:
463 case PPC::VMULOUB:
464 case PPC::VMULOUH:
465 case PPC::VMULOUW:
466 case PPC::VNCIPHER:
467 case PPC::VNCIPHERLAST:
468 case PPC::VPERM:
469 case PPC::VPERMXOR:
470 case PPC::VPKPX:
471 case PPC::VPKSHSS:
472 case PPC::VPKSHUS:
473 case PPC::VPKSDSS:
474 case PPC::VPKSDUS:
475 case PPC::VPKSWSS:
476 case PPC::VPKSWUS:
477 case PPC::VPKUDUM:
478 case PPC::VPKUDUS:
479 case PPC::VPKUHUM:
480 case PPC::VPKUHUS:
481 case PPC::VPKUWUM:
482 case PPC::VPKUWUS:
483 case PPC::VPMSUMB:
484 case PPC::VPMSUMD:
485 case PPC::VPMSUMH:
486 case PPC::VPMSUMW:
487 case PPC::VRLB:
488 case PPC::VRLD:
489 case PPC::VRLH:
490 case PPC::VRLW:
491 case PPC::VSBOX:
492 case PPC::VSHASIGMAD:
493 case PPC::VSHASIGMAW:
494 case PPC::VSL:
495 case PPC::VSLDOI:
496 case PPC::VSLO:
497 case PPC::VSR:
498 case PPC::VSRO:
499 case PPC::VSUM2SWS:
500 case PPC::VSUM4SBS:
501 case PPC::VSUM4SHS:
502 case PPC::VSUM4UBS:
503 case PPC::VSUMSWS:
504 case PPC::VUPKHPX:
505 case PPC::VUPKHSB:
506 case PPC::VUPKHSH:
507 case PPC::VUPKHSW:
508 case PPC::VUPKLPX:
509 case PPC::VUPKLSB:
510 case PPC::VUPKLSH:
511 case PPC::VUPKLSW:
512 case PPC::XXMRGHW:
513 case PPC::XXMRGLW:
514 // XXSLDWI could be replaced by a general permute with one of three
515 // permute control vectors (for shift values 1, 2, 3). However,
516 // VPERM has a more restrictive register class.
517 case PPC::XXSLDWI:
518 case PPC::XSCVDPSPN:
519 case PPC::XSCVSPDPN:
520 case PPC::MTVSCR:
521 case PPC::MFVSCR:
522 break;
523 }
524 }
525 }
526
527 if (RelevantFunction) {
528 LLVM_DEBUG(dbgs() << "Swap vector when first built\n\n");
529 LLVM_DEBUG(dumpSwapVector());
530 }
531
532 return RelevantFunction;
533}
534
535// Add an entry to the swap vector and swap map, and make a
536// singleton equivalence class for the entry.
537int PPCVSXSwapRemoval::addSwapEntry(MachineInstr *MI,
538 PPCVSXSwapEntry& SwapEntry) {
539 SwapEntry.VSEMI = MI;
540 SwapEntry.VSEId = SwapVector.size();
541 SwapVector.push_back(x: SwapEntry);
542 EC->insert(Data: SwapEntry.VSEId);
543 SwapMap[MI] = SwapEntry.VSEId;
544 return SwapEntry.VSEId;
545}
546
547// This is used to find the "true" source register for an
548// XXPERMDI instruction, since MachineCSE does not handle the
549// "copy-like" operations (Copy and SubregToReg). Returns
550// the original SrcReg unless it is the target of a copy-like
551// operation, in which case we chain backwards through all
552// such operations to the ultimate source register. If a
553// physical register is encountered, we stop the search and
554// flag the swap entry indicated by VecIdx (the original
555// XXPERMDI) as mentioning a physical register.
556unsigned PPCVSXSwapRemoval::lookThruCopyLike(unsigned SrcReg,
557 unsigned VecIdx) {
558 MachineInstr *MI = MRI->getVRegDef(Reg: SrcReg);
559 if (!MI->isCopyLike())
560 return SrcReg;
561
562 assert((MI->isCopy() || MI->isSubregToReg()) &&
563 "bad opcode for lookThruCopyLike");
564 unsigned CopySrcReg = MI->getOperand(i: 1).getReg();
565
566 if (!Register::isVirtualRegister(Reg: CopySrcReg)) {
567 if (!isScalarVecReg(Reg: CopySrcReg))
568 SwapVector[VecIdx].MentionsPhysVR = 1;
569 return CopySrcReg;
570 }
571
572 return lookThruCopyLike(SrcReg: CopySrcReg, VecIdx);
573}
574
575// Generate equivalence classes for related computations (webs) by
576// def-use relationships of virtual registers. Mention of a physical
577// register terminates the generation of equivalence classes as this
578// indicates a use of a parameter, definition of a return value, use
579// of a value returned from a call, or definition of a parameter to a
580// call. Computations with physical register mentions are flagged
581// as such so their containing webs will not be optimized.
582void PPCVSXSwapRemoval::formWebs() {
583
584 LLVM_DEBUG(dbgs() << "\n*** Forming webs for swap removal ***\n\n");
585
586 for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) {
587
588 MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
589
590 LLVM_DEBUG(dbgs() << "\n" << SwapVector[EntryIdx].VSEId << " ");
591 LLVM_DEBUG(MI->dump());
592
593 // It's sufficient to walk vector uses and join them to their unique
594 // definitions. In addition, check full vector register operands
595 // for physical regs. We exclude partial-vector register operands
596 // because we can handle them if copied to a full vector.
597 for (const MachineOperand &MO : MI->operands()) {
598 if (!MO.isReg())
599 continue;
600
601 Register Reg = MO.getReg();
602 if (!isVecReg(Reg) && !isScalarVecReg(Reg))
603 continue;
604
605 if (!Reg.isVirtual()) {
606 if (!(MI->isCopy() && isScalarVecReg(Reg)))
607 SwapVector[EntryIdx].MentionsPhysVR = 1;
608 continue;
609 }
610
611 if (!MO.isUse())
612 continue;
613
614 MachineInstr* DefMI = MRI->getVRegDef(Reg);
615 assert(SwapMap.contains(DefMI) &&
616 "Inconsistency: def of vector reg not found in swap map!");
617 int DefIdx = SwapMap[DefMI];
618 (void)EC->unionSets(V1: SwapVector[DefIdx].VSEId,
619 V2: SwapVector[EntryIdx].VSEId);
620
621 LLVM_DEBUG(dbgs() << format("Unioning %d with %d\n",
622 SwapVector[DefIdx].VSEId,
623 SwapVector[EntryIdx].VSEId));
624 LLVM_DEBUG(dbgs() << " Def: ");
625 LLVM_DEBUG(DefMI->dump());
626 }
627 }
628}
629
630// Walk the swap vector entries looking for conditions that prevent their
631// containing computations from being optimized. When such conditions are
632// found, mark the representative of the computation's equivalence class
633// as rejected.
634void PPCVSXSwapRemoval::recordUnoptimizableWebs() {
635
636 LLVM_DEBUG(dbgs() << "\n*** Rejecting webs for swap removal ***\n\n");
637
638 for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) {
639 int Repr = EC->getLeaderValue(V: SwapVector[EntryIdx].VSEId);
640
641 // If representative is already rejected, don't waste further time.
642 if (SwapVector[Repr].WebRejected)
643 continue;
644
645 // Reject webs containing mentions of physical or partial registers, or
646 // containing operations that we don't know how to handle in a lane-
647 // permuted region.
648 if (SwapVector[EntryIdx].MentionsPhysVR ||
649 SwapVector[EntryIdx].MentionsPartialVR ||
650 !(SwapVector[EntryIdx].IsSwappable || SwapVector[EntryIdx].IsSwap)) {
651
652 SwapVector[Repr].WebRejected = 1;
653
654 LLVM_DEBUG(
655 dbgs() << format("Web %d rejected for physreg, partial reg, or not "
656 "swap[pable]\n",
657 Repr));
658 LLVM_DEBUG(dbgs() << " in " << EntryIdx << ": ");
659 LLVM_DEBUG(SwapVector[EntryIdx].VSEMI->dump());
660 LLVM_DEBUG(dbgs() << "\n");
661 }
662
663 // Reject webs than contain swapping loads that feed something other
664 // than a swap instruction.
665 else if (SwapVector[EntryIdx].IsLoad && SwapVector[EntryIdx].IsSwap) {
666 MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
667 Register DefReg = MI->getOperand(i: 0).getReg();
668
669 // We skip debug instructions in the analysis. (Note that debug
670 // location information is still maintained by this optimization
671 // because it remains on the LXVD2X and STXVD2X instructions after
672 // the XXPERMDIs are removed.)
673 for (MachineInstr &UseMI : MRI->use_nodbg_instructions(Reg: DefReg)) {
674 int UseIdx = SwapMap[&UseMI];
675
676 if (!SwapVector[UseIdx].IsSwap || SwapVector[UseIdx].IsLoad ||
677 SwapVector[UseIdx].IsStore) {
678
679 SwapVector[Repr].WebRejected = 1;
680
681 LLVM_DEBUG(dbgs() << format(
682 "Web %d rejected for load not feeding swap\n", Repr));
683 LLVM_DEBUG(dbgs() << " def " << EntryIdx << ": ");
684 LLVM_DEBUG(MI->dump());
685 LLVM_DEBUG(dbgs() << " use " << UseIdx << ": ");
686 LLVM_DEBUG(UseMI.dump());
687 LLVM_DEBUG(dbgs() << "\n");
688 }
689
690 // It is possible that the load feeds a swap and that swap feeds a
691 // store. In such a case, the code is actually trying to store a swapped
692 // vector. We must reject such webs.
693 if (SwapVector[UseIdx].IsSwap && !SwapVector[UseIdx].IsLoad &&
694 !SwapVector[UseIdx].IsStore) {
695 Register SwapDefReg = UseMI.getOperand(i: 0).getReg();
696 for (MachineInstr &UseOfUseMI :
697 MRI->use_nodbg_instructions(Reg: SwapDefReg)) {
698 int UseOfUseIdx = SwapMap[&UseOfUseMI];
699 if (SwapVector[UseOfUseIdx].IsStore) {
700 SwapVector[Repr].WebRejected = 1;
701 LLVM_DEBUG(
702 dbgs() << format(
703 "Web %d rejected for load/swap feeding a store\n", Repr));
704 LLVM_DEBUG(dbgs() << " def " << EntryIdx << ": ");
705 LLVM_DEBUG(MI->dump());
706 LLVM_DEBUG(dbgs() << " use " << UseIdx << ": ");
707 LLVM_DEBUG(UseMI.dump());
708 LLVM_DEBUG(dbgs() << "\n");
709 }
710 }
711 }
712 }
713
714 // Reject webs that contain swapping stores that are fed by something
715 // other than a swap instruction.
716 } else if (SwapVector[EntryIdx].IsStore && SwapVector[EntryIdx].IsSwap) {
717 MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
718 Register UseReg = MI->getOperand(i: 0).getReg();
719 MachineInstr *DefMI = MRI->getVRegDef(Reg: UseReg);
720 Register DefReg = DefMI->getOperand(i: 0).getReg();
721 int DefIdx = SwapMap[DefMI];
722
723 if (!SwapVector[DefIdx].IsSwap || SwapVector[DefIdx].IsLoad ||
724 SwapVector[DefIdx].IsStore) {
725
726 SwapVector[Repr].WebRejected = 1;
727
728 LLVM_DEBUG(dbgs() << format(
729 "Web %d rejected for store not fed by swap\n", Repr));
730 LLVM_DEBUG(dbgs() << " def " << DefIdx << ": ");
731 LLVM_DEBUG(DefMI->dump());
732 LLVM_DEBUG(dbgs() << " use " << EntryIdx << ": ");
733 LLVM_DEBUG(MI->dump());
734 LLVM_DEBUG(dbgs() << "\n");
735 }
736
737 // Ensure all uses of the register defined by DefMI feed store
738 // instructions
739 for (MachineInstr &UseMI : MRI->use_nodbg_instructions(Reg: DefReg)) {
740 int UseIdx = SwapMap[&UseMI];
741
742 if (SwapVector[UseIdx].VSEMI->getOpcode() != MI->getOpcode()) {
743 SwapVector[Repr].WebRejected = 1;
744
745 LLVM_DEBUG(
746 dbgs() << format(
747 "Web %d rejected for swap not feeding only stores\n", Repr));
748 LLVM_DEBUG(dbgs() << " def "
749 << " : ");
750 LLVM_DEBUG(DefMI->dump());
751 LLVM_DEBUG(dbgs() << " use " << UseIdx << ": ");
752 LLVM_DEBUG(SwapVector[UseIdx].VSEMI->dump());
753 LLVM_DEBUG(dbgs() << "\n");
754 }
755 }
756 }
757 }
758
759 LLVM_DEBUG(dbgs() << "Swap vector after web analysis:\n\n");
760 LLVM_DEBUG(dumpSwapVector());
761}
762
763// Walk the swap vector entries looking for swaps fed by permuting loads
764// and swaps that feed permuting stores. If the containing computation
765// has not been marked rejected, mark each such swap for removal.
766// (Removal is delayed in case optimization has disturbed the pattern,
767// such that multiple loads feed the same swap, etc.)
768void PPCVSXSwapRemoval::markSwapsForRemoval() {
769
770 LLVM_DEBUG(dbgs() << "\n*** Marking swaps for removal ***\n\n");
771
772 for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) {
773
774 if (SwapVector[EntryIdx].IsLoad && SwapVector[EntryIdx].IsSwap) {
775 int Repr = EC->getLeaderValue(V: SwapVector[EntryIdx].VSEId);
776
777 if (!SwapVector[Repr].WebRejected) {
778 MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
779 Register DefReg = MI->getOperand(i: 0).getReg();
780
781 for (MachineInstr &UseMI : MRI->use_nodbg_instructions(Reg: DefReg)) {
782 int UseIdx = SwapMap[&UseMI];
783 SwapVector[UseIdx].WillRemove = 1;
784
785 LLVM_DEBUG(dbgs() << "Marking swap fed by load for removal: ");
786 LLVM_DEBUG(UseMI.dump());
787 }
788 }
789
790 } else if (SwapVector[EntryIdx].IsStore && SwapVector[EntryIdx].IsSwap) {
791 int Repr = EC->getLeaderValue(V: SwapVector[EntryIdx].VSEId);
792
793 if (!SwapVector[Repr].WebRejected) {
794 MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
795 Register UseReg = MI->getOperand(i: 0).getReg();
796 MachineInstr *DefMI = MRI->getVRegDef(Reg: UseReg);
797 int DefIdx = SwapMap[DefMI];
798 SwapVector[DefIdx].WillRemove = 1;
799
800 LLVM_DEBUG(dbgs() << "Marking swap feeding store for removal: ");
801 LLVM_DEBUG(DefMI->dump());
802 }
803
804 } else if (SwapVector[EntryIdx].IsSwappable &&
805 SwapVector[EntryIdx].SpecialHandling != 0) {
806 int Repr = EC->getLeaderValue(V: SwapVector[EntryIdx].VSEId);
807
808 if (!SwapVector[Repr].WebRejected)
809 handleSpecialSwappables(EntryIdx);
810 }
811 }
812}
813
814// Create an xxswapd instruction and insert it prior to the given point.
815// MI is used to determine basic block and debug loc information.
816// FIXME: When inserting a swap, we should check whether SrcReg is
817// defined by another swap: SrcReg = XXPERMDI Reg, Reg, 2; If so,
818// then instead we should generate a copy from Reg to DstReg.
819void PPCVSXSwapRemoval::insertSwap(MachineInstr *MI,
820 MachineBasicBlock::iterator InsertPoint,
821 unsigned DstReg, unsigned SrcReg) {
822 BuildMI(BB&: *MI->getParent(), I: InsertPoint, MIMD: MI->getDebugLoc(),
823 MCID: TII->get(Opcode: PPC::XXPERMDI), DestReg: DstReg)
824 .addReg(RegNo: SrcReg)
825 .addReg(RegNo: SrcReg)
826 .addImm(Val: 2);
827}
828
829// The identified swap entry requires special handling to allow its
830// containing computation to be optimized. Perform that handling
831// here.
832// FIXME: Additional opportunities will be phased in with subsequent
833// patches.
834void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
835 switch (SwapVector[EntryIdx].SpecialHandling) {
836
837 default:
838 llvm_unreachable("Unexpected special handling type");
839
840 // For splats based on an index into a vector, add N/2 modulo N
841 // to the index, where N is the number of vector elements.
842 case SHValues::SH_SPLAT: {
843 MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
844 unsigned NElts;
845
846 LLVM_DEBUG(dbgs() << "Changing splat: ");
847 LLVM_DEBUG(MI->dump());
848
849 switch (MI->getOpcode()) {
850 default:
851 llvm_unreachable("Unexpected splat opcode");
852 case PPC::VSPLTB: NElts = 16; break;
853 case PPC::VSPLTH: NElts = 8; break;
854 case PPC::VSPLTW:
855 case PPC::XXSPLTW: NElts = 4; break;
856 }
857
858 unsigned EltNo;
859 if (MI->getOpcode() == PPC::XXSPLTW)
860 EltNo = MI->getOperand(i: 2).getImm();
861 else
862 EltNo = MI->getOperand(i: 1).getImm();
863
864 EltNo = (EltNo + NElts / 2) % NElts;
865 if (MI->getOpcode() == PPC::XXSPLTW)
866 MI->getOperand(i: 2).setImm(EltNo);
867 else
868 MI->getOperand(i: 1).setImm(EltNo);
869
870 LLVM_DEBUG(dbgs() << " Into: ");
871 LLVM_DEBUG(MI->dump());
872 break;
873 }
874
875 // For an XXPERMDI that isn't handled otherwise, we need to
876 // reverse the order of the operands. If the selector operand
877 // has a value of 0 or 3, we need to change it to 3 or 0,
878 // respectively. Otherwise we should leave it alone. (This
879 // is equivalent to reversing the two bits of the selector
880 // operand and complementing the result.)
881 case SHValues::SH_XXPERMDI: {
882 MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
883
884 LLVM_DEBUG(dbgs() << "Changing XXPERMDI: ");
885 LLVM_DEBUG(MI->dump());
886
887 unsigned Selector = MI->getOperand(i: 3).getImm();
888 if (Selector == 0 || Selector == 3)
889 Selector = 3 - Selector;
890 MI->getOperand(i: 3).setImm(Selector);
891
892 Register Reg1 = MI->getOperand(i: 1).getReg();
893 Register Reg2 = MI->getOperand(i: 2).getReg();
894 MI->getOperand(i: 1).setReg(Reg2);
895 MI->getOperand(i: 2).setReg(Reg1);
896
897 // We also need to swap kill flag associated with the register.
898 bool IsKill1 = MI->getOperand(i: 1).isKill();
899 bool IsKill2 = MI->getOperand(i: 2).isKill();
900 MI->getOperand(i: 1).setIsKill(IsKill2);
901 MI->getOperand(i: 2).setIsKill(IsKill1);
902
903 LLVM_DEBUG(dbgs() << " Into: ");
904 LLVM_DEBUG(MI->dump());
905 break;
906 }
907
908 // For a copy from a scalar floating-point register to a vector
909 // register, removing swaps will leave the copied value in the
910 // wrong lane. Insert a swap following the copy to fix this.
911 case SHValues::SH_COPYWIDEN: {
912 MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
913
914 LLVM_DEBUG(dbgs() << "Changing SUBREG_TO_REG: ");
915 LLVM_DEBUG(MI->dump());
916
917 Register DstReg = MI->getOperand(i: 0).getReg();
918 const TargetRegisterClass *DstRC = MRI->getRegClass(Reg: DstReg);
919 Register NewVReg = MRI->createVirtualRegister(RegClass: DstRC);
920
921 MI->getOperand(i: 0).setReg(NewVReg);
922 LLVM_DEBUG(dbgs() << " Into: ");
923 LLVM_DEBUG(MI->dump());
924
925 auto InsertPoint = ++MachineBasicBlock::iterator(MI);
926
927 // Note that an XXPERMDI requires a VSRC, so if the SUBREG_TO_REG
928 // is copying to a VRRC, we need to be careful to avoid a register
929 // assignment problem. In this case we must copy from VRRC to VSRC
930 // prior to the swap, and from VSRC to VRRC following the swap.
931 // Coalescing will usually remove all this mess.
932 if (DstRC == &PPC::VRRCRegClass) {
933 Register VSRCTmp1 = MRI->createVirtualRegister(RegClass: &PPC::VSRCRegClass);
934 Register VSRCTmp2 = MRI->createVirtualRegister(RegClass: &PPC::VSRCRegClass);
935
936 BuildMI(BB&: *MI->getParent(), I: InsertPoint, MIMD: MI->getDebugLoc(),
937 MCID: TII->get(Opcode: PPC::COPY), DestReg: VSRCTmp1)
938 .addReg(RegNo: NewVReg);
939 LLVM_DEBUG(std::prev(InsertPoint)->dump());
940
941 insertSwap(MI, InsertPoint, DstReg: VSRCTmp2, SrcReg: VSRCTmp1);
942 LLVM_DEBUG(std::prev(InsertPoint)->dump());
943
944 BuildMI(BB&: *MI->getParent(), I: InsertPoint, MIMD: MI->getDebugLoc(),
945 MCID: TII->get(Opcode: PPC::COPY), DestReg: DstReg)
946 .addReg(RegNo: VSRCTmp2);
947 LLVM_DEBUG(std::prev(InsertPoint)->dump());
948
949 } else {
950 insertSwap(MI, InsertPoint, DstReg, SrcReg: NewVReg);
951 LLVM_DEBUG(std::prev(InsertPoint)->dump());
952 }
953 break;
954 }
955 }
956}
957
958// Walk the swap vector and replace each entry marked for removal with
959// a copy operation.
960bool PPCVSXSwapRemoval::removeSwaps() {
961
962 LLVM_DEBUG(dbgs() << "\n*** Removing swaps ***\n\n");
963
964 bool Changed = false;
965
966 for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) {
967 if (SwapVector[EntryIdx].WillRemove) {
968 Changed = true;
969 MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
970 MachineBasicBlock *MBB = MI->getParent();
971 BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY),
972 DestReg: MI->getOperand(i: 0).getReg())
973 .add(MO: MI->getOperand(i: 1));
974
975 LLVM_DEBUG(dbgs() << format("Replaced %d with copy: ",
976 SwapVector[EntryIdx].VSEId));
977 LLVM_DEBUG(MI->dump());
978
979 MI->eraseFromParent();
980 }
981 }
982
983 return Changed;
984}
985
986#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
987// For debug purposes, dump the contents of the swap vector.
988LLVM_DUMP_METHOD void PPCVSXSwapRemoval::dumpSwapVector() {
989
990 for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) {
991
992 MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
993 int ID = SwapVector[EntryIdx].VSEId;
994
995 dbgs() << format("%6d", ID);
996 dbgs() << format("%6d", EC->getLeaderValue(ID));
997 dbgs() << format(" %bb.%3d", MI->getParent()->getNumber());
998 dbgs() << format(" %14s ", TII->getName(MI->getOpcode()).str().c_str());
999
1000 if (SwapVector[EntryIdx].IsLoad)
1001 dbgs() << "load ";
1002 if (SwapVector[EntryIdx].IsStore)
1003 dbgs() << "store ";
1004 if (SwapVector[EntryIdx].IsSwap)
1005 dbgs() << "swap ";
1006 if (SwapVector[EntryIdx].MentionsPhysVR)
1007 dbgs() << "physreg ";
1008 if (SwapVector[EntryIdx].MentionsPartialVR)
1009 dbgs() << "partialreg ";
1010
1011 if (SwapVector[EntryIdx].IsSwappable) {
1012 dbgs() << "swappable ";
1013 switch(SwapVector[EntryIdx].SpecialHandling) {
1014 default:
1015 dbgs() << "special:**unknown**";
1016 break;
1017 case SH_NONE:
1018 break;
1019 case SH_EXTRACT:
1020 dbgs() << "special:extract ";
1021 break;
1022 case SH_INSERT:
1023 dbgs() << "special:insert ";
1024 break;
1025 case SH_NOSWAP_LD:
1026 dbgs() << "special:load ";
1027 break;
1028 case SH_NOSWAP_ST:
1029 dbgs() << "special:store ";
1030 break;
1031 case SH_SPLAT:
1032 dbgs() << "special:splat ";
1033 break;
1034 case SH_XXPERMDI:
1035 dbgs() << "special:xxpermdi ";
1036 break;
1037 case SH_COPYWIDEN:
1038 dbgs() << "special:copywiden ";
1039 break;
1040 }
1041 }
1042
1043 if (SwapVector[EntryIdx].WebRejected)
1044 dbgs() << "rejected ";
1045 if (SwapVector[EntryIdx].WillRemove)
1046 dbgs() << "remove ";
1047
1048 dbgs() << "\n";
1049
1050 // For no-asserts builds.
1051 (void)MI;
1052 (void)ID;
1053 }
1054
1055 dbgs() << "\n";
1056}
1057#endif
1058
1059INITIALIZE_PASS_BEGIN(PPCVSXSwapRemoval, DEBUG_TYPE,
1060 "PowerPC VSX Swap Removal", false, false)
1061INITIALIZE_PASS_END(PPCVSXSwapRemoval, DEBUG_TYPE,
1062 "PowerPC VSX Swap Removal", false, false)
1063
1064char PPCVSXSwapRemoval::ID = 0;
1065FunctionPass*
1066llvm::createPPCVSXSwapRemovalPass() { return new PPCVSXSwapRemoval(); }
1067