1//===----------- PPCVSXSwapRemoval.cpp - Remove VSX LE Swaps -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===---------------------------------------------------------------------===//
8//
9// This pass analyzes vector computations and removes unnecessary
10// doubleword swaps (xxswapd instructions). This pass is performed
11// only for little-endian VSX code generation.
12//
13// For this specific case, loads and stores of v4i32, v4f32, v2i64,
14// and v2f64 vectors are inefficient. These are implemented using
15// the lxvd2x and stxvd2x instructions, which invert the order of
16// doublewords in a vector register. Thus code generation inserts
17// an xxswapd after each such load, and prior to each such store.
18//
19// The extra xxswapd instructions reduce performance. The purpose
20// of this pass is to reduce the number of xxswapd instructions
21// required for correctness.
22//
23// The primary insight is that much code that operates on vectors
24// does not care about the relative order of elements in a register,
25// so long as the correct memory order is preserved. If we have a
26// computation where all input values are provided by lxvd2x/xxswapd,
27// all outputs are stored using xxswapd/lxvd2x, and all intermediate
28// computations are lane-insensitive (independent of element order),
29// then all the xxswapd instructions associated with the loads and
30// stores may be removed without changing observable semantics.
31//
32// This pass uses standard equivalence class infrastructure to create
33// maximal webs of computations fitting the above description. Each
34// such web is then optimized by removing its unnecessary xxswapd
35// instructions.
36//
37// There are some lane-sensitive operations for which we can still
38// permit the optimization, provided we modify those operations
39// accordingly. Such operations are identified as using "special
40// handling" within this module.
41//
42//===---------------------------------------------------------------------===//
43
44#include "PPC.h"
45#include "PPCInstrInfo.h"
46#include "PPCTargetMachine.h"
47#include "llvm/ADT/DenseMap.h"
48#include "llvm/ADT/EquivalenceClasses.h"
49#include "llvm/CodeGen/MachineFunctionPass.h"
50#include "llvm/CodeGen/MachineInstrBuilder.h"
51#include "llvm/CodeGen/MachineRegisterInfo.h"
52#include "llvm/Config/llvm-config.h"
53#include "llvm/Support/Debug.h"
54#include "llvm/Support/Format.h"
55#include "llvm/Support/raw_ostream.h"
56
57using namespace llvm;
58
59#define DEBUG_TYPE "ppc-vsx-swaps"
60
61namespace {
62
63// A PPCVSXSwapEntry is created for each machine instruction that
64// is relevant to a vector computation.
65struct PPCVSXSwapEntry {
66 // Pointer to the instruction.
67 MachineInstr *VSEMI;
68
69 // Unique ID (position in the swap vector).
70 int VSEId;
71
72 // Attributes of this node.
73 unsigned int IsLoad : 1;
74 unsigned int IsStore : 1;
75 unsigned int IsSwap : 1;
76 unsigned int MentionsPhysVR : 1;
77 unsigned int IsSwappable : 1;
78 unsigned int MentionsPartialVR : 1;
79 unsigned int SpecialHandling : 3;
80 unsigned int WebRejected : 1;
81 unsigned int WillRemove : 1;
82};
83
84enum SHValues {
85 SH_NONE = 0,
86 SH_EXTRACT,
87 SH_INSERT,
88 SH_NOSWAP_LD,
89 SH_NOSWAP_ST,
90 SH_SPLAT,
91 SH_XXPERMDI,
92 SH_COPYWIDEN
93};
94
95struct PPCVSXSwapRemoval : public MachineFunctionPass {
96
97 static char ID;
98 const PPCInstrInfo *TII;
99 MachineFunction *MF;
100 MachineRegisterInfo *MRI;
101
102 // Swap entries are allocated in a vector for better performance.
103 std::vector<PPCVSXSwapEntry> SwapVector;
104
105 // A mapping is maintained between machine instructions and
106 // their swap entries. The key is the address of the MI.
107 DenseMap<MachineInstr*, int> SwapMap;
108
109 // Equivalence classes are used to gather webs of related computation.
110 // Swap entries are represented by their VSEId fields.
111 EquivalenceClasses<int> *EC;
112
113 PPCVSXSwapRemoval() : MachineFunctionPass(ID) {}
114
115private:
116 // Initialize data structures.
117 void initialize(MachineFunction &MFParm);
118
119 // Walk the machine instructions to gather vector usage information.
120 // Return true iff vector mentions are present.
121 bool gatherVectorInstructions();
122
123 // Add an entry to the swap vector and swap map.
124 int addSwapEntry(MachineInstr *MI, PPCVSXSwapEntry &SwapEntry);
125
126 // Hunt backwards through COPY and SUBREG_TO_REG chains for a
127 // source register. VecIdx indicates the swap vector entry to
128 // mark as mentioning a physical register if the search leads
129 // to one.
130 unsigned lookThruCopyLike(unsigned SrcReg, unsigned VecIdx);
131
132 // Generate equivalence classes for related computations (webs).
133 void formWebs();
134
135 // Analyze webs and determine those that cannot be optimized.
136 void recordUnoptimizableWebs();
137
138 // Record which swap instructions can be safely removed.
139 void markSwapsForRemoval();
140
141 // Remove swaps and update other instructions requiring special
142 // handling. Return true iff any changes are made.
143 bool removeSwaps();
144
145 // Insert a swap instruction from SrcReg to DstReg at the given
146 // InsertPoint.
147 void insertSwap(MachineInstr *MI, MachineBasicBlock::iterator InsertPoint,
148 unsigned DstReg, unsigned SrcReg);
149
150 // Update instructions requiring special handling.
151 void handleSpecialSwappables(int EntryIdx);
152
153 // Dump a description of the entries in the swap vector.
154 void dumpSwapVector();
155
156 // Return true iff the given register is in the given class.
157 bool isRegInClass(unsigned Reg, const TargetRegisterClass *RC) {
158 if (Register::isVirtualRegister(Reg))
159 return RC->hasSubClassEq(RC: MRI->getRegClass(Reg));
160 return RC->contains(Reg);
161 }
162
163 // Return true iff the given register is a full vector register.
164 bool isVecReg(unsigned Reg) {
165 return (isRegInClass(Reg, RC: &PPC::VSRCRegClass) ||
166 isRegInClass(Reg, RC: &PPC::VRRCRegClass));
167 }
168
169 // Return true iff the given register is a partial vector register.
170 bool isScalarVecReg(unsigned Reg) {
171 return (isRegInClass(Reg, RC: &PPC::VSFRCRegClass) ||
172 isRegInClass(Reg, RC: &PPC::VSSRCRegClass));
173 }
174
175 // Return true iff the given register mentions all or part of a
176 // vector register. Also sets Partial to true if the mention
177 // is for just the floating-point register overlap of the register.
178 bool isAnyVecReg(unsigned Reg, bool &Partial) {
179 if (isScalarVecReg(Reg))
180 Partial = true;
181 return isScalarVecReg(Reg) || isVecReg(Reg);
182 }
183
184public:
185 // Main entry point for this pass.
186 bool runOnMachineFunction(MachineFunction &MF) override {
187 if (skipFunction(F: MF.getFunction()))
188 return false;
189
190 // If we don't have VSX on the subtarget, don't do anything.
191 // Also, on Power 9 the load and store ops preserve element order and so
192 // the swaps are not required.
193 const PPCSubtarget &STI = MF.getSubtarget<PPCSubtarget>();
194 if (!STI.hasVSX() || !STI.needsSwapsForVSXMemOps())
195 return false;
196
197 bool Changed = false;
198 initialize(MFParm&: MF);
199
200 if (gatherVectorInstructions()) {
201 formWebs();
202 recordUnoptimizableWebs();
203 markSwapsForRemoval();
204 Changed = removeSwaps();
205 }
206
207 // FIXME: See the allocation of EC in initialize().
208 delete EC;
209 return Changed;
210 }
211};
212} // end anonymous namespace
213
214// Initialize data structures for this pass. In particular, clear the
215// swap vector and allocate the equivalence class mapping before
216// processing each function.
217void PPCVSXSwapRemoval::initialize(MachineFunction &MFParm) {
218 MF = &MFParm;
219 MRI = &MF->getRegInfo();
220 TII = MF->getSubtarget<PPCSubtarget>().getInstrInfo();
221
222 // An initial vector size of 256 appears to work well in practice.
223 // Small/medium functions with vector content tend not to incur a
224 // reallocation at this size. Three of the vector tests in
225 // projects/test-suite reallocate, which seems like a reasonable rate.
226 const int InitialVectorSize(256);
227 SwapVector.clear();
228 SwapVector.reserve(n: InitialVectorSize);
229
230 // FIXME: Currently we allocate EC each time because we don't have
231 // access to the set representation on which to call clear(). Should
232 // consider adding a clear() method to the EquivalenceClasses class.
233 EC = new EquivalenceClasses<int>;
234}
235
236// Create an entry in the swap vector for each instruction that mentions
237// a full vector register, recording various characteristics of the
238// instructions there.
239bool PPCVSXSwapRemoval::gatherVectorInstructions() {
240 bool RelevantFunction = false;
241
242 for (MachineBasicBlock &MBB : *MF) {
243 for (MachineInstr &MI : MBB) {
244
245 if (MI.isDebugInstr())
246 continue;
247
248 bool RelevantInstr = false;
249 bool Partial = false;
250
251 for (const MachineOperand &MO : MI.operands()) {
252 if (!MO.isReg())
253 continue;
254 Register Reg = MO.getReg();
255 // All operands need to be checked because there are instructions that
256 // operate on a partial register and produce a full register (such as
257 // XXPERMDIs).
258 if (isAnyVecReg(Reg, Partial))
259 RelevantInstr = true;
260 }
261
262 if (!RelevantInstr)
263 continue;
264
265 RelevantFunction = true;
266
267 // Create a SwapEntry initialized to zeros, then fill in the
268 // instruction and ID fields before pushing it to the back
269 // of the swap vector.
270 PPCVSXSwapEntry SwapEntry{};
271 int VecIdx = addSwapEntry(MI: &MI, SwapEntry);
272
273 switch(MI.getOpcode()) {
274 default:
275 // Unless noted otherwise, an instruction is considered
276 // safe for the optimization. There are a large number of
277 // such true-SIMD instructions (all vector math, logical,
278 // select, compare, etc.). However, if the instruction
279 // mentions a partial vector register and does not have
280 // special handling defined, it is not swappable.
281 if (Partial)
282 SwapVector[VecIdx].MentionsPartialVR = 1;
283 else
284 SwapVector[VecIdx].IsSwappable = 1;
285 break;
286 case PPC::XXPERMDI: {
287 // This is a swap if it is of the form XXPERMDI t, s, s, 2.
288 // Unfortunately, MachineCSE ignores COPY and SUBREG_TO_REG, so we
289 // can also see XXPERMDI t, SUBREG_TO_REG(s), SUBREG_TO_REG(s), 2,
290 // for example. We have to look through chains of COPY and
291 // SUBREG_TO_REG to find the real source value for comparison.
292 // If the real source value is a physical register, then mark the
293 // XXPERMDI as mentioning a physical register.
294 int immed = MI.getOperand(i: 3).getImm();
295 if (immed == 2) {
296 unsigned trueReg1 = lookThruCopyLike(SrcReg: MI.getOperand(i: 1).getReg(),
297 VecIdx);
298 unsigned trueReg2 = lookThruCopyLike(SrcReg: MI.getOperand(i: 2).getReg(),
299 VecIdx);
300 if (trueReg1 == trueReg2)
301 SwapVector[VecIdx].IsSwap = 1;
302 else {
303 // We can still handle these if the two registers are not
304 // identical, by adjusting the form of the XXPERMDI.
305 SwapVector[VecIdx].IsSwappable = 1;
306 SwapVector[VecIdx].SpecialHandling = SHValues::SH_XXPERMDI;
307 }
308 // This is a doubleword splat if it is of the form
309 // XXPERMDI t, s, s, 0 or XXPERMDI t, s, s, 3. As above we
310 // must look through chains of copy-likes to find the source
311 // register. We turn off the marking for mention of a physical
312 // register, because splatting it is safe; the optimization
313 // will not swap the value in the physical register. Whether
314 // or not the two input registers are identical, we can handle
315 // these by adjusting the form of the XXPERMDI.
316 } else if (immed == 0 || immed == 3) {
317
318 SwapVector[VecIdx].IsSwappable = 1;
319 SwapVector[VecIdx].SpecialHandling = SHValues::SH_XXPERMDI;
320
321 unsigned trueReg1 = lookThruCopyLike(SrcReg: MI.getOperand(i: 1).getReg(),
322 VecIdx);
323 unsigned trueReg2 = lookThruCopyLike(SrcReg: MI.getOperand(i: 2).getReg(),
324 VecIdx);
325 if (trueReg1 == trueReg2)
326 SwapVector[VecIdx].MentionsPhysVR = 0;
327
328 } else {
329 // We can still handle these by adjusting the form of the XXPERMDI.
330 SwapVector[VecIdx].IsSwappable = 1;
331 SwapVector[VecIdx].SpecialHandling = SHValues::SH_XXPERMDI;
332 }
333 break;
334 }
335 case PPC::LVX:
336 // Non-permuting loads are currently unsafe. We can use special
337 // handling for this in the future. By not marking these as
338 // IsSwap, we ensure computations containing them will be rejected
339 // for now.
340 SwapVector[VecIdx].IsLoad = 1;
341 break;
342 case PPC::LXVD2X:
343 case PPC::LXVW4X:
344 // Permuting loads are marked as both load and swap, and are
345 // safe for optimization.
346 SwapVector[VecIdx].IsLoad = 1;
347 SwapVector[VecIdx].IsSwap = 1;
348 break;
349 case PPC::LXSDX:
350 case PPC::LXSSPX:
351 case PPC::XFLOADf64:
352 case PPC::XFLOADf32:
353 // A load of a floating-point value into the high-order half of
354 // a vector register is safe, provided that we introduce a swap
355 // following the load, which will be done by the SUBREG_TO_REG
356 // support. So just mark these as safe.
357 SwapVector[VecIdx].IsLoad = 1;
358 SwapVector[VecIdx].IsSwappable = 1;
359 break;
360 case PPC::STVX:
361 // Non-permuting stores are currently unsafe. We can use special
362 // handling for this in the future. By not marking these as
363 // IsSwap, we ensure computations containing them will be rejected
364 // for now.
365 SwapVector[VecIdx].IsStore = 1;
366 break;
367 case PPC::STXVD2X:
368 case PPC::STXVW4X:
369 // Permuting stores are marked as both store and swap, and are
370 // safe for optimization.
371 SwapVector[VecIdx].IsStore = 1;
372 SwapVector[VecIdx].IsSwap = 1;
373 break;
374 case PPC::COPY:
375 // These are fine provided they are moving between full vector
376 // register classes.
377 if (isVecReg(Reg: MI.getOperand(i: 0).getReg()) &&
378 isVecReg(Reg: MI.getOperand(i: 1).getReg()))
379 SwapVector[VecIdx].IsSwappable = 1;
380 // If we have a copy from one scalar floating-point register
381 // to another, we can accept this even if it is a physical
382 // register. The only way this gets involved is if it feeds
383 // a SUBREG_TO_REG, which is handled by introducing a swap.
384 else if (isScalarVecReg(Reg: MI.getOperand(i: 0).getReg()) &&
385 isScalarVecReg(Reg: MI.getOperand(i: 1).getReg()))
386 SwapVector[VecIdx].IsSwappable = 1;
387 break;
388 case PPC::SUBREG_TO_REG: {
389 // These are fine provided they are moving between full vector
390 // register classes. If they are moving from a scalar
391 // floating-point class to a vector class, we can handle those
392 // as well, provided we introduce a swap. It is generally the
393 // case that we will introduce fewer swaps than we remove, but
394 // (FIXME) a cost model could be used. However, introduced
395 // swaps could potentially be CSEd, so this is not trivial.
396 if (isVecReg(Reg: MI.getOperand(i: 0).getReg()) &&
397 isVecReg(Reg: MI.getOperand(i: 2).getReg()))
398 SwapVector[VecIdx].IsSwappable = 1;
399 else if (isVecReg(Reg: MI.getOperand(i: 0).getReg()) &&
400 isScalarVecReg(Reg: MI.getOperand(i: 2).getReg())) {
401 SwapVector[VecIdx].IsSwappable = 1;
402 SwapVector[VecIdx].SpecialHandling = SHValues::SH_COPYWIDEN;
403 }
404 break;
405 }
406 case PPC::VSPLTB:
407 case PPC::VSPLTH:
408 case PPC::VSPLTW:
409 case PPC::XXSPLTW:
410 // Splats are lane-sensitive, but we can use special handling
411 // to adjust the source lane for the splat.
412 SwapVector[VecIdx].IsSwappable = 1;
413 SwapVector[VecIdx].SpecialHandling = SHValues::SH_SPLAT;
414 break;
415 // The presence of the following lane-sensitive operations in a
416 // web will kill the optimization, at least for now. For these
417 // we do nothing, causing the optimization to fail.
418 // FIXME: Some of these could be permitted with special handling,
419 // and will be phased in as time permits.
420 // FIXME: There is no simple and maintainable way to express a set
421 // of opcodes having a common attribute in TableGen. Should this
422 // change, this is a prime candidate to use such a mechanism.
423 case PPC::INLINEASM:
424 case PPC::INLINEASM_BR:
425 case PPC::EXTRACT_SUBREG:
426 case PPC::INSERT_SUBREG:
427 case PPC::COPY_TO_REGCLASS:
428 case PPC::LVEBX:
429 case PPC::LVEHX:
430 case PPC::LVEWX:
431 case PPC::LVSL:
432 case PPC::LVSR:
433 case PPC::LVXL:
434 case PPC::STVEBX:
435 case PPC::STVEHX:
436 case PPC::STVEWX:
437 case PPC::STVXL:
438 // We can handle STXSDX and STXSSPX similarly to LXSDX and LXSSPX,
439 // by adding special handling for narrowing copies as well as
440 // widening ones. However, I've experimented with this, and in
441 // practice we currently do not appear to use STXSDX fed by
442 // a narrowing copy from a full vector register. Since I can't
443 // generate any useful test cases, I've left this alone for now.
444 case PPC::STXSDX:
445 case PPC::STXSSPX:
446 case PPC::VCIPHER:
447 case PPC::VCIPHERLAST:
448 case PPC::VMRGHB:
449 case PPC::VMRGHH:
450 case PPC::VMRGHW:
451 case PPC::VMRGLB:
452 case PPC::VMRGLH:
453 case PPC::VMRGLW:
454 case PPC::VMULESB:
455 case PPC::VMULESH:
456 case PPC::VMULESW:
457 case PPC::VMULEUB:
458 case PPC::VMULEUH:
459 case PPC::VMULEUW:
460 case PPC::VMULOSB:
461 case PPC::VMULOSH:
462 case PPC::VMULOSW:
463 case PPC::VMULOUB:
464 case PPC::VMULOUH:
465 case PPC::VMULOUW:
466 case PPC::VNCIPHER:
467 case PPC::VNCIPHERLAST:
468 case PPC::VPERM:
469 case PPC::VPERMXOR:
470 case PPC::VPKPX:
471 case PPC::VPKSHSS:
472 case PPC::VPKSHUS:
473 case PPC::VPKSDSS:
474 case PPC::VPKSDUS:
475 case PPC::VPKSWSS:
476 case PPC::VPKSWUS:
477 case PPC::VPKUDUM:
478 case PPC::VPKUDUS:
479 case PPC::VPKUHUM:
480 case PPC::VPKUHUS:
481 case PPC::VPKUWUM:
482 case PPC::VPKUWUS:
483 case PPC::VPMSUMB:
484 case PPC::VPMSUMD:
485 case PPC::VPMSUMH:
486 case PPC::VPMSUMW:
487 case PPC::VRLB:
488 case PPC::VRLD:
489 case PPC::VRLH:
490 case PPC::VRLW:
491 case PPC::VSBOX:
492 case PPC::VSHASIGMAD:
493 case PPC::VSHASIGMAW:
494 case PPC::VSL:
495 case PPC::VSLDOI:
496 case PPC::VSLO:
497 case PPC::VSR:
498 case PPC::VSRO:
499 case PPC::VSUM2SWS:
500 case PPC::VSUM4SBS:
501 case PPC::VSUM4SHS:
502 case PPC::VSUM4UBS:
503 case PPC::VSUMSWS:
504 case PPC::VUPKHPX:
505 case PPC::VUPKHSB:
506 case PPC::VUPKHSH:
507 case PPC::VUPKHSW:
508 case PPC::VUPKLPX:
509 case PPC::VUPKLSB:
510 case PPC::VUPKLSH:
511 case PPC::VUPKLSW:
512 case PPC::XXMRGHW:
513 case PPC::XXMRGLW:
514 // XXSLDWI could be replaced by a general permute with one of three
515 // permute control vectors (for shift values 1, 2, 3). However,
516 // VPERM has a more restrictive register class.
517 case PPC::XXSLDWI:
518 case PPC::XSCVDPSPN:
519 case PPC::XSCVSPDPN:
520 case PPC::MTVSCR:
521 case PPC::MFVSCR:
522 break;
523 }
524 }
525 }
526
527 if (RelevantFunction) {
528 LLVM_DEBUG(dbgs() << "Swap vector when first built\n\n");
529 LLVM_DEBUG(dumpSwapVector());
530 }
531
532 return RelevantFunction;
533}
534
535// Add an entry to the swap vector and swap map, and make a
536// singleton equivalence class for the entry.
537int PPCVSXSwapRemoval::addSwapEntry(MachineInstr *MI,
538 PPCVSXSwapEntry& SwapEntry) {
539 SwapEntry.VSEMI = MI;
540 SwapEntry.VSEId = SwapVector.size();
541 SwapVector.push_back(x: SwapEntry);
542 EC->insert(Data: SwapEntry.VSEId);
543 SwapMap[MI] = SwapEntry.VSEId;
544 return SwapEntry.VSEId;
545}
546
547// This is used to find the "true" source register for an
548// XXPERMDI instruction, since MachineCSE does not handle the
549// "copy-like" operations (Copy and SubregToReg). Returns
550// the original SrcReg unless it is the target of a copy-like
551// operation, in which case we chain backwards through all
552// such operations to the ultimate source register. If a
553// physical register is encountered, we stop the search and
554// flag the swap entry indicated by VecIdx (the original
555// XXPERMDI) as mentioning a physical register.
556unsigned PPCVSXSwapRemoval::lookThruCopyLike(unsigned SrcReg,
557 unsigned VecIdx) {
558 MachineInstr *MI = MRI->getVRegDef(Reg: SrcReg);
559 if (!MI->isCopyLike())
560 return SrcReg;
561
562 unsigned CopySrcReg;
563 if (MI->isCopy())
564 CopySrcReg = MI->getOperand(i: 1).getReg();
565 else {
566 assert(MI->isSubregToReg() && "bad opcode for lookThruCopyLike");
567 CopySrcReg = MI->getOperand(i: 2).getReg();
568 }
569
570 if (!Register::isVirtualRegister(Reg: CopySrcReg)) {
571 if (!isScalarVecReg(Reg: CopySrcReg))
572 SwapVector[VecIdx].MentionsPhysVR = 1;
573 return CopySrcReg;
574 }
575
576 return lookThruCopyLike(SrcReg: CopySrcReg, VecIdx);
577}
578
579// Generate equivalence classes for related computations (webs) by
580// def-use relationships of virtual registers. Mention of a physical
581// register terminates the generation of equivalence classes as this
582// indicates a use of a parameter, definition of a return value, use
583// of a value returned from a call, or definition of a parameter to a
584// call. Computations with physical register mentions are flagged
585// as such so their containing webs will not be optimized.
586void PPCVSXSwapRemoval::formWebs() {
587
588 LLVM_DEBUG(dbgs() << "\n*** Forming webs for swap removal ***\n\n");
589
590 for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) {
591
592 MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
593
594 LLVM_DEBUG(dbgs() << "\n" << SwapVector[EntryIdx].VSEId << " ");
595 LLVM_DEBUG(MI->dump());
596
597 // It's sufficient to walk vector uses and join them to their unique
598 // definitions. In addition, check full vector register operands
599 // for physical regs. We exclude partial-vector register operands
600 // because we can handle them if copied to a full vector.
601 for (const MachineOperand &MO : MI->operands()) {
602 if (!MO.isReg())
603 continue;
604
605 Register Reg = MO.getReg();
606 if (!isVecReg(Reg) && !isScalarVecReg(Reg))
607 continue;
608
609 if (!Reg.isVirtual()) {
610 if (!(MI->isCopy() && isScalarVecReg(Reg)))
611 SwapVector[EntryIdx].MentionsPhysVR = 1;
612 continue;
613 }
614
615 if (!MO.isUse())
616 continue;
617
618 MachineInstr* DefMI = MRI->getVRegDef(Reg);
619 assert(SwapMap.contains(DefMI) &&
620 "Inconsistency: def of vector reg not found in swap map!");
621 int DefIdx = SwapMap[DefMI];
622 (void)EC->unionSets(V1: SwapVector[DefIdx].VSEId,
623 V2: SwapVector[EntryIdx].VSEId);
624
625 LLVM_DEBUG(dbgs() << format("Unioning %d with %d\n",
626 SwapVector[DefIdx].VSEId,
627 SwapVector[EntryIdx].VSEId));
628 LLVM_DEBUG(dbgs() << " Def: ");
629 LLVM_DEBUG(DefMI->dump());
630 }
631 }
632}
633
634// Walk the swap vector entries looking for conditions that prevent their
635// containing computations from being optimized. When such conditions are
636// found, mark the representative of the computation's equivalence class
637// as rejected.
638void PPCVSXSwapRemoval::recordUnoptimizableWebs() {
639
640 LLVM_DEBUG(dbgs() << "\n*** Rejecting webs for swap removal ***\n\n");
641
642 for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) {
643 int Repr = EC->getLeaderValue(V: SwapVector[EntryIdx].VSEId);
644
645 // If representative is already rejected, don't waste further time.
646 if (SwapVector[Repr].WebRejected)
647 continue;
648
649 // Reject webs containing mentions of physical or partial registers, or
650 // containing operations that we don't know how to handle in a lane-
651 // permuted region.
652 if (SwapVector[EntryIdx].MentionsPhysVR ||
653 SwapVector[EntryIdx].MentionsPartialVR ||
654 !(SwapVector[EntryIdx].IsSwappable || SwapVector[EntryIdx].IsSwap)) {
655
656 SwapVector[Repr].WebRejected = 1;
657
658 LLVM_DEBUG(
659 dbgs() << format("Web %d rejected for physreg, partial reg, or not "
660 "swap[pable]\n",
661 Repr));
662 LLVM_DEBUG(dbgs() << " in " << EntryIdx << ": ");
663 LLVM_DEBUG(SwapVector[EntryIdx].VSEMI->dump());
664 LLVM_DEBUG(dbgs() << "\n");
665 }
666
667 // Reject webs than contain swapping loads that feed something other
668 // than a swap instruction.
669 else if (SwapVector[EntryIdx].IsLoad && SwapVector[EntryIdx].IsSwap) {
670 MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
671 Register DefReg = MI->getOperand(i: 0).getReg();
672
673 // We skip debug instructions in the analysis. (Note that debug
674 // location information is still maintained by this optimization
675 // because it remains on the LXVD2X and STXVD2X instructions after
676 // the XXPERMDIs are removed.)
677 for (MachineInstr &UseMI : MRI->use_nodbg_instructions(Reg: DefReg)) {
678 int UseIdx = SwapMap[&UseMI];
679
680 if (!SwapVector[UseIdx].IsSwap || SwapVector[UseIdx].IsLoad ||
681 SwapVector[UseIdx].IsStore) {
682
683 SwapVector[Repr].WebRejected = 1;
684
685 LLVM_DEBUG(dbgs() << format(
686 "Web %d rejected for load not feeding swap\n", Repr));
687 LLVM_DEBUG(dbgs() << " def " << EntryIdx << ": ");
688 LLVM_DEBUG(MI->dump());
689 LLVM_DEBUG(dbgs() << " use " << UseIdx << ": ");
690 LLVM_DEBUG(UseMI.dump());
691 LLVM_DEBUG(dbgs() << "\n");
692 }
693
694 // It is possible that the load feeds a swap and that swap feeds a
695 // store. In such a case, the code is actually trying to store a swapped
696 // vector. We must reject such webs.
697 if (SwapVector[UseIdx].IsSwap && !SwapVector[UseIdx].IsLoad &&
698 !SwapVector[UseIdx].IsStore) {
699 Register SwapDefReg = UseMI.getOperand(i: 0).getReg();
700 for (MachineInstr &UseOfUseMI :
701 MRI->use_nodbg_instructions(Reg: SwapDefReg)) {
702 int UseOfUseIdx = SwapMap[&UseOfUseMI];
703 if (SwapVector[UseOfUseIdx].IsStore) {
704 SwapVector[Repr].WebRejected = 1;
705 LLVM_DEBUG(
706 dbgs() << format(
707 "Web %d rejected for load/swap feeding a store\n", Repr));
708 LLVM_DEBUG(dbgs() << " def " << EntryIdx << ": ");
709 LLVM_DEBUG(MI->dump());
710 LLVM_DEBUG(dbgs() << " use " << UseIdx << ": ");
711 LLVM_DEBUG(UseMI.dump());
712 LLVM_DEBUG(dbgs() << "\n");
713 }
714 }
715 }
716 }
717
718 // Reject webs that contain swapping stores that are fed by something
719 // other than a swap instruction.
720 } else if (SwapVector[EntryIdx].IsStore && SwapVector[EntryIdx].IsSwap) {
721 MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
722 Register UseReg = MI->getOperand(i: 0).getReg();
723 MachineInstr *DefMI = MRI->getVRegDef(Reg: UseReg);
724 Register DefReg = DefMI->getOperand(i: 0).getReg();
725 int DefIdx = SwapMap[DefMI];
726
727 if (!SwapVector[DefIdx].IsSwap || SwapVector[DefIdx].IsLoad ||
728 SwapVector[DefIdx].IsStore) {
729
730 SwapVector[Repr].WebRejected = 1;
731
732 LLVM_DEBUG(dbgs() << format(
733 "Web %d rejected for store not fed by swap\n", Repr));
734 LLVM_DEBUG(dbgs() << " def " << DefIdx << ": ");
735 LLVM_DEBUG(DefMI->dump());
736 LLVM_DEBUG(dbgs() << " use " << EntryIdx << ": ");
737 LLVM_DEBUG(MI->dump());
738 LLVM_DEBUG(dbgs() << "\n");
739 }
740
741 // Ensure all uses of the register defined by DefMI feed store
742 // instructions
743 for (MachineInstr &UseMI : MRI->use_nodbg_instructions(Reg: DefReg)) {
744 int UseIdx = SwapMap[&UseMI];
745
746 if (SwapVector[UseIdx].VSEMI->getOpcode() != MI->getOpcode()) {
747 SwapVector[Repr].WebRejected = 1;
748
749 LLVM_DEBUG(
750 dbgs() << format(
751 "Web %d rejected for swap not feeding only stores\n", Repr));
752 LLVM_DEBUG(dbgs() << " def "
753 << " : ");
754 LLVM_DEBUG(DefMI->dump());
755 LLVM_DEBUG(dbgs() << " use " << UseIdx << ": ");
756 LLVM_DEBUG(SwapVector[UseIdx].VSEMI->dump());
757 LLVM_DEBUG(dbgs() << "\n");
758 }
759 }
760 }
761 }
762
763 LLVM_DEBUG(dbgs() << "Swap vector after web analysis:\n\n");
764 LLVM_DEBUG(dumpSwapVector());
765}
766
767// Walk the swap vector entries looking for swaps fed by permuting loads
768// and swaps that feed permuting stores. If the containing computation
769// has not been marked rejected, mark each such swap for removal.
770// (Removal is delayed in case optimization has disturbed the pattern,
771// such that multiple loads feed the same swap, etc.)
772void PPCVSXSwapRemoval::markSwapsForRemoval() {
773
774 LLVM_DEBUG(dbgs() << "\n*** Marking swaps for removal ***\n\n");
775
776 for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) {
777
778 if (SwapVector[EntryIdx].IsLoad && SwapVector[EntryIdx].IsSwap) {
779 int Repr = EC->getLeaderValue(V: SwapVector[EntryIdx].VSEId);
780
781 if (!SwapVector[Repr].WebRejected) {
782 MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
783 Register DefReg = MI->getOperand(i: 0).getReg();
784
785 for (MachineInstr &UseMI : MRI->use_nodbg_instructions(Reg: DefReg)) {
786 int UseIdx = SwapMap[&UseMI];
787 SwapVector[UseIdx].WillRemove = 1;
788
789 LLVM_DEBUG(dbgs() << "Marking swap fed by load for removal: ");
790 LLVM_DEBUG(UseMI.dump());
791 }
792 }
793
794 } else if (SwapVector[EntryIdx].IsStore && SwapVector[EntryIdx].IsSwap) {
795 int Repr = EC->getLeaderValue(V: SwapVector[EntryIdx].VSEId);
796
797 if (!SwapVector[Repr].WebRejected) {
798 MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
799 Register UseReg = MI->getOperand(i: 0).getReg();
800 MachineInstr *DefMI = MRI->getVRegDef(Reg: UseReg);
801 int DefIdx = SwapMap[DefMI];
802 SwapVector[DefIdx].WillRemove = 1;
803
804 LLVM_DEBUG(dbgs() << "Marking swap feeding store for removal: ");
805 LLVM_DEBUG(DefMI->dump());
806 }
807
808 } else if (SwapVector[EntryIdx].IsSwappable &&
809 SwapVector[EntryIdx].SpecialHandling != 0) {
810 int Repr = EC->getLeaderValue(V: SwapVector[EntryIdx].VSEId);
811
812 if (!SwapVector[Repr].WebRejected)
813 handleSpecialSwappables(EntryIdx);
814 }
815 }
816}
817
818// Create an xxswapd instruction and insert it prior to the given point.
819// MI is used to determine basic block and debug loc information.
820// FIXME: When inserting a swap, we should check whether SrcReg is
821// defined by another swap: SrcReg = XXPERMDI Reg, Reg, 2; If so,
822// then instead we should generate a copy from Reg to DstReg.
823void PPCVSXSwapRemoval::insertSwap(MachineInstr *MI,
824 MachineBasicBlock::iterator InsertPoint,
825 unsigned DstReg, unsigned SrcReg) {
826 BuildMI(BB&: *MI->getParent(), I: InsertPoint, MIMD: MI->getDebugLoc(),
827 MCID: TII->get(Opcode: PPC::XXPERMDI), DestReg: DstReg)
828 .addReg(RegNo: SrcReg)
829 .addReg(RegNo: SrcReg)
830 .addImm(Val: 2);
831}
832
833// The identified swap entry requires special handling to allow its
834// containing computation to be optimized. Perform that handling
835// here.
836// FIXME: Additional opportunities will be phased in with subsequent
837// patches.
838void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
839 switch (SwapVector[EntryIdx].SpecialHandling) {
840
841 default:
842 llvm_unreachable("Unexpected special handling type");
843
844 // For splats based on an index into a vector, add N/2 modulo N
845 // to the index, where N is the number of vector elements.
846 case SHValues::SH_SPLAT: {
847 MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
848 unsigned NElts;
849
850 LLVM_DEBUG(dbgs() << "Changing splat: ");
851 LLVM_DEBUG(MI->dump());
852
853 switch (MI->getOpcode()) {
854 default:
855 llvm_unreachable("Unexpected splat opcode");
856 case PPC::VSPLTB: NElts = 16; break;
857 case PPC::VSPLTH: NElts = 8; break;
858 case PPC::VSPLTW:
859 case PPC::XXSPLTW: NElts = 4; break;
860 }
861
862 unsigned EltNo;
863 if (MI->getOpcode() == PPC::XXSPLTW)
864 EltNo = MI->getOperand(i: 2).getImm();
865 else
866 EltNo = MI->getOperand(i: 1).getImm();
867
868 EltNo = (EltNo + NElts / 2) % NElts;
869 if (MI->getOpcode() == PPC::XXSPLTW)
870 MI->getOperand(i: 2).setImm(EltNo);
871 else
872 MI->getOperand(i: 1).setImm(EltNo);
873
874 LLVM_DEBUG(dbgs() << " Into: ");
875 LLVM_DEBUG(MI->dump());
876 break;
877 }
878
879 // For an XXPERMDI that isn't handled otherwise, we need to
880 // reverse the order of the operands. If the selector operand
881 // has a value of 0 or 3, we need to change it to 3 or 0,
882 // respectively. Otherwise we should leave it alone. (This
883 // is equivalent to reversing the two bits of the selector
884 // operand and complementing the result.)
885 case SHValues::SH_XXPERMDI: {
886 MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
887
888 LLVM_DEBUG(dbgs() << "Changing XXPERMDI: ");
889 LLVM_DEBUG(MI->dump());
890
891 unsigned Selector = MI->getOperand(i: 3).getImm();
892 if (Selector == 0 || Selector == 3)
893 Selector = 3 - Selector;
894 MI->getOperand(i: 3).setImm(Selector);
895
896 Register Reg1 = MI->getOperand(i: 1).getReg();
897 Register Reg2 = MI->getOperand(i: 2).getReg();
898 MI->getOperand(i: 1).setReg(Reg2);
899 MI->getOperand(i: 2).setReg(Reg1);
900
901 // We also need to swap kill flag associated with the register.
902 bool IsKill1 = MI->getOperand(i: 1).isKill();
903 bool IsKill2 = MI->getOperand(i: 2).isKill();
904 MI->getOperand(i: 1).setIsKill(IsKill2);
905 MI->getOperand(i: 2).setIsKill(IsKill1);
906
907 LLVM_DEBUG(dbgs() << " Into: ");
908 LLVM_DEBUG(MI->dump());
909 break;
910 }
911
912 // For a copy from a scalar floating-point register to a vector
913 // register, removing swaps will leave the copied value in the
914 // wrong lane. Insert a swap following the copy to fix this.
915 case SHValues::SH_COPYWIDEN: {
916 MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
917
918 LLVM_DEBUG(dbgs() << "Changing SUBREG_TO_REG: ");
919 LLVM_DEBUG(MI->dump());
920
921 Register DstReg = MI->getOperand(i: 0).getReg();
922 const TargetRegisterClass *DstRC = MRI->getRegClass(Reg: DstReg);
923 Register NewVReg = MRI->createVirtualRegister(RegClass: DstRC);
924
925 MI->getOperand(i: 0).setReg(NewVReg);
926 LLVM_DEBUG(dbgs() << " Into: ");
927 LLVM_DEBUG(MI->dump());
928
929 auto InsertPoint = ++MachineBasicBlock::iterator(MI);
930
931 // Note that an XXPERMDI requires a VSRC, so if the SUBREG_TO_REG
932 // is copying to a VRRC, we need to be careful to avoid a register
933 // assignment problem. In this case we must copy from VRRC to VSRC
934 // prior to the swap, and from VSRC to VRRC following the swap.
935 // Coalescing will usually remove all this mess.
936 if (DstRC == &PPC::VRRCRegClass) {
937 Register VSRCTmp1 = MRI->createVirtualRegister(RegClass: &PPC::VSRCRegClass);
938 Register VSRCTmp2 = MRI->createVirtualRegister(RegClass: &PPC::VSRCRegClass);
939
940 BuildMI(BB&: *MI->getParent(), I: InsertPoint, MIMD: MI->getDebugLoc(),
941 MCID: TII->get(Opcode: PPC::COPY), DestReg: VSRCTmp1)
942 .addReg(RegNo: NewVReg);
943 LLVM_DEBUG(std::prev(InsertPoint)->dump());
944
945 insertSwap(MI, InsertPoint, DstReg: VSRCTmp2, SrcReg: VSRCTmp1);
946 LLVM_DEBUG(std::prev(InsertPoint)->dump());
947
948 BuildMI(BB&: *MI->getParent(), I: InsertPoint, MIMD: MI->getDebugLoc(),
949 MCID: TII->get(Opcode: PPC::COPY), DestReg: DstReg)
950 .addReg(RegNo: VSRCTmp2);
951 LLVM_DEBUG(std::prev(InsertPoint)->dump());
952
953 } else {
954 insertSwap(MI, InsertPoint, DstReg, SrcReg: NewVReg);
955 LLVM_DEBUG(std::prev(InsertPoint)->dump());
956 }
957 break;
958 }
959 }
960}
961
962// Walk the swap vector and replace each entry marked for removal with
963// a copy operation.
964bool PPCVSXSwapRemoval::removeSwaps() {
965
966 LLVM_DEBUG(dbgs() << "\n*** Removing swaps ***\n\n");
967
968 bool Changed = false;
969
970 for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) {
971 if (SwapVector[EntryIdx].WillRemove) {
972 Changed = true;
973 MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
974 MachineBasicBlock *MBB = MI->getParent();
975 BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY),
976 DestReg: MI->getOperand(i: 0).getReg())
977 .add(MO: MI->getOperand(i: 1));
978
979 LLVM_DEBUG(dbgs() << format("Replaced %d with copy: ",
980 SwapVector[EntryIdx].VSEId));
981 LLVM_DEBUG(MI->dump());
982
983 MI->eraseFromParent();
984 }
985 }
986
987 return Changed;
988}
989
990#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
991// For debug purposes, dump the contents of the swap vector.
992LLVM_DUMP_METHOD void PPCVSXSwapRemoval::dumpSwapVector() {
993
994 for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) {
995
996 MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
997 int ID = SwapVector[EntryIdx].VSEId;
998
999 dbgs() << format("%6d", ID);
1000 dbgs() << format("%6d", EC->getLeaderValue(ID));
1001 dbgs() << format(" %bb.%3d", MI->getParent()->getNumber());
1002 dbgs() << format(" %14s ", TII->getName(MI->getOpcode()).str().c_str());
1003
1004 if (SwapVector[EntryIdx].IsLoad)
1005 dbgs() << "load ";
1006 if (SwapVector[EntryIdx].IsStore)
1007 dbgs() << "store ";
1008 if (SwapVector[EntryIdx].IsSwap)
1009 dbgs() << "swap ";
1010 if (SwapVector[EntryIdx].MentionsPhysVR)
1011 dbgs() << "physreg ";
1012 if (SwapVector[EntryIdx].MentionsPartialVR)
1013 dbgs() << "partialreg ";
1014
1015 if (SwapVector[EntryIdx].IsSwappable) {
1016 dbgs() << "swappable ";
1017 switch(SwapVector[EntryIdx].SpecialHandling) {
1018 default:
1019 dbgs() << "special:**unknown**";
1020 break;
1021 case SH_NONE:
1022 break;
1023 case SH_EXTRACT:
1024 dbgs() << "special:extract ";
1025 break;
1026 case SH_INSERT:
1027 dbgs() << "special:insert ";
1028 break;
1029 case SH_NOSWAP_LD:
1030 dbgs() << "special:load ";
1031 break;
1032 case SH_NOSWAP_ST:
1033 dbgs() << "special:store ";
1034 break;
1035 case SH_SPLAT:
1036 dbgs() << "special:splat ";
1037 break;
1038 case SH_XXPERMDI:
1039 dbgs() << "special:xxpermdi ";
1040 break;
1041 case SH_COPYWIDEN:
1042 dbgs() << "special:copywiden ";
1043 break;
1044 }
1045 }
1046
1047 if (SwapVector[EntryIdx].WebRejected)
1048 dbgs() << "rejected ";
1049 if (SwapVector[EntryIdx].WillRemove)
1050 dbgs() << "remove ";
1051
1052 dbgs() << "\n";
1053
1054 // For no-asserts builds.
1055 (void)MI;
1056 (void)ID;
1057 }
1058
1059 dbgs() << "\n";
1060}
1061#endif
1062
1063INITIALIZE_PASS_BEGIN(PPCVSXSwapRemoval, DEBUG_TYPE,
1064 "PowerPC VSX Swap Removal", false, false)
1065INITIALIZE_PASS_END(PPCVSXSwapRemoval, DEBUG_TYPE,
1066 "PowerPC VSX Swap Removal", false, false)
1067
1068char PPCVSXSwapRemoval::ID = 0;
1069FunctionPass*
1070llvm::createPPCVSXSwapRemovalPass() { return new PPCVSXSwapRemoval(); }
1071