1//===- X86CompressEVEX.cpp ------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass compresses instructions from EVEX space to legacy/VEX/EVEX space
10// when possible in order to reduce code size or facilitate HW decoding.
11//
12// Possible compression:
13// a. AVX512 instruction (EVEX) -> AVX instruction (VEX)
14// b. Promoted instruction (EVEX) -> pre-promotion instruction (legacy/VEX)
15// c. NDD (EVEX) -> non-NDD (legacy)
16// d. NF_ND (EVEX) -> NF (EVEX)
17// e. NonNF (EVEX) -> NF (EVEX)
18// f. SETZUCCm (EVEX) -> SETCCm (legacy)
19// g. VPMOV*2M (EVEX) + KMOV -> VMOVMSK/VPMOVMSKB (VEX)
20//
21// Compression a, b and c can always reduce code size, with some exceptions
22// such as promoted 16-bit CRC32 which is as long as the legacy version.
23//
24// legacy:
25// crc32w %si, %eax ## encoding: [0x66,0xf2,0x0f,0x38,0xf1,0xc6]
26// promoted:
27// crc32w %si, %eax ## encoding: [0x62,0xf4,0x7d,0x08,0xf1,0xc6]
28//
29// From performance perspective, these should be same (same uops and same EXE
30// ports). From a FMV perspective, an older legacy encoding is preferred b/c it
31// can execute in more places (broader HW install base). So we will still do
32// the compression.
33//
34// Compression d can help hardware decode (HW may skip reading the NDD
35// register) although the instruction length remains unchanged.
36//
37// Compression e can help hardware skip updating EFLAGS although the instruction
38// length remains unchanged.
39//===----------------------------------------------------------------------===//
40
41#include "MCTargetDesc/X86BaseInfo.h"
42#include "X86.h"
43#include "X86InstrInfo.h"
44#include "X86Subtarget.h"
45#include "llvm/ADT/SmallVector.h"
46#include "llvm/ADT/StringRef.h"
47#include "llvm/CodeGen/MachineFunction.h"
48#include "llvm/CodeGen/MachineFunctionAnalysisManager.h"
49#include "llvm/CodeGen/MachineFunctionPass.h"
50#include "llvm/CodeGen/MachineInstr.h"
51#include "llvm/CodeGen/MachineOperand.h"
52#include "llvm/CodeGen/MachinePassManager.h"
53#include "llvm/IR/Analysis.h"
54#include "llvm/MC/MCInstrDesc.h"
55#include "llvm/Pass.h"
56#include <atomic>
57#include <cassert>
58#include <cstdint>
59
60using namespace llvm;
61
62#define COMP_EVEX_DESC "Compressing EVEX instrs when possible"
63#define COMP_EVEX_NAME "x86-compress-evex"
64
65#define DEBUG_TYPE COMP_EVEX_NAME
66
67extern cl::opt<bool> X86EnableAPXForRelocation;
68
69namespace {
70// Including the generated EVEX compression tables.
71#define GET_X86_COMPRESS_EVEX_TABLE
72#include "X86GenInstrMapping.inc"
73
74class CompressEVEXLegacy : public MachineFunctionPass {
75public:
76 static char ID;
77 CompressEVEXLegacy() : MachineFunctionPass(ID) {}
78 StringRef getPassName() const override { return COMP_EVEX_DESC; }
79
80 bool runOnMachineFunction(MachineFunction &MF) override;
81
82 // This pass runs after regalloc and doesn't support VReg operands.
83 MachineFunctionProperties getRequiredProperties() const override {
84 return MachineFunctionProperties().setNoVRegs();
85 }
86};
87
88} // end anonymous namespace
89
90char CompressEVEXLegacy::ID = 0;
91
92static bool usesExtendedRegister(const MachineInstr &MI) {
93 auto isHiRegIdx = [](MCRegister Reg) {
94 // Check for XMM register with indexes between 16 - 31.
95 if (Reg >= X86::XMM16 && Reg <= X86::XMM31)
96 return true;
97 // Check for YMM register with indexes between 16 - 31.
98 if (Reg >= X86::YMM16 && Reg <= X86::YMM31)
99 return true;
100 // Check for GPR with indexes between 16 - 31.
101 if (X86II::isApxExtendedReg(Reg))
102 return true;
103 return false;
104 };
105
106 // Check that operands are not ZMM regs or
107 // XMM/YMM regs with hi indexes between 16 - 31.
108 for (const MachineOperand &MO : MI.explicit_operands()) {
109 if (!MO.isReg())
110 continue;
111
112 MCRegister Reg = MO.getReg().asMCReg();
113 assert(!X86II::isZMMReg(Reg) &&
114 "ZMM instructions should not be in the EVEX->VEX tables");
115 if (isHiRegIdx(Reg))
116 return true;
117 }
118
119 return false;
120}
121
122// Do any custom cleanup needed to finalize the conversion.
123static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) {
124 (void)NewOpc;
125 unsigned Opc = MI.getOpcode();
126 switch (Opc) {
127 case X86::VALIGNDZ128rri:
128 case X86::VALIGNDZ128rmi:
129 case X86::VALIGNQZ128rri:
130 case X86::VALIGNQZ128rmi: {
131 assert((NewOpc == X86::VPALIGNRrri || NewOpc == X86::VPALIGNRrmi) &&
132 "Unexpected new opcode!");
133 unsigned Scale =
134 (Opc == X86::VALIGNQZ128rri || Opc == X86::VALIGNQZ128rmi) ? 8 : 4;
135 MachineOperand &Imm = MI.getOperand(i: MI.getNumExplicitOperands() - 1);
136 Imm.setImm(Imm.getImm() * Scale);
137 break;
138 }
139 case X86::VSHUFF32X4Z256rmi:
140 case X86::VSHUFF32X4Z256rri:
141 case X86::VSHUFF64X2Z256rmi:
142 case X86::VSHUFF64X2Z256rri:
143 case X86::VSHUFI32X4Z256rmi:
144 case X86::VSHUFI32X4Z256rri:
145 case X86::VSHUFI64X2Z256rmi:
146 case X86::VSHUFI64X2Z256rri: {
147 assert((NewOpc == X86::VPERM2F128rri || NewOpc == X86::VPERM2I128rri ||
148 NewOpc == X86::VPERM2F128rmi || NewOpc == X86::VPERM2I128rmi) &&
149 "Unexpected new opcode!");
150 MachineOperand &Imm = MI.getOperand(i: MI.getNumExplicitOperands() - 1);
151 int64_t ImmVal = Imm.getImm();
152 // Set bit 5, move bit 1 to bit 4, copy bit 0.
153 Imm.setImm(0x20 | ((ImmVal & 2) << 3) | (ImmVal & 1));
154 break;
155 }
156 case X86::VRNDSCALEPDZ128rri:
157 case X86::VRNDSCALEPDZ128rmi:
158 case X86::VRNDSCALEPSZ128rri:
159 case X86::VRNDSCALEPSZ128rmi:
160 case X86::VRNDSCALEPDZ256rri:
161 case X86::VRNDSCALEPDZ256rmi:
162 case X86::VRNDSCALEPSZ256rri:
163 case X86::VRNDSCALEPSZ256rmi:
164 case X86::VRNDSCALESDZrri:
165 case X86::VRNDSCALESDZrmi:
166 case X86::VRNDSCALESSZrri:
167 case X86::VRNDSCALESSZrmi:
168 case X86::VRNDSCALESDZrri_Int:
169 case X86::VRNDSCALESDZrmi_Int:
170 case X86::VRNDSCALESSZrri_Int:
171 case X86::VRNDSCALESSZrmi_Int:
172 const MachineOperand &Imm = MI.getOperand(i: MI.getNumExplicitOperands() - 1);
173 int64_t ImmVal = Imm.getImm();
174 // Ensure that only bits 3:0 of the immediate are used.
175 if ((ImmVal & 0xf) != ImmVal)
176 return false;
177 break;
178 }
179
180 return true;
181}
182
183static bool isKMovNarrowing(unsigned VPMOVOpc, unsigned KMOVOpc) {
184 unsigned VPMOVBits = 0;
185 switch (VPMOVOpc) {
186 case X86::VPMOVQ2MZ128kr:
187 VPMOVBits = 2;
188 break;
189 case X86::VPMOVQ2MZ256kr:
190 case X86::VPMOVD2MZ128kr:
191 VPMOVBits = 4;
192 break;
193 case X86::VPMOVD2MZ256kr:
194 VPMOVBits = 8;
195 break;
196 case X86::VPMOVB2MZ128kr:
197 VPMOVBits = 16;
198 break;
199 case X86::VPMOVB2MZ256kr:
200 VPMOVBits = 32;
201 break;
202 default:
203 llvm_unreachable("Unknown VPMOV opcode");
204 }
205
206 unsigned KMOVSize = 0;
207 switch (KMOVOpc) {
208 case X86::KMOVBrk:
209 KMOVSize = 8;
210 break;
211 case X86::KMOVWrk:
212 KMOVSize = 16;
213 break;
214 case X86::KMOVDrk:
215 KMOVSize = 32;
216 break;
217 default:
218 llvm_unreachable("Unknown KMOV opcode");
219 }
220
221 return KMOVSize < VPMOVBits;
222}
223
224// Try to compress VPMOV*2M + KMOV chain patterns:
225// vpmov*2m %xmm0, %k0 -> (erase this)
226// kmov* %k0, %eax -> vmovmskp* %xmm0, %eax
227static bool tryCompressVPMOVPattern(MachineInstr &MI, MachineBasicBlock &MBB,
228 const X86Subtarget &ST,
229 SmallVectorImpl<MachineInstr *> &ToErase) {
230 const X86InstrInfo *TII = ST.getInstrInfo();
231 const TargetRegisterInfo *TRI = ST.getRegisterInfo();
232 MachineRegisterInfo *MRI = &MBB.getParent()->getRegInfo();
233
234 unsigned Opc = MI.getOpcode();
235 if (Opc != X86::VPMOVD2MZ128kr && Opc != X86::VPMOVD2MZ256kr &&
236 Opc != X86::VPMOVQ2MZ128kr && Opc != X86::VPMOVQ2MZ256kr &&
237 Opc != X86::VPMOVB2MZ128kr && Opc != X86::VPMOVB2MZ256kr)
238 return false;
239
240 Register MaskReg = MI.getOperand(i: 0).getReg();
241 Register SrcVecReg = MI.getOperand(i: 1).getReg();
242
243 unsigned MovMskOpc = 0;
244 switch (Opc) {
245 case X86::VPMOVD2MZ128kr:
246 MovMskOpc = X86::VMOVMSKPSrr;
247 break;
248 case X86::VPMOVD2MZ256kr:
249 MovMskOpc = X86::VMOVMSKPSYrr;
250 break;
251 case X86::VPMOVQ2MZ128kr:
252 MovMskOpc = X86::VMOVMSKPDrr;
253 break;
254 case X86::VPMOVQ2MZ256kr:
255 MovMskOpc = X86::VMOVMSKPDYrr;
256 break;
257 case X86::VPMOVB2MZ128kr:
258 MovMskOpc = X86::VPMOVMSKBrr;
259 break;
260 case X86::VPMOVB2MZ256kr:
261 MovMskOpc = X86::VPMOVMSKBYrr;
262 break;
263 default:
264 llvm_unreachable("Unknown VPMOV opcode");
265 }
266
267 MachineInstr *KMovMI = nullptr;
268
269 for (MachineInstr &CurMI : llvm::make_range(
270 x: std::next(x: MachineBasicBlock::iterator(MI)), y: MBB.end())) {
271 if (CurMI.modifiesRegister(Reg: MaskReg, TRI)) {
272 if (!KMovMI)
273 return false; // Mask clobbered before use
274 break;
275 }
276
277 if (CurMI.readsRegister(Reg: MaskReg, TRI)) {
278 if (KMovMI)
279 return false; // Fail: Mask has MULTIPLE uses
280
281 unsigned UseOpc = CurMI.getOpcode();
282 bool IsKMOV = UseOpc == X86::KMOVBrk || UseOpc == X86::KMOVWrk ||
283 UseOpc == X86::KMOVDrk;
284 // Only allow non-narrowing KMOV uses of the mask.
285 if (IsKMOV && CurMI.getOperand(i: 1).getReg() == MaskReg &&
286 !isKMovNarrowing(VPMOVOpc: Opc, KMOVOpc: UseOpc)) {
287 KMovMI = &CurMI;
288 // continue scanning to ensure
289 // there are no *other* uses of the mask later in the block.
290 } else {
291 return false;
292 }
293 }
294
295 if (!KMovMI && CurMI.modifiesRegister(Reg: SrcVecReg, TRI)) {
296 return false; // SrcVecReg modified before it could be used by MOVMSK
297 }
298 }
299
300 if (!KMovMI)
301 return false;
302
303 // Check if MaskReg is used in any other basic blocks
304 for (const MachineOperand &MO : MRI->use_operands(Reg: MaskReg))
305 if (MO.getParent()->getParent() != &MBB)
306 return false;
307
308 // Apply the transformation
309 KMovMI->setDesc(TII->get(Opcode: MovMskOpc));
310 KMovMI->getOperand(i: 1).setReg(SrcVecReg);
311 KMovMI->setAsmPrinterFlag(X86::AC_EVEX_2_VEX);
312
313 ToErase.push_back(Elt: &MI);
314 return true;
315}
316
317static bool CompressEVEXImpl(MachineInstr &MI, MachineBasicBlock &MBB,
318 const X86Subtarget &ST,
319 SmallVectorImpl<MachineInstr *> &ToErase) {
320 uint64_t TSFlags = MI.getDesc().TSFlags;
321
322 // Check for EVEX instructions only.
323 if ((TSFlags & X86II::EncodingMask) != X86II::EVEX)
324 return false;
325
326 // Instructions with mask or 512-bit vector can't be converted to VEX.
327 if (TSFlags & (X86II::EVEX_K | X86II::EVEX_L2))
328 return false;
329
330 // Specialized VPMOVD2M + KMOV -> MOVMSK fold first.
331 if (tryCompressVPMOVPattern(MI, MBB, ST, ToErase))
332 return true;
333
334 auto IsRedundantNewDataDest = [&](unsigned &Opc) {
335 // $rbx = ADD64rr_ND $rbx, $rax / $rbx = ADD64rr_ND $rax, $rbx
336 // ->
337 // $rbx = ADD64rr $rbx, $rax
338 const MCInstrDesc &Desc = MI.getDesc();
339 Register Reg0 = MI.getOperand(i: 0).getReg();
340 const MachineOperand &Op1 = MI.getOperand(i: 1);
341 if (!Op1.isReg() || X86::getFirstAddrOperandIdx(MI) == 1 ||
342 X86::isCFCMOVCC(Opcode: MI.getOpcode()))
343 return false;
344 Register Reg1 = Op1.getReg();
345 if (Reg1 == Reg0)
346 return true;
347
348 // Op1 and Op2 may be commutable for ND instructions.
349 if (!Desc.isCommutable() || Desc.getNumOperands() < 3 ||
350 !MI.getOperand(i: 2).isReg() || MI.getOperand(i: 2).getReg() != Reg0)
351 return false;
352 // Opcode may change after commute, e.g. SHRD -> SHLD
353 ST.getInstrInfo()->commuteInstruction(MI, NewMI: false, OpIdx1: 1, OpIdx2: 2);
354 Opc = MI.getOpcode();
355 return true;
356 };
357
358 // EVEX_B has several meanings.
359 // AVX512:
360 // register form: rounding control or SAE
361 // memory form: broadcast
362 //
363 // APX:
364 // MAP4: NDD, ZU
365 //
366 // For AVX512 cases, EVEX prefix is needed in order to carry this information
367 // thus preventing the transformation to VEX encoding.
368 bool IsND = X86II::hasNewDataDest(TSFlags);
369 unsigned Opc = MI.getOpcode();
370 bool IsSetZUCCm = Opc == X86::SETZUCCm;
371 if (TSFlags & X86II::EVEX_B && !IsND && !IsSetZUCCm)
372 return false;
373 // MOVBE*rr is special because it has semantic of NDD but not set EVEX_B.
374 bool IsNDLike = IsND || Opc == X86::MOVBE32rr || Opc == X86::MOVBE64rr;
375 bool IsRedundantNDD = IsNDLike ? IsRedundantNewDataDest(Opc) : false;
376
377 auto GetCompressedOpc = [&](unsigned Opc) -> unsigned {
378 ArrayRef<X86TableEntry> Table = ArrayRef(X86CompressEVEXTable);
379 const auto I = llvm::lower_bound(Range&: Table, Value&: Opc);
380 if (I == Table.end() || I->OldOpc != Opc)
381 return 0;
382
383 if (usesExtendedRegister(MI) || !checkPredicate(Opc: I->NewOpc, Subtarget: &ST) ||
384 !performCustomAdjustments(MI, NewOpc: I->NewOpc))
385 return 0;
386 return I->NewOpc;
387 };
388
389 Register Dst = MI.getOperand(i: 0).getReg();
390 if (IsRedundantNDD) {
391 // Redundant NDD ops cannot be safely compressed if either:
392 // - the legacy op would introduce a partial write that BreakFalseDeps
393 // identified as a potential stall, or
394 // - the op is writing to a subregister of a live register, i.e. the
395 // full (zeroed) result is used.
396 // Both cases are indicated by an implicit def of the superregister.
397 if (Dst &&
398 (X86::GR16RegClass.contains(Reg: Dst) || X86::GR8RegClass.contains(Reg: Dst))) {
399 Register Super = getX86SubSuperRegister(Reg: Dst, Size: 64);
400 if (MI.definesRegister(Reg: Super, /*TRI=*/nullptr))
401 IsRedundantNDD = false;
402 }
403
404 // ADDrm/mr instructions with NDD + relocation had been transformed to the
405 // instructions without NDD in X86SuppressAPXForRelocation pass. That is to
406 // keep backward compatibility with linkers without APX support.
407 if (!X86EnableAPXForRelocation)
408 assert(!isAddMemInstrWithRelocation(MI) &&
409 "Unexpected NDD instruction with relocation!");
410 } else if (Opc == X86::ADD32ri_ND || Opc == X86::ADD64ri32_ND ||
411 Opc == X86::ADD32rr_ND || Opc == X86::ADD64rr_ND) {
412 // Non-redundant NDD ADD can be compressed to LEA when:
413 // - No EGPR register used and
414 // - EFLAGS is dead.
415 if (!usesExtendedRegister(MI) &&
416 MI.registerDefIsDead(Reg: X86::EFLAGS, /*TRI=*/nullptr)) {
417 Register Src1 = MI.getOperand(i: 1).getReg();
418 const MachineOperand &Src2 = MI.getOperand(i: 2);
419 bool Is32BitReg = Opc == X86::ADD32ri_ND || Opc == X86::ADD32rr_ND;
420 const MCInstrDesc &NewDesc =
421 ST.getInstrInfo()->get(Opcode: Is32BitReg ? X86::LEA64_32r : X86::LEA64r);
422 if (Is32BitReg)
423 Src1 = getX86SubSuperRegister(Reg: Src1, Size: 64);
424 MachineInstrBuilder MIB = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: NewDesc, DestReg: Dst)
425 .addReg(RegNo: Src1)
426 .addImm(Val: 1);
427 if (Opc == X86::ADD32ri_ND || Opc == X86::ADD64ri32_ND)
428 MIB.addReg(RegNo: 0).add(MO: Src2);
429 else if (Is32BitReg)
430 MIB.addReg(RegNo: getX86SubSuperRegister(Reg: Src2.getReg(), Size: 64)).addImm(Val: 0);
431 else
432 MIB.add(MO: Src2).addImm(Val: 0);
433 MIB.addReg(RegNo: 0);
434 MI.removeFromParent();
435 return true;
436 }
437 }
438
439 // NonNF -> NF only if it's not a compressible NDD instruction and eflags is
440 // dead.
441 unsigned NewOpc = IsRedundantNDD
442 ? X86::getNonNDVariant(Opc)
443 : ((IsNDLike && ST.hasNF() &&
444 MI.registerDefIsDead(Reg: X86::EFLAGS, /*TRI=*/nullptr))
445 ? X86::getNFVariant(Opc)
446 : GetCompressedOpc(Opc));
447
448 if (!NewOpc)
449 return false;
450
451 const MCInstrDesc &NewDesc = ST.getInstrInfo()->get(Opcode: NewOpc);
452 MI.setDesc(NewDesc);
453 unsigned AsmComment;
454 switch (NewDesc.TSFlags & X86II::EncodingMask) {
455 case X86II::LEGACY:
456 AsmComment = X86::AC_EVEX_2_LEGACY;
457 break;
458 case X86II::VEX:
459 AsmComment = X86::AC_EVEX_2_VEX;
460 break;
461 case X86II::EVEX:
462 AsmComment = X86::AC_EVEX_2_EVEX;
463 assert(IsND && (NewDesc.TSFlags & X86II::EVEX_NF) &&
464 "Unknown EVEX2EVEX compression");
465 break;
466 default:
467 llvm_unreachable("Unknown EVEX compression");
468 }
469 MI.setAsmPrinterFlag(AsmComment);
470 if (IsRedundantNDD)
471 MI.tieOperands(DefIdx: 0, UseIdx: 1);
472
473 return true;
474}
475
476static bool runOnMF(MachineFunction &MF) {
477 LLVM_DEBUG(dbgs() << "Start X86CompressEVEXPass\n";);
478#ifndef NDEBUG
479 // Make sure the tables are sorted.
480 static std::atomic<bool> TableChecked(false);
481 if (!TableChecked.load(std::memory_order_relaxed)) {
482 assert(llvm::is_sorted(X86CompressEVEXTable) &&
483 "X86CompressEVEXTable is not sorted!");
484 TableChecked.store(true, std::memory_order_relaxed);
485 }
486#endif
487 const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
488 if (!ST.hasAVX512() && !ST.hasEGPR() && !ST.hasNDD() && !ST.hasZU())
489 return false;
490
491 bool Changed = false;
492
493 for (MachineBasicBlock &MBB : MF) {
494 SmallVector<MachineInstr *, 4> ToErase;
495
496 for (MachineInstr &MI : llvm::make_early_inc_range(Range&: MBB)) {
497 Changed |= CompressEVEXImpl(MI, MBB, ST, ToErase);
498 }
499
500 for (MachineInstr *MI : ToErase) {
501 MI->eraseFromParent();
502 }
503 }
504 LLVM_DEBUG(dbgs() << "End X86CompressEVEXPass\n";);
505 return Changed;
506}
507
508INITIALIZE_PASS(CompressEVEXLegacy, COMP_EVEX_NAME, COMP_EVEX_DESC, false,
509 false)
510
511FunctionPass *llvm::createX86CompressEVEXLegacyPass() {
512 return new CompressEVEXLegacy();
513}
514
515bool CompressEVEXLegacy::runOnMachineFunction(MachineFunction &MF) {
516 return runOnMF(MF);
517}
518
519PreservedAnalyses
520X86CompressEVEXPass::run(MachineFunction &MF,
521 MachineFunctionAnalysisManager &MFAM) {
522 bool Changed = runOnMF(MF);
523 if (!Changed)
524 return PreservedAnalyses::all();
525 PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses();
526 PA.preserveSet<CFGAnalyses>();
527 return PA;
528}
529