1//===- X86CompressEVEX.cpp ------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass compresses instructions from EVEX space to legacy/VEX/EVEX space
10// when possible in order to reduce code size or facilitate HW decoding.
11//
12// Possible compression:
13// a. AVX512 instruction (EVEX) -> AVX instruction (VEX)
14// b. Promoted instruction (EVEX) -> pre-promotion instruction (legacy/VEX)
15// c. NDD (EVEX) -> non-NDD (legacy)
16// d. NF_ND (EVEX) -> NF (EVEX)
17// e. NonNF (EVEX) -> NF (EVEX)
18// f. SETZUCCm (EVEX) -> SETCCm (legacy)
19// g. VPMOV*2M (EVEX) + KMOV -> VMOVMSK/VPMOVMSKB (VEX)
20//
21// Compression a, b and c can always reduce code size, with some exceptions
22// such as promoted 16-bit CRC32 which is as long as the legacy version.
23//
24// legacy:
25// crc32w %si, %eax ## encoding: [0x66,0xf2,0x0f,0x38,0xf1,0xc6]
26// promoted:
27// crc32w %si, %eax ## encoding: [0x62,0xf4,0x7d,0x08,0xf1,0xc6]
28//
29// From performance perspective, these should be same (same uops and same EXE
30// ports). From a FMV perspective, an older legacy encoding is preferred b/c it
31// can execute in more places (broader HW install base). So we will still do
32// the compression.
33//
34// Compression d can help hardware decode (HW may skip reading the NDD
35// register) although the instruction length remains unchanged.
36//
37// Compression e can help hardware skip updating EFLAGS although the instruction
38// length remains unchanged.
39//===----------------------------------------------------------------------===//
40
41#include "MCTargetDesc/X86BaseInfo.h"
42#include "X86.h"
43#include "X86InstrInfo.h"
44#include "X86Subtarget.h"
45#include "llvm/ADT/SmallVector.h"
46#include "llvm/ADT/StringRef.h"
47#include "llvm/CodeGen/MachineFunction.h"
48#include "llvm/CodeGen/MachineFunctionAnalysisManager.h"
49#include "llvm/CodeGen/MachineFunctionPass.h"
50#include "llvm/CodeGen/MachineInstr.h"
51#include "llvm/CodeGen/MachineOperand.h"
52#include "llvm/CodeGen/MachinePassManager.h"
53#include "llvm/IR/Analysis.h"
54#include "llvm/MC/MCInstrDesc.h"
55#include "llvm/Pass.h"
56#include <atomic>
57#include <cassert>
58#include <cstdint>
59
60using namespace llvm;
61
62#define COMP_EVEX_DESC "Compressing EVEX instrs when possible"
63#define COMP_EVEX_NAME "x86-compress-evex"
64
65#define DEBUG_TYPE COMP_EVEX_NAME
66
67extern cl::opt<bool> X86EnableAPXForRelocation;
68
69namespace {
70// Including the generated EVEX compression tables.
71#define GET_X86_COMPRESS_EVEX_TABLE
72#include "X86GenInstrMapping.inc"
73
74class CompressEVEXLegacy : public MachineFunctionPass {
75public:
76 static char ID;
77 CompressEVEXLegacy() : MachineFunctionPass(ID) {}
78 StringRef getPassName() const override { return COMP_EVEX_DESC; }
79
80 bool runOnMachineFunction(MachineFunction &MF) override;
81
82 // This pass runs after regalloc and doesn't support VReg operands.
83 MachineFunctionProperties getRequiredProperties() const override {
84 return MachineFunctionProperties().setNoVRegs();
85 }
86};
87
88} // end anonymous namespace
89
90char CompressEVEXLegacy::ID = 0;
91
92static bool usesExtendedRegister(const MachineInstr &MI) {
93 auto isHiRegIdx = [](MCRegister Reg) {
94 // Check for XMM register with indexes between 16 - 31.
95 if (Reg >= X86::XMM16 && Reg <= X86::XMM31)
96 return true;
97 // Check for YMM register with indexes between 16 - 31.
98 if (Reg >= X86::YMM16 && Reg <= X86::YMM31)
99 return true;
100 // Check for GPR with indexes between 16 - 31.
101 if (X86II::isApxExtendedReg(Reg))
102 return true;
103 return false;
104 };
105
106 // Check that operands are not ZMM regs or
107 // XMM/YMM regs with hi indexes between 16 - 31.
108 for (const MachineOperand &MO : MI.explicit_operands()) {
109 if (!MO.isReg())
110 continue;
111
112 MCRegister Reg = MO.getReg().asMCReg();
113 assert(!X86II::isZMMReg(Reg) &&
114 "ZMM instructions should not be in the EVEX->VEX tables");
115 if (isHiRegIdx(Reg))
116 return true;
117 }
118
119 return false;
120}
121
122// Do any custom cleanup needed to finalize the conversion.
123static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) {
124 (void)NewOpc;
125 unsigned Opc = MI.getOpcode();
126 switch (Opc) {
127 case X86::VALIGNDZ128rri:
128 case X86::VALIGNDZ128rmi:
129 case X86::VALIGNQZ128rri:
130 case X86::VALIGNQZ128rmi: {
131 assert((NewOpc == X86::VPALIGNRrri || NewOpc == X86::VPALIGNRrmi) &&
132 "Unexpected new opcode!");
133 unsigned Scale =
134 (Opc == X86::VALIGNQZ128rri || Opc == X86::VALIGNQZ128rmi) ? 8 : 4;
135 MachineOperand &Imm = MI.getOperand(i: MI.getNumExplicitOperands() - 1);
136 Imm.setImm(Imm.getImm() * Scale);
137 break;
138 }
139 case X86::VSHUFF32X4Z256rmi:
140 case X86::VSHUFF32X4Z256rri:
141 case X86::VSHUFF64X2Z256rmi:
142 case X86::VSHUFF64X2Z256rri:
143 case X86::VSHUFI32X4Z256rmi:
144 case X86::VSHUFI32X4Z256rri:
145 case X86::VSHUFI64X2Z256rmi:
146 case X86::VSHUFI64X2Z256rri: {
147 assert((NewOpc == X86::VPERM2F128rri || NewOpc == X86::VPERM2I128rri ||
148 NewOpc == X86::VPERM2F128rmi || NewOpc == X86::VPERM2I128rmi) &&
149 "Unexpected new opcode!");
150 MachineOperand &Imm = MI.getOperand(i: MI.getNumExplicitOperands() - 1);
151 int64_t ImmVal = Imm.getImm();
152 // Set bit 5, move bit 1 to bit 4, copy bit 0.
153 Imm.setImm(0x20 | ((ImmVal & 2) << 3) | (ImmVal & 1));
154 break;
155 }
156 case X86::VRNDSCALEPDZ128rri:
157 case X86::VRNDSCALEPDZ128rmi:
158 case X86::VRNDSCALEPSZ128rri:
159 case X86::VRNDSCALEPSZ128rmi:
160 case X86::VRNDSCALEPDZ256rri:
161 case X86::VRNDSCALEPDZ256rmi:
162 case X86::VRNDSCALEPSZ256rri:
163 case X86::VRNDSCALEPSZ256rmi:
164 case X86::VRNDSCALESDZrri:
165 case X86::VRNDSCALESDZrmi:
166 case X86::VRNDSCALESSZrri:
167 case X86::VRNDSCALESSZrmi:
168 case X86::VRNDSCALESDZrri_Int:
169 case X86::VRNDSCALESDZrmi_Int:
170 case X86::VRNDSCALESSZrri_Int:
171 case X86::VRNDSCALESSZrmi_Int:
172 const MachineOperand &Imm = MI.getOperand(i: MI.getNumExplicitOperands() - 1);
173 int64_t ImmVal = Imm.getImm();
174 // Ensure that only bits 3:0 of the immediate are used.
175 if ((ImmVal & 0xf) != ImmVal)
176 return false;
177 break;
178 }
179
180 return true;
181}
182
183static bool isKMovNarrowing(unsigned VPMOVOpc, unsigned KMOVOpc) {
184 unsigned VPMOVBits = 0;
185 switch (VPMOVOpc) {
186 case X86::VPMOVQ2MZ128kr:
187 VPMOVBits = 2;
188 break;
189 case X86::VPMOVQ2MZ256kr:
190 case X86::VPMOVD2MZ128kr:
191 VPMOVBits = 4;
192 break;
193 case X86::VPMOVD2MZ256kr:
194 VPMOVBits = 8;
195 break;
196 case X86::VPMOVB2MZ128kr:
197 VPMOVBits = 16;
198 break;
199 case X86::VPMOVB2MZ256kr:
200 VPMOVBits = 32;
201 break;
202 default:
203 llvm_unreachable("Unknown VPMOV opcode");
204 }
205
206 unsigned KMOVSize = 0;
207 switch (KMOVOpc) {
208 case X86::KMOVBrk:
209 KMOVSize = 8;
210 break;
211 case X86::KMOVWrk:
212 KMOVSize = 16;
213 break;
214 case X86::KMOVDrk:
215 KMOVSize = 32;
216 break;
217 default:
218 llvm_unreachable("Unknown KMOV opcode");
219 }
220
221 return KMOVSize < VPMOVBits;
222}
223
224// Try to compress VPMOV*2M + KMOV chain patterns:
225// vpmov*2m %xmm0, %k0 -> (erase this)
226// kmov* %k0, %eax -> vmovmskp* %xmm0, %eax
227static bool tryCompressVPMOVPattern(MachineInstr &MI, MachineBasicBlock &MBB,
228 const X86Subtarget &ST,
229 SmallVectorImpl<MachineInstr *> &ToErase) {
230 const X86InstrInfo *TII = ST.getInstrInfo();
231 const TargetRegisterInfo *TRI = ST.getRegisterInfo();
232 MachineRegisterInfo *MRI = &MBB.getParent()->getRegInfo();
233
234 unsigned Opc = MI.getOpcode();
235 if (Opc != X86::VPMOVD2MZ128kr && Opc != X86::VPMOVD2MZ256kr &&
236 Opc != X86::VPMOVQ2MZ128kr && Opc != X86::VPMOVQ2MZ256kr &&
237 Opc != X86::VPMOVB2MZ128kr && Opc != X86::VPMOVB2MZ256kr)
238 return false;
239
240 if (usesExtendedRegister(MI))
241 return false;
242
243 Register MaskReg = MI.getOperand(i: 0).getReg();
244 Register SrcVecReg = MI.getOperand(i: 1).getReg();
245
246 unsigned MovMskOpc = 0;
247 switch (Opc) {
248 case X86::VPMOVD2MZ128kr:
249 MovMskOpc = X86::VMOVMSKPSrr;
250 break;
251 case X86::VPMOVD2MZ256kr:
252 MovMskOpc = X86::VMOVMSKPSYrr;
253 break;
254 case X86::VPMOVQ2MZ128kr:
255 MovMskOpc = X86::VMOVMSKPDrr;
256 break;
257 case X86::VPMOVQ2MZ256kr:
258 MovMskOpc = X86::VMOVMSKPDYrr;
259 break;
260 case X86::VPMOVB2MZ128kr:
261 MovMskOpc = X86::VPMOVMSKBrr;
262 break;
263 case X86::VPMOVB2MZ256kr:
264 MovMskOpc = X86::VPMOVMSKBYrr;
265 break;
266 default:
267 llvm_unreachable("Unknown VPMOV opcode");
268 }
269
270 MachineInstr *KMovMI = nullptr;
271
272 for (MachineInstr &CurMI : llvm::make_range(
273 x: std::next(x: MachineBasicBlock::iterator(MI)), y: MBB.end())) {
274 if (CurMI.readsRegister(Reg: MaskReg, TRI)) {
275 if (KMovMI)
276 return false; // Fail: Mask has MULTIPLE uses
277
278 unsigned UseOpc = CurMI.getOpcode();
279 bool IsKMOV = UseOpc == X86::KMOVBrk || UseOpc == X86::KMOVWrk ||
280 UseOpc == X86::KMOVDrk;
281 // Only allow non-narrowing KMOV uses of the mask.
282 if (IsKMOV && CurMI.getOperand(i: 1).getReg() == MaskReg &&
283 !isKMovNarrowing(VPMOVOpc: Opc, KMOVOpc: UseOpc)) {
284 KMovMI = &CurMI;
285 // continue scanning to ensure
286 // there are no *other* uses of the mask later in the block.
287 } else {
288 return false;
289 }
290 }
291
292 if (CurMI.modifiesRegister(Reg: MaskReg, TRI)) {
293 if (!KMovMI)
294 return false; // Mask clobbered before use
295 break;
296 }
297
298 if (!KMovMI && CurMI.modifiesRegister(Reg: SrcVecReg, TRI)) {
299 return false; // SrcVecReg modified before it could be used by MOVMSK
300 }
301 }
302
303 if (!KMovMI)
304 return false;
305
306 // Check if MaskReg is used in any other basic blocks
307 for (const MachineOperand &MO : MRI->use_operands(Reg: MaskReg))
308 if (MO.getParent()->getParent() != &MBB)
309 return false;
310
311 // Apply the transformation
312 KMovMI->setDesc(TII->get(Opcode: MovMskOpc));
313 MachineOperand &NewSrc = KMovMI->getOperand(i: 1);
314 NewSrc.setReg(SrcVecReg);
315 // setReg() keeps the mask operand's kill flag; take the source's kill
316 // state from the VPMOV instead.
317 NewSrc.setIsKill(MI.getOperand(i: 1).isKill());
318 KMovMI->setAsmPrinterFlag(X86::AC_EVEX_2_VEX);
319
320 ToErase.push_back(Elt: &MI);
321 return true;
322}
323
324static bool CompressEVEXImpl(MachineInstr &MI, MachineBasicBlock &MBB,
325 const X86Subtarget &ST,
326 SmallVectorImpl<MachineInstr *> &ToErase) {
327 uint64_t TSFlags = MI.getDesc().TSFlags;
328
329 // Check for EVEX instructions only.
330 if ((TSFlags & X86II::EncodingMask) != X86II::EVEX)
331 return false;
332
333 // Instructions with mask or 512-bit vector can't be converted to VEX.
334 if (TSFlags & (X86II::EVEX_K | X86II::EVEX_L2))
335 return false;
336
337 // Specialized VPMOVD2M + KMOV -> MOVMSK fold first.
338 if (tryCompressVPMOVPattern(MI, MBB, ST, ToErase))
339 return true;
340
341 auto IsRedundantNewDataDest = [&](unsigned &Opc) {
342 // $rbx = ADD64rr_ND $rbx, $rax / $rbx = ADD64rr_ND $rax, $rbx
343 // ->
344 // $rbx = ADD64rr $rbx, $rax
345 const MCInstrDesc &Desc = MI.getDesc();
346 Register Reg0 = MI.getOperand(i: 0).getReg();
347 const MachineOperand &Op1 = MI.getOperand(i: 1);
348 if (!Op1.isReg() || X86::getFirstAddrOperandIdx(MI) == 1 ||
349 X86::isCFCMOVCC(Opcode: MI.getOpcode()))
350 return false;
351 Register Reg1 = Op1.getReg();
352 if (Reg1 == Reg0)
353 return true;
354
355 // Op1 and Op2 may be commutable for ND instructions.
356 if (!Desc.isCommutable() || Desc.getNumOperands() < 3 ||
357 !MI.getOperand(i: 2).isReg() || MI.getOperand(i: 2).getReg() != Reg0)
358 return false;
359 // Opcode may change after commute, e.g. SHRD -> SHLD
360 ST.getInstrInfo()->commuteInstruction(MI, NewMI: false, OpIdx1: 1, OpIdx2: 2);
361 Opc = MI.getOpcode();
362 return true;
363 };
364
365 // EVEX_B has several meanings.
366 // AVX512:
367 // register form: rounding control or SAE
368 // memory form: broadcast
369 //
370 // APX:
371 // MAP4: NDD, ZU
372 //
373 // For AVX512 cases, EVEX prefix is needed in order to carry this information
374 // thus preventing the transformation to VEX encoding.
375 bool IsND = X86II::hasNewDataDest(TSFlags);
376 unsigned Opc = MI.getOpcode();
377 bool IsSetZUCCm = Opc == X86::SETZUCCm;
378 if (TSFlags & X86II::EVEX_B && !IsND && !IsSetZUCCm)
379 return false;
380 // MOVBE*rr is special because it has semantic of NDD but not set EVEX_B.
381 bool IsNDLike = IsND || Opc == X86::MOVBE32rr || Opc == X86::MOVBE64rr;
382 bool IsRedundantNDD = IsNDLike ? IsRedundantNewDataDest(Opc) : false;
383
384 auto GetCompressedOpc = [&](unsigned Opc) -> unsigned {
385 ArrayRef<X86TableEntry> Table = ArrayRef(X86CompressEVEXTable);
386 const auto I = llvm::lower_bound(Range&: Table, Value&: Opc);
387 if (I == Table.end() || I->OldOpc != Opc)
388 return 0;
389
390 if (usesExtendedRegister(MI) || !checkPredicate(Opc: I->NewOpc, Subtarget: &ST) ||
391 !performCustomAdjustments(MI, NewOpc: I->NewOpc))
392 return 0;
393 return I->NewOpc;
394 };
395
396 Register Dst = MI.getOperand(i: 0).getReg();
397 if (IsRedundantNDD) {
398 // Redundant NDD ops cannot be safely compressed if either:
399 // - the legacy op would introduce a partial write that BreakFalseDeps
400 // identified as a potential stall, or
401 // - the op is writing to a subregister of a live register, i.e. the
402 // full (zeroed) result is used.
403 // Both cases are indicated by an implicit def of the superregister.
404 if (Dst &&
405 (X86::GR16RegClass.contains(Reg: Dst) || X86::GR8RegClass.contains(Reg: Dst))) {
406 Register Super = getX86SubSuperRegister(Reg: Dst, Size: 64);
407 if (MI.definesRegister(Reg: Super, /*TRI=*/nullptr))
408 IsRedundantNDD = false;
409 }
410
411 // ADDrm/mr instructions with NDD + relocation had been transformed to the
412 // instructions without NDD in X86SuppressAPXForRelocation pass. That is to
413 // keep backward compatibility with linkers without APX support.
414 if (!X86EnableAPXForRelocation)
415 assert(!isAddMemInstrWithRelocation(MI) &&
416 "Unexpected NDD instruction with relocation!");
417 } else if (Opc == X86::ADD32ri_ND || Opc == X86::ADD64ri32_ND ||
418 Opc == X86::ADD32rr_ND || Opc == X86::ADD64rr_ND) {
419 // Non-redundant NDD ADD can be compressed to LEA when:
420 // - No EGPR register used and
421 // - EFLAGS is dead.
422 if (!usesExtendedRegister(MI) &&
423 MI.registerDefIsDead(Reg: X86::EFLAGS, /*TRI=*/nullptr)) {
424 Register Src1 = MI.getOperand(i: 1).getReg();
425 const MachineOperand &Src2 = MI.getOperand(i: 2);
426 bool Is32BitReg = Opc == X86::ADD32ri_ND || Opc == X86::ADD32rr_ND;
427 const MCInstrDesc &NewDesc =
428 ST.getInstrInfo()->get(Opcode: Is32BitReg ? X86::LEA64_32r : X86::LEA64r);
429 if (Is32BitReg)
430 Src1 = getX86SubSuperRegister(Reg: Src1, Size: 64);
431 MachineInstrBuilder MIB = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: NewDesc, DestReg: Dst)
432 .addReg(RegNo: Src1)
433 .addImm(Val: 1);
434 if (Opc == X86::ADD32ri_ND || Opc == X86::ADD64ri32_ND)
435 MIB.addReg(RegNo: 0).add(MO: Src2);
436 else if (Is32BitReg)
437 MIB.addReg(RegNo: getX86SubSuperRegister(Reg: Src2.getReg(), Size: 64)).addImm(Val: 0);
438 else
439 MIB.add(MO: Src2).addImm(Val: 0);
440 MIB.addReg(RegNo: 0);
441 MI.removeFromParent();
442 return true;
443 }
444 }
445
446 // NonNF -> NF only if it's not a compressible NDD instruction and eflags is
447 // dead.
448 unsigned NewOpc = IsRedundantNDD
449 ? X86::getNonNDVariant(Opc)
450 : ((IsNDLike && ST.hasNF() &&
451 MI.registerDefIsDead(Reg: X86::EFLAGS, /*TRI=*/nullptr))
452 ? X86::getNFVariant(Opc)
453 : GetCompressedOpc(Opc));
454
455 if (!NewOpc)
456 return false;
457 // NF (No Flags) instructions cannot compress to VEX/legacy encoding.
458 // NF_ND can still compress to NF (both remain EVEX).
459 assert((IsND || !(TSFlags & X86II::EVEX_NF)) &&
460 "Unexpected to compress NF instructions without ND.");
461
462 const MCInstrDesc &NewDesc = ST.getInstrInfo()->get(Opcode: NewOpc);
463 MI.setDesc(NewDesc);
464 unsigned AsmComment;
465 switch (NewDesc.TSFlags & X86II::EncodingMask) {
466 case X86II::LEGACY:
467 AsmComment = X86::AC_EVEX_2_LEGACY;
468 break;
469 case X86II::VEX:
470 AsmComment = X86::AC_EVEX_2_VEX;
471 break;
472 case X86II::EVEX:
473 AsmComment = X86::AC_EVEX_2_EVEX;
474 assert(IsND && (NewDesc.TSFlags & X86II::EVEX_NF) &&
475 "Unknown EVEX2EVEX compression");
476 break;
477 default:
478 llvm_unreachable("Unknown EVEX compression");
479 }
480 MI.setAsmPrinterFlag(AsmComment);
481 if (IsRedundantNDD)
482 MI.tieOperands(DefIdx: 0, UseIdx: 1);
483
484 return true;
485}
486
487static bool runOnMF(MachineFunction &MF) {
488 LLVM_DEBUG(dbgs() << "Start X86CompressEVEXPass\n";);
489#ifndef NDEBUG
490 // Make sure the tables are sorted.
491 static std::atomic<bool> TableChecked(false);
492 if (!TableChecked.load(std::memory_order_relaxed)) {
493 assert(llvm::is_sorted(X86CompressEVEXTable) &&
494 "X86CompressEVEXTable is not sorted!");
495 TableChecked.store(true, std::memory_order_relaxed);
496 }
497#endif
498 const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
499 if (!ST.hasAVX512() && !ST.hasEGPR() && !ST.hasNDD() && !ST.hasZU())
500 return false;
501
502 bool Changed = false;
503
504 for (MachineBasicBlock &MBB : MF) {
505 SmallVector<MachineInstr *, 4> ToErase;
506
507 for (MachineInstr &MI : llvm::make_early_inc_range(Range&: MBB)) {
508 Changed |= CompressEVEXImpl(MI, MBB, ST, ToErase);
509 }
510
511 for (MachineInstr *MI : ToErase) {
512 MI->eraseFromParent();
513 }
514 }
515 LLVM_DEBUG(dbgs() << "End X86CompressEVEXPass\n";);
516 return Changed;
517}
518
519INITIALIZE_PASS(CompressEVEXLegacy, COMP_EVEX_NAME, COMP_EVEX_DESC, false,
520 false)
521
522FunctionPass *llvm::createX86CompressEVEXLegacyPass() {
523 return new CompressEVEXLegacy();
524}
525
526bool CompressEVEXLegacy::runOnMachineFunction(MachineFunction &MF) {
527 return runOnMF(MF);
528}
529
530PreservedAnalyses
531X86CompressEVEXPass::run(MachineFunction &MF,
532 MachineFunctionAnalysisManager &MFAM) {
533 bool Changed = runOnMF(MF);
534 if (!Changed)
535 return PreservedAnalyses::all();
536 PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses();
537 PA.preserveSet<CFGAnalyses>();
538 return PA;
539}
540