X86CompressEVEX.cpp source code [llvm_projects/llvm/lib/Target/X86/X86CompressEVEX.cpp]

1	//===- X86CompressEVEX.cpp ------------------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This pass compresses instructions from EVEX space to legacy/VEX/EVEX space
10	// when possible in order to reduce code size or facilitate HW decoding.
11	//
12	// Possible compression:
13	// a. AVX512 instruction (EVEX) -> AVX instruction (VEX)
14	// b. Promoted instruction (EVEX) -> pre-promotion instruction (legacy/VEX)
15	// c. NDD (EVEX) -> non-NDD (legacy)
16	// d. NF_ND (EVEX) -> NF (EVEX)
17	// e. NonNF (EVEX) -> NF (EVEX)
18	// f. SETZUCCm (EVEX) -> SETCCm (legacy)
19	// g. VPMOV2M (EVEX) + KMOV -> VMOVMSK/VPMOVMSKB (VEX)*
20	//
21	// Compression a, b and c can always reduce code size, with some exceptions
22	// such as promoted 16-bit CRC32 which is as long as the legacy version.
23	//
24	// legacy:
25	// crc32w %si, %eax ## encoding: [0x66,0xf2,0x0f,0x38,0xf1,0xc6]
26	// promoted:
27	// crc32w %si, %eax ## encoding: [0x62,0xf4,0x7d,0x08,0xf1,0xc6]
28	//
29	// From performance perspective, these should be same (same uops and same EXE
30	// ports). From a FMV perspective, an older legacy encoding is preferred b/c it
31	// can execute in more places (broader HW install base). So we will still do
32	// the compression.
33	//
34	// Compression d can help hardware decode (HW may skip reading the NDD
35	// register) although the instruction length remains unchanged.
36	//
37	// Compression e can help hardware skip updating EFLAGS although the instruction
38	// length remains unchanged.
39	//===----------------------------------------------------------------------===//
40
41	#include "MCTargetDesc/X86BaseInfo.h"
42	#include "X86.h"
43	#include "X86InstrInfo.h"
44	#include "X86Subtarget.h"
45	#include "llvm/ADT/SmallVector.h"
46	#include "llvm/ADT/StringRef.h"
47	#include "llvm/CodeGen/MachineFunction.h"
48	#include "llvm/CodeGen/MachineFunctionAnalysisManager.h"
49	#include "llvm/CodeGen/MachineFunctionPass.h"
50	#include "llvm/CodeGen/MachineInstr.h"
51	#include "llvm/CodeGen/MachineOperand.h"
52	#include "llvm/CodeGen/MachinePassManager.h"
53	#include "llvm/IR/Analysis.h"
54	#include "llvm/MC/MCInstrDesc.h"
55	#include "llvm/Pass.h"
56	#include <atomic>
57	#include <cassert>
58	#include <cstdint>
59
60	using namespace llvm;
61
62	#define COMP_EVEX_DESC "Compressing EVEX instrs when possible"
63	#define COMP_EVEX_NAME "x86-compress-evex"
64
65	#define DEBUG_TYPE COMP_EVEX_NAME
66
67	extern cl::opt<bool> X86EnableAPXForRelocation;
68
69	namespace {
70	// Including the generated EVEX compression tables.
71	#define GET_X86_COMPRESS_EVEX_TABLE
72	#include "X86GenInstrMapping.inc"
73
74	class CompressEVEXLegacy : public MachineFunctionPass {
75	public:
76	static char ID;
77	CompressEVEXLegacy() : MachineFunctionPass (ID) {}
78	StringRef getPassName() const override { return COMP_EVEX_DESC; }
79
80	bool runOnMachineFunction(MachineFunction &MF) override;
81
82	// This pass runs after regalloc and doesn't support VReg operands.
83	MachineFunctionProperties getRequiredProperties() const override {
84	return MachineFunctionProperties ().setNoVRegs();
85	}
86	};
87
88	} // end anonymous namespace
89
90	char CompressEVEXLegacy::ID = `0`;
91
92	static bool usesExtendedRegister(const MachineInstr &MI) {
93	auto isHiRegIdx = [](MCRegister Reg) {
94	// Check for XMM register with indexes between 16 - 31.
95	if (Reg >= X86::XMM16 && Reg <= X86::XMM31)
96	return true;
97	// Check for YMM register with indexes between 16 - 31.
98	if (Reg >= X86::YMM16 && Reg <= X86::YMM31)
99	return true;
100	// Check for GPR with indexes between 16 - 31.
101	if (X86II::isApxExtendedReg(Reg))
102	return true;
103	return false;
104	};
105
106	// Check that operands are not ZMM regs or
107	// XMM/YMM regs with hi indexes between 16 - 31.
108	for (const MachineOperand &MO : MI.explicit_operands()) {
109	if (!MO.isReg())
110	continue;
111
112	MCRegister Reg = MO.getReg().asMCReg();
113	assert(!X86II::isZMMReg(Reg) &&
114	"ZMM instructions should not be in the EVEX->VEX tables");
115	if (isHiRegIdx (Reg))
116	return true;
117	}
118
119	return false;
120	}
121
122	// Do any custom cleanup needed to finalize the conversion.
123	static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) {
124	(void)NewOpc;
125	unsigned Opc = MI.getOpcode();
126	switch (Opc) {
127	case X86::VALIGNDZ128rri:
128	case X86::VALIGNDZ128rmi:
129	case X86::VALIGNQZ128rri:
130	case X86::VALIGNQZ128rmi: {
131	assert((NewOpc == X86::VPALIGNRrri \|\| NewOpc == X86::VPALIGNRrmi) &&
132	"Unexpected new opcode!");
133	unsigned Scale =
134	(Opc == X86::VALIGNQZ128rri \|\| Opc == X86::VALIGNQZ128rmi) ? `8` : `4`;
135	MachineOperand &Imm = MI.getOperand(i: MI.getNumExplicitOperands() - `1`);
136	Imm.setImm(Imm.getImm() * Scale);
137	break;
138	}
139	case X86::VSHUFF32X4Z256rmi:
140	case X86::VSHUFF32X4Z256rri:
141	case X86::VSHUFF64X2Z256rmi:
142	case X86::VSHUFF64X2Z256rri:
143	case X86::VSHUFI32X4Z256rmi:
144	case X86::VSHUFI32X4Z256rri:
145	case X86::VSHUFI64X2Z256rmi:
146	case X86::VSHUFI64X2Z256rri: {
147	assert((NewOpc == X86::VPERM2F128rri \|\| NewOpc == X86::VPERM2I128rri \|\|
148	NewOpc == X86::VPERM2F128rmi \|\| NewOpc == X86::VPERM2I128rmi) &&
149	"Unexpected new opcode!");
150	MachineOperand &Imm = MI.getOperand(i: MI.getNumExplicitOperands() - `1`);
151	int64_t ImmVal = Imm.getImm();
152	// Set bit 5, move bit 1 to bit 4, copy bit 0.
153	Imm.setImm(`0x20` \| ((ImmVal & `2`) << `3`) \| (ImmVal & `1`));
154	break;
155	}
156	case X86::VRNDSCALEPDZ128rri:
157	case X86::VRNDSCALEPDZ128rmi:
158	case X86::VRNDSCALEPSZ128rri:
159	case X86::VRNDSCALEPSZ128rmi:
160	case X86::VRNDSCALEPDZ256rri:
161	case X86::VRNDSCALEPDZ256rmi:
162	case X86::VRNDSCALEPSZ256rri:
163	case X86::VRNDSCALEPSZ256rmi:
164	case X86::VRNDSCALESDZrri:
165	case X86::VRNDSCALESDZrmi:
166	case X86::VRNDSCALESSZrri:
167	case X86::VRNDSCALESSZrmi:
168	case X86::VRNDSCALESDZrri_Int:
169	case X86::VRNDSCALESDZrmi_Int:
170	case X86::VRNDSCALESSZrri_Int:
171	case X86::VRNDSCALESSZrmi_Int:
172	const MachineOperand &Imm = MI.getOperand(i: MI.getNumExplicitOperands() - `1`);
173	int64_t ImmVal = Imm.getImm();
174	// Ensure that only bits 3:0 of the immediate are used.
175	if ((ImmVal & `0xf`) != ImmVal)
176	return false;
177	break;
178	}
179
180	return true;
181	}
182
183	static bool isKMovNarrowing(unsigned VPMOVOpc, unsigned KMOVOpc) {
184	unsigned VPMOVBits = `0`;
185	switch (VPMOVOpc) {
186	case X86::VPMOVQ2MZ128kr:
187	VPMOVBits = `2`;
188	break;
189	case X86::VPMOVQ2MZ256kr:
190	case X86::VPMOVD2MZ128kr:
191	VPMOVBits = `4`;
192	break;
193	case X86::VPMOVD2MZ256kr:
194	VPMOVBits = `8`;
195	break;
196	case X86::VPMOVB2MZ128kr:
197	VPMOVBits = `16`;
198	break;
199	case X86::VPMOVB2MZ256kr:
200	VPMOVBits = `32`;
201	break;
202	default:
203	llvm_unreachable("Unknown VPMOV opcode");
204	}
205
206	unsigned KMOVSize = `0`;
207	switch (KMOVOpc) {
208	case X86::KMOVBrk:
209	KMOVSize = `8`;
210	break;
211	case X86::KMOVWrk:
212	KMOVSize = `16`;
213	break;
214	case X86::KMOVDrk:
215	KMOVSize = `32`;
216	break;
217	default:
218	llvm_unreachable("Unknown KMOV opcode");
219	}
220
221	return KMOVSize < VPMOVBits;
222	}
223
224	// Try to compress VPMOV2M + KMOV chain patterns:*
225	// vpmov2m %xmm0, %k0 -> (erase this)*
226	// kmov %k0, %eax -> vmovmskp* %xmm0, %eax*
227	static bool tryCompressVPMOVPattern(MachineInstr &MI, MachineBasicBlock &MBB,
228	const X86Subtarget &ST,
229	SmallVectorImpl<MachineInstr *> &ToErase) {
230	const X86InstrInfo *TII = ST.getInstrInfo();
231	const TargetRegisterInfo *TRI = ST.getRegisterInfo();
232	MachineRegisterInfo *MRI = &MBB.getParent()->getRegInfo();
233
234	unsigned Opc = MI.getOpcode();
235	if (Opc != X86::VPMOVD2MZ128kr && Opc != X86::VPMOVD2MZ256kr &&
236	Opc != X86::VPMOVQ2MZ128kr && Opc != X86::VPMOVQ2MZ256kr &&
237	Opc != X86::VPMOVB2MZ128kr && Opc != X86::VPMOVB2MZ256kr)
238	return false;
239
240	Register MaskReg = MI.getOperand(i: `0`).getReg();
241	Register SrcVecReg = MI.getOperand(i: `1`).getReg();
242
243	unsigned MovMskOpc = `0`;
244	switch (Opc) {
245	case X86::VPMOVD2MZ128kr:
246	MovMskOpc = X86::VMOVMSKPSrr;
247	break;
248	case X86::VPMOVD2MZ256kr:
249	MovMskOpc = X86::VMOVMSKPSYrr;
250	break;
251	case X86::VPMOVQ2MZ128kr:
252	MovMskOpc = X86::VMOVMSKPDrr;
253	break;
254	case X86::VPMOVQ2MZ256kr:
255	MovMskOpc = X86::VMOVMSKPDYrr;
256	break;
257	case X86::VPMOVB2MZ128kr:
258	MovMskOpc = X86::VPMOVMSKBrr;
259	break;
260	case X86::VPMOVB2MZ256kr:
261	MovMskOpc = X86::VPMOVMSKBYrr;
262	break;
263	default:
264	llvm_unreachable("Unknown VPMOV opcode");
265	}
266
267	MachineInstr KMovMI = nullptr*;
268
269	for (MachineInstr &CurMI : llvm::make_range(
270	x: std::next(x: MachineBasicBlock::iterator (MI)), y: MBB.end())) {
271	if (CurMI.modifiesRegister(Reg: MaskReg, TRI)) {
272	if (!KMovMI)
273	return false; // Mask clobbered before use
274	break;
275	}
276
277	if (CurMI.readsRegister(Reg: MaskReg, TRI)) {
278	if (KMovMI)
279	return false; // Fail: Mask has MULTIPLE uses
280
281	unsigned UseOpc = CurMI.getOpcode();
282	bool IsKMOV = UseOpc == X86::KMOVBrk \|\| UseOpc == X86::KMOVWrk \|\|
283	UseOpc == X86::KMOVDrk;
284	// Only allow non-narrowing KMOV uses of the mask.
285	if (IsKMOV && CurMI.getOperand(i: `1`).getReg() == MaskReg &&
286	!isKMovNarrowing(VPMOVOpc: Opc, KMOVOpc: UseOpc)) {
287	KMovMI = &CurMI;
288	// continue scanning to ensure
289	// there are no other* uses of the mask later in the block.*
290	} else {
291	return false;
292	}
293	}
294
295	if (!KMovMI && CurMI.modifiesRegister(Reg: SrcVecReg, TRI)) {
296	return false; // SrcVecReg modified before it could be used by MOVMSK
297	}
298	}
299
300	if (!KMovMI)
301	return false;
302
303	// Check if MaskReg is used in any other basic blocks
304	for (const MachineOperand &MO : MRI->use_operands(Reg: MaskReg))
305	if (MO.getParent()->getParent() != &MBB)
306	return false;
307
308	// Apply the transformation
309	KMovMI->setDesc(TII->get(Opcode: MovMskOpc));
310	KMovMI->getOperand(i: `1`).setReg(SrcVecReg);
311	KMovMI->setAsmPrinterFlag(X86::AC_EVEX_2_VEX);
312
313	ToErase.push_back(Elt: &MI);
314	return true;
315	}
316
317	static bool CompressEVEXImpl(MachineInstr &MI, MachineBasicBlock &MBB,
318	const X86Subtarget &ST,
319	SmallVectorImpl<MachineInstr *> &ToErase) {
320	uint64_t TSFlags = MI.getDesc().TSFlags;
321
322	// Check for EVEX instructions only.
323	if ((TSFlags & X86II::EncodingMask) != X86II::EVEX)
324	return false;
325
326	// Instructions with mask or 512-bit vector can't be converted to VEX.
327	if (TSFlags & (X86II::EVEX_K \| X86II::EVEX_L2))
328	return false;
329
330	// Specialized VPMOVD2M + KMOV -> MOVMSK fold first.
331	if (tryCompressVPMOVPattern(MI, MBB, ST, ToErase))
332	return true;
333
334	auto IsRedundantNewDataDest = [&](unsigned &Opc) {
335	// $rbx = ADD64rr_ND $rbx, $rax / $rbx = ADD64rr_ND $rax, $rbx
336	// ->
337	// $rbx = ADD64rr $rbx, $rax
338	const MCInstrDesc &Desc = MI.getDesc();
339	Register Reg0 = MI.getOperand(i: `0`).getReg();
340	const MachineOperand &Op1 = MI.getOperand(i: `1`);
341	if (!Op1.isReg() \|\| X86::getFirstAddrOperandIdx(MI) == `1` \|\|
342	X86::isCFCMOVCC(Opcode: MI.getOpcode()))
343	return false;
344	Register Reg1 = Op1.getReg();
345	if (Reg1 == Reg0)
346	return true;
347
348	// Op1 and Op2 may be commutable for ND instructions.
349	if (!Desc.isCommutable() \|\| Desc.getNumOperands() < `3` \|\|
350	!MI.getOperand(i: `2`).isReg() \|\| MI.getOperand(i: `2`).getReg() != Reg0)
351	return false;
352	// Opcode may change after commute, e.g. SHRD -> SHLD
353	ST.getInstrInfo()->commuteInstruction(MI, NewMI: false, OpIdx1: `1`, OpIdx2: `2`);
354	Opc = MI.getOpcode();
355	return true;
356	};
357
358	// EVEX_B has several meanings.
359	// AVX512:
360	// register form: rounding control or SAE
361	// memory form: broadcast
362	//
363	// APX:
364	// MAP4: NDD, ZU
365	//
366	// For AVX512 cases, EVEX prefix is needed in order to carry this information
367	// thus preventing the transformation to VEX encoding.
368	bool IsND = X86II::hasNewDataDest(TSFlags);
369	unsigned Opc = MI.getOpcode();
370	bool IsSetZUCCm = Opc == X86::SETZUCCm;
371	if (TSFlags & X86II::EVEX_B && !IsND && !IsSetZUCCm)
372	return false;
373	// MOVBErr is special because it has semantic of NDD but not set EVEX_B.*
374	bool IsNDLike = IsND \|\| Opc == X86::MOVBE32rr \|\| Opc == X86::MOVBE64rr;
375	bool IsRedundantNDD = IsNDLike ? IsRedundantNewDataDest (Opc) : false;
376
377	auto GetCompressedOpc = [&](unsigned Opc) -> unsigned {
378	ArrayRef<X86TableEntry> Table = ArrayRef(X86CompressEVEXTable);
379	const auto I = llvm::lower_bound(Range&: Table, Value&: Opc);
380	if (I == Table.end() \|\| I->OldOpc != Opc)
381	return `0`;
382
383	if (usesExtendedRegister(MI) \|\| !checkPredicate(Opc: I->NewOpc, Subtarget: &ST) \|\|
384	!performCustomAdjustments(MI, NewOpc: I->NewOpc))
385	return `0`;
386	return I->NewOpc;
387	};
388
389	Register Dst = MI.getOperand(i: `0`).getReg();
390	if (IsRedundantNDD) {
391	// Redundant NDD ops cannot be safely compressed if either:
392	// - the legacy op would introduce a partial write that BreakFalseDeps
393	// identified as a potential stall, or
394	// - the op is writing to a subregister of a live register, i.e. the
395	// full (zeroed) result is used.
396	// Both cases are indicated by an implicit def of the superregister.
397	if (Dst &&
398	(X86::GR16RegClass.contains(Reg: Dst) \|\| X86::GR8RegClass.contains(Reg: Dst))) {
399	Register Super = getX86SubSuperRegister(Reg: Dst, Size: `64`);
400	if (MI.definesRegister(Reg: Super, /TRI=/nullptr))
401	IsRedundantNDD = false;
402	}
403
404	// ADDrm/mr instructions with NDD + relocation had been transformed to the
405	// instructions without NDD in X86SuppressAPXForRelocation pass. That is to
406	// keep backward compatibility with linkers without APX support.
407	if (!X86EnableAPXForRelocation)
408	assert(!isAddMemInstrWithRelocation(MI) &&
409	"Unexpected NDD instruction with relocation!");
410	} else if (Opc == X86::ADD32ri_ND \|\| Opc == X86::ADD64ri32_ND \|\|
411	Opc == X86::ADD32rr_ND \|\| Opc == X86::ADD64rr_ND) {
412	// Non-redundant NDD ADD can be compressed to LEA when:
413	// - No EGPR register used and
414	// - EFLAGS is dead.
415	if (!usesExtendedRegister(MI) &&
416	MI.registerDefIsDead(Reg: X86::EFLAGS, /TRI=/nullptr)) {
417	Register Src1 = MI.getOperand(i: `1`).getReg();
418	const MachineOperand &Src2 = MI.getOperand(i: `2`);
419	bool Is32BitReg = Opc == X86::ADD32ri_ND \|\| Opc == X86::ADD32rr_ND;
420	const MCInstrDesc &NewDesc =
421	ST.getInstrInfo()->get(Opcode: Is32BitReg ? X86::LEA64_32r : X86::LEA64r);
422	if (Is32BitReg)
423	Src1 = getX86SubSuperRegister(Reg: Src1, Size: `64`);
424	MachineInstrBuilder MIB = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: NewDesc, DestReg: Dst)
425	.addReg(RegNo: Src1)
426	.addImm(Val: `1`);
427	if (Opc == X86::ADD32ri_ND \|\| Opc == X86::ADD64ri32_ND)
428	MIB.addReg(RegNo: `0`).add(MO: Src2);
429	else if (Is32BitReg)
430	MIB.addReg(RegNo: getX86SubSuperRegister(Reg: Src2.getReg(), Size: `64`)).addImm(Val: `0`);
431	else
432	MIB.add(MO: Src2).addImm(Val: `0`);
433	MIB.addReg(RegNo: `0`);
434	MI.removeFromParent();
435	return true;
436	}
437	}
438
439	// NonNF -> NF only if it's not a compressible NDD instruction and eflags is
440	// dead.
441	unsigned NewOpc = IsRedundantNDD
442	? X86::getNonNDVariant(Opc)
443	: ((IsNDLike && ST.hasNF() &&
444	MI.registerDefIsDead(Reg: X86::EFLAGS, /TRI=/nullptr))
445	? X86::getNFVariant(Opc)
446	: GetCompressedOpc (Opc));
447
448	if (!NewOpc)
449	return false;
450
451	const MCInstrDesc &NewDesc = ST.getInstrInfo()->get(Opcode: NewOpc);
452	MI.setDesc(NewDesc);
453	unsigned AsmComment;
454	switch (NewDesc.TSFlags & X86II::EncodingMask) {
455	case X86II::LEGACY:
456	AsmComment = X86::AC_EVEX_2_LEGACY;
457	break;
458	case X86II::VEX:
459	AsmComment = X86::AC_EVEX_2_VEX;
460	break;
461	case X86II::EVEX:
462	AsmComment = X86::AC_EVEX_2_EVEX;
463	assert(IsND && (NewDesc.TSFlags & X86II::EVEX_NF) &&
464	"Unknown EVEX2EVEX compression");
465	break;
466	default:
467	llvm_unreachable("Unknown EVEX compression");
468	}
469	MI.setAsmPrinterFlag(AsmComment);
470	if (IsRedundantNDD)
471	MI.tieOperands(DefIdx: `0`, UseIdx: `1`);
472
473	return true;
474	}
475
476	static bool runOnMF(MachineFunction &MF) {
477	LLVM_DEBUG(dbgs() << "Start X86CompressEVEXPass\n";);
478	#ifndef NDEBUG
479	// Make sure the tables are sorted.
480	static std::atomic<bool> TableChecked(false);
481	if (!TableChecked.load(std::memory_order_relaxed)) {
482	assert(llvm::is_sorted(X86CompressEVEXTable) &&
483	"X86CompressEVEXTable is not sorted!");
484	TableChecked.store(true, std::memory_order_relaxed);
485	}
486	#endif
487	const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
488	if (!ST.hasAVX512() && !ST.hasEGPR() && !ST.hasNDD() && !ST.hasZU())
489	return false;
490
491	bool Changed = false;
492
493	for (MachineBasicBlock &MBB : MF) {
494	SmallVector<MachineInstr *, `4`> ToErase;
495
496	for (MachineInstr &MI : llvm::make_early_inc_range(Range&: MBB)) {
497	Changed \|= CompressEVEXImpl(MI, MBB, ST, ToErase);
498	}
499
500	for (MachineInstr *MI : ToErase) {
501	MI->eraseFromParent();
502	}
503	}
504	LLVM_DEBUG(dbgs() << "End X86CompressEVEXPass\n";);
505	return Changed;
506	}
507
508	INITIALIZE_PASS(CompressEVEXLegacy, COMP_EVEX_NAME, COMP_EVEX_DESC, false,
509	false)
510
511	FunctionPass *llvm::createX86CompressEVEXLegacyPass() {
512	return new CompressEVEXLegacy ();
513	}
514
515	bool CompressEVEXLegacy::runOnMachineFunction(MachineFunction &MF) {
516	return runOnMF(MF);
517	}
518
519	PreservedAnalyses
520	X86CompressEVEXPass::run(MachineFunction &MF,
521	MachineFunctionAnalysisManager &MFAM) {
522	bool Changed = runOnMF(MF);
523	if (!Changed)
524	return PreservedAnalyses::all();
525	PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses();
526	PA.preserveSet<CFGAnalyses>();
527	return PA;
528	}
529

Browse the source code of llvm_projects/llvm/lib/Target/X86/X86CompressEVEX.cpp