1//===-- Target.cpp ----------------------------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8#include "../Target.h"
9
10#include "../Error.h"
11#include "../MmapUtils.h"
12#include "../ParallelSnippetGenerator.h"
13#include "../SerialSnippetGenerator.h"
14#include "../SnippetGenerator.h"
15#include "../SubprocessMemory.h"
16#include "MCTargetDesc/X86BaseInfo.h"
17#include "MCTargetDesc/X86MCTargetDesc.h"
18#include "X86.h"
19#include "X86Counter.h"
20#include "X86RegisterInfo.h"
21#include "llvm/ADT/Sequence.h"
22#include "llvm/CodeGen/MachineInstrBuilder.h"
23#include "llvm/MC/MCInstBuilder.h"
24#include "llvm/Support/Errc.h"
25#include "llvm/Support/Error.h"
26#include "llvm/Support/ErrorHandling.h"
27#include "llvm/Support/FormatVariadic.h"
28#include "llvm/TargetParser/Host.h"
29
30#include <memory>
31#include <string>
32#include <vector>
33#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
34#include <immintrin.h>
35#include <intrin.h>
36#endif
37#if defined(_MSC_VER) && defined(_M_X64)
38#include <float.h> // For _clearfp in ~X86SavedState().
39#endif
40
41#ifdef __linux__
42#ifdef __x86_64__
43#include <asm/prctl.h>
44#endif // __x86_64__
45#include <sys/mman.h>
46#include <sys/syscall.h>
47#include <unistd.h>
48#ifdef HAVE_LIBPFM
49#include <perfmon/perf_event.h>
50#endif // HAVE_LIBPFM
51#endif
52
53#define GET_AVAILABLE_OPCODE_CHECKER
54#include "X86GenInstrInfo.inc"
55
56namespace llvm {
57namespace exegesis {
58
59// If a positive value is specified, we are going to use the LBR in
60// latency-mode.
61//
62// Note:
63// - A small value is preferred, but too low a value could result in
64// throttling.
65// - A prime number is preferred to avoid always skipping certain blocks.
66//
67static cl::opt<unsigned> LbrSamplingPeriod(
68 "x86-lbr-sample-period",
69 cl::desc("The sample period (nbranches/sample), used for LBR sampling"),
70 cl::cat(BenchmarkOptions), cl::init(Val: 0));
71
72static cl::opt<bool>
73 DisableUpperSSERegisters("x86-disable-upper-sse-registers",
74 cl::desc("Disable XMM8-XMM15 register usage"),
75 cl::cat(BenchmarkOptions), cl::init(Val: false));
76
77// FIXME: Validates that repetition-mode is loop if LBR is requested.
78
79// Returns a non-null reason if we cannot handle the memory references in this
80// instruction.
81static const char *isInvalidMemoryInstr(const Instruction &Instr) {
82 switch (Instr.Description.TSFlags & X86II::FormMask) {
83 default:
84 return "Unknown FormMask value";
85 // These have no memory access.
86 case X86II::Pseudo:
87 case X86II::RawFrm:
88 case X86II::AddCCFrm:
89 case X86II::PrefixByte:
90 case X86II::MRMDestReg:
91 case X86II::MRMSrcReg:
92 case X86II::MRMSrcReg4VOp3:
93 case X86II::MRMSrcRegOp4:
94 case X86II::MRMSrcRegCC:
95 case X86II::MRMXrCC:
96 case X86II::MRMr0:
97 case X86II::MRMXr:
98 case X86II::MRM0r:
99 case X86II::MRM1r:
100 case X86II::MRM2r:
101 case X86II::MRM3r:
102 case X86II::MRM4r:
103 case X86II::MRM5r:
104 case X86II::MRM6r:
105 case X86II::MRM7r:
106 case X86II::MRM0X:
107 case X86II::MRM1X:
108 case X86II::MRM2X:
109 case X86II::MRM3X:
110 case X86II::MRM4X:
111 case X86II::MRM5X:
112 case X86II::MRM6X:
113 case X86II::MRM7X:
114 case X86II::MRM_C0:
115 case X86II::MRM_C1:
116 case X86II::MRM_C2:
117 case X86II::MRM_C3:
118 case X86II::MRM_C4:
119 case X86II::MRM_C5:
120 case X86II::MRM_C6:
121 case X86II::MRM_C7:
122 case X86II::MRM_C8:
123 case X86II::MRM_C9:
124 case X86II::MRM_CA:
125 case X86II::MRM_CB:
126 case X86II::MRM_CC:
127 case X86II::MRM_CD:
128 case X86II::MRM_CE:
129 case X86II::MRM_CF:
130 case X86II::MRM_D0:
131 case X86II::MRM_D1:
132 case X86II::MRM_D2:
133 case X86II::MRM_D3:
134 case X86II::MRM_D4:
135 case X86II::MRM_D5:
136 case X86II::MRM_D6:
137 case X86II::MRM_D7:
138 case X86II::MRM_D8:
139 case X86II::MRM_D9:
140 case X86II::MRM_DA:
141 case X86II::MRM_DB:
142 case X86II::MRM_DC:
143 case X86II::MRM_DD:
144 case X86II::MRM_DE:
145 case X86II::MRM_DF:
146 case X86II::MRM_E0:
147 case X86II::MRM_E1:
148 case X86II::MRM_E2:
149 case X86II::MRM_E3:
150 case X86II::MRM_E4:
151 case X86II::MRM_E5:
152 case X86II::MRM_E6:
153 case X86II::MRM_E7:
154 case X86II::MRM_E8:
155 case X86II::MRM_E9:
156 case X86II::MRM_EA:
157 case X86II::MRM_EB:
158 case X86II::MRM_EC:
159 case X86II::MRM_ED:
160 case X86II::MRM_EE:
161 case X86II::MRM_EF:
162 case X86II::MRM_F0:
163 case X86II::MRM_F1:
164 case X86II::MRM_F2:
165 case X86II::MRM_F3:
166 case X86II::MRM_F4:
167 case X86II::MRM_F5:
168 case X86II::MRM_F6:
169 case X86II::MRM_F7:
170 case X86II::MRM_F8:
171 case X86II::MRM_F9:
172 case X86II::MRM_FA:
173 case X86II::MRM_FB:
174 case X86II::MRM_FC:
175 case X86II::MRM_FD:
176 case X86II::MRM_FE:
177 case X86II::MRM_FF:
178 case X86II::RawFrmImm8:
179 return nullptr;
180 case X86II::AddRegFrm:
181 return (Instr.Description.Opcode == X86::POP16r ||
182 Instr.Description.Opcode == X86::POP32r ||
183 Instr.Description.Opcode == X86::PUSH16r ||
184 Instr.Description.Opcode == X86::PUSH32r)
185 ? "unsupported opcode: unsupported memory access"
186 : nullptr;
187 // These access memory and are handled.
188 case X86II::MRMDestMem:
189 case X86II::MRMSrcMem:
190 case X86II::MRMSrcMem4VOp3:
191 case X86II::MRMSrcMemOp4:
192 case X86II::MRMSrcMemCC:
193 case X86II::MRMXmCC:
194 case X86II::MRMXm:
195 case X86II::MRM0m:
196 case X86II::MRM1m:
197 case X86II::MRM2m:
198 case X86II::MRM3m:
199 case X86II::MRM4m:
200 case X86II::MRM5m:
201 case X86II::MRM6m:
202 case X86II::MRM7m:
203 return nullptr;
204 // These access memory and are not handled yet.
205 case X86II::RawFrmImm16:
206 case X86II::RawFrmMemOffs:
207 case X86II::RawFrmSrc:
208 case X86II::RawFrmDst:
209 case X86II::RawFrmDstSrc:
210 return "unsupported opcode: non uniform memory access";
211 }
212}
213
214// If the opcode is invalid, returns a pointer to a character literal indicating
215// the reason. nullptr indicates a valid opcode.
216static const char *isInvalidOpcode(const Instruction &Instr) {
217 const auto OpcodeName = Instr.Name;
218 if ((Instr.Description.TSFlags & X86II::FormMask) == X86II::Pseudo)
219 return "unsupported opcode: pseudo instruction";
220 if ((OpcodeName.starts_with(Prefix: "POP") && !OpcodeName.starts_with(Prefix: "POPCNT")) ||
221 OpcodeName.starts_with(Prefix: "PUSH") ||
222 OpcodeName.starts_with(Prefix: "ADJCALLSTACK") || OpcodeName.starts_with(Prefix: "LEAVE"))
223 return "unsupported opcode: Push/Pop/AdjCallStack/Leave";
224 switch (Instr.Description.Opcode) {
225 case X86::LFS16rm:
226 case X86::LFS32rm:
227 case X86::LFS64rm:
228 case X86::LGS16rm:
229 case X86::LGS32rm:
230 case X86::LGS64rm:
231 case X86::LSS16rm:
232 case X86::LSS32rm:
233 case X86::LSS64rm:
234 case X86::SYSENTER:
235 case X86::WRFSBASE:
236 case X86::WRFSBASE64:
237 return "unsupported opcode";
238 default:
239 break;
240 }
241 if (const auto reason = isInvalidMemoryInstr(Instr))
242 return reason;
243 // We do not handle instructions with OPERAND_PCREL.
244 for (const Operand &Op : Instr.Operands)
245 if (Op.isExplicit() &&
246 Op.getExplicitOperandInfo().OperandType == MCOI::OPERAND_PCREL)
247 return "unsupported opcode: PC relative operand";
248 // We do not handle second-form X87 instructions. We only handle first-form
249 // ones (_Fp), see comment in X86InstrFPStack.td.
250 for (const Operand &Op : Instr.Operands)
251 if (Op.isReg() && Op.isExplicit() &&
252 Op.getExplicitOperandInfo().RegClass == X86::RSTRegClassID)
253 return "unsupported second-form X87 instruction";
254 return nullptr;
255}
256
257static unsigned getX86FPFlags(const Instruction &Instr) {
258 return Instr.Description.TSFlags & X86II::FPTypeMask;
259}
260
261// Helper to fill a memory operand with a value.
262static void setMemOp(InstructionTemplate &IT, int OpIdx,
263 const MCOperand &OpVal) {
264 const auto Op = IT.getInstr().Operands[OpIdx];
265 assert(Op.isExplicit() && "invalid memory pattern");
266 IT.getValueFor(Op) = OpVal;
267}
268
269// Common (latency, uops) code for LEA templates. `GetDestReg` takes the
270// addressing base and index registers and returns the LEA destination register.
271static Expected<std::vector<CodeTemplate>> generateLEATemplatesCommon(
272 const Instruction &Instr, const BitVector &ForbiddenRegisters,
273 const LLVMState &State, const SnippetGenerator::Options &Opts,
274 std::function<void(unsigned, unsigned, BitVector &CandidateDestRegs)>
275 RestrictDestRegs) {
276 assert(Instr.Operands.size() == 6 && "invalid LEA");
277 assert(X86II::getMemoryOperandNo(Instr.Description.TSFlags) == 1 &&
278 "invalid LEA");
279
280 constexpr const int kDestOp = 0;
281 constexpr const int kBaseOp = 1;
282 constexpr const int kIndexOp = 3;
283 auto PossibleDestRegs =
284 Instr.Operands[kDestOp].getRegisterAliasing().sourceBits();
285 remove(A&: PossibleDestRegs, B: ForbiddenRegisters);
286 auto PossibleBaseRegs =
287 Instr.Operands[kBaseOp].getRegisterAliasing().sourceBits();
288 remove(A&: PossibleBaseRegs, B: ForbiddenRegisters);
289 auto PossibleIndexRegs =
290 Instr.Operands[kIndexOp].getRegisterAliasing().sourceBits();
291 remove(A&: PossibleIndexRegs, B: ForbiddenRegisters);
292
293 const auto &RegInfo = State.getRegInfo();
294 std::vector<CodeTemplate> Result;
295 for (const unsigned BaseReg : PossibleBaseRegs.set_bits()) {
296 for (const unsigned IndexReg : PossibleIndexRegs.set_bits()) {
297 for (int LogScale = 0; LogScale <= 3; ++LogScale) {
298 // FIXME: Add an option for controlling how we explore immediates.
299 for (const int Disp : {0, 42}) {
300 InstructionTemplate IT(&Instr);
301 const int64_t Scale = 1ull << LogScale;
302 setMemOp(IT, OpIdx: 1, OpVal: MCOperand::createReg(Reg: BaseReg));
303 setMemOp(IT, OpIdx: 2, OpVal: MCOperand::createImm(Val: Scale));
304 setMemOp(IT, OpIdx: 3, OpVal: MCOperand::createReg(Reg: IndexReg));
305 setMemOp(IT, OpIdx: 4, OpVal: MCOperand::createImm(Val: Disp));
306 // SegmentReg must be 0 for LEA.
307 setMemOp(IT, OpIdx: 5, OpVal: MCOperand::createReg(Reg: 0));
308
309 // Output reg candidates are selected by the caller.
310 auto PossibleDestRegsNow = PossibleDestRegs;
311 RestrictDestRegs(BaseReg, IndexReg, PossibleDestRegsNow);
312 assert(PossibleDestRegsNow.set_bits().begin() !=
313 PossibleDestRegsNow.set_bits().end() &&
314 "no remaining registers");
315 setMemOp(
316 IT, OpIdx: 0,
317 OpVal: MCOperand::createReg(Reg: *PossibleDestRegsNow.set_bits().begin()));
318
319 CodeTemplate CT;
320 CT.Instructions.push_back(x: std::move(IT));
321 CT.Config = formatv(Fmt: "{3}(%{0}, %{1}, {2})", Vals: RegInfo.getName(RegNo: BaseReg),
322 Vals: RegInfo.getName(RegNo: IndexReg), Vals: Scale, Vals: Disp)
323 .str();
324 Result.push_back(x: std::move(CT));
325 if (Result.size() >= Opts.MaxConfigsPerOpcode)
326 return std::move(Result);
327 }
328 }
329 }
330 }
331
332 return std::move(Result);
333}
334
335namespace {
336class X86SerialSnippetGenerator : public SerialSnippetGenerator {
337public:
338 using SerialSnippetGenerator::SerialSnippetGenerator;
339
340 Expected<std::vector<CodeTemplate>>
341 generateCodeTemplates(InstructionTemplate Variant,
342 const BitVector &ForbiddenRegisters) const override;
343};
344} // namespace
345
346Expected<std::vector<CodeTemplate>>
347X86SerialSnippetGenerator::generateCodeTemplates(
348 InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const {
349 const Instruction &Instr = Variant.getInstr();
350
351 if (const auto reason = isInvalidOpcode(Instr))
352 return make_error<Failure>(Args: reason);
353
354 // LEA gets special attention.
355 const auto Opcode = Instr.Description.getOpcode();
356 if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r) {
357 return generateLEATemplatesCommon(
358 Instr, ForbiddenRegisters, State, Opts,
359 RestrictDestRegs: [this](unsigned BaseReg, unsigned IndexReg,
360 BitVector &CandidateDestRegs) {
361 // We just select a destination register that aliases the base
362 // register.
363 CandidateDestRegs &=
364 State.getRATC().getRegister(Reg: BaseReg).aliasedBits();
365 });
366 }
367
368 if (Instr.hasMemoryOperands())
369 return make_error<Failure>(
370 Args: "unsupported memory operand in latency measurements");
371
372 switch (getX86FPFlags(Instr)) {
373 case X86II::NotFP:
374 return SerialSnippetGenerator::generateCodeTemplates(Variant,
375 ForbiddenRegisters);
376 case X86II::ZeroArgFP:
377 case X86II::OneArgFP:
378 case X86II::SpecialFP:
379 case X86II::CompareFP:
380 case X86II::CondMovFP:
381 return make_error<Failure>(Args: "Unsupported x87 Instruction");
382 case X86II::OneArgFPRW:
383 case X86II::TwoArgFP:
384 // These are instructions like
385 // - `ST(0) = fsqrt(ST(0))` (OneArgFPRW)
386 // - `ST(0) = ST(0) + ST(i)` (TwoArgFP)
387 // They are intrinsically serial and do not modify the state of the stack.
388 return generateSelfAliasingCodeTemplates(Variant, ForbiddenRegisters);
389 default:
390 llvm_unreachable("Unknown FP Type!");
391 }
392}
393
394namespace {
395class X86ParallelSnippetGenerator : public ParallelSnippetGenerator {
396public:
397 using ParallelSnippetGenerator::ParallelSnippetGenerator;
398
399 Expected<std::vector<CodeTemplate>>
400 generateCodeTemplates(InstructionTemplate Variant,
401 const BitVector &ForbiddenRegisters) const override;
402};
403
404} // namespace
405
406Expected<std::vector<CodeTemplate>>
407X86ParallelSnippetGenerator::generateCodeTemplates(
408 InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const {
409 const Instruction &Instr = Variant.getInstr();
410
411 if (const auto reason = isInvalidOpcode(Instr))
412 return make_error<Failure>(Args: reason);
413
414 // LEA gets special attention.
415 const auto Opcode = Instr.Description.getOpcode();
416 if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r) {
417 return generateLEATemplatesCommon(
418 Instr, ForbiddenRegisters, State, Opts,
419 RestrictDestRegs: [this](unsigned BaseReg, unsigned IndexReg,
420 BitVector &CandidateDestRegs) {
421 // Any destination register that is not used for addressing is fine.
422 remove(A&: CandidateDestRegs,
423 B: State.getRATC().getRegister(Reg: BaseReg).aliasedBits());
424 remove(A&: CandidateDestRegs,
425 B: State.getRATC().getRegister(Reg: IndexReg).aliasedBits());
426 });
427 }
428
429 switch (getX86FPFlags(Instr)) {
430 case X86II::NotFP:
431 return ParallelSnippetGenerator::generateCodeTemplates(Variant,
432 ForbiddenRegisters);
433 case X86II::ZeroArgFP:
434 case X86II::OneArgFP:
435 case X86II::SpecialFP:
436 return make_error<Failure>(Args: "Unsupported x87 Instruction");
437 case X86II::OneArgFPRW:
438 case X86II::TwoArgFP:
439 // These are instructions like
440 // - `ST(0) = fsqrt(ST(0))` (OneArgFPRW)
441 // - `ST(0) = ST(0) + ST(i)` (TwoArgFP)
442 // They are intrinsically serial and do not modify the state of the stack.
443 // We generate the same code for latency and uops.
444 return generateSelfAliasingCodeTemplates(Variant, ForbiddenRegisters);
445 case X86II::CompareFP:
446 case X86II::CondMovFP:
447 // We can compute uops for any FP instruction that does not grow or shrink
448 // the stack (either do not touch the stack or push as much as they pop).
449 return generateUnconstrainedCodeTemplates(
450 Variant, Msg: "instruction does not grow/shrink the FP stack");
451 default:
452 llvm_unreachable("Unknown FP Type!");
453 }
454}
455
456static unsigned getLoadImmediateOpcode(unsigned RegBitWidth) {
457 switch (RegBitWidth) {
458 case 8:
459 return X86::MOV8ri;
460 case 16:
461 return X86::MOV16ri;
462 case 32:
463 return X86::MOV32ri;
464 case 64:
465 return X86::MOV64ri;
466 }
467 llvm_unreachable("Invalid Value Width");
468}
469
470// Generates instruction to load an immediate value into a register.
471static MCInst loadImmediate(MCRegister Reg, unsigned RegBitWidth,
472 const APInt &Value) {
473 if (Value.getBitWidth() > RegBitWidth)
474 llvm_unreachable("Value must fit in the Register");
475 return MCInstBuilder(getLoadImmediateOpcode(RegBitWidth))
476 .addReg(Reg)
477 .addImm(Val: Value.getZExtValue());
478}
479
480// Allocates scratch memory on the stack.
481static MCInst allocateStackSpace(unsigned Bytes) {
482 return MCInstBuilder(X86::SUB64ri8)
483 .addReg(Reg: X86::RSP)
484 .addReg(Reg: X86::RSP)
485 .addImm(Val: Bytes);
486}
487
488// Fills scratch memory at offset `OffsetBytes` with value `Imm`.
489static MCInst fillStackSpace(unsigned MovOpcode, unsigned OffsetBytes,
490 uint64_t Imm) {
491 return MCInstBuilder(MovOpcode)
492 // Address = ESP
493 .addReg(Reg: X86::RSP) // BaseReg
494 .addImm(Val: 1) // ScaleAmt
495 .addReg(Reg: 0) // IndexReg
496 .addImm(Val: OffsetBytes) // Disp
497 .addReg(Reg: 0) // Segment
498 // Immediate.
499 .addImm(Val: Imm);
500}
501
502// Loads scratch memory into register `Reg` using opcode `RMOpcode`.
503static MCInst loadToReg(MCRegister Reg, unsigned RMOpcode) {
504 return MCInstBuilder(RMOpcode)
505 .addReg(Reg)
506 // Address = ESP
507 .addReg(Reg: X86::RSP) // BaseReg
508 .addImm(Val: 1) // ScaleAmt
509 .addReg(Reg: 0) // IndexReg
510 .addImm(Val: 0) // Disp
511 .addReg(Reg: 0); // Segment
512}
513
514// Releases scratch memory.
515static MCInst releaseStackSpace(unsigned Bytes) {
516 return MCInstBuilder(X86::ADD64ri8)
517 .addReg(Reg: X86::RSP)
518 .addReg(Reg: X86::RSP)
519 .addImm(Val: Bytes);
520}
521
522// Reserves some space on the stack, fills it with the content of the provided
523// constant and provide methods to load the stack value into a register.
524namespace {
525struct ConstantInliner {
526 explicit ConstantInliner(const APInt &Constant) : Constant_(Constant) {}
527
528 std::vector<MCInst> loadAndFinalize(MCRegister Reg, unsigned RegBitWidth,
529 unsigned Opcode);
530
531 std::vector<MCInst> loadX87STAndFinalize(MCRegister Reg);
532
533 std::vector<MCInst> loadX87FPAndFinalize(MCRegister Reg);
534
535 std::vector<MCInst> popFlagAndFinalize();
536
537 std::vector<MCInst> loadImplicitRegAndFinalize(unsigned Opcode,
538 unsigned Value);
539
540 std::vector<MCInst> loadDirectionFlagAndFinalize();
541
542private:
543 ConstantInliner &add(const MCInst &Inst) {
544 Instructions.push_back(x: Inst);
545 return *this;
546 }
547
548 void initStack(unsigned Bytes);
549
550 static constexpr const unsigned kF80Bytes = 10; // 80 bits.
551
552 APInt Constant_;
553 std::vector<MCInst> Instructions;
554};
555} // namespace
556
557std::vector<MCInst> ConstantInliner::loadAndFinalize(MCRegister Reg,
558 unsigned RegBitWidth,
559 unsigned Opcode) {
560 assert((RegBitWidth & 7) == 0 && "RegBitWidth must be a multiple of 8 bits");
561 initStack(Bytes: RegBitWidth / 8);
562 add(Inst: loadToReg(Reg, RMOpcode: Opcode));
563 add(Inst: releaseStackSpace(Bytes: RegBitWidth / 8));
564 return std::move(Instructions);
565}
566
567std::vector<MCInst> ConstantInliner::loadX87STAndFinalize(MCRegister Reg) {
568 initStack(Bytes: kF80Bytes);
569 add(Inst: MCInstBuilder(X86::LD_F80m)
570 // Address = ESP
571 .addReg(Reg: X86::RSP) // BaseReg
572 .addImm(Val: 1) // ScaleAmt
573 .addReg(Reg: 0) // IndexReg
574 .addImm(Val: 0) // Disp
575 .addReg(Reg: 0)); // Segment
576 if (Reg != X86::ST0)
577 add(Inst: MCInstBuilder(X86::ST_Frr).addReg(Reg));
578 add(Inst: releaseStackSpace(Bytes: kF80Bytes));
579 return std::move(Instructions);
580}
581
582std::vector<MCInst> ConstantInliner::loadX87FPAndFinalize(MCRegister Reg) {
583 initStack(Bytes: kF80Bytes);
584 add(Inst: MCInstBuilder(X86::LD_Fp80m)
585 .addReg(Reg)
586 // Address = ESP
587 .addReg(Reg: X86::RSP) // BaseReg
588 .addImm(Val: 1) // ScaleAmt
589 .addReg(Reg: 0) // IndexReg
590 .addImm(Val: 0) // Disp
591 .addReg(Reg: 0)); // Segment
592 add(Inst: releaseStackSpace(Bytes: kF80Bytes));
593 return std::move(Instructions);
594}
595
596std::vector<MCInst> ConstantInliner::popFlagAndFinalize() {
597 initStack(Bytes: 8);
598 add(Inst: MCInstBuilder(X86::POPF64));
599 return std::move(Instructions);
600}
601
602std::vector<MCInst>
603ConstantInliner::loadImplicitRegAndFinalize(unsigned Opcode, unsigned Value) {
604 add(Inst: allocateStackSpace(Bytes: 4));
605 add(Inst: fillStackSpace(MovOpcode: X86::MOV32mi, OffsetBytes: 0, Imm: Value)); // Mask all FP exceptions
606 add(Inst: MCInstBuilder(Opcode)
607 // Address = ESP
608 .addReg(Reg: X86::RSP) // BaseReg
609 .addImm(Val: 1) // ScaleAmt
610 .addReg(Reg: 0) // IndexReg
611 .addImm(Val: 0) // Disp
612 .addReg(Reg: 0)); // Segment
613 add(Inst: releaseStackSpace(Bytes: 4));
614 return std::move(Instructions);
615}
616
617std::vector<MCInst> ConstantInliner::loadDirectionFlagAndFinalize() {
618 if (Constant_.isZero())
619 add(Inst: MCInstBuilder(X86::CLD));
620 else if (Constant_.isOne())
621 add(Inst: MCInstBuilder(X86::STD));
622
623 return std::move(Instructions);
624}
625
626void ConstantInliner::initStack(unsigned Bytes) {
627 assert(Constant_.getBitWidth() <= Bytes * 8 &&
628 "Value does not have the correct size");
629 const APInt WideConstant = Constant_.getBitWidth() < Bytes * 8
630 ? Constant_.sext(width: Bytes * 8)
631 : Constant_;
632 add(Inst: allocateStackSpace(Bytes));
633 size_t ByteOffset = 0;
634 for (; Bytes - ByteOffset >= 4; ByteOffset += 4)
635 add(Inst: fillStackSpace(
636 MovOpcode: X86::MOV32mi, OffsetBytes: ByteOffset,
637 Imm: WideConstant.extractBits(numBits: 32, bitPosition: ByteOffset * 8).getZExtValue()));
638 if (Bytes - ByteOffset >= 2) {
639 add(Inst: fillStackSpace(
640 MovOpcode: X86::MOV16mi, OffsetBytes: ByteOffset,
641 Imm: WideConstant.extractBits(numBits: 16, bitPosition: ByteOffset * 8).getZExtValue()));
642 ByteOffset += 2;
643 }
644 if (Bytes - ByteOffset >= 1)
645 add(Inst: fillStackSpace(
646 MovOpcode: X86::MOV8mi, OffsetBytes: ByteOffset,
647 Imm: WideConstant.extractBits(numBits: 8, bitPosition: ByteOffset * 8).getZExtValue()));
648}
649
650#include "X86GenExegesis.inc"
651
652namespace {
653
654class X86SavedState : public ExegesisTarget::SavedState {
655public:
656 X86SavedState() {
657#if defined(_MSC_VER) && defined(_M_X64)
658 _fxsave64(FPState);
659 Eflags = __readeflags();
660#elif defined(__GNUC__) && defined(__x86_64__)
661 __builtin_ia32_fxsave64(FPState);
662 Eflags = __builtin_ia32_readeflags_u64();
663#else
664 report_fatal_error("X86 exegesis running on unsupported target");
665#endif
666 }
667
668 ~X86SavedState() {
669 // Restoring the X87 state does not flush pending exceptions, make sure
670 // these exceptions are flushed now.
671#if defined(_MSC_VER) && defined(_M_X64)
672 _clearfp();
673 _fxrstor64(FPState);
674 __writeeflags(Eflags);
675#elif defined(__GNUC__) && defined(__x86_64__)
676 asm volatile("fwait");
677 __builtin_ia32_fxrstor64(FPState);
678 __builtin_ia32_writeeflags_u64(Eflags);
679#else
680 report_fatal_error("X86 exegesis running on unsupported target");
681#endif
682 }
683
684private:
685#if defined(__x86_64__) || defined(_M_X64)
686 alignas(16) char FPState[512];
687 uint64_t Eflags;
688#endif
689};
690
691class ExegesisX86Target : public ExegesisTarget {
692public:
693 ExegesisX86Target()
694 : ExegesisTarget(X86CpuPfmCounters, X86_MC::isOpcodeAvailable) {}
695
696 Expected<std::unique_ptr<pfm::CounterGroup>>
697 createCounter(StringRef CounterName, const LLVMState &State,
698 ArrayRef<const char *> ValidationCounters,
699 const pid_t ProcessID) const override {
700 // If LbrSamplingPeriod was provided, then ignore the
701 // CounterName because we only have one for LBR.
702 if (LbrSamplingPeriod > 0) {
703 // Can't use LBR without HAVE_LIBPFM, LIBPFM_HAS_FIELD_CYCLES, or without
704 // __linux__ (for now)
705#if defined(HAVE_LIBPFM) && defined(LIBPFM_HAS_FIELD_CYCLES) && \
706 defined(__linux__)
707 // TODO(boomanaiden154): Add in support for using validation counters when
708 // using LBR counters.
709 if (ValidationCounters.size() > 0)
710 return make_error<StringError>(
711 "Using LBR is not currently supported with validation counters",
712 errc::invalid_argument);
713
714 return std::make_unique<X86LbrCounter>(
715 X86LbrPerfEvent(LbrSamplingPeriod));
716#else
717 return make_error<StringError>(
718 Args: "LBR counter requested without HAVE_LIBPFM, LIBPFM_HAS_FIELD_CYCLES, "
719 "or running on Linux.",
720 Args: errc::invalid_argument);
721#endif
722 }
723 return ExegesisTarget::createCounter(CounterName, State, ValidationCounters,
724 ProcessID);
725 }
726
727 enum ArgumentRegisters { CodeSize = X86::R12, AuxiliaryMemoryFD = X86::R13 };
728
729private:
730 void addTargetSpecificPasses(PassManagerBase &PM) const override;
731
732 MCRegister getScratchMemoryRegister(const Triple &TT) const override;
733
734 MCRegister getDefaultLoopCounterRegister(const Triple &) const override;
735
736 unsigned getMaxMemoryAccessSize() const override { return 64; }
737
738 Error randomizeTargetMCOperand(const Instruction &Instr, const Variable &Var,
739 MCOperand &AssignedValue,
740 const BitVector &ForbiddenRegs) const override;
741
742 void fillMemoryOperands(InstructionTemplate &IT, MCRegister Reg,
743 unsigned Offset) const override;
744
745 void decrementLoopCounterAndJump(MachineBasicBlock &MBB,
746 MachineBasicBlock &TargetMBB,
747 const MCInstrInfo &MII,
748 MCRegister LoopRegister) const override;
749
750 std::vector<MCInst> setRegTo(const MCSubtargetInfo &STI, MCRegister Reg,
751 const APInt &Value) const override;
752
753#ifdef __linux__
754 void generateLowerMunmap(std::vector<MCInst> &GeneratedCode) const override;
755
756 void generateUpperMunmap(std::vector<MCInst> &GeneratedCode) const override;
757
758 std::vector<MCInst> generateExitSyscall(unsigned ExitCode) const override;
759
760 std::vector<MCInst>
761 generateMmap(uintptr_t Address, size_t Length,
762 uintptr_t FileDescriptorAddress) const override;
763
764 void generateMmapAuxMem(std::vector<MCInst> &GeneratedCode) const override;
765
766 void moveArgumentRegisters(std::vector<MCInst> &GeneratedCode) const override;
767
768 std::vector<MCInst> generateMemoryInitialSetup() const override;
769
770 std::vector<MCInst> setStackRegisterToAuxMem() const override;
771
772 uintptr_t getAuxiliaryMemoryStartAddress() const override;
773
774 std::vector<MCInst> configurePerfCounter(long Request, bool SaveRegisters) const override;
775
776 std::vector<MCRegister> getArgumentRegisters() const override;
777
778 std::vector<MCRegister> getRegistersNeedSaving() const override;
779#endif // __linux__
780
781 ArrayRef<MCPhysReg> getUnavailableRegisters() const override {
782 if (DisableUpperSSERegisters)
783 return ArrayRef(kUnavailableRegistersSSE);
784
785 return ArrayRef(kUnavailableRegisters);
786 }
787
788 bool allowAsBackToBack(const Instruction &Instr) const override {
789 const unsigned Opcode = Instr.Description.Opcode;
790 return !isInvalidOpcode(Instr) && Opcode != X86::LEA64r &&
791 Opcode != X86::LEA64_32r && Opcode != X86::LEA16r;
792 }
793
794 std::vector<InstructionTemplate>
795 generateInstructionVariants(const Instruction &Instr,
796 unsigned MaxConfigsPerOpcode) const override;
797
798 std::unique_ptr<SnippetGenerator> createSerialSnippetGenerator(
799 const LLVMState &State,
800 const SnippetGenerator::Options &Opts) const override {
801 return std::make_unique<X86SerialSnippetGenerator>(args: State, args: Opts);
802 }
803
804 std::unique_ptr<SnippetGenerator> createParallelSnippetGenerator(
805 const LLVMState &State,
806 const SnippetGenerator::Options &Opts) const override {
807 return std::make_unique<X86ParallelSnippetGenerator>(args: State, args: Opts);
808 }
809
810 bool matchesArch(Triple::ArchType Arch) const override {
811 return Arch == Triple::x86_64 || Arch == Triple::x86;
812 }
813
814 Error checkFeatureSupport() const override {
815 // LBR is the only feature we conditionally support now.
816 // So if LBR is not requested, then we should be able to run the benchmarks.
817 if (LbrSamplingPeriod == 0)
818 return Error::success();
819
820#if defined(__linux__) && defined(HAVE_LIBPFM) && \
821 defined(LIBPFM_HAS_FIELD_CYCLES)
822 // FIXME: Fix this.
823 // https://bugs.llvm.org/show_bug.cgi?id=48918
824 // For now, only do the check if we see an Intel machine because
825 // the counter uses some intel-specific magic and it could
826 // be confuse and think an AMD machine actually has LBR support.
827#if defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) || \
828 defined(_M_X64)
829 using namespace sys::detail::x86;
830
831 if (getVendorSignature() == VendorSignatures::GENUINE_INTEL)
832 // If the kernel supports it, the hardware still may not have it.
833 return X86LbrCounter::checkLbrSupport();
834#else
835 report_fatal_error("Running X86 exegesis on unsupported target");
836#endif
837#endif
838 return make_error<StringError>(
839 Args: "LBR not supported on this kernel and/or platform",
840 Args: errc::not_supported);
841 }
842
843 std::unique_ptr<SavedState> withSavedState() const override {
844 return std::make_unique<X86SavedState>();
845 }
846
847 static const MCPhysReg kUnavailableRegisters[4];
848 static const MCPhysReg kUnavailableRegistersSSE[12];
849};
850
851// We disable a few registers that cannot be encoded on instructions with a REX
852// prefix.
853const MCPhysReg ExegesisX86Target::kUnavailableRegisters[4] = {
854 X86::AH, X86::BH, X86::CH, X86::DH};
855
856// Optionally, also disable the upper (x86_64) SSE registers to reduce frontend
857// decoder load.
858const MCPhysReg ExegesisX86Target::kUnavailableRegistersSSE[12] = {
859 X86::AH, X86::BH, X86::CH, X86::DH, X86::XMM8, X86::XMM9,
860 X86::XMM10, X86::XMM11, X86::XMM12, X86::XMM13, X86::XMM14, X86::XMM15};
861
862// We're using one of R8-R15 because these registers are never hardcoded in
863// instructions (e.g. MOVS writes to EDI, ESI, EDX), so they have less
864// conflicts.
865constexpr const MCPhysReg kDefaultLoopCounterReg = X86::R8;
866
867} // namespace
868
869void ExegesisX86Target::addTargetSpecificPasses(PassManagerBase &PM) const {
870 // Lowers FP pseudo-instructions, e.g. ABS_Fp32 -> ABS_F.
871 PM.add(P: createX86FloatingPointStackifierPass());
872}
873
874MCRegister ExegesisX86Target::getScratchMemoryRegister(const Triple &TT) const {
875 if (!TT.isArch64Bit()) {
876 // FIXME: This would require popping from the stack, so we would have to
877 // add some additional setup code.
878 return MCRegister();
879 }
880 return TT.isOSWindows() ? X86::RCX : X86::RDI;
881}
882
883MCRegister
884ExegesisX86Target::getDefaultLoopCounterRegister(const Triple &TT) const {
885 if (!TT.isArch64Bit()) {
886 return MCRegister();
887 }
888 return kDefaultLoopCounterReg;
889}
890
891Error ExegesisX86Target::randomizeTargetMCOperand(
892 const Instruction &Instr, const Variable &Var, MCOperand &AssignedValue,
893 const BitVector &ForbiddenRegs) const {
894 const Operand &Op = Instr.getPrimaryOperand(Var);
895 switch (Op.getExplicitOperandInfo().OperandType) {
896 case X86::OperandType::OPERAND_COND_CODE:
897 AssignedValue =
898 MCOperand::createImm(Val: randomIndex(Max: X86::CondCode::LAST_VALID_COND));
899 return Error::success();
900 case X86::OperandType::OPERAND_ROUNDING_CONTROL:
901 AssignedValue =
902 MCOperand::createImm(Val: randomIndex(Max: X86::STATIC_ROUNDING::TO_ZERO));
903 return Error::success();
904 default:
905 break;
906 }
907 return make_error<Failure>(
908 Args: Twine("unimplemented operand type ")
909 .concat(Suffix: Twine(Op.getExplicitOperandInfo().OperandType)));
910}
911
912void ExegesisX86Target::fillMemoryOperands(InstructionTemplate &IT,
913 MCRegister Reg,
914 unsigned Offset) const {
915 assert(!isInvalidMemoryInstr(IT.getInstr()) &&
916 "fillMemoryOperands requires a valid memory instruction");
917 int MemOpIdx = X86II::getMemoryOperandNo(TSFlags: IT.getInstr().Description.TSFlags);
918 assert(MemOpIdx >= 0 && "invalid memory operand index");
919 // getMemoryOperandNo() ignores tied operands, so we have to add them back.
920 MemOpIdx += X86II::getOperandBias(Desc: IT.getInstr().Description);
921 setMemOp(IT, OpIdx: MemOpIdx + 0, OpVal: MCOperand::createReg(Reg)); // BaseReg
922 setMemOp(IT, OpIdx: MemOpIdx + 1, OpVal: MCOperand::createImm(Val: 1)); // ScaleAmt
923 setMemOp(IT, OpIdx: MemOpIdx + 2, OpVal: MCOperand::createReg(Reg: 0)); // IndexReg
924 setMemOp(IT, OpIdx: MemOpIdx + 3, OpVal: MCOperand::createImm(Val: Offset)); // Disp
925 setMemOp(IT, OpIdx: MemOpIdx + 4, OpVal: MCOperand::createReg(Reg: 0)); // Segment
926}
927
928void ExegesisX86Target::decrementLoopCounterAndJump(
929 MachineBasicBlock &MBB, MachineBasicBlock &TargetMBB,
930 const MCInstrInfo &MII, MCRegister LoopRegister) const {
931 BuildMI(BB: &MBB, MIMD: DebugLoc(), MCID: MII.get(Opcode: X86::ADD64ri8))
932 .addDef(RegNo: LoopRegister)
933 .addUse(RegNo: LoopRegister)
934 .addImm(Val: -1);
935 BuildMI(BB: &MBB, MIMD: DebugLoc(), MCID: MII.get(Opcode: X86::JCC_1))
936 .addMBB(MBB: &TargetMBB)
937 .addImm(Val: X86::COND_NE);
938}
939
940void generateRegisterStackPush(unsigned int Register,
941 std::vector<MCInst> &GeneratedCode) {
942 GeneratedCode.push_back(x: MCInstBuilder(X86::PUSH64r).addReg(Reg: Register));
943}
944
945void generateRegisterStackPop(unsigned int Register,
946 std::vector<MCInst> &GeneratedCode) {
947 GeneratedCode.push_back(x: MCInstBuilder(X86::POP64r).addReg(Reg: Register));
948}
949
950void generateSyscall(long SyscallNumber, std::vector<MCInst> &GeneratedCode) {
951 GeneratedCode.push_back(
952 x: loadImmediate(Reg: X86::RAX, RegBitWidth: 64, Value: APInt(64, SyscallNumber)));
953 GeneratedCode.push_back(x: MCInstBuilder(X86::SYSCALL));
954}
955
956// The functions below for saving and restoring system call registers are only
957// used when llvm-exegesis is built on Linux.
958#ifdef __linux__
959constexpr std::array<unsigned, 6> SyscallArgumentRegisters{
960 X86::RDI, X86::RSI, X86::RDX, X86::R10, X86::R8, X86::R9};
961
962static void saveSyscallRegisters(std::vector<MCInst> &GeneratedCode,
963 unsigned ArgumentCount) {
964 assert(ArgumentCount <= 6 &&
965 "System calls only X86-64 Linux can only take six arguments");
966 // Preserve RCX and R11 (Clobbered by the system call).
967 generateRegisterStackPush(Register: X86::RCX, GeneratedCode);
968 generateRegisterStackPush(Register: X86::R11, GeneratedCode);
969 // Preserve RAX (used for the syscall number/return value).
970 generateRegisterStackPush(Register: X86::RAX, GeneratedCode);
971 // Preserve the registers used to pass arguments to the system call.
972 for (unsigned I = 0; I < ArgumentCount; ++I)
973 generateRegisterStackPush(Register: SyscallArgumentRegisters[I], GeneratedCode);
974}
975
976static void restoreSyscallRegisters(std::vector<MCInst> &GeneratedCode,
977 unsigned ArgumentCount) {
978 assert(ArgumentCount <= 6 &&
979 "System calls only X86-64 Linux can only take six arguments");
980 // Restore the argument registers, in the opposite order of the way they are
981 // saved.
982 for (unsigned I = ArgumentCount; I > 0; --I) {
983 generateRegisterStackPop(Register: SyscallArgumentRegisters[I - 1], GeneratedCode);
984 }
985 generateRegisterStackPop(Register: X86::RAX, GeneratedCode);
986 generateRegisterStackPop(Register: X86::R11, GeneratedCode);
987 generateRegisterStackPop(Register: X86::RCX, GeneratedCode);
988}
989#endif // __linux__
990
991static std::vector<MCInst> loadImmediateSegmentRegister(MCRegister Reg,
992 const APInt &Value) {
993#if defined(__x86_64__) && defined(__linux__)
994 assert(Value.getBitWidth() <= 64 && "Value must fit in the register.");
995 std::vector<MCInst> loadSegmentRegisterCode;
996 // Preserve the syscall registers here as we don't
997 // want to make any assumptions about the ordering of what registers are
998 // loaded in first, and we might have already loaded in registers that we are
999 // going to be clobbering here.
1000 saveSyscallRegisters(GeneratedCode&: loadSegmentRegisterCode, ArgumentCount: 2);
1001 // Generate the instructions to make the arch_prctl system call to set
1002 // the registers.
1003 int SyscallCode = 0;
1004 if (Reg == X86::FS)
1005 SyscallCode = ARCH_SET_FS;
1006 else if (Reg == X86::GS)
1007 SyscallCode = ARCH_SET_GS;
1008 else
1009 llvm_unreachable("Only the segment registers GS and FS are supported");
1010 loadSegmentRegisterCode.push_back(
1011 x: loadImmediate(Reg: X86::RDI, RegBitWidth: 64, Value: APInt(64, SyscallCode)));
1012 loadSegmentRegisterCode.push_back(x: loadImmediate(Reg: X86::RSI, RegBitWidth: 64, Value));
1013 generateSyscall(SYS_arch_prctl, GeneratedCode&: loadSegmentRegisterCode);
1014 // Restore the registers in reverse order
1015 restoreSyscallRegisters(GeneratedCode&: loadSegmentRegisterCode, ArgumentCount: 2);
1016 return loadSegmentRegisterCode;
1017#else
1018 llvm_unreachable("Loading immediate segment registers is only supported with "
1019 "x86-64 llvm-exegesis");
1020#endif // defined(__x86_64__) && defined(__linux__)
1021}
1022
1023std::vector<MCInst> ExegesisX86Target::setRegTo(const MCSubtargetInfo &STI,
1024 MCRegister Reg,
1025 const APInt &Value) const {
1026 if (X86::SEGMENT_REGRegClass.contains(Reg))
1027 return loadImmediateSegmentRegister(Reg, Value);
1028 if (X86::GR8RegClass.contains(Reg))
1029 return {loadImmediate(Reg, RegBitWidth: 8, Value)};
1030 if (X86::GR16RegClass.contains(Reg))
1031 return {loadImmediate(Reg, RegBitWidth: 16, Value)};
1032 if (X86::GR32RegClass.contains(Reg))
1033 return {loadImmediate(Reg, RegBitWidth: 32, Value)};
1034 if (X86::GR64RegClass.contains(Reg))
1035 return {loadImmediate(Reg, RegBitWidth: 64, Value)};
1036 if (X86::VK8RegClass.contains(Reg) || X86::VK16RegClass.contains(Reg) ||
1037 X86::VK32RegClass.contains(Reg) || X86::VK64RegClass.contains(Reg)) {
1038 switch (Value.getBitWidth()) {
1039 case 8:
1040 if (STI.getFeatureBits()[X86::FeatureDQI]) {
1041 ConstantInliner CI(Value);
1042 return CI.loadAndFinalize(Reg, RegBitWidth: Value.getBitWidth(), Opcode: X86::KMOVBkm);
1043 }
1044 [[fallthrough]];
1045 case 16:
1046 if (STI.getFeatureBits()[X86::FeatureAVX512]) {
1047 ConstantInliner CI(Value.zextOrTrunc(width: 16));
1048 return CI.loadAndFinalize(Reg, RegBitWidth: 16, Opcode: X86::KMOVWkm);
1049 }
1050 break;
1051 case 32:
1052 if (STI.getFeatureBits()[X86::FeatureBWI]) {
1053 ConstantInliner CI(Value);
1054 return CI.loadAndFinalize(Reg, RegBitWidth: Value.getBitWidth(), Opcode: X86::KMOVDkm);
1055 }
1056 break;
1057 case 64:
1058 if (STI.getFeatureBits()[X86::FeatureBWI]) {
1059 ConstantInliner CI(Value);
1060 return CI.loadAndFinalize(Reg, RegBitWidth: Value.getBitWidth(), Opcode: X86::KMOVQkm);
1061 }
1062 break;
1063 }
1064 }
1065 ConstantInliner CI(Value);
1066 if (X86::VR64RegClass.contains(Reg))
1067 return CI.loadAndFinalize(Reg, RegBitWidth: 64, Opcode: X86::MMX_MOVQ64rm);
1068 if (X86::VR128RegClass.contains(Reg)) {
1069 if (STI.getFeatureBits()[X86::FeatureAVX])
1070 return CI.loadAndFinalize(Reg, RegBitWidth: 128, Opcode: X86::VMOVDQUrm);
1071 return CI.loadAndFinalize(Reg, RegBitWidth: 128, Opcode: X86::MOVDQUrm);
1072 }
1073 if (X86::VR128XRegClass.contains(Reg)) {
1074 if (STI.getFeatureBits()[X86::FeatureAVX512])
1075 return CI.loadAndFinalize(Reg, RegBitWidth: 128, Opcode: X86::VMOVDQU32Z128rm);
1076 }
1077 if (X86::VR256RegClass.contains(Reg)) {
1078 if (STI.getFeatureBits()[X86::FeatureAVX])
1079 return CI.loadAndFinalize(Reg, RegBitWidth: 256, Opcode: X86::VMOVDQUYrm);
1080 }
1081 if (X86::VR256XRegClass.contains(Reg)) {
1082 if (STI.getFeatureBits()[X86::FeatureAVX512])
1083 return CI.loadAndFinalize(Reg, RegBitWidth: 256, Opcode: X86::VMOVDQU32Z256rm);
1084 }
1085 if (X86::VR512RegClass.contains(Reg))
1086 if (STI.getFeatureBits()[X86::FeatureAVX512])
1087 return CI.loadAndFinalize(Reg, RegBitWidth: 512, Opcode: X86::VMOVDQU32Zrm);
1088 if (X86::RSTRegClass.contains(Reg)) {
1089 return CI.loadX87STAndFinalize(Reg);
1090 }
1091 if (X86::RFP32RegClass.contains(Reg) || X86::RFP64RegClass.contains(Reg) ||
1092 X86::RFP80RegClass.contains(Reg)) {
1093 return CI.loadX87FPAndFinalize(Reg);
1094 }
1095 if (Reg == X86::EFLAGS)
1096 return CI.popFlagAndFinalize();
1097 if (Reg == X86::MXCSR)
1098 return CI.loadImplicitRegAndFinalize(
1099 Opcode: STI.getFeatureBits()[X86::FeatureAVX] ? X86::VLDMXCSR : X86::LDMXCSR,
1100 Value: 0x1f80);
1101 if (Reg == X86::FPCW)
1102 return CI.loadImplicitRegAndFinalize(Opcode: X86::FLDCW16m, Value: 0x37f);
1103 if (Reg == X86::DF)
1104 return CI.loadDirectionFlagAndFinalize();
1105 return {}; // Not yet implemented.
1106}
1107
1108#ifdef __linux__
1109
1110#ifdef __arm__
1111static constexpr const uintptr_t VAddressSpaceCeiling = 0xC0000000;
1112#else
1113static constexpr const uintptr_t VAddressSpaceCeiling = 0x0000800000000000;
1114#endif
1115
1116void generateRoundToNearestPage(unsigned int Register,
1117 std::vector<MCInst> &GeneratedCode) {
1118 int PageSizeShift = static_cast<int>(round(x: log2(x: getpagesize())));
1119 // Round down to the nearest page by getting rid of the least significant bits
1120 // representing location in the page. Shift right to get rid of this info and
1121 // then shift back left.
1122 GeneratedCode.push_back(x: MCInstBuilder(X86::SHR64ri)
1123 .addReg(Reg: Register)
1124 .addReg(Reg: Register)
1125 .addImm(Val: PageSizeShift));
1126 GeneratedCode.push_back(x: MCInstBuilder(X86::SHL64ri)
1127 .addReg(Reg: Register)
1128 .addReg(Reg: Register)
1129 .addImm(Val: PageSizeShift));
1130}
1131
1132void generateGetInstructionPointer(unsigned int ResultRegister,
1133 std::vector<MCInst> &GeneratedCode) {
1134 // Use a load effective address to get the current instruction pointer and put
1135 // it into the result register.
1136 GeneratedCode.push_back(x: MCInstBuilder(X86::LEA64r)
1137 .addReg(Reg: ResultRegister)
1138 .addReg(Reg: X86::RIP)
1139 .addImm(Val: 1)
1140 .addReg(Reg: 0)
1141 .addImm(Val: 0)
1142 .addReg(Reg: 0));
1143}
1144
1145void ExegesisX86Target::generateLowerMunmap(
1146 std::vector<MCInst> &GeneratedCode) const {
1147 // Unmap starting at address zero
1148 GeneratedCode.push_back(x: loadImmediate(Reg: X86::RDI, RegBitWidth: 64, Value: APInt(64, 0)));
1149 // Get the current instruction pointer so we know where to unmap up to.
1150 generateGetInstructionPointer(ResultRegister: X86::RSI, GeneratedCode);
1151 generateRoundToNearestPage(Register: X86::RSI, GeneratedCode);
1152 // Subtract a page from the end of the unmap so we don't unmap the currently
1153 // executing section.
1154 GeneratedCode.push_back(x: MCInstBuilder(X86::SUB64ri32)
1155 .addReg(Reg: X86::RSI)
1156 .addReg(Reg: X86::RSI)
1157 .addImm(Val: getpagesize()));
1158 generateSyscall(SYS_munmap, GeneratedCode);
1159}
1160
1161void ExegesisX86Target::generateUpperMunmap(
1162 std::vector<MCInst> &GeneratedCode) const {
1163 generateGetInstructionPointer(ResultRegister: X86::R8, GeneratedCode);
1164 // Load in the size of the snippet to RDI from from the argument register.
1165 GeneratedCode.push_back(x: MCInstBuilder(X86::MOV64rr)
1166 .addReg(Reg: X86::RDI)
1167 .addReg(Reg: ArgumentRegisters::CodeSize));
1168 // Add the length of the snippet (in %RDI) to the current instruction pointer
1169 // (%R8) to get the address where we should start unmapping at.
1170 GeneratedCode.push_back(x: MCInstBuilder(X86::ADD64rr)
1171 .addReg(Reg: X86::RDI)
1172 .addReg(Reg: X86::RDI)
1173 .addReg(Reg: X86::R8));
1174 generateRoundToNearestPage(Register: X86::RDI, GeneratedCode);
1175 // Add a one page to the start address to ensure that we're above the snippet
1176 // since the above function rounds down.
1177 GeneratedCode.push_back(x: MCInstBuilder(X86::ADD64ri32)
1178 .addReg(Reg: X86::RDI)
1179 .addReg(Reg: X86::RDI)
1180 .addImm(Val: getpagesize()));
1181 // Unmap to just one page under the ceiling of the address space.
1182 GeneratedCode.push_back(x: loadImmediate(
1183 Reg: X86::RSI, RegBitWidth: 64, Value: APInt(64, VAddressSpaceCeiling - getpagesize())));
1184 GeneratedCode.push_back(x: MCInstBuilder(X86::SUB64rr)
1185 .addReg(Reg: X86::RSI)
1186 .addReg(Reg: X86::RSI)
1187 .addReg(Reg: X86::RDI));
1188 generateSyscall(SYS_munmap, GeneratedCode);
1189}
1190
1191std::vector<MCInst>
1192ExegesisX86Target::generateExitSyscall(unsigned ExitCode) const {
1193 std::vector<MCInst> ExitCallCode;
1194 ExitCallCode.push_back(x: loadImmediate(Reg: X86::RDI, RegBitWidth: 64, Value: APInt(64, ExitCode)));
1195 generateSyscall(SYS_exit, GeneratedCode&: ExitCallCode);
1196 return ExitCallCode;
1197}
1198
1199std::vector<MCInst>
1200ExegesisX86Target::generateMmap(uintptr_t Address, size_t Length,
1201 uintptr_t FileDescriptorAddress) const {
1202 std::vector<MCInst> MmapCode;
1203 MmapCode.push_back(x: loadImmediate(Reg: X86::RDI, RegBitWidth: 64, Value: APInt(64, Address)));
1204 MmapCode.push_back(x: loadImmediate(Reg: X86::RSI, RegBitWidth: 64, Value: APInt(64, Length)));
1205 MmapCode.push_back(
1206 x: loadImmediate(Reg: X86::RDX, RegBitWidth: 64, Value: APInt(64, PROT_READ | PROT_WRITE)));
1207 MmapCode.push_back(
1208 x: loadImmediate(Reg: X86::R10, RegBitWidth: 64, Value: APInt(64, MAP_SHARED | MAP_FIXED_NOREPLACE)));
1209 // Copy file descriptor location from aux memory into R8
1210 MmapCode.push_back(
1211 x: loadImmediate(Reg: X86::R8, RegBitWidth: 64, Value: APInt(64, FileDescriptorAddress)));
1212 // Dereference file descriptor into FD argument register
1213 MmapCode.push_back(x: MCInstBuilder(X86::MOV32rm)
1214 .addReg(Reg: X86::R8D)
1215 .addReg(Reg: X86::R8)
1216 .addImm(Val: 1)
1217 .addReg(Reg: 0)
1218 .addImm(Val: 0)
1219 .addReg(Reg: 0));
1220 MmapCode.push_back(x: loadImmediate(Reg: X86::R9, RegBitWidth: 64, Value: APInt(64, 0)));
1221 generateSyscall(SYS_mmap, GeneratedCode&: MmapCode);
1222 return MmapCode;
1223}
1224
1225void ExegesisX86Target::generateMmapAuxMem(
1226 std::vector<MCInst> &GeneratedCode) const {
1227 GeneratedCode.push_back(
1228 x: loadImmediate(Reg: X86::RDI, RegBitWidth: 64, Value: APInt(64, getAuxiliaryMemoryStartAddress())));
1229 GeneratedCode.push_back(x: loadImmediate(
1230 Reg: X86::RSI, RegBitWidth: 64, Value: APInt(64, SubprocessMemory::AuxiliaryMemorySize)));
1231 GeneratedCode.push_back(
1232 x: loadImmediate(Reg: X86::RDX, RegBitWidth: 64, Value: APInt(64, PROT_READ | PROT_WRITE)));
1233 GeneratedCode.push_back(
1234 x: loadImmediate(Reg: X86::R10, RegBitWidth: 64, Value: APInt(64, MAP_SHARED | MAP_FIXED_NOREPLACE)));
1235 GeneratedCode.push_back(x: MCInstBuilder(X86::MOV64rr)
1236 .addReg(Reg: X86::R8)
1237 .addReg(Reg: ArgumentRegisters::AuxiliaryMemoryFD));
1238 GeneratedCode.push_back(x: loadImmediate(Reg: X86::R9, RegBitWidth: 64, Value: APInt(64, 0)));
1239 generateSyscall(SYS_mmap, GeneratedCode);
1240}
1241
1242void ExegesisX86Target::moveArgumentRegisters(
1243 std::vector<MCInst> &GeneratedCode) const {
1244 GeneratedCode.push_back(x: MCInstBuilder(X86::MOV64rr)
1245 .addReg(Reg: ArgumentRegisters::CodeSize)
1246 .addReg(Reg: X86::RDI));
1247 GeneratedCode.push_back(x: MCInstBuilder(X86::MOV64rr)
1248 .addReg(Reg: ArgumentRegisters::AuxiliaryMemoryFD)
1249 .addReg(Reg: X86::RSI));
1250}
1251
1252std::vector<MCInst> ExegesisX86Target::generateMemoryInitialSetup() const {
1253 std::vector<MCInst> MemoryInitialSetupCode;
1254 moveArgumentRegisters(GeneratedCode&: MemoryInitialSetupCode);
1255 generateLowerMunmap(GeneratedCode&: MemoryInitialSetupCode);
1256 generateUpperMunmap(GeneratedCode&: MemoryInitialSetupCode);
1257 generateMmapAuxMem(GeneratedCode&: MemoryInitialSetupCode);
1258 return MemoryInitialSetupCode;
1259}
1260
1261std::vector<MCInst> ExegesisX86Target::setStackRegisterToAuxMem() const {
1262 // Moves %rsp to the end of the auxiliary memory
1263 return {MCInstBuilder(X86::MOV64ri)
1264 .addReg(Reg: X86::RSP)
1265 .addImm(Val: getAuxiliaryMemoryStartAddress() +
1266 SubprocessMemory::AuxiliaryMemorySize)};
1267}
1268
1269uintptr_t ExegesisX86Target::getAuxiliaryMemoryStartAddress() const {
1270 // Return the second to last page in the virtual address space to try and
1271 // prevent interference with memory annotations in the snippet
1272 return VAddressSpaceCeiling - 2 * getpagesize();
1273}
1274
1275std::vector<MCInst>
1276ExegesisX86Target::configurePerfCounter(long Request, bool SaveRegisters) const {
1277 std::vector<MCInst> ConfigurePerfCounterCode;
1278 if (SaveRegisters)
1279 saveSyscallRegisters(GeneratedCode&: ConfigurePerfCounterCode, ArgumentCount: 3);
1280 ConfigurePerfCounterCode.push_back(
1281 x: loadImmediate(Reg: X86::RDI, RegBitWidth: 64, Value: APInt(64, getAuxiliaryMemoryStartAddress())));
1282 ConfigurePerfCounterCode.push_back(x: MCInstBuilder(X86::MOV32rm)
1283 .addReg(Reg: X86::EDI)
1284 .addReg(Reg: X86::RDI)
1285 .addImm(Val: 1)
1286 .addReg(Reg: 0)
1287 .addImm(Val: 0)
1288 .addReg(Reg: 0));
1289 ConfigurePerfCounterCode.push_back(
1290 x: loadImmediate(Reg: X86::RSI, RegBitWidth: 64, Value: APInt(64, Request)));
1291#ifdef HAVE_LIBPFM
1292 ConfigurePerfCounterCode.push_back(
1293 loadImmediate(X86::RDX, 64, APInt(64, PERF_IOC_FLAG_GROUP)));
1294#endif // HAVE_LIBPFM
1295 generateSyscall(SYS_ioctl, GeneratedCode&: ConfigurePerfCounterCode);
1296 if (SaveRegisters)
1297 restoreSyscallRegisters(GeneratedCode&: ConfigurePerfCounterCode, ArgumentCount: 3);
1298 return ConfigurePerfCounterCode;
1299}
1300
1301std::vector<MCRegister> ExegesisX86Target::getArgumentRegisters() const {
1302 return {X86::RDI, X86::RSI};
1303}
1304
1305std::vector<MCRegister> ExegesisX86Target::getRegistersNeedSaving() const {
1306 return {X86::RAX, X86::RDI, X86::RSI, X86::RCX, X86::R11};
1307}
1308
1309#endif // __linux__
1310
1311// Instruction can have some variable operands, and we may want to see how
1312// different operands affect performance. So for each operand position,
1313// precompute all the possible choices we might care about,
1314// and greedily generate all the possible combinations of choices.
1315std::vector<InstructionTemplate> ExegesisX86Target::generateInstructionVariants(
1316 const Instruction &Instr, unsigned MaxConfigsPerOpcode) const {
1317 bool Exploration = false;
1318 SmallVector<SmallVector<MCOperand, 1>, 4> VariableChoices;
1319 VariableChoices.resize(N: Instr.Variables.size());
1320 for (auto I : zip(t: Instr.Variables, u&: VariableChoices)) {
1321 const Variable &Var = std::get<0>(t&: I);
1322 SmallVectorImpl<MCOperand> &Choices = std::get<1>(t&: I);
1323
1324 switch (Instr.getPrimaryOperand(Var).getExplicitOperandInfo().OperandType) {
1325 default:
1326 // We don't wish to explicitly explore this variable.
1327 Choices.emplace_back(); // But add invalid MCOperand to simplify logic.
1328 continue;
1329 case X86::OperandType::OPERAND_COND_CODE: {
1330 Exploration = true;
1331 auto CondCodes = enum_seq_inclusive(Begin: X86::CondCode::COND_O,
1332 End: X86::CondCode::LAST_VALID_COND,
1333 force_iteration_on_noniterable_enum);
1334 Choices.reserve(N: CondCodes.size());
1335 for (int CondCode : CondCodes)
1336 Choices.emplace_back(Args: MCOperand::createImm(Val: CondCode));
1337 break;
1338 }
1339 }
1340 }
1341
1342 // If we don't wish to explore any variables, defer to the baseline method.
1343 if (!Exploration)
1344 return ExegesisTarget::generateInstructionVariants(Instr,
1345 MaxConfigsPerOpcode);
1346
1347 std::vector<InstructionTemplate> Variants;
1348 size_t NumVariants;
1349 CombinationGenerator<MCOperand, decltype(VariableChoices)::value_type, 4> G(
1350 VariableChoices);
1351
1352 // How many operand combinations can we produce, within the limit?
1353 NumVariants = std::min(a: G.numCombinations(), b: (size_t)MaxConfigsPerOpcode);
1354 // And actually produce all the wanted operand combinations.
1355 Variants.reserve(n: NumVariants);
1356 G.generate(Callback: [&](ArrayRef<MCOperand> State) -> bool {
1357 Variants.emplace_back(args: &Instr);
1358 Variants.back().setVariableValues(State);
1359 // Did we run out of space for variants?
1360 return Variants.size() >= NumVariants;
1361 });
1362
1363 assert(Variants.size() == NumVariants &&
1364 Variants.size() <= MaxConfigsPerOpcode &&
1365 "Should not produce too many variants");
1366 return Variants;
1367}
1368
1369static ExegesisTarget *getTheExegesisX86Target() {
1370 static ExegesisX86Target Target;
1371 return &Target;
1372}
1373
1374void InitializeX86ExegesisTarget() {
1375 ExegesisTarget::registerTarget(T: getTheExegesisX86Target());
1376}
1377
1378} // namespace exegesis
1379} // namespace llvm
1380