1//===-- Target.cpp ----------------------------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8#include "../Target.h"
9
10#include "../Error.h"
11#include "../MmapUtils.h"
12#include "../ParallelSnippetGenerator.h"
13#include "../SerialSnippetGenerator.h"
14#include "../SnippetGenerator.h"
15#include "../SubprocessMemory.h"
16#include "MCTargetDesc/X86BaseInfo.h"
17#include "MCTargetDesc/X86MCTargetDesc.h"
18#include "X86.h"
19#include "X86Counter.h"
20#include "X86RegisterInfo.h"
21#include "llvm/ADT/Sequence.h"
22#include "llvm/CodeGen/MachineInstrBuilder.h"
23#include "llvm/MC/MCInstBuilder.h"
24#include "llvm/Support/Errc.h"
25#include "llvm/Support/Error.h"
26#include "llvm/Support/ErrorHandling.h"
27#include "llvm/Support/FormatVariadic.h"
28#include "llvm/TargetParser/Host.h"
29
30#include <memory>
31#include <string>
32#include <vector>
33#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
34#include <immintrin.h>
35#include <intrin.h>
36#endif
37#if defined(_MSC_VER) && defined(_M_X64)
38#include <float.h> // For _clearfp in ~X86SavedState().
39#endif
40
41#ifdef __linux__
42#ifdef __x86_64__
43#include <asm/prctl.h>
44#endif // __x86_64__
45#include <sys/mman.h>
46#include <sys/syscall.h>
47#include <unistd.h>
48#ifdef HAVE_LIBPFM
49#include <perfmon/perf_event.h>
50#endif // HAVE_LIBPFM
51#endif
52
53#define GET_AVAILABLE_OPCODE_CHECKER
54#include "X86GenInstrInfo.inc"
55
56namespace llvm {
57namespace exegesis {
58
59// If a positive value is specified, we are going to use the LBR in
60// latency-mode.
61//
62// Note:
63// - A small value is preferred, but too low a value could result in
64// throttling.
65// - A prime number is preferred to avoid always skipping certain blocks.
66//
67static cl::opt<unsigned> LbrSamplingPeriod(
68 "x86-lbr-sample-period",
69 cl::desc("The sample period (nbranches/sample), used for LBR sampling"),
70 cl::cat(BenchmarkOptions), cl::init(Val: 0));
71
72static cl::opt<bool>
73 DisableUpperSSERegisters("x86-disable-upper-sse-registers",
74 cl::desc("Disable XMM8-XMM15 register usage"),
75 cl::cat(BenchmarkOptions), cl::init(Val: false));
76
77// FIXME: Validates that repetition-mode is loop if LBR is requested.
78
79// Returns a non-null reason if we cannot handle the memory references in this
80// instruction.
81static const char *isInvalidMemoryInstr(const Instruction &Instr) {
82 switch (Instr.Description.TSFlags & X86II::FormMask) {
83 default:
84 return "Unknown FormMask value";
85 // These have no memory access.
86 case X86II::Pseudo:
87 case X86II::RawFrm:
88 case X86II::AddCCFrm:
89 case X86II::PrefixByte:
90 case X86II::MRMDestReg:
91 case X86II::MRMSrcReg:
92 case X86II::MRMSrcReg4VOp3:
93 case X86II::MRMSrcRegOp4:
94 case X86II::MRMSrcRegCC:
95 case X86II::MRMXrCC:
96 case X86II::MRMr0:
97 case X86II::MRMXr:
98 case X86II::MRM0r:
99 case X86II::MRM1r:
100 case X86II::MRM2r:
101 case X86II::MRM3r:
102 case X86II::MRM4r:
103 case X86II::MRM5r:
104 case X86II::MRM6r:
105 case X86II::MRM7r:
106 case X86II::MRM0X:
107 case X86II::MRM1X:
108 case X86II::MRM2X:
109 case X86II::MRM3X:
110 case X86II::MRM4X:
111 case X86II::MRM5X:
112 case X86II::MRM6X:
113 case X86II::MRM7X:
114 case X86II::MRM_C0:
115 case X86II::MRM_C1:
116 case X86II::MRM_C2:
117 case X86II::MRM_C3:
118 case X86II::MRM_C4:
119 case X86II::MRM_C5:
120 case X86II::MRM_C6:
121 case X86II::MRM_C7:
122 case X86II::MRM_C8:
123 case X86II::MRM_C9:
124 case X86II::MRM_CA:
125 case X86II::MRM_CB:
126 case X86II::MRM_CC:
127 case X86II::MRM_CD:
128 case X86II::MRM_CE:
129 case X86II::MRM_CF:
130 case X86II::MRM_D0:
131 case X86II::MRM_D1:
132 case X86II::MRM_D2:
133 case X86II::MRM_D3:
134 case X86II::MRM_D4:
135 case X86II::MRM_D5:
136 case X86II::MRM_D6:
137 case X86II::MRM_D7:
138 case X86II::MRM_D8:
139 case X86II::MRM_D9:
140 case X86II::MRM_DA:
141 case X86II::MRM_DB:
142 case X86II::MRM_DC:
143 case X86II::MRM_DD:
144 case X86II::MRM_DE:
145 case X86II::MRM_DF:
146 case X86II::MRM_E0:
147 case X86II::MRM_E1:
148 case X86II::MRM_E2:
149 case X86II::MRM_E3:
150 case X86II::MRM_E4:
151 case X86II::MRM_E5:
152 case X86II::MRM_E6:
153 case X86II::MRM_E7:
154 case X86II::MRM_E8:
155 case X86II::MRM_E9:
156 case X86II::MRM_EA:
157 case X86II::MRM_EB:
158 case X86II::MRM_EC:
159 case X86II::MRM_ED:
160 case X86II::MRM_EE:
161 case X86II::MRM_EF:
162 case X86II::MRM_F0:
163 case X86II::MRM_F1:
164 case X86II::MRM_F2:
165 case X86II::MRM_F3:
166 case X86II::MRM_F4:
167 case X86II::MRM_F5:
168 case X86II::MRM_F6:
169 case X86II::MRM_F7:
170 case X86II::MRM_F8:
171 case X86II::MRM_F9:
172 case X86II::MRM_FA:
173 case X86II::MRM_FB:
174 case X86II::MRM_FC:
175 case X86II::MRM_FD:
176 case X86II::MRM_FE:
177 case X86II::MRM_FF:
178 case X86II::RawFrmImm8:
179 return nullptr;
180 case X86II::AddRegFrm:
181 return (Instr.Description.Opcode == X86::POP16r ||
182 Instr.Description.Opcode == X86::POP32r ||
183 Instr.Description.Opcode == X86::PUSH16r ||
184 Instr.Description.Opcode == X86::PUSH32r)
185 ? "unsupported opcode: unsupported memory access"
186 : nullptr;
187 // These access memory and are handled.
188 case X86II::MRMDestMem:
189 case X86II::MRMSrcMem:
190 case X86II::MRMSrcMem4VOp3:
191 case X86II::MRMSrcMemOp4:
192 case X86II::MRMSrcMemCC:
193 case X86II::MRMXmCC:
194 case X86II::MRMXm:
195 case X86II::MRM0m:
196 case X86II::MRM1m:
197 case X86II::MRM2m:
198 case X86II::MRM3m:
199 case X86II::MRM4m:
200 case X86II::MRM5m:
201 case X86II::MRM6m:
202 case X86II::MRM7m:
203 return nullptr;
204 // These access memory and are not handled yet.
205 case X86II::RawFrmImm16:
206 case X86II::RawFrmMemOffs:
207 case X86II::RawFrmSrc:
208 case X86II::RawFrmDst:
209 case X86II::RawFrmDstSrc:
210 return "unsupported opcode: non uniform memory access";
211 }
212}
213
214// If the opcode is invalid, returns a pointer to a character literal indicating
215// the reason. nullptr indicates a valid opcode.
216static const char *isInvalidOpcode(const Instruction &Instr) {
217 const auto OpcodeName = Instr.Name;
218 if ((Instr.Description.TSFlags & X86II::FormMask) == X86II::Pseudo)
219 return "unsupported opcode: pseudo instruction";
220 if ((OpcodeName.starts_with(Prefix: "POP") && !OpcodeName.starts_with(Prefix: "POPCNT")) ||
221 OpcodeName.starts_with(Prefix: "PUSH") ||
222 OpcodeName.starts_with(Prefix: "ADJCALLSTACK") || OpcodeName.starts_with(Prefix: "LEAVE"))
223 return "unsupported opcode: Push/Pop/AdjCallStack/Leave";
224 switch (Instr.Description.Opcode) {
225 case X86::LFS16rm:
226 case X86::LFS32rm:
227 case X86::LFS64rm:
228 case X86::LGS16rm:
229 case X86::LGS32rm:
230 case X86::LGS64rm:
231 case X86::LSS16rm:
232 case X86::LSS32rm:
233 case X86::LSS64rm:
234 case X86::SYSENTER:
235 case X86::WRFSBASE:
236 case X86::WRFSBASE64:
237 return "unsupported opcode";
238 default:
239 break;
240 }
241 if (const auto reason = isInvalidMemoryInstr(Instr))
242 return reason;
243 // We do not handle instructions with OPERAND_PCREL.
244 for (const Operand &Op : Instr.Operands)
245 if (Op.isExplicit() &&
246 Op.getExplicitOperandInfo().OperandType == MCOI::OPERAND_PCREL)
247 return "unsupported opcode: PC relative operand";
248 // We do not handle second-form X87 instructions. We only handle first-form
249 // ones (_Fp), see comment in X86InstrFPStack.td.
250 for (const Operand &Op : Instr.Operands)
251 if (Op.isReg() && Op.isExplicit() &&
252 Op.getExplicitOperandInfo().RegClass == X86::RSTRegClassID)
253 return "unsupported second-form X87 instruction";
254 return nullptr;
255}
256
257static unsigned getX86FPFlags(const Instruction &Instr) {
258 return Instr.Description.TSFlags & X86II::FPTypeMask;
259}
260
261// Helper to fill a memory operand with a value.
262static void setMemOp(InstructionTemplate &IT, int OpIdx,
263 const MCOperand &OpVal) {
264 const auto Op = IT.getInstr().Operands[OpIdx];
265 assert(Op.isExplicit() && "invalid memory pattern");
266 IT.getValueFor(Op) = OpVal;
267}
268
269// Common (latency, uops) code for LEA templates. `GetDestReg` takes the
270// addressing base and index registers and returns the LEA destination register.
271static Expected<std::vector<CodeTemplate>> generateLEATemplatesCommon(
272 const Instruction &Instr, const BitVector &ForbiddenRegisters,
273 const LLVMState &State, const SnippetGenerator::Options &Opts,
274 std::function<void(unsigned, unsigned, BitVector &CandidateDestRegs)>
275 RestrictDestRegs) {
276 assert(Instr.Operands.size() == 6 && "invalid LEA");
277 assert(X86II::getMemoryOperandNo(Instr.Description.TSFlags) == 1 &&
278 "invalid LEA");
279
280 constexpr const int kDestOp = 0;
281 constexpr const int kBaseOp = 1;
282 constexpr const int kIndexOp = 3;
283 auto PossibleDestRegs =
284 Instr.Operands[kDestOp].getRegisterAliasing().sourceBits();
285 remove(A&: PossibleDestRegs, B: ForbiddenRegisters);
286 auto PossibleBaseRegs =
287 Instr.Operands[kBaseOp].getRegisterAliasing().sourceBits();
288 remove(A&: PossibleBaseRegs, B: ForbiddenRegisters);
289 auto PossibleIndexRegs =
290 Instr.Operands[kIndexOp].getRegisterAliasing().sourceBits();
291 remove(A&: PossibleIndexRegs, B: ForbiddenRegisters);
292
293 const auto &RegInfo = State.getRegInfo();
294 std::vector<CodeTemplate> Result;
295 for (const unsigned BaseReg : PossibleBaseRegs.set_bits()) {
296 for (const unsigned IndexReg : PossibleIndexRegs.set_bits()) {
297 for (int LogScale = 0; LogScale <= 3; ++LogScale) {
298 // FIXME: Add an option for controlling how we explore immediates.
299 for (const int Disp : {0, 42}) {
300 InstructionTemplate IT(&Instr);
301 const int64_t Scale = 1ull << LogScale;
302 setMemOp(IT, OpIdx: 1, OpVal: MCOperand::createReg(Reg: BaseReg));
303 setMemOp(IT, OpIdx: 2, OpVal: MCOperand::createImm(Val: Scale));
304 setMemOp(IT, OpIdx: 3, OpVal: MCOperand::createReg(Reg: IndexReg));
305 setMemOp(IT, OpIdx: 4, OpVal: MCOperand::createImm(Val: Disp));
306 // SegmentReg must be 0 for LEA.
307 setMemOp(IT, OpIdx: 5, OpVal: MCOperand::createReg(Reg: 0));
308
309 // Output reg candidates are selected by the caller.
310 auto PossibleDestRegsNow = PossibleDestRegs;
311 RestrictDestRegs(BaseReg, IndexReg, PossibleDestRegsNow);
312 assert(PossibleDestRegsNow.set_bits().begin() !=
313 PossibleDestRegsNow.set_bits().end() &&
314 "no remaining registers");
315 setMemOp(
316 IT, OpIdx: 0,
317 OpVal: MCOperand::createReg(Reg: *PossibleDestRegsNow.set_bits().begin()));
318
319 CodeTemplate CT;
320 CT.Instructions.push_back(x: std::move(IT));
321 CT.Config = formatv(Fmt: "{3}(%{0}, %{1}, {2})", Vals: RegInfo.getName(RegNo: BaseReg),
322 Vals: RegInfo.getName(RegNo: IndexReg), Vals: Scale, Vals: Disp)
323 .str();
324 Result.push_back(x: std::move(CT));
325 if (Result.size() >= Opts.MaxConfigsPerOpcode)
326 return std::move(Result);
327 }
328 }
329 }
330 }
331
332 return std::move(Result);
333}
334
335namespace {
336class X86SerialSnippetGenerator : public SerialSnippetGenerator {
337public:
338 using SerialSnippetGenerator::SerialSnippetGenerator;
339
340 Expected<std::vector<CodeTemplate>>
341 generateCodeTemplates(InstructionTemplate Variant,
342 const BitVector &ForbiddenRegisters) const override;
343};
344} // namespace
345
346Expected<std::vector<CodeTemplate>>
347X86SerialSnippetGenerator::generateCodeTemplates(
348 InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const {
349 const Instruction &Instr = Variant.getInstr();
350
351 if (const auto reason = isInvalidOpcode(Instr))
352 return make_error<Failure>(Args: reason);
353
354 // LEA gets special attention.
355 const auto Opcode = Instr.Description.getOpcode();
356 if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r) {
357 return generateLEATemplatesCommon(
358 Instr, ForbiddenRegisters, State, Opts,
359 RestrictDestRegs: [this](unsigned BaseReg, unsigned IndexReg,
360 BitVector &CandidateDestRegs) {
361 // We just select a destination register that aliases the base
362 // register.
363 CandidateDestRegs &=
364 State.getRATC().getRegister(Reg: BaseReg).aliasedBits();
365 });
366 }
367
368 if (Instr.hasMemoryOperands())
369 return make_error<Failure>(
370 Args: "unsupported memory operand in latency measurements");
371
372 switch (getX86FPFlags(Instr)) {
373 case X86II::NotFP:
374 return SerialSnippetGenerator::generateCodeTemplates(Variant,
375 ForbiddenRegisters);
376 case X86II::ZeroArgFP:
377 case X86II::OneArgFP:
378 case X86II::SpecialFP:
379 case X86II::CompareFP:
380 case X86II::CondMovFP:
381 return make_error<Failure>(Args: "Unsupported x87 Instruction");
382 case X86II::OneArgFPRW:
383 case X86II::TwoArgFP:
384 // These are instructions like
385 // - `ST(0) = fsqrt(ST(0))` (OneArgFPRW)
386 // - `ST(0) = ST(0) + ST(i)` (TwoArgFP)
387 // They are intrinsically serial and do not modify the state of the stack.
388 return generateSelfAliasingCodeTemplates(Variant, ForbiddenRegisters);
389 default:
390 llvm_unreachable("Unknown FP Type!");
391 }
392}
393
394namespace {
395class X86ParallelSnippetGenerator : public ParallelSnippetGenerator {
396public:
397 using ParallelSnippetGenerator::ParallelSnippetGenerator;
398
399 Expected<std::vector<CodeTemplate>>
400 generateCodeTemplates(InstructionTemplate Variant,
401 const BitVector &ForbiddenRegisters) const override;
402};
403
404} // namespace
405
406Expected<std::vector<CodeTemplate>>
407X86ParallelSnippetGenerator::generateCodeTemplates(
408 InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const {
409 const Instruction &Instr = Variant.getInstr();
410
411 if (const auto reason = isInvalidOpcode(Instr))
412 return make_error<Failure>(Args: reason);
413
414 // LEA gets special attention.
415 const auto Opcode = Instr.Description.getOpcode();
416 if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r) {
417 return generateLEATemplatesCommon(
418 Instr, ForbiddenRegisters, State, Opts,
419 RestrictDestRegs: [this](unsigned BaseReg, unsigned IndexReg,
420 BitVector &CandidateDestRegs) {
421 // Any destination register that is not used for addressing is fine.
422 remove(A&: CandidateDestRegs,
423 B: State.getRATC().getRegister(Reg: BaseReg).aliasedBits());
424 remove(A&: CandidateDestRegs,
425 B: State.getRATC().getRegister(Reg: IndexReg).aliasedBits());
426 });
427 }
428
429 switch (getX86FPFlags(Instr)) {
430 case X86II::NotFP:
431 return ParallelSnippetGenerator::generateCodeTemplates(Variant,
432 ForbiddenRegisters);
433 case X86II::ZeroArgFP:
434 case X86II::OneArgFP:
435 case X86II::SpecialFP:
436 return make_error<Failure>(Args: "Unsupported x87 Instruction");
437 case X86II::OneArgFPRW:
438 case X86II::TwoArgFP:
439 // These are instructions like
440 // - `ST(0) = fsqrt(ST(0))` (OneArgFPRW)
441 // - `ST(0) = ST(0) + ST(i)` (TwoArgFP)
442 // They are intrinsically serial and do not modify the state of the stack.
443 // We generate the same code for latency and uops.
444 return generateSelfAliasingCodeTemplates(Variant, ForbiddenRegisters);
445 case X86II::CompareFP:
446 case X86II::CondMovFP:
447 // We can compute uops for any FP instruction that does not grow or shrink
448 // the stack (either do not touch the stack or push as much as they pop).
449 return generateUnconstrainedCodeTemplates(
450 Variant, Msg: "instruction does not grow/shrink the FP stack");
451 default:
452 llvm_unreachable("Unknown FP Type!");
453 }
454}
455
456static unsigned getLoadImmediateOpcode(unsigned RegBitWidth) {
457 switch (RegBitWidth) {
458 case 8:
459 return X86::MOV8ri;
460 case 16:
461 return X86::MOV16ri;
462 case 32:
463 return X86::MOV32ri;
464 case 64:
465 return X86::MOV64ri;
466 }
467 llvm_unreachable("Invalid Value Width");
468}
469
470// Generates instruction to load an immediate value into a register.
471static MCInst loadImmediate(unsigned Reg, unsigned RegBitWidth,
472 const APInt &Value) {
473 if (Value.getBitWidth() > RegBitWidth)
474 llvm_unreachable("Value must fit in the Register");
475 return MCInstBuilder(getLoadImmediateOpcode(RegBitWidth))
476 .addReg(Reg)
477 .addImm(Val: Value.getZExtValue());
478}
479
480// Allocates scratch memory on the stack.
481static MCInst allocateStackSpace(unsigned Bytes) {
482 return MCInstBuilder(X86::SUB64ri8)
483 .addReg(Reg: X86::RSP)
484 .addReg(Reg: X86::RSP)
485 .addImm(Val: Bytes);
486}
487
488// Fills scratch memory at offset `OffsetBytes` with value `Imm`.
489static MCInst fillStackSpace(unsigned MovOpcode, unsigned OffsetBytes,
490 uint64_t Imm) {
491 return MCInstBuilder(MovOpcode)
492 // Address = ESP
493 .addReg(Reg: X86::RSP) // BaseReg
494 .addImm(Val: 1) // ScaleAmt
495 .addReg(Reg: 0) // IndexReg
496 .addImm(Val: OffsetBytes) // Disp
497 .addReg(Reg: 0) // Segment
498 // Immediate.
499 .addImm(Val: Imm);
500}
501
502// Loads scratch memory into register `Reg` using opcode `RMOpcode`.
503static MCInst loadToReg(unsigned Reg, unsigned RMOpcode) {
504 return MCInstBuilder(RMOpcode)
505 .addReg(Reg)
506 // Address = ESP
507 .addReg(Reg: X86::RSP) // BaseReg
508 .addImm(Val: 1) // ScaleAmt
509 .addReg(Reg: 0) // IndexReg
510 .addImm(Val: 0) // Disp
511 .addReg(Reg: 0); // Segment
512}
513
514// Releases scratch memory.
515static MCInst releaseStackSpace(unsigned Bytes) {
516 return MCInstBuilder(X86::ADD64ri8)
517 .addReg(Reg: X86::RSP)
518 .addReg(Reg: X86::RSP)
519 .addImm(Val: Bytes);
520}
521
522// Reserves some space on the stack, fills it with the content of the provided
523// constant and provide methods to load the stack value into a register.
524namespace {
525struct ConstantInliner {
526 explicit ConstantInliner(const APInt &Constant) : Constant_(Constant) {}
527
528 std::vector<MCInst> loadAndFinalize(unsigned Reg, unsigned RegBitWidth,
529 unsigned Opcode);
530
531 std::vector<MCInst> loadX87STAndFinalize(unsigned Reg);
532
533 std::vector<MCInst> loadX87FPAndFinalize(unsigned Reg);
534
535 std::vector<MCInst> popFlagAndFinalize();
536
537 std::vector<MCInst> loadImplicitRegAndFinalize(unsigned Opcode,
538 unsigned Value);
539
540private:
541 ConstantInliner &add(const MCInst &Inst) {
542 Instructions.push_back(x: Inst);
543 return *this;
544 }
545
546 void initStack(unsigned Bytes);
547
548 static constexpr const unsigned kF80Bytes = 10; // 80 bits.
549
550 APInt Constant_;
551 std::vector<MCInst> Instructions;
552};
553} // namespace
554
555std::vector<MCInst> ConstantInliner::loadAndFinalize(unsigned Reg,
556 unsigned RegBitWidth,
557 unsigned Opcode) {
558 assert((RegBitWidth & 7) == 0 && "RegBitWidth must be a multiple of 8 bits");
559 initStack(Bytes: RegBitWidth / 8);
560 add(Inst: loadToReg(Reg, RMOpcode: Opcode));
561 add(Inst: releaseStackSpace(Bytes: RegBitWidth / 8));
562 return std::move(Instructions);
563}
564
565std::vector<MCInst> ConstantInliner::loadX87STAndFinalize(unsigned Reg) {
566 initStack(Bytes: kF80Bytes);
567 add(Inst: MCInstBuilder(X86::LD_F80m)
568 // Address = ESP
569 .addReg(Reg: X86::RSP) // BaseReg
570 .addImm(Val: 1) // ScaleAmt
571 .addReg(Reg: 0) // IndexReg
572 .addImm(Val: 0) // Disp
573 .addReg(Reg: 0)); // Segment
574 if (Reg != X86::ST0)
575 add(Inst: MCInstBuilder(X86::ST_Frr).addReg(Reg));
576 add(Inst: releaseStackSpace(Bytes: kF80Bytes));
577 return std::move(Instructions);
578}
579
580std::vector<MCInst> ConstantInliner::loadX87FPAndFinalize(unsigned Reg) {
581 initStack(Bytes: kF80Bytes);
582 add(Inst: MCInstBuilder(X86::LD_Fp80m)
583 .addReg(Reg)
584 // Address = ESP
585 .addReg(Reg: X86::RSP) // BaseReg
586 .addImm(Val: 1) // ScaleAmt
587 .addReg(Reg: 0) // IndexReg
588 .addImm(Val: 0) // Disp
589 .addReg(Reg: 0)); // Segment
590 add(Inst: releaseStackSpace(Bytes: kF80Bytes));
591 return std::move(Instructions);
592}
593
594std::vector<MCInst> ConstantInliner::popFlagAndFinalize() {
595 initStack(Bytes: 8);
596 add(Inst: MCInstBuilder(X86::POPF64));
597 return std::move(Instructions);
598}
599
600std::vector<MCInst>
601ConstantInliner::loadImplicitRegAndFinalize(unsigned Opcode, unsigned Value) {
602 add(Inst: allocateStackSpace(Bytes: 4));
603 add(Inst: fillStackSpace(MovOpcode: X86::MOV32mi, OffsetBytes: 0, Imm: Value)); // Mask all FP exceptions
604 add(Inst: MCInstBuilder(Opcode)
605 // Address = ESP
606 .addReg(Reg: X86::RSP) // BaseReg
607 .addImm(Val: 1) // ScaleAmt
608 .addReg(Reg: 0) // IndexReg
609 .addImm(Val: 0) // Disp
610 .addReg(Reg: 0)); // Segment
611 add(Inst: releaseStackSpace(Bytes: 4));
612 return std::move(Instructions);
613}
614
615void ConstantInliner::initStack(unsigned Bytes) {
616 assert(Constant_.getBitWidth() <= Bytes * 8 &&
617 "Value does not have the correct size");
618 const APInt WideConstant = Constant_.getBitWidth() < Bytes * 8
619 ? Constant_.sext(width: Bytes * 8)
620 : Constant_;
621 add(Inst: allocateStackSpace(Bytes));
622 size_t ByteOffset = 0;
623 for (; Bytes - ByteOffset >= 4; ByteOffset += 4)
624 add(Inst: fillStackSpace(
625 MovOpcode: X86::MOV32mi, OffsetBytes: ByteOffset,
626 Imm: WideConstant.extractBits(numBits: 32, bitPosition: ByteOffset * 8).getZExtValue()));
627 if (Bytes - ByteOffset >= 2) {
628 add(Inst: fillStackSpace(
629 MovOpcode: X86::MOV16mi, OffsetBytes: ByteOffset,
630 Imm: WideConstant.extractBits(numBits: 16, bitPosition: ByteOffset * 8).getZExtValue()));
631 ByteOffset += 2;
632 }
633 if (Bytes - ByteOffset >= 1)
634 add(Inst: fillStackSpace(
635 MovOpcode: X86::MOV8mi, OffsetBytes: ByteOffset,
636 Imm: WideConstant.extractBits(numBits: 8, bitPosition: ByteOffset * 8).getZExtValue()));
637}
638
639#include "X86GenExegesis.inc"
640
641namespace {
642
643class X86SavedState : public ExegesisTarget::SavedState {
644public:
645 X86SavedState() {
646#if defined(_MSC_VER) && defined(_M_X64)
647 _fxsave64(FPState);
648 Eflags = __readeflags();
649#elif defined(__GNUC__) && defined(__x86_64__)
650 __builtin_ia32_fxsave64(FPState);
651 Eflags = __builtin_ia32_readeflags_u64();
652#else
653 report_fatal_error("X86 exegesis running on unsupported target");
654#endif
655 }
656
657 ~X86SavedState() {
658 // Restoring the X87 state does not flush pending exceptions, make sure
659 // these exceptions are flushed now.
660#if defined(_MSC_VER) && defined(_M_X64)
661 _clearfp();
662 _fxrstor64(FPState);
663 __writeeflags(Eflags);
664#elif defined(__GNUC__) && defined(__x86_64__)
665 asm volatile("fwait");
666 __builtin_ia32_fxrstor64(FPState);
667 __builtin_ia32_writeeflags_u64(Eflags);
668#else
669 report_fatal_error("X86 exegesis running on unsupported target");
670#endif
671 }
672
673private:
674#if defined(__x86_64__) || defined(_M_X64)
675 alignas(16) char FPState[512];
676 uint64_t Eflags;
677#endif
678};
679
680class ExegesisX86Target : public ExegesisTarget {
681public:
682 ExegesisX86Target()
683 : ExegesisTarget(X86CpuPfmCounters, X86_MC::isOpcodeAvailable) {}
684
685 Expected<std::unique_ptr<pfm::CounterGroup>>
686 createCounter(StringRef CounterName, const LLVMState &State,
687 ArrayRef<const char *> ValidationCounters,
688 const pid_t ProcessID) const override {
689 // If LbrSamplingPeriod was provided, then ignore the
690 // CounterName because we only have one for LBR.
691 if (LbrSamplingPeriod > 0) {
692 // Can't use LBR without HAVE_LIBPFM, LIBPFM_HAS_FIELD_CYCLES, or without
693 // __linux__ (for now)
694#if defined(HAVE_LIBPFM) && defined(LIBPFM_HAS_FIELD_CYCLES) && \
695 defined(__linux__)
696 // TODO(boomanaiden154): Add in support for using validation counters when
697 // using LBR counters.
698 if (ValidationCounters.size() > 0)
699 return make_error<StringError>(
700 "Using LBR is not currently supported with validation counters",
701 errc::invalid_argument);
702
703 return std::make_unique<X86LbrCounter>(
704 X86LbrPerfEvent(LbrSamplingPeriod));
705#else
706 return make_error<StringError>(
707 Args: "LBR counter requested without HAVE_LIBPFM, LIBPFM_HAS_FIELD_CYCLES, "
708 "or running on Linux.",
709 Args: errc::invalid_argument);
710#endif
711 }
712 return ExegesisTarget::createCounter(CounterName, State, ValidationCounters,
713 ProcessID);
714 }
715
716 enum ArgumentRegisters { CodeSize = X86::R12, AuxiliaryMemoryFD = X86::R13 };
717
718private:
719 void addTargetSpecificPasses(PassManagerBase &PM) const override;
720
721 unsigned getScratchMemoryRegister(const Triple &TT) const override;
722
723 unsigned getDefaultLoopCounterRegister(const Triple &) const override;
724
725 unsigned getMaxMemoryAccessSize() const override { return 64; }
726
727 Error randomizeTargetMCOperand(const Instruction &Instr, const Variable &Var,
728 MCOperand &AssignedValue,
729 const BitVector &ForbiddenRegs) const override;
730
731 void fillMemoryOperands(InstructionTemplate &IT, unsigned Reg,
732 unsigned Offset) const override;
733
734 void decrementLoopCounterAndJump(MachineBasicBlock &MBB,
735 MachineBasicBlock &TargetMBB,
736 const MCInstrInfo &MII,
737 unsigned LoopRegister) const override;
738
739 std::vector<MCInst> setRegTo(const MCSubtargetInfo &STI, unsigned Reg,
740 const APInt &Value) const override;
741
742#ifdef __linux__
743 void generateLowerMunmap(std::vector<MCInst> &GeneratedCode) const override;
744
745 void generateUpperMunmap(std::vector<MCInst> &GeneratedCode) const override;
746
747 std::vector<MCInst> generateExitSyscall(unsigned ExitCode) const override;
748
749 std::vector<MCInst>
750 generateMmap(intptr_t Address, size_t Length,
751 intptr_t FileDescriptorAddress) const override;
752
753 void generateMmapAuxMem(std::vector<MCInst> &GeneratedCode) const override;
754
755 void moveArgumentRegisters(std::vector<MCInst> &GeneratedCode) const override;
756
757 std::vector<MCInst> generateMemoryInitialSetup() const override;
758
759 std::vector<MCInst> setStackRegisterToAuxMem() const override;
760
761 intptr_t getAuxiliaryMemoryStartAddress() const override;
762
763 std::vector<MCInst> configurePerfCounter(long Request, bool SaveRegisters) const override;
764
765 std::vector<unsigned> getArgumentRegisters() const override;
766
767 std::vector<unsigned> getRegistersNeedSaving() const override;
768#endif // __linux__
769
770 ArrayRef<unsigned> getUnavailableRegisters() const override {
771 if (DisableUpperSSERegisters)
772 return ArrayRef(kUnavailableRegistersSSE);
773
774 return ArrayRef(kUnavailableRegisters);
775 }
776
777 bool allowAsBackToBack(const Instruction &Instr) const override {
778 const unsigned Opcode = Instr.Description.Opcode;
779 return !isInvalidOpcode(Instr) && Opcode != X86::LEA64r &&
780 Opcode != X86::LEA64_32r && Opcode != X86::LEA16r;
781 }
782
783 std::vector<InstructionTemplate>
784 generateInstructionVariants(const Instruction &Instr,
785 unsigned MaxConfigsPerOpcode) const override;
786
787 std::unique_ptr<SnippetGenerator> createSerialSnippetGenerator(
788 const LLVMState &State,
789 const SnippetGenerator::Options &Opts) const override {
790 return std::make_unique<X86SerialSnippetGenerator>(args: State, args: Opts);
791 }
792
793 std::unique_ptr<SnippetGenerator> createParallelSnippetGenerator(
794 const LLVMState &State,
795 const SnippetGenerator::Options &Opts) const override {
796 return std::make_unique<X86ParallelSnippetGenerator>(args: State, args: Opts);
797 }
798
799 bool matchesArch(Triple::ArchType Arch) const override {
800 return Arch == Triple::x86_64 || Arch == Triple::x86;
801 }
802
803 Error checkFeatureSupport() const override {
804 // LBR is the only feature we conditionally support now.
805 // So if LBR is not requested, then we should be able to run the benchmarks.
806 if (LbrSamplingPeriod == 0)
807 return Error::success();
808
809#if defined(__linux__) && defined(HAVE_LIBPFM) && \
810 defined(LIBPFM_HAS_FIELD_CYCLES)
811 // FIXME: Fix this.
812 // https://bugs.llvm.org/show_bug.cgi?id=48918
813 // For now, only do the check if we see an Intel machine because
814 // the counter uses some intel-specific magic and it could
815 // be confuse and think an AMD machine actually has LBR support.
816#if defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) || \
817 defined(_M_X64)
818 using namespace sys::detail::x86;
819
820 if (getVendorSignature() == VendorSignatures::GENUINE_INTEL)
821 // If the kernel supports it, the hardware still may not have it.
822 return X86LbrCounter::checkLbrSupport();
823#else
824 report_fatal_error("Running X86 exegesis on unsupported target");
825#endif
826#endif
827 return make_error<StringError>(
828 Args: "LBR not supported on this kernel and/or platform",
829 Args: errc::not_supported);
830 }
831
832 std::unique_ptr<SavedState> withSavedState() const override {
833 return std::make_unique<X86SavedState>();
834 }
835
836 static const unsigned kUnavailableRegisters[4];
837 static const unsigned kUnavailableRegistersSSE[12];
838};
839
840// We disable a few registers that cannot be encoded on instructions with a REX
841// prefix.
842const unsigned ExegesisX86Target::kUnavailableRegisters[4] = {X86::AH, X86::BH,
843 X86::CH, X86::DH};
844
845// Optionally, also disable the upper (x86_64) SSE registers to reduce frontend
846// decoder load.
847const unsigned ExegesisX86Target::kUnavailableRegistersSSE[12] = {
848 X86::AH, X86::BH, X86::CH, X86::DH, X86::XMM8, X86::XMM9,
849 X86::XMM10, X86::XMM11, X86::XMM12, X86::XMM13, X86::XMM14, X86::XMM15};
850
851// We're using one of R8-R15 because these registers are never hardcoded in
852// instructions (e.g. MOVS writes to EDI, ESI, EDX), so they have less
853// conflicts.
854constexpr const unsigned kDefaultLoopCounterReg = X86::R8;
855
856} // namespace
857
858void ExegesisX86Target::addTargetSpecificPasses(PassManagerBase &PM) const {
859 // Lowers FP pseudo-instructions, e.g. ABS_Fp32 -> ABS_F.
860 PM.add(P: createX86FloatingPointStackifierPass());
861}
862
863unsigned ExegesisX86Target::getScratchMemoryRegister(const Triple &TT) const {
864 if (!TT.isArch64Bit()) {
865 // FIXME: This would require popping from the stack, so we would have to
866 // add some additional setup code.
867 return 0;
868 }
869 return TT.isOSWindows() ? X86::RCX : X86::RDI;
870}
871
872unsigned
873ExegesisX86Target::getDefaultLoopCounterRegister(const Triple &TT) const {
874 if (!TT.isArch64Bit()) {
875 return 0;
876 }
877 return kDefaultLoopCounterReg;
878}
879
880Error ExegesisX86Target::randomizeTargetMCOperand(
881 const Instruction &Instr, const Variable &Var, MCOperand &AssignedValue,
882 const BitVector &ForbiddenRegs) const {
883 const Operand &Op = Instr.getPrimaryOperand(Var);
884 switch (Op.getExplicitOperandInfo().OperandType) {
885 case X86::OperandType::OPERAND_COND_CODE:
886 AssignedValue =
887 MCOperand::createImm(Val: randomIndex(Max: X86::CondCode::LAST_VALID_COND));
888 return Error::success();
889 case X86::OperandType::OPERAND_ROUNDING_CONTROL:
890 AssignedValue =
891 MCOperand::createImm(Val: randomIndex(Max: X86::STATIC_ROUNDING::TO_ZERO));
892 return Error::success();
893 default:
894 break;
895 }
896 return make_error<Failure>(
897 Args: Twine("unimplemented operand type ")
898 .concat(Suffix: Twine(Op.getExplicitOperandInfo().OperandType)));
899}
900
901void ExegesisX86Target::fillMemoryOperands(InstructionTemplate &IT,
902 unsigned Reg,
903 unsigned Offset) const {
904 assert(!isInvalidMemoryInstr(IT.getInstr()) &&
905 "fillMemoryOperands requires a valid memory instruction");
906 int MemOpIdx = X86II::getMemoryOperandNo(TSFlags: IT.getInstr().Description.TSFlags);
907 assert(MemOpIdx >= 0 && "invalid memory operand index");
908 // getMemoryOperandNo() ignores tied operands, so we have to add them back.
909 MemOpIdx += X86II::getOperandBias(Desc: IT.getInstr().Description);
910 setMemOp(IT, OpIdx: MemOpIdx + 0, OpVal: MCOperand::createReg(Reg)); // BaseReg
911 setMemOp(IT, OpIdx: MemOpIdx + 1, OpVal: MCOperand::createImm(Val: 1)); // ScaleAmt
912 setMemOp(IT, OpIdx: MemOpIdx + 2, OpVal: MCOperand::createReg(Reg: 0)); // IndexReg
913 setMemOp(IT, OpIdx: MemOpIdx + 3, OpVal: MCOperand::createImm(Val: Offset)); // Disp
914 setMemOp(IT, OpIdx: MemOpIdx + 4, OpVal: MCOperand::createReg(Reg: 0)); // Segment
915}
916
917void ExegesisX86Target::decrementLoopCounterAndJump(
918 MachineBasicBlock &MBB, MachineBasicBlock &TargetMBB,
919 const MCInstrInfo &MII, unsigned LoopRegister) const {
920 BuildMI(BB: &MBB, MIMD: DebugLoc(), MCID: MII.get(Opcode: X86::ADD64ri8))
921 .addDef(RegNo: LoopRegister)
922 .addUse(RegNo: LoopRegister)
923 .addImm(Val: -1);
924 BuildMI(BB: &MBB, MIMD: DebugLoc(), MCID: MII.get(Opcode: X86::JCC_1))
925 .addMBB(MBB: &TargetMBB)
926 .addImm(Val: X86::COND_NE);
927}
928
929void generateRegisterStackPush(unsigned int Register,
930 std::vector<MCInst> &GeneratedCode) {
931 GeneratedCode.push_back(x: MCInstBuilder(X86::PUSH64r).addReg(Reg: Register));
932}
933
934void generateRegisterStackPop(unsigned int Register,
935 std::vector<MCInst> &GeneratedCode) {
936 GeneratedCode.push_back(x: MCInstBuilder(X86::POP64r).addReg(Reg: Register));
937}
938
939void generateSyscall(long SyscallNumber, std::vector<MCInst> &GeneratedCode) {
940 GeneratedCode.push_back(
941 x: loadImmediate(Reg: X86::RAX, RegBitWidth: 64, Value: APInt(64, SyscallNumber)));
942 GeneratedCode.push_back(x: MCInstBuilder(X86::SYSCALL));
943}
944
945// The functions below for saving and restoring system call registers are only
946// used when llvm-exegesis is built on Linux.
947#ifdef __linux__
948constexpr std::array<unsigned, 6> SyscallArgumentRegisters{
949 X86::RDI, X86::RSI, X86::RDX, X86::R10, X86::R8, X86::R9};
950
951static void saveSyscallRegisters(std::vector<MCInst> &GeneratedCode,
952 unsigned ArgumentCount) {
953 assert(ArgumentCount <= 6 &&
954 "System calls only X86-64 Linux can only take six arguments");
955 // Preserve RCX and R11 (Clobbered by the system call).
956 generateRegisterStackPush(Register: X86::RCX, GeneratedCode);
957 generateRegisterStackPush(Register: X86::R11, GeneratedCode);
958 // Preserve RAX (used for the syscall number/return value).
959 generateRegisterStackPush(Register: X86::RAX, GeneratedCode);
960 // Preserve the registers used to pass arguments to the system call.
961 for (unsigned I = 0; I < ArgumentCount; ++I)
962 generateRegisterStackPush(Register: SyscallArgumentRegisters[I], GeneratedCode);
963}
964
965static void restoreSyscallRegisters(std::vector<MCInst> &GeneratedCode,
966 unsigned ArgumentCount) {
967 assert(ArgumentCount <= 6 &&
968 "System calls only X86-64 Linux can only take six arguments");
969 // Restore the argument registers, in the opposite order of the way they are
970 // saved.
971 for (unsigned I = ArgumentCount; I > 0; --I) {
972 generateRegisterStackPop(Register: SyscallArgumentRegisters[I - 1], GeneratedCode);
973 }
974 generateRegisterStackPop(Register: X86::RAX, GeneratedCode);
975 generateRegisterStackPop(Register: X86::R11, GeneratedCode);
976 generateRegisterStackPop(Register: X86::RCX, GeneratedCode);
977}
978#endif // __linux__
979
980static std::vector<MCInst> loadImmediateSegmentRegister(unsigned Reg,
981 const APInt &Value) {
982#if defined(__x86_64__) && defined(__linux__)
983 assert(Value.getBitWidth() <= 64 && "Value must fit in the register.");
984 std::vector<MCInst> loadSegmentRegisterCode;
985 // Preserve the syscall registers here as we don't
986 // want to make any assumptions about the ordering of what registers are
987 // loaded in first, and we might have already loaded in registers that we are
988 // going to be clobbering here.
989 saveSyscallRegisters(GeneratedCode&: loadSegmentRegisterCode, ArgumentCount: 2);
990 // Generate the instructions to make the arch_prctl system call to set
991 // the registers.
992 int SyscallCode = 0;
993 if (Reg == X86::FS)
994 SyscallCode = ARCH_SET_FS;
995 else if (Reg == X86::GS)
996 SyscallCode = ARCH_SET_GS;
997 else
998 llvm_unreachable("Only the segment registers GS and FS are supported");
999 loadSegmentRegisterCode.push_back(
1000 x: loadImmediate(Reg: X86::RDI, RegBitWidth: 64, Value: APInt(64, SyscallCode)));
1001 loadSegmentRegisterCode.push_back(x: loadImmediate(Reg: X86::RSI, RegBitWidth: 64, Value));
1002 generateSyscall(SYS_arch_prctl, GeneratedCode&: loadSegmentRegisterCode);
1003 // Restore the registers in reverse order
1004 restoreSyscallRegisters(GeneratedCode&: loadSegmentRegisterCode, ArgumentCount: 2);
1005 return loadSegmentRegisterCode;
1006#else
1007 llvm_unreachable("Loading immediate segment registers is only supported with "
1008 "x86-64 llvm-exegesis");
1009#endif // defined(__x86_64__) && defined(__linux__)
1010}
1011
1012std::vector<MCInst> ExegesisX86Target::setRegTo(const MCSubtargetInfo &STI,
1013 unsigned Reg,
1014 const APInt &Value) const {
1015 if (X86::SEGMENT_REGRegClass.contains(Reg))
1016 return loadImmediateSegmentRegister(Reg, Value);
1017 if (X86::GR8RegClass.contains(Reg))
1018 return {loadImmediate(Reg, RegBitWidth: 8, Value)};
1019 if (X86::GR16RegClass.contains(Reg))
1020 return {loadImmediate(Reg, RegBitWidth: 16, Value)};
1021 if (X86::GR32RegClass.contains(Reg))
1022 return {loadImmediate(Reg, RegBitWidth: 32, Value)};
1023 if (X86::GR64RegClass.contains(Reg))
1024 return {loadImmediate(Reg, RegBitWidth: 64, Value)};
1025 if (X86::VK8RegClass.contains(Reg) || X86::VK16RegClass.contains(Reg) ||
1026 X86::VK32RegClass.contains(Reg) || X86::VK64RegClass.contains(Reg)) {
1027 switch (Value.getBitWidth()) {
1028 case 8:
1029 if (STI.getFeatureBits()[X86::FeatureDQI]) {
1030 ConstantInliner CI(Value);
1031 return CI.loadAndFinalize(Reg, RegBitWidth: Value.getBitWidth(), Opcode: X86::KMOVBkm);
1032 }
1033 [[fallthrough]];
1034 case 16:
1035 if (STI.getFeatureBits()[X86::FeatureAVX512]) {
1036 ConstantInliner CI(Value.zextOrTrunc(width: 16));
1037 return CI.loadAndFinalize(Reg, RegBitWidth: 16, Opcode: X86::KMOVWkm);
1038 }
1039 break;
1040 case 32:
1041 if (STI.getFeatureBits()[X86::FeatureBWI]) {
1042 ConstantInliner CI(Value);
1043 return CI.loadAndFinalize(Reg, RegBitWidth: Value.getBitWidth(), Opcode: X86::KMOVDkm);
1044 }
1045 break;
1046 case 64:
1047 if (STI.getFeatureBits()[X86::FeatureBWI]) {
1048 ConstantInliner CI(Value);
1049 return CI.loadAndFinalize(Reg, RegBitWidth: Value.getBitWidth(), Opcode: X86::KMOVQkm);
1050 }
1051 break;
1052 }
1053 }
1054 ConstantInliner CI(Value);
1055 if (X86::VR64RegClass.contains(Reg))
1056 return CI.loadAndFinalize(Reg, RegBitWidth: 64, Opcode: X86::MMX_MOVQ64rm);
1057 if (X86::VR128XRegClass.contains(Reg)) {
1058 if (STI.getFeatureBits()[X86::FeatureAVX512])
1059 return CI.loadAndFinalize(Reg, RegBitWidth: 128, Opcode: X86::VMOVDQU32Z128rm);
1060 if (STI.getFeatureBits()[X86::FeatureAVX])
1061 return CI.loadAndFinalize(Reg, RegBitWidth: 128, Opcode: X86::VMOVDQUrm);
1062 return CI.loadAndFinalize(Reg, RegBitWidth: 128, Opcode: X86::MOVDQUrm);
1063 }
1064 if (X86::VR256XRegClass.contains(Reg)) {
1065 if (STI.getFeatureBits()[X86::FeatureAVX512])
1066 return CI.loadAndFinalize(Reg, RegBitWidth: 256, Opcode: X86::VMOVDQU32Z256rm);
1067 if (STI.getFeatureBits()[X86::FeatureAVX])
1068 return CI.loadAndFinalize(Reg, RegBitWidth: 256, Opcode: X86::VMOVDQUYrm);
1069 }
1070 if (X86::VR512RegClass.contains(Reg))
1071 if (STI.getFeatureBits()[X86::FeatureAVX512])
1072 return CI.loadAndFinalize(Reg, RegBitWidth: 512, Opcode: X86::VMOVDQU32Zrm);
1073 if (X86::RSTRegClass.contains(Reg)) {
1074 return CI.loadX87STAndFinalize(Reg);
1075 }
1076 if (X86::RFP32RegClass.contains(Reg) || X86::RFP64RegClass.contains(Reg) ||
1077 X86::RFP80RegClass.contains(Reg)) {
1078 return CI.loadX87FPAndFinalize(Reg);
1079 }
1080 if (Reg == X86::EFLAGS)
1081 return CI.popFlagAndFinalize();
1082 if (Reg == X86::MXCSR)
1083 return CI.loadImplicitRegAndFinalize(
1084 Opcode: STI.getFeatureBits()[X86::FeatureAVX] ? X86::VLDMXCSR : X86::LDMXCSR,
1085 Value: 0x1f80);
1086 if (Reg == X86::FPCW)
1087 return CI.loadImplicitRegAndFinalize(Opcode: X86::FLDCW16m, Value: 0x37f);
1088 return {}; // Not yet implemented.
1089}
1090
1091#ifdef __linux__
1092
1093#ifdef __arm__
1094static constexpr const intptr_t VAddressSpaceCeiling = 0xC0000000;
1095#else
1096static constexpr const intptr_t VAddressSpaceCeiling = 0x0000800000000000;
1097#endif
1098
1099void generateRoundToNearestPage(unsigned int Register,
1100 std::vector<MCInst> &GeneratedCode) {
1101 int PageSizeShift = static_cast<int>(round(x: log2(x: getpagesize())));
1102 // Round down to the nearest page by getting rid of the least significant bits
1103 // representing location in the page. Shift right to get rid of this info and
1104 // then shift back left.
1105 GeneratedCode.push_back(x: MCInstBuilder(X86::SHR64ri)
1106 .addReg(Reg: Register)
1107 .addReg(Reg: Register)
1108 .addImm(Val: PageSizeShift));
1109 GeneratedCode.push_back(x: MCInstBuilder(X86::SHL64ri)
1110 .addReg(Reg: Register)
1111 .addReg(Reg: Register)
1112 .addImm(Val: PageSizeShift));
1113}
1114
1115void generateGetInstructionPointer(unsigned int ResultRegister,
1116 std::vector<MCInst> &GeneratedCode) {
1117 // Use a load effective address to get the current instruction pointer and put
1118 // it into the result register.
1119 GeneratedCode.push_back(x: MCInstBuilder(X86::LEA64r)
1120 .addReg(Reg: ResultRegister)
1121 .addReg(Reg: X86::RIP)
1122 .addImm(Val: 1)
1123 .addReg(Reg: 0)
1124 .addImm(Val: 0)
1125 .addReg(Reg: 0));
1126}
1127
1128void ExegesisX86Target::generateLowerMunmap(
1129 std::vector<MCInst> &GeneratedCode) const {
1130 // Unmap starting at address zero
1131 GeneratedCode.push_back(x: loadImmediate(Reg: X86::RDI, RegBitWidth: 64, Value: APInt(64, 0)));
1132 // Get the current instruction pointer so we know where to unmap up to.
1133 generateGetInstructionPointer(ResultRegister: X86::RSI, GeneratedCode);
1134 generateRoundToNearestPage(Register: X86::RSI, GeneratedCode);
1135 // Subtract a page from the end of the unmap so we don't unmap the currently
1136 // executing section.
1137 GeneratedCode.push_back(x: MCInstBuilder(X86::SUB64ri32)
1138 .addReg(Reg: X86::RSI)
1139 .addReg(Reg: X86::RSI)
1140 .addImm(Val: getpagesize()));
1141 generateSyscall(SYS_munmap, GeneratedCode);
1142}
1143
1144void ExegesisX86Target::generateUpperMunmap(
1145 std::vector<MCInst> &GeneratedCode) const {
1146 generateGetInstructionPointer(ResultRegister: X86::R8, GeneratedCode);
1147 // Load in the size of the snippet to RDI from from the argument register.
1148 GeneratedCode.push_back(x: MCInstBuilder(X86::MOV64rr)
1149 .addReg(Reg: X86::RDI)
1150 .addReg(Reg: ArgumentRegisters::CodeSize));
1151 // Add the length of the snippet (in %RDI) to the current instruction pointer
1152 // (%R8) to get the address where we should start unmapping at.
1153 GeneratedCode.push_back(x: MCInstBuilder(X86::ADD64rr)
1154 .addReg(Reg: X86::RDI)
1155 .addReg(Reg: X86::RDI)
1156 .addReg(Reg: X86::R8));
1157 generateRoundToNearestPage(Register: X86::RDI, GeneratedCode);
1158 // Add a one page to the start address to ensure that we're above the snippet
1159 // since the above function rounds down.
1160 GeneratedCode.push_back(x: MCInstBuilder(X86::ADD64ri32)
1161 .addReg(Reg: X86::RDI)
1162 .addReg(Reg: X86::RDI)
1163 .addImm(Val: getpagesize()));
1164 // Unmap to just one page under the ceiling of the address space.
1165 GeneratedCode.push_back(x: loadImmediate(
1166 Reg: X86::RSI, RegBitWidth: 64, Value: APInt(64, VAddressSpaceCeiling - getpagesize())));
1167 GeneratedCode.push_back(x: MCInstBuilder(X86::SUB64rr)
1168 .addReg(Reg: X86::RSI)
1169 .addReg(Reg: X86::RSI)
1170 .addReg(Reg: X86::RDI));
1171 generateSyscall(SYS_munmap, GeneratedCode);
1172}
1173
1174std::vector<MCInst>
1175ExegesisX86Target::generateExitSyscall(unsigned ExitCode) const {
1176 std::vector<MCInst> ExitCallCode;
1177 ExitCallCode.push_back(x: loadImmediate(Reg: X86::RDI, RegBitWidth: 64, Value: APInt(64, ExitCode)));
1178 generateSyscall(SYS_exit, GeneratedCode&: ExitCallCode);
1179 return ExitCallCode;
1180}
1181
1182std::vector<MCInst>
1183ExegesisX86Target::generateMmap(intptr_t Address, size_t Length,
1184 intptr_t FileDescriptorAddress) const {
1185 std::vector<MCInst> MmapCode;
1186 MmapCode.push_back(x: loadImmediate(Reg: X86::RDI, RegBitWidth: 64, Value: APInt(64, Address)));
1187 MmapCode.push_back(x: loadImmediate(Reg: X86::RSI, RegBitWidth: 64, Value: APInt(64, Length)));
1188 MmapCode.push_back(
1189 x: loadImmediate(Reg: X86::RDX, RegBitWidth: 64, Value: APInt(64, PROT_READ | PROT_WRITE)));
1190 MmapCode.push_back(
1191 x: loadImmediate(Reg: X86::R10, RegBitWidth: 64, Value: APInt(64, MAP_SHARED | MAP_FIXED_NOREPLACE)));
1192 // Copy file descriptor location from aux memory into R8
1193 MmapCode.push_back(
1194 x: loadImmediate(Reg: X86::R8, RegBitWidth: 64, Value: APInt(64, FileDescriptorAddress)));
1195 // Dereference file descriptor into FD argument register
1196 MmapCode.push_back(x: MCInstBuilder(X86::MOV32rm)
1197 .addReg(Reg: X86::R8D)
1198 .addReg(Reg: X86::R8)
1199 .addImm(Val: 1)
1200 .addReg(Reg: 0)
1201 .addImm(Val: 0)
1202 .addReg(Reg: 0));
1203 MmapCode.push_back(x: loadImmediate(Reg: X86::R9, RegBitWidth: 64, Value: APInt(64, 0)));
1204 generateSyscall(SYS_mmap, GeneratedCode&: MmapCode);
1205 return MmapCode;
1206}
1207
1208void ExegesisX86Target::generateMmapAuxMem(
1209 std::vector<MCInst> &GeneratedCode) const {
1210 GeneratedCode.push_back(
1211 x: loadImmediate(Reg: X86::RDI, RegBitWidth: 64, Value: APInt(64, getAuxiliaryMemoryStartAddress())));
1212 GeneratedCode.push_back(x: loadImmediate(
1213 Reg: X86::RSI, RegBitWidth: 64, Value: APInt(64, SubprocessMemory::AuxiliaryMemorySize)));
1214 GeneratedCode.push_back(
1215 x: loadImmediate(Reg: X86::RDX, RegBitWidth: 64, Value: APInt(64, PROT_READ | PROT_WRITE)));
1216 GeneratedCode.push_back(
1217 x: loadImmediate(Reg: X86::R10, RegBitWidth: 64, Value: APInt(64, MAP_SHARED | MAP_FIXED_NOREPLACE)));
1218 GeneratedCode.push_back(x: MCInstBuilder(X86::MOV64rr)
1219 .addReg(Reg: X86::R8)
1220 .addReg(Reg: ArgumentRegisters::AuxiliaryMemoryFD));
1221 GeneratedCode.push_back(x: loadImmediate(Reg: X86::R9, RegBitWidth: 64, Value: APInt(64, 0)));
1222 generateSyscall(SYS_mmap, GeneratedCode);
1223}
1224
1225void ExegesisX86Target::moveArgumentRegisters(
1226 std::vector<MCInst> &GeneratedCode) const {
1227 GeneratedCode.push_back(x: MCInstBuilder(X86::MOV64rr)
1228 .addReg(Reg: ArgumentRegisters::CodeSize)
1229 .addReg(Reg: X86::RDI));
1230 GeneratedCode.push_back(x: MCInstBuilder(X86::MOV64rr)
1231 .addReg(Reg: ArgumentRegisters::AuxiliaryMemoryFD)
1232 .addReg(Reg: X86::RSI));
1233}
1234
1235std::vector<MCInst> ExegesisX86Target::generateMemoryInitialSetup() const {
1236 std::vector<MCInst> MemoryInitialSetupCode;
1237 moveArgumentRegisters(GeneratedCode&: MemoryInitialSetupCode);
1238 generateLowerMunmap(GeneratedCode&: MemoryInitialSetupCode);
1239 generateUpperMunmap(GeneratedCode&: MemoryInitialSetupCode);
1240 generateMmapAuxMem(GeneratedCode&: MemoryInitialSetupCode);
1241 return MemoryInitialSetupCode;
1242}
1243
1244std::vector<MCInst> ExegesisX86Target::setStackRegisterToAuxMem() const {
1245 // Moves %rsp to the end of the auxiliary memory
1246 return {MCInstBuilder(X86::MOV64ri)
1247 .addReg(Reg: X86::RSP)
1248 .addImm(Val: getAuxiliaryMemoryStartAddress() +
1249 SubprocessMemory::AuxiliaryMemorySize)};
1250}
1251
1252intptr_t ExegesisX86Target::getAuxiliaryMemoryStartAddress() const {
1253 // Return the second to last page in the virtual address space to try and
1254 // prevent interference with memory annotations in the snippet
1255 return VAddressSpaceCeiling - 2 * getpagesize();
1256}
1257
1258std::vector<MCInst>
1259ExegesisX86Target::configurePerfCounter(long Request, bool SaveRegisters) const {
1260 std::vector<MCInst> ConfigurePerfCounterCode;
1261 if (SaveRegisters)
1262 saveSyscallRegisters(GeneratedCode&: ConfigurePerfCounterCode, ArgumentCount: 3);
1263 ConfigurePerfCounterCode.push_back(
1264 x: loadImmediate(Reg: X86::RDI, RegBitWidth: 64, Value: APInt(64, getAuxiliaryMemoryStartAddress())));
1265 ConfigurePerfCounterCode.push_back(x: MCInstBuilder(X86::MOV32rm)
1266 .addReg(Reg: X86::EDI)
1267 .addReg(Reg: X86::RDI)
1268 .addImm(Val: 1)
1269 .addReg(Reg: 0)
1270 .addImm(Val: 0)
1271 .addReg(Reg: 0));
1272 ConfigurePerfCounterCode.push_back(
1273 x: loadImmediate(Reg: X86::RSI, RegBitWidth: 64, Value: APInt(64, Request)));
1274#ifdef HAVE_LIBPFM
1275 ConfigurePerfCounterCode.push_back(
1276 loadImmediate(X86::RDX, 64, APInt(64, PERF_IOC_FLAG_GROUP)));
1277#endif // HAVE_LIBPFM
1278 generateSyscall(SYS_ioctl, GeneratedCode&: ConfigurePerfCounterCode);
1279 if (SaveRegisters)
1280 restoreSyscallRegisters(GeneratedCode&: ConfigurePerfCounterCode, ArgumentCount: 3);
1281 return ConfigurePerfCounterCode;
1282}
1283
1284std::vector<unsigned> ExegesisX86Target::getArgumentRegisters() const {
1285 return {X86::RDI, X86::RSI};
1286}
1287
1288std::vector<unsigned> ExegesisX86Target::getRegistersNeedSaving() const {
1289 return {X86::RAX, X86::RDI, X86::RSI, X86::RCX, X86::R11};
1290}
1291
1292#endif // __linux__
1293
1294// Instruction can have some variable operands, and we may want to see how
1295// different operands affect performance. So for each operand position,
1296// precompute all the possible choices we might care about,
1297// and greedily generate all the possible combinations of choices.
1298std::vector<InstructionTemplate> ExegesisX86Target::generateInstructionVariants(
1299 const Instruction &Instr, unsigned MaxConfigsPerOpcode) const {
1300 bool Exploration = false;
1301 SmallVector<SmallVector<MCOperand, 1>, 4> VariableChoices;
1302 VariableChoices.resize(N: Instr.Variables.size());
1303 for (auto I : zip(t: Instr.Variables, u&: VariableChoices)) {
1304 const Variable &Var = std::get<0>(t&: I);
1305 SmallVectorImpl<MCOperand> &Choices = std::get<1>(t&: I);
1306
1307 switch (Instr.getPrimaryOperand(Var).getExplicitOperandInfo().OperandType) {
1308 default:
1309 // We don't wish to explicitly explore this variable.
1310 Choices.emplace_back(); // But add invalid MCOperand to simplify logic.
1311 continue;
1312 case X86::OperandType::OPERAND_COND_CODE: {
1313 Exploration = true;
1314 auto CondCodes = enum_seq_inclusive(Begin: X86::CondCode::COND_O,
1315 End: X86::CondCode::LAST_VALID_COND,
1316 force_iteration_on_noniterable_enum);
1317 Choices.reserve(N: CondCodes.size());
1318 for (int CondCode : CondCodes)
1319 Choices.emplace_back(Args: MCOperand::createImm(Val: CondCode));
1320 break;
1321 }
1322 }
1323 }
1324
1325 // If we don't wish to explore any variables, defer to the baseline method.
1326 if (!Exploration)
1327 return ExegesisTarget::generateInstructionVariants(Instr,
1328 MaxConfigsPerOpcode);
1329
1330 std::vector<InstructionTemplate> Variants;
1331 size_t NumVariants;
1332 CombinationGenerator<MCOperand, decltype(VariableChoices)::value_type, 4> G(
1333 VariableChoices);
1334
1335 // How many operand combinations can we produce, within the limit?
1336 NumVariants = std::min(a: G.numCombinations(), b: (size_t)MaxConfigsPerOpcode);
1337 // And actually produce all the wanted operand combinations.
1338 Variants.reserve(n: NumVariants);
1339 G.generate(Callback: [&](ArrayRef<MCOperand> State) -> bool {
1340 Variants.emplace_back(args: &Instr);
1341 Variants.back().setVariableValues(State);
1342 // Did we run out of space for variants?
1343 return Variants.size() >= NumVariants;
1344 });
1345
1346 assert(Variants.size() == NumVariants &&
1347 Variants.size() <= MaxConfigsPerOpcode &&
1348 "Should not produce too many variants");
1349 return Variants;
1350}
1351
1352static ExegesisTarget *getTheExegesisX86Target() {
1353 static ExegesisX86Target Target;
1354 return &Target;
1355}
1356
1357void InitializeX86ExegesisTarget() {
1358 ExegesisTarget::registerTarget(T: getTheExegesisX86Target());
1359}
1360
1361} // namespace exegesis
1362} // namespace llvm
1363