1 | //===-- Target.cpp ----------------------------------------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | #include "../Target.h" |
9 | |
10 | #include "../Error.h" |
11 | #include "../MmapUtils.h" |
12 | #include "../ParallelSnippetGenerator.h" |
13 | #include "../SerialSnippetGenerator.h" |
14 | #include "../SnippetGenerator.h" |
15 | #include "../SubprocessMemory.h" |
16 | #include "MCTargetDesc/X86BaseInfo.h" |
17 | #include "MCTargetDesc/X86MCTargetDesc.h" |
18 | #include "X86.h" |
19 | #include "X86Counter.h" |
20 | #include "X86RegisterInfo.h" |
21 | #include "llvm/ADT/Sequence.h" |
22 | #include "llvm/CodeGen/MachineInstrBuilder.h" |
23 | #include "llvm/MC/MCInstBuilder.h" |
24 | #include "llvm/Support/Errc.h" |
25 | #include "llvm/Support/Error.h" |
26 | #include "llvm/Support/ErrorHandling.h" |
27 | #include "llvm/Support/FormatVariadic.h" |
28 | #include "llvm/TargetParser/Host.h" |
29 | |
30 | #include <memory> |
31 | #include <string> |
32 | #include <vector> |
33 | #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) |
34 | #include <immintrin.h> |
35 | #include <intrin.h> |
36 | #endif |
37 | #if defined(_MSC_VER) && defined(_M_X64) |
38 | #include <float.h> // For _clearfp in ~X86SavedState(). |
39 | #endif |
40 | |
41 | #ifdef __linux__ |
42 | #ifdef __x86_64__ |
43 | #include <asm/prctl.h> |
44 | #endif // __x86_64__ |
45 | #include <sys/mman.h> |
46 | #include <sys/syscall.h> |
47 | #include <unistd.h> |
48 | #ifdef HAVE_LIBPFM |
49 | #include <perfmon/perf_event.h> |
50 | #endif // HAVE_LIBPFM |
51 | #endif |
52 | |
53 | #define GET_AVAILABLE_OPCODE_CHECKER |
54 | #include "X86GenInstrInfo.inc" |
55 | |
56 | namespace llvm { |
57 | namespace exegesis { |
58 | |
59 | // If a positive value is specified, we are going to use the LBR in |
60 | // latency-mode. |
61 | // |
62 | // Note: |
63 | // - A small value is preferred, but too low a value could result in |
64 | // throttling. |
65 | // - A prime number is preferred to avoid always skipping certain blocks. |
66 | // |
67 | static cl::opt<unsigned> LbrSamplingPeriod( |
68 | "x86-lbr-sample-period" , |
69 | cl::desc("The sample period (nbranches/sample), used for LBR sampling" ), |
70 | cl::cat(BenchmarkOptions), cl::init(Val: 0)); |
71 | |
72 | static cl::opt<bool> |
73 | ("x86-disable-upper-sse-registers" , |
74 | cl::desc("Disable XMM8-XMM15 register usage" ), |
75 | cl::cat(BenchmarkOptions), cl::init(Val: false)); |
76 | |
77 | // FIXME: Validates that repetition-mode is loop if LBR is requested. |
78 | |
79 | // Returns a non-null reason if we cannot handle the memory references in this |
80 | // instruction. |
81 | static const char *isInvalidMemoryInstr(const Instruction &Instr) { |
82 | switch (Instr.Description.TSFlags & X86II::FormMask) { |
83 | default: |
84 | return "Unknown FormMask value" ; |
85 | // These have no memory access. |
86 | case X86II::Pseudo: |
87 | case X86II::RawFrm: |
88 | case X86II::AddCCFrm: |
89 | case X86II::PrefixByte: |
90 | case X86II::MRMDestReg: |
91 | case X86II::MRMSrcReg: |
92 | case X86II::MRMSrcReg4VOp3: |
93 | case X86II::MRMSrcRegOp4: |
94 | case X86II::MRMSrcRegCC: |
95 | case X86II::MRMXrCC: |
96 | case X86II::MRMr0: |
97 | case X86II::MRMXr: |
98 | case X86II::MRM0r: |
99 | case X86II::MRM1r: |
100 | case X86II::MRM2r: |
101 | case X86II::MRM3r: |
102 | case X86II::MRM4r: |
103 | case X86II::MRM5r: |
104 | case X86II::MRM6r: |
105 | case X86II::MRM7r: |
106 | case X86II::MRM0X: |
107 | case X86II::MRM1X: |
108 | case X86II::MRM2X: |
109 | case X86II::MRM3X: |
110 | case X86II::MRM4X: |
111 | case X86II::MRM5X: |
112 | case X86II::MRM6X: |
113 | case X86II::MRM7X: |
114 | case X86II::MRM_C0: |
115 | case X86II::MRM_C1: |
116 | case X86II::MRM_C2: |
117 | case X86II::MRM_C3: |
118 | case X86II::MRM_C4: |
119 | case X86II::MRM_C5: |
120 | case X86II::MRM_C6: |
121 | case X86II::MRM_C7: |
122 | case X86II::MRM_C8: |
123 | case X86II::MRM_C9: |
124 | case X86II::MRM_CA: |
125 | case X86II::MRM_CB: |
126 | case X86II::MRM_CC: |
127 | case X86II::MRM_CD: |
128 | case X86II::MRM_CE: |
129 | case X86II::MRM_CF: |
130 | case X86II::MRM_D0: |
131 | case X86II::MRM_D1: |
132 | case X86II::MRM_D2: |
133 | case X86II::MRM_D3: |
134 | case X86II::MRM_D4: |
135 | case X86II::MRM_D5: |
136 | case X86II::MRM_D6: |
137 | case X86II::MRM_D7: |
138 | case X86II::MRM_D8: |
139 | case X86II::MRM_D9: |
140 | case X86II::MRM_DA: |
141 | case X86II::MRM_DB: |
142 | case X86II::MRM_DC: |
143 | case X86II::MRM_DD: |
144 | case X86II::MRM_DE: |
145 | case X86II::MRM_DF: |
146 | case X86II::MRM_E0: |
147 | case X86II::MRM_E1: |
148 | case X86II::MRM_E2: |
149 | case X86II::MRM_E3: |
150 | case X86II::MRM_E4: |
151 | case X86II::MRM_E5: |
152 | case X86II::MRM_E6: |
153 | case X86II::MRM_E7: |
154 | case X86II::MRM_E8: |
155 | case X86II::MRM_E9: |
156 | case X86II::MRM_EA: |
157 | case X86II::MRM_EB: |
158 | case X86II::MRM_EC: |
159 | case X86II::MRM_ED: |
160 | case X86II::MRM_EE: |
161 | case X86II::MRM_EF: |
162 | case X86II::MRM_F0: |
163 | case X86II::MRM_F1: |
164 | case X86II::MRM_F2: |
165 | case X86II::MRM_F3: |
166 | case X86II::MRM_F4: |
167 | case X86II::MRM_F5: |
168 | case X86II::MRM_F6: |
169 | case X86II::MRM_F7: |
170 | case X86II::MRM_F8: |
171 | case X86II::MRM_F9: |
172 | case X86II::MRM_FA: |
173 | case X86II::MRM_FB: |
174 | case X86II::MRM_FC: |
175 | case X86II::MRM_FD: |
176 | case X86II::MRM_FE: |
177 | case X86II::MRM_FF: |
178 | case X86II::RawFrmImm8: |
179 | return nullptr; |
180 | case X86II::AddRegFrm: |
181 | return (Instr.Description.Opcode == X86::POP16r || |
182 | Instr.Description.Opcode == X86::POP32r || |
183 | Instr.Description.Opcode == X86::PUSH16r || |
184 | Instr.Description.Opcode == X86::PUSH32r) |
185 | ? "unsupported opcode: unsupported memory access" |
186 | : nullptr; |
187 | // These access memory and are handled. |
188 | case X86II::MRMDestMem: |
189 | case X86II::MRMSrcMem: |
190 | case X86II::MRMSrcMem4VOp3: |
191 | case X86II::MRMSrcMemOp4: |
192 | case X86II::MRMSrcMemCC: |
193 | case X86II::MRMXmCC: |
194 | case X86II::MRMXm: |
195 | case X86II::MRM0m: |
196 | case X86II::MRM1m: |
197 | case X86II::MRM2m: |
198 | case X86II::MRM3m: |
199 | case X86II::MRM4m: |
200 | case X86II::MRM5m: |
201 | case X86II::MRM6m: |
202 | case X86II::MRM7m: |
203 | return nullptr; |
204 | // These access memory and are not handled yet. |
205 | case X86II::RawFrmImm16: |
206 | case X86II::RawFrmMemOffs: |
207 | case X86II::RawFrmSrc: |
208 | case X86II::RawFrmDst: |
209 | case X86II::RawFrmDstSrc: |
210 | return "unsupported opcode: non uniform memory access" ; |
211 | } |
212 | } |
213 | |
214 | // If the opcode is invalid, returns a pointer to a character literal indicating |
215 | // the reason. nullptr indicates a valid opcode. |
216 | static const char *isInvalidOpcode(const Instruction &Instr) { |
217 | const auto OpcodeName = Instr.Name; |
218 | if ((Instr.Description.TSFlags & X86II::FormMask) == X86II::Pseudo) |
219 | return "unsupported opcode: pseudo instruction" ; |
220 | if ((OpcodeName.starts_with(Prefix: "POP" ) && !OpcodeName.starts_with(Prefix: "POPCNT" )) || |
221 | OpcodeName.starts_with(Prefix: "PUSH" ) || |
222 | OpcodeName.starts_with(Prefix: "ADJCALLSTACK" ) || OpcodeName.starts_with(Prefix: "LEAVE" )) |
223 | return "unsupported opcode: Push/Pop/AdjCallStack/Leave" ; |
224 | switch (Instr.Description.Opcode) { |
225 | case X86::LFS16rm: |
226 | case X86::LFS32rm: |
227 | case X86::LFS64rm: |
228 | case X86::LGS16rm: |
229 | case X86::LGS32rm: |
230 | case X86::LGS64rm: |
231 | case X86::LSS16rm: |
232 | case X86::LSS32rm: |
233 | case X86::LSS64rm: |
234 | case X86::SYSENTER: |
235 | case X86::WRFSBASE: |
236 | case X86::WRFSBASE64: |
237 | return "unsupported opcode" ; |
238 | default: |
239 | break; |
240 | } |
241 | if (const auto reason = isInvalidMemoryInstr(Instr)) |
242 | return reason; |
243 | // We do not handle instructions with OPERAND_PCREL. |
244 | for (const Operand &Op : Instr.Operands) |
245 | if (Op.isExplicit() && |
246 | Op.getExplicitOperandInfo().OperandType == MCOI::OPERAND_PCREL) |
247 | return "unsupported opcode: PC relative operand" ; |
248 | // We do not handle second-form X87 instructions. We only handle first-form |
249 | // ones (_Fp), see comment in X86InstrFPStack.td. |
250 | for (const Operand &Op : Instr.Operands) |
251 | if (Op.isReg() && Op.isExplicit() && |
252 | Op.getExplicitOperandInfo().RegClass == X86::RSTRegClassID) |
253 | return "unsupported second-form X87 instruction" ; |
254 | return nullptr; |
255 | } |
256 | |
257 | static unsigned getX86FPFlags(const Instruction &Instr) { |
258 | return Instr.Description.TSFlags & X86II::FPTypeMask; |
259 | } |
260 | |
261 | // Helper to fill a memory operand with a value. |
262 | static void setMemOp(InstructionTemplate &IT, int OpIdx, |
263 | const MCOperand &OpVal) { |
264 | const auto Op = IT.getInstr().Operands[OpIdx]; |
265 | assert(Op.isExplicit() && "invalid memory pattern" ); |
266 | IT.getValueFor(Op) = OpVal; |
267 | } |
268 | |
269 | // Common (latency, uops) code for LEA templates. `GetDestReg` takes the |
270 | // addressing base and index registers and returns the LEA destination register. |
271 | static Expected<std::vector<CodeTemplate>> generateLEATemplatesCommon( |
272 | const Instruction &Instr, const BitVector &ForbiddenRegisters, |
273 | const LLVMState &State, const SnippetGenerator::Options &Opts, |
274 | std::function<void(unsigned, unsigned, BitVector &CandidateDestRegs)> |
275 | RestrictDestRegs) { |
276 | assert(Instr.Operands.size() == 6 && "invalid LEA" ); |
277 | assert(X86II::getMemoryOperandNo(Instr.Description.TSFlags) == 1 && |
278 | "invalid LEA" ); |
279 | |
280 | constexpr const int kDestOp = 0; |
281 | constexpr const int kBaseOp = 1; |
282 | constexpr const int kIndexOp = 3; |
283 | auto PossibleDestRegs = |
284 | Instr.Operands[kDestOp].getRegisterAliasing().sourceBits(); |
285 | remove(A&: PossibleDestRegs, B: ForbiddenRegisters); |
286 | auto PossibleBaseRegs = |
287 | Instr.Operands[kBaseOp].getRegisterAliasing().sourceBits(); |
288 | remove(A&: PossibleBaseRegs, B: ForbiddenRegisters); |
289 | auto PossibleIndexRegs = |
290 | Instr.Operands[kIndexOp].getRegisterAliasing().sourceBits(); |
291 | remove(A&: PossibleIndexRegs, B: ForbiddenRegisters); |
292 | |
293 | const auto &RegInfo = State.getRegInfo(); |
294 | std::vector<CodeTemplate> Result; |
295 | for (const unsigned BaseReg : PossibleBaseRegs.set_bits()) { |
296 | for (const unsigned IndexReg : PossibleIndexRegs.set_bits()) { |
297 | for (int LogScale = 0; LogScale <= 3; ++LogScale) { |
298 | // FIXME: Add an option for controlling how we explore immediates. |
299 | for (const int Disp : {0, 42}) { |
300 | InstructionTemplate IT(&Instr); |
301 | const int64_t Scale = 1ull << LogScale; |
302 | setMemOp(IT, OpIdx: 1, OpVal: MCOperand::createReg(Reg: BaseReg)); |
303 | setMemOp(IT, OpIdx: 2, OpVal: MCOperand::createImm(Val: Scale)); |
304 | setMemOp(IT, OpIdx: 3, OpVal: MCOperand::createReg(Reg: IndexReg)); |
305 | setMemOp(IT, OpIdx: 4, OpVal: MCOperand::createImm(Val: Disp)); |
306 | // SegmentReg must be 0 for LEA. |
307 | setMemOp(IT, OpIdx: 5, OpVal: MCOperand::createReg(Reg: 0)); |
308 | |
309 | // Output reg candidates are selected by the caller. |
310 | auto PossibleDestRegsNow = PossibleDestRegs; |
311 | RestrictDestRegs(BaseReg, IndexReg, PossibleDestRegsNow); |
312 | assert(PossibleDestRegsNow.set_bits().begin() != |
313 | PossibleDestRegsNow.set_bits().end() && |
314 | "no remaining registers" ); |
315 | setMemOp( |
316 | IT, OpIdx: 0, |
317 | OpVal: MCOperand::createReg(Reg: *PossibleDestRegsNow.set_bits().begin())); |
318 | |
319 | CodeTemplate CT; |
320 | CT.Instructions.push_back(x: std::move(IT)); |
321 | CT.Config = formatv(Fmt: "{3}(%{0}, %{1}, {2})" , Vals: RegInfo.getName(RegNo: BaseReg), |
322 | Vals: RegInfo.getName(RegNo: IndexReg), Vals: Scale, Vals: Disp) |
323 | .str(); |
324 | Result.push_back(x: std::move(CT)); |
325 | if (Result.size() >= Opts.MaxConfigsPerOpcode) |
326 | return std::move(Result); |
327 | } |
328 | } |
329 | } |
330 | } |
331 | |
332 | return std::move(Result); |
333 | } |
334 | |
335 | namespace { |
336 | class X86SerialSnippetGenerator : public SerialSnippetGenerator { |
337 | public: |
338 | using SerialSnippetGenerator::SerialSnippetGenerator; |
339 | |
340 | Expected<std::vector<CodeTemplate>> |
341 | generateCodeTemplates(InstructionTemplate Variant, |
342 | const BitVector &ForbiddenRegisters) const override; |
343 | }; |
344 | } // namespace |
345 | |
346 | Expected<std::vector<CodeTemplate>> |
347 | X86SerialSnippetGenerator::generateCodeTemplates( |
348 | InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const { |
349 | const Instruction &Instr = Variant.getInstr(); |
350 | |
351 | if (const auto reason = isInvalidOpcode(Instr)) |
352 | return make_error<Failure>(Args: reason); |
353 | |
354 | // LEA gets special attention. |
355 | const auto Opcode = Instr.Description.getOpcode(); |
356 | if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r) { |
357 | return generateLEATemplatesCommon( |
358 | Instr, ForbiddenRegisters, State, Opts, |
359 | RestrictDestRegs: [this](unsigned BaseReg, unsigned IndexReg, |
360 | BitVector &CandidateDestRegs) { |
361 | // We just select a destination register that aliases the base |
362 | // register. |
363 | CandidateDestRegs &= |
364 | State.getRATC().getRegister(Reg: BaseReg).aliasedBits(); |
365 | }); |
366 | } |
367 | |
368 | if (Instr.hasMemoryOperands()) |
369 | return make_error<Failure>( |
370 | Args: "unsupported memory operand in latency measurements" ); |
371 | |
372 | switch (getX86FPFlags(Instr)) { |
373 | case X86II::NotFP: |
374 | return SerialSnippetGenerator::generateCodeTemplates(Variant, |
375 | ForbiddenRegisters); |
376 | case X86II::ZeroArgFP: |
377 | case X86II::OneArgFP: |
378 | case X86II::SpecialFP: |
379 | case X86II::CompareFP: |
380 | case X86II::CondMovFP: |
381 | return make_error<Failure>(Args: "Unsupported x87 Instruction" ); |
382 | case X86II::OneArgFPRW: |
383 | case X86II::TwoArgFP: |
384 | // These are instructions like |
385 | // - `ST(0) = fsqrt(ST(0))` (OneArgFPRW) |
386 | // - `ST(0) = ST(0) + ST(i)` (TwoArgFP) |
387 | // They are intrinsically serial and do not modify the state of the stack. |
388 | return generateSelfAliasingCodeTemplates(Variant, ForbiddenRegisters); |
389 | default: |
390 | llvm_unreachable("Unknown FP Type!" ); |
391 | } |
392 | } |
393 | |
394 | namespace { |
395 | class X86ParallelSnippetGenerator : public ParallelSnippetGenerator { |
396 | public: |
397 | using ParallelSnippetGenerator::ParallelSnippetGenerator; |
398 | |
399 | Expected<std::vector<CodeTemplate>> |
400 | generateCodeTemplates(InstructionTemplate Variant, |
401 | const BitVector &ForbiddenRegisters) const override; |
402 | }; |
403 | |
404 | } // namespace |
405 | |
406 | Expected<std::vector<CodeTemplate>> |
407 | X86ParallelSnippetGenerator::generateCodeTemplates( |
408 | InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const { |
409 | const Instruction &Instr = Variant.getInstr(); |
410 | |
411 | if (const auto reason = isInvalidOpcode(Instr)) |
412 | return make_error<Failure>(Args: reason); |
413 | |
414 | // LEA gets special attention. |
415 | const auto Opcode = Instr.Description.getOpcode(); |
416 | if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r) { |
417 | return generateLEATemplatesCommon( |
418 | Instr, ForbiddenRegisters, State, Opts, |
419 | RestrictDestRegs: [this](unsigned BaseReg, unsigned IndexReg, |
420 | BitVector &CandidateDestRegs) { |
421 | // Any destination register that is not used for addressing is fine. |
422 | remove(A&: CandidateDestRegs, |
423 | B: State.getRATC().getRegister(Reg: BaseReg).aliasedBits()); |
424 | remove(A&: CandidateDestRegs, |
425 | B: State.getRATC().getRegister(Reg: IndexReg).aliasedBits()); |
426 | }); |
427 | } |
428 | |
429 | switch (getX86FPFlags(Instr)) { |
430 | case X86II::NotFP: |
431 | return ParallelSnippetGenerator::generateCodeTemplates(Variant, |
432 | ForbiddenRegisters); |
433 | case X86II::ZeroArgFP: |
434 | case X86II::OneArgFP: |
435 | case X86II::SpecialFP: |
436 | return make_error<Failure>(Args: "Unsupported x87 Instruction" ); |
437 | case X86II::OneArgFPRW: |
438 | case X86II::TwoArgFP: |
439 | // These are instructions like |
440 | // - `ST(0) = fsqrt(ST(0))` (OneArgFPRW) |
441 | // - `ST(0) = ST(0) + ST(i)` (TwoArgFP) |
442 | // They are intrinsically serial and do not modify the state of the stack. |
443 | // We generate the same code for latency and uops. |
444 | return generateSelfAliasingCodeTemplates(Variant, ForbiddenRegisters); |
445 | case X86II::CompareFP: |
446 | case X86II::CondMovFP: |
447 | // We can compute uops for any FP instruction that does not grow or shrink |
448 | // the stack (either do not touch the stack or push as much as they pop). |
449 | return generateUnconstrainedCodeTemplates( |
450 | Variant, Msg: "instruction does not grow/shrink the FP stack" ); |
451 | default: |
452 | llvm_unreachable("Unknown FP Type!" ); |
453 | } |
454 | } |
455 | |
456 | static unsigned getLoadImmediateOpcode(unsigned RegBitWidth) { |
457 | switch (RegBitWidth) { |
458 | case 8: |
459 | return X86::MOV8ri; |
460 | case 16: |
461 | return X86::MOV16ri; |
462 | case 32: |
463 | return X86::MOV32ri; |
464 | case 64: |
465 | return X86::MOV64ri; |
466 | } |
467 | llvm_unreachable("Invalid Value Width" ); |
468 | } |
469 | |
470 | // Generates instruction to load an immediate value into a register. |
471 | static MCInst loadImmediate(MCRegister Reg, unsigned RegBitWidth, |
472 | const APInt &Value) { |
473 | if (Value.getBitWidth() > RegBitWidth) |
474 | llvm_unreachable("Value must fit in the Register" ); |
475 | return MCInstBuilder(getLoadImmediateOpcode(RegBitWidth)) |
476 | .addReg(Reg) |
477 | .addImm(Val: Value.getZExtValue()); |
478 | } |
479 | |
480 | // Allocates scratch memory on the stack. |
481 | static MCInst allocateStackSpace(unsigned Bytes) { |
482 | return MCInstBuilder(X86::SUB64ri8) |
483 | .addReg(Reg: X86::RSP) |
484 | .addReg(Reg: X86::RSP) |
485 | .addImm(Val: Bytes); |
486 | } |
487 | |
488 | // Fills scratch memory at offset `OffsetBytes` with value `Imm`. |
489 | static MCInst fillStackSpace(unsigned MovOpcode, unsigned OffsetBytes, |
490 | uint64_t Imm) { |
491 | return MCInstBuilder(MovOpcode) |
492 | // Address = ESP |
493 | .addReg(Reg: X86::RSP) // BaseReg |
494 | .addImm(Val: 1) // ScaleAmt |
495 | .addReg(Reg: 0) // IndexReg |
496 | .addImm(Val: OffsetBytes) // Disp |
497 | .addReg(Reg: 0) // Segment |
498 | // Immediate. |
499 | .addImm(Val: Imm); |
500 | } |
501 | |
502 | // Loads scratch memory into register `Reg` using opcode `RMOpcode`. |
503 | static MCInst loadToReg(MCRegister Reg, unsigned RMOpcode) { |
504 | return MCInstBuilder(RMOpcode) |
505 | .addReg(Reg) |
506 | // Address = ESP |
507 | .addReg(Reg: X86::RSP) // BaseReg |
508 | .addImm(Val: 1) // ScaleAmt |
509 | .addReg(Reg: 0) // IndexReg |
510 | .addImm(Val: 0) // Disp |
511 | .addReg(Reg: 0); // Segment |
512 | } |
513 | |
514 | // Releases scratch memory. |
515 | static MCInst releaseStackSpace(unsigned Bytes) { |
516 | return MCInstBuilder(X86::ADD64ri8) |
517 | .addReg(Reg: X86::RSP) |
518 | .addReg(Reg: X86::RSP) |
519 | .addImm(Val: Bytes); |
520 | } |
521 | |
522 | // Reserves some space on the stack, fills it with the content of the provided |
523 | // constant and provide methods to load the stack value into a register. |
524 | namespace { |
525 | struct ConstantInliner { |
526 | explicit ConstantInliner(const APInt &Constant) : Constant_(Constant) {} |
527 | |
528 | std::vector<MCInst> loadAndFinalize(MCRegister Reg, unsigned RegBitWidth, |
529 | unsigned Opcode); |
530 | |
531 | std::vector<MCInst> loadX87STAndFinalize(MCRegister Reg); |
532 | |
533 | std::vector<MCInst> loadX87FPAndFinalize(MCRegister Reg); |
534 | |
535 | std::vector<MCInst> popFlagAndFinalize(); |
536 | |
537 | std::vector<MCInst> loadImplicitRegAndFinalize(unsigned Opcode, |
538 | unsigned Value); |
539 | |
540 | std::vector<MCInst> loadDirectionFlagAndFinalize(); |
541 | |
542 | private: |
543 | ConstantInliner &add(const MCInst &Inst) { |
544 | Instructions.push_back(x: Inst); |
545 | return *this; |
546 | } |
547 | |
548 | void initStack(unsigned Bytes); |
549 | |
550 | static constexpr const unsigned kF80Bytes = 10; // 80 bits. |
551 | |
552 | APInt Constant_; |
553 | std::vector<MCInst> Instructions; |
554 | }; |
555 | } // namespace |
556 | |
557 | std::vector<MCInst> ConstantInliner::loadAndFinalize(MCRegister Reg, |
558 | unsigned RegBitWidth, |
559 | unsigned Opcode) { |
560 | assert((RegBitWidth & 7) == 0 && "RegBitWidth must be a multiple of 8 bits" ); |
561 | initStack(Bytes: RegBitWidth / 8); |
562 | add(Inst: loadToReg(Reg, RMOpcode: Opcode)); |
563 | add(Inst: releaseStackSpace(Bytes: RegBitWidth / 8)); |
564 | return std::move(Instructions); |
565 | } |
566 | |
567 | std::vector<MCInst> ConstantInliner::loadX87STAndFinalize(MCRegister Reg) { |
568 | initStack(Bytes: kF80Bytes); |
569 | add(Inst: MCInstBuilder(X86::LD_F80m) |
570 | // Address = ESP |
571 | .addReg(Reg: X86::RSP) // BaseReg |
572 | .addImm(Val: 1) // ScaleAmt |
573 | .addReg(Reg: 0) // IndexReg |
574 | .addImm(Val: 0) // Disp |
575 | .addReg(Reg: 0)); // Segment |
576 | if (Reg != X86::ST0) |
577 | add(Inst: MCInstBuilder(X86::ST_Frr).addReg(Reg)); |
578 | add(Inst: releaseStackSpace(Bytes: kF80Bytes)); |
579 | return std::move(Instructions); |
580 | } |
581 | |
582 | std::vector<MCInst> ConstantInliner::loadX87FPAndFinalize(MCRegister Reg) { |
583 | initStack(Bytes: kF80Bytes); |
584 | add(Inst: MCInstBuilder(X86::LD_Fp80m) |
585 | .addReg(Reg) |
586 | // Address = ESP |
587 | .addReg(Reg: X86::RSP) // BaseReg |
588 | .addImm(Val: 1) // ScaleAmt |
589 | .addReg(Reg: 0) // IndexReg |
590 | .addImm(Val: 0) // Disp |
591 | .addReg(Reg: 0)); // Segment |
592 | add(Inst: releaseStackSpace(Bytes: kF80Bytes)); |
593 | return std::move(Instructions); |
594 | } |
595 | |
596 | std::vector<MCInst> ConstantInliner::popFlagAndFinalize() { |
597 | initStack(Bytes: 8); |
598 | add(Inst: MCInstBuilder(X86::POPF64)); |
599 | return std::move(Instructions); |
600 | } |
601 | |
602 | std::vector<MCInst> |
603 | ConstantInliner::loadImplicitRegAndFinalize(unsigned Opcode, unsigned Value) { |
604 | add(Inst: allocateStackSpace(Bytes: 4)); |
605 | add(Inst: fillStackSpace(MovOpcode: X86::MOV32mi, OffsetBytes: 0, Imm: Value)); // Mask all FP exceptions |
606 | add(Inst: MCInstBuilder(Opcode) |
607 | // Address = ESP |
608 | .addReg(Reg: X86::RSP) // BaseReg |
609 | .addImm(Val: 1) // ScaleAmt |
610 | .addReg(Reg: 0) // IndexReg |
611 | .addImm(Val: 0) // Disp |
612 | .addReg(Reg: 0)); // Segment |
613 | add(Inst: releaseStackSpace(Bytes: 4)); |
614 | return std::move(Instructions); |
615 | } |
616 | |
617 | std::vector<MCInst> ConstantInliner::loadDirectionFlagAndFinalize() { |
618 | if (Constant_.isZero()) |
619 | add(Inst: MCInstBuilder(X86::CLD)); |
620 | else if (Constant_.isOne()) |
621 | add(Inst: MCInstBuilder(X86::STD)); |
622 | |
623 | return std::move(Instructions); |
624 | } |
625 | |
626 | void ConstantInliner::initStack(unsigned Bytes) { |
627 | assert(Constant_.getBitWidth() <= Bytes * 8 && |
628 | "Value does not have the correct size" ); |
629 | const APInt WideConstant = Constant_.getBitWidth() < Bytes * 8 |
630 | ? Constant_.sext(width: Bytes * 8) |
631 | : Constant_; |
632 | add(Inst: allocateStackSpace(Bytes)); |
633 | size_t ByteOffset = 0; |
634 | for (; Bytes - ByteOffset >= 4; ByteOffset += 4) |
635 | add(Inst: fillStackSpace( |
636 | MovOpcode: X86::MOV32mi, OffsetBytes: ByteOffset, |
637 | Imm: WideConstant.extractBits(numBits: 32, bitPosition: ByteOffset * 8).getZExtValue())); |
638 | if (Bytes - ByteOffset >= 2) { |
639 | add(Inst: fillStackSpace( |
640 | MovOpcode: X86::MOV16mi, OffsetBytes: ByteOffset, |
641 | Imm: WideConstant.extractBits(numBits: 16, bitPosition: ByteOffset * 8).getZExtValue())); |
642 | ByteOffset += 2; |
643 | } |
644 | if (Bytes - ByteOffset >= 1) |
645 | add(Inst: fillStackSpace( |
646 | MovOpcode: X86::MOV8mi, OffsetBytes: ByteOffset, |
647 | Imm: WideConstant.extractBits(numBits: 8, bitPosition: ByteOffset * 8).getZExtValue())); |
648 | } |
649 | |
650 | #include "X86GenExegesis.inc" |
651 | |
652 | namespace { |
653 | |
654 | class X86SavedState : public ExegesisTarget::SavedState { |
655 | public: |
656 | X86SavedState() { |
657 | #if defined(_MSC_VER) && defined(_M_X64) |
658 | _fxsave64(FPState); |
659 | Eflags = __readeflags(); |
660 | #elif defined(__GNUC__) && defined(__x86_64__) |
661 | __builtin_ia32_fxsave64(FPState); |
662 | Eflags = __builtin_ia32_readeflags_u64(); |
663 | #else |
664 | report_fatal_error("X86 exegesis running on unsupported target" ); |
665 | #endif |
666 | } |
667 | |
668 | ~X86SavedState() { |
669 | // Restoring the X87 state does not flush pending exceptions, make sure |
670 | // these exceptions are flushed now. |
671 | #if defined(_MSC_VER) && defined(_M_X64) |
672 | _clearfp(); |
673 | _fxrstor64(FPState); |
674 | __writeeflags(Eflags); |
675 | #elif defined(__GNUC__) && defined(__x86_64__) |
676 | asm volatile("fwait" ); |
677 | __builtin_ia32_fxrstor64(FPState); |
678 | __builtin_ia32_writeeflags_u64(Eflags); |
679 | #else |
680 | report_fatal_error("X86 exegesis running on unsupported target" ); |
681 | #endif |
682 | } |
683 | |
684 | private: |
685 | #if defined(__x86_64__) || defined(_M_X64) |
686 | alignas(16) char FPState[512]; |
687 | uint64_t Eflags; |
688 | #endif |
689 | }; |
690 | |
691 | class ExegesisX86Target : public ExegesisTarget { |
692 | public: |
693 | ExegesisX86Target() |
694 | : ExegesisTarget(X86CpuPfmCounters, X86_MC::isOpcodeAvailable) {} |
695 | |
696 | Expected<std::unique_ptr<pfm::CounterGroup>> |
697 | createCounter(StringRef CounterName, const LLVMState &State, |
698 | ArrayRef<const char *> ValidationCounters, |
699 | const pid_t ProcessID) const override { |
700 | // If LbrSamplingPeriod was provided, then ignore the |
701 | // CounterName because we only have one for LBR. |
702 | if (LbrSamplingPeriod > 0) { |
703 | // Can't use LBR without HAVE_LIBPFM, LIBPFM_HAS_FIELD_CYCLES, or without |
704 | // __linux__ (for now) |
705 | #if defined(HAVE_LIBPFM) && defined(LIBPFM_HAS_FIELD_CYCLES) && \ |
706 | defined(__linux__) |
707 | // TODO(boomanaiden154): Add in support for using validation counters when |
708 | // using LBR counters. |
709 | if (ValidationCounters.size() > 0) |
710 | return make_error<StringError>( |
711 | "Using LBR is not currently supported with validation counters" , |
712 | errc::invalid_argument); |
713 | |
714 | return std::make_unique<X86LbrCounter>( |
715 | X86LbrPerfEvent(LbrSamplingPeriod)); |
716 | #else |
717 | return make_error<StringError>( |
718 | Args: "LBR counter requested without HAVE_LIBPFM, LIBPFM_HAS_FIELD_CYCLES, " |
719 | "or running on Linux." , |
720 | Args: errc::invalid_argument); |
721 | #endif |
722 | } |
723 | return ExegesisTarget::createCounter(CounterName, State, ValidationCounters, |
724 | ProcessID); |
725 | } |
726 | |
727 | enum ArgumentRegisters { CodeSize = X86::R12, AuxiliaryMemoryFD = X86::R13 }; |
728 | |
729 | private: |
730 | void addTargetSpecificPasses(PassManagerBase &PM) const override; |
731 | |
732 | MCRegister getScratchMemoryRegister(const Triple &TT) const override; |
733 | |
734 | MCRegister getDefaultLoopCounterRegister(const Triple &) const override; |
735 | |
736 | unsigned getMaxMemoryAccessSize() const override { return 64; } |
737 | |
738 | Error randomizeTargetMCOperand(const Instruction &Instr, const Variable &Var, |
739 | MCOperand &AssignedValue, |
740 | const BitVector &ForbiddenRegs) const override; |
741 | |
742 | void fillMemoryOperands(InstructionTemplate &IT, MCRegister Reg, |
743 | unsigned Offset) const override; |
744 | |
745 | void decrementLoopCounterAndJump(MachineBasicBlock &MBB, |
746 | MachineBasicBlock &TargetMBB, |
747 | const MCInstrInfo &MII, |
748 | MCRegister LoopRegister) const override; |
749 | |
750 | std::vector<MCInst> setRegTo(const MCSubtargetInfo &STI, MCRegister Reg, |
751 | const APInt &Value) const override; |
752 | |
753 | #ifdef __linux__ |
754 | void generateLowerMunmap(std::vector<MCInst> &GeneratedCode) const override; |
755 | |
756 | void generateUpperMunmap(std::vector<MCInst> &GeneratedCode) const override; |
757 | |
758 | std::vector<MCInst> generateExitSyscall(unsigned ExitCode) const override; |
759 | |
760 | std::vector<MCInst> |
761 | generateMmap(uintptr_t Address, size_t Length, |
762 | uintptr_t FileDescriptorAddress) const override; |
763 | |
764 | void generateMmapAuxMem(std::vector<MCInst> &GeneratedCode) const override; |
765 | |
766 | void moveArgumentRegisters(std::vector<MCInst> &GeneratedCode) const override; |
767 | |
768 | std::vector<MCInst> generateMemoryInitialSetup() const override; |
769 | |
770 | std::vector<MCInst> setStackRegisterToAuxMem() const override; |
771 | |
772 | uintptr_t getAuxiliaryMemoryStartAddress() const override; |
773 | |
774 | std::vector<MCInst> configurePerfCounter(long Request, bool SaveRegisters) const override; |
775 | |
776 | std::vector<MCRegister> getArgumentRegisters() const override; |
777 | |
778 | std::vector<MCRegister> getRegistersNeedSaving() const override; |
779 | #endif // __linux__ |
780 | |
781 | ArrayRef<MCPhysReg> getUnavailableRegisters() const override { |
782 | if (DisableUpperSSERegisters) |
783 | return ArrayRef(kUnavailableRegistersSSE); |
784 | |
785 | return ArrayRef(kUnavailableRegisters); |
786 | } |
787 | |
788 | bool allowAsBackToBack(const Instruction &Instr) const override { |
789 | const unsigned Opcode = Instr.Description.Opcode; |
790 | return !isInvalidOpcode(Instr) && Opcode != X86::LEA64r && |
791 | Opcode != X86::LEA64_32r && Opcode != X86::LEA16r; |
792 | } |
793 | |
794 | std::vector<InstructionTemplate> |
795 | generateInstructionVariants(const Instruction &Instr, |
796 | unsigned MaxConfigsPerOpcode) const override; |
797 | |
798 | std::unique_ptr<SnippetGenerator> createSerialSnippetGenerator( |
799 | const LLVMState &State, |
800 | const SnippetGenerator::Options &Opts) const override { |
801 | return std::make_unique<X86SerialSnippetGenerator>(args: State, args: Opts); |
802 | } |
803 | |
804 | std::unique_ptr<SnippetGenerator> createParallelSnippetGenerator( |
805 | const LLVMState &State, |
806 | const SnippetGenerator::Options &Opts) const override { |
807 | return std::make_unique<X86ParallelSnippetGenerator>(args: State, args: Opts); |
808 | } |
809 | |
810 | bool matchesArch(Triple::ArchType Arch) const override { |
811 | return Arch == Triple::x86_64 || Arch == Triple::x86; |
812 | } |
813 | |
814 | Error checkFeatureSupport() const override { |
815 | // LBR is the only feature we conditionally support now. |
816 | // So if LBR is not requested, then we should be able to run the benchmarks. |
817 | if (LbrSamplingPeriod == 0) |
818 | return Error::success(); |
819 | |
820 | #if defined(__linux__) && defined(HAVE_LIBPFM) && \ |
821 | defined(LIBPFM_HAS_FIELD_CYCLES) |
822 | // FIXME: Fix this. |
823 | // https://bugs.llvm.org/show_bug.cgi?id=48918 |
824 | // For now, only do the check if we see an Intel machine because |
825 | // the counter uses some intel-specific magic and it could |
826 | // be confuse and think an AMD machine actually has LBR support. |
827 | #if defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) || \ |
828 | defined(_M_X64) |
829 | using namespace sys::detail::x86; |
830 | |
831 | if (getVendorSignature() == VendorSignatures::GENUINE_INTEL) |
832 | // If the kernel supports it, the hardware still may not have it. |
833 | return X86LbrCounter::checkLbrSupport(); |
834 | #else |
835 | report_fatal_error("Running X86 exegesis on unsupported target" ); |
836 | #endif |
837 | #endif |
838 | return make_error<StringError>( |
839 | Args: "LBR not supported on this kernel and/or platform" , |
840 | Args: errc::not_supported); |
841 | } |
842 | |
843 | std::unique_ptr<SavedState> withSavedState() const override { |
844 | return std::make_unique<X86SavedState>(); |
845 | } |
846 | |
847 | static const MCPhysReg kUnavailableRegisters[4]; |
848 | static const MCPhysReg [12]; |
849 | }; |
850 | |
851 | // We disable a few registers that cannot be encoded on instructions with a REX |
852 | // prefix. |
853 | const MCPhysReg ExegesisX86Target::kUnavailableRegisters[4] = { |
854 | X86::AH, X86::BH, X86::CH, X86::DH}; |
855 | |
856 | // Optionally, also disable the upper (x86_64) SSE registers to reduce frontend |
857 | // decoder load. |
858 | const MCPhysReg ExegesisX86Target::[12] = { |
859 | X86::AH, X86::BH, X86::CH, X86::DH, X86::XMM8, X86::XMM9, |
860 | X86::XMM10, X86::XMM11, X86::XMM12, X86::XMM13, X86::XMM14, X86::XMM15}; |
861 | |
862 | // We're using one of R8-R15 because these registers are never hardcoded in |
863 | // instructions (e.g. MOVS writes to EDI, ESI, EDX), so they have less |
864 | // conflicts. |
865 | constexpr const MCPhysReg kDefaultLoopCounterReg = X86::R8; |
866 | |
867 | } // namespace |
868 | |
869 | void ExegesisX86Target::addTargetSpecificPasses(PassManagerBase &PM) const { |
870 | // Lowers FP pseudo-instructions, e.g. ABS_Fp32 -> ABS_F. |
871 | PM.add(P: createX86FloatingPointStackifierPass()); |
872 | } |
873 | |
874 | MCRegister ExegesisX86Target::getScratchMemoryRegister(const Triple &TT) const { |
875 | if (!TT.isArch64Bit()) { |
876 | // FIXME: This would require popping from the stack, so we would have to |
877 | // add some additional setup code. |
878 | return MCRegister(); |
879 | } |
880 | return TT.isOSWindows() ? X86::RCX : X86::RDI; |
881 | } |
882 | |
883 | MCRegister |
884 | ExegesisX86Target::getDefaultLoopCounterRegister(const Triple &TT) const { |
885 | if (!TT.isArch64Bit()) { |
886 | return MCRegister(); |
887 | } |
888 | return kDefaultLoopCounterReg; |
889 | } |
890 | |
891 | Error ExegesisX86Target::randomizeTargetMCOperand( |
892 | const Instruction &Instr, const Variable &Var, MCOperand &AssignedValue, |
893 | const BitVector &ForbiddenRegs) const { |
894 | const Operand &Op = Instr.getPrimaryOperand(Var); |
895 | switch (Op.getExplicitOperandInfo().OperandType) { |
896 | case X86::OperandType::OPERAND_COND_CODE: |
897 | AssignedValue = |
898 | MCOperand::createImm(Val: randomIndex(Max: X86::CondCode::LAST_VALID_COND)); |
899 | return Error::success(); |
900 | case X86::OperandType::OPERAND_ROUNDING_CONTROL: |
901 | AssignedValue = |
902 | MCOperand::createImm(Val: randomIndex(Max: X86::STATIC_ROUNDING::TO_ZERO)); |
903 | return Error::success(); |
904 | default: |
905 | break; |
906 | } |
907 | return make_error<Failure>( |
908 | Args: Twine("unimplemented operand type " ) |
909 | .concat(Suffix: Twine(Op.getExplicitOperandInfo().OperandType))); |
910 | } |
911 | |
912 | void ExegesisX86Target::fillMemoryOperands(InstructionTemplate &IT, |
913 | MCRegister Reg, |
914 | unsigned Offset) const { |
915 | assert(!isInvalidMemoryInstr(IT.getInstr()) && |
916 | "fillMemoryOperands requires a valid memory instruction" ); |
917 | int MemOpIdx = X86II::getMemoryOperandNo(TSFlags: IT.getInstr().Description.TSFlags); |
918 | assert(MemOpIdx >= 0 && "invalid memory operand index" ); |
919 | // getMemoryOperandNo() ignores tied operands, so we have to add them back. |
920 | MemOpIdx += X86II::getOperandBias(Desc: IT.getInstr().Description); |
921 | setMemOp(IT, OpIdx: MemOpIdx + 0, OpVal: MCOperand::createReg(Reg)); // BaseReg |
922 | setMemOp(IT, OpIdx: MemOpIdx + 1, OpVal: MCOperand::createImm(Val: 1)); // ScaleAmt |
923 | setMemOp(IT, OpIdx: MemOpIdx + 2, OpVal: MCOperand::createReg(Reg: 0)); // IndexReg |
924 | setMemOp(IT, OpIdx: MemOpIdx + 3, OpVal: MCOperand::createImm(Val: Offset)); // Disp |
925 | setMemOp(IT, OpIdx: MemOpIdx + 4, OpVal: MCOperand::createReg(Reg: 0)); // Segment |
926 | } |
927 | |
928 | void ExegesisX86Target::decrementLoopCounterAndJump( |
929 | MachineBasicBlock &MBB, MachineBasicBlock &TargetMBB, |
930 | const MCInstrInfo &MII, MCRegister LoopRegister) const { |
931 | BuildMI(BB: &MBB, MIMD: DebugLoc(), MCID: MII.get(Opcode: X86::ADD64ri8)) |
932 | .addDef(RegNo: LoopRegister) |
933 | .addUse(RegNo: LoopRegister) |
934 | .addImm(Val: -1); |
935 | BuildMI(BB: &MBB, MIMD: DebugLoc(), MCID: MII.get(Opcode: X86::JCC_1)) |
936 | .addMBB(MBB: &TargetMBB) |
937 | .addImm(Val: X86::COND_NE); |
938 | } |
939 | |
940 | void generateRegisterStackPush(unsigned int Register, |
941 | std::vector<MCInst> &GeneratedCode) { |
942 | GeneratedCode.push_back(x: MCInstBuilder(X86::PUSH64r).addReg(Reg: Register)); |
943 | } |
944 | |
945 | void generateRegisterStackPop(unsigned int Register, |
946 | std::vector<MCInst> &GeneratedCode) { |
947 | GeneratedCode.push_back(x: MCInstBuilder(X86::POP64r).addReg(Reg: Register)); |
948 | } |
949 | |
950 | void generateSyscall(long SyscallNumber, std::vector<MCInst> &GeneratedCode) { |
951 | GeneratedCode.push_back( |
952 | x: loadImmediate(Reg: X86::RAX, RegBitWidth: 64, Value: APInt(64, SyscallNumber))); |
953 | GeneratedCode.push_back(x: MCInstBuilder(X86::SYSCALL)); |
954 | } |
955 | |
956 | // The functions below for saving and restoring system call registers are only |
957 | // used when llvm-exegesis is built on Linux. |
958 | #ifdef __linux__ |
959 | constexpr std::array<unsigned, 6> SyscallArgumentRegisters{ |
960 | X86::RDI, X86::RSI, X86::RDX, X86::R10, X86::R8, X86::R9}; |
961 | |
962 | static void saveSyscallRegisters(std::vector<MCInst> &GeneratedCode, |
963 | unsigned ArgumentCount) { |
964 | assert(ArgumentCount <= 6 && |
965 | "System calls only X86-64 Linux can only take six arguments" ); |
966 | // Preserve RCX and R11 (Clobbered by the system call). |
967 | generateRegisterStackPush(Register: X86::RCX, GeneratedCode); |
968 | generateRegisterStackPush(Register: X86::R11, GeneratedCode); |
969 | // Preserve RAX (used for the syscall number/return value). |
970 | generateRegisterStackPush(Register: X86::RAX, GeneratedCode); |
971 | // Preserve the registers used to pass arguments to the system call. |
972 | for (unsigned I = 0; I < ArgumentCount; ++I) |
973 | generateRegisterStackPush(Register: SyscallArgumentRegisters[I], GeneratedCode); |
974 | } |
975 | |
976 | static void restoreSyscallRegisters(std::vector<MCInst> &GeneratedCode, |
977 | unsigned ArgumentCount) { |
978 | assert(ArgumentCount <= 6 && |
979 | "System calls only X86-64 Linux can only take six arguments" ); |
980 | // Restore the argument registers, in the opposite order of the way they are |
981 | // saved. |
982 | for (unsigned I = ArgumentCount; I > 0; --I) { |
983 | generateRegisterStackPop(Register: SyscallArgumentRegisters[I - 1], GeneratedCode); |
984 | } |
985 | generateRegisterStackPop(Register: X86::RAX, GeneratedCode); |
986 | generateRegisterStackPop(Register: X86::R11, GeneratedCode); |
987 | generateRegisterStackPop(Register: X86::RCX, GeneratedCode); |
988 | } |
989 | #endif // __linux__ |
990 | |
991 | static std::vector<MCInst> loadImmediateSegmentRegister(MCRegister Reg, |
992 | const APInt &Value) { |
993 | #if defined(__x86_64__) && defined(__linux__) |
994 | assert(Value.getBitWidth() <= 64 && "Value must fit in the register." ); |
995 | std::vector<MCInst> loadSegmentRegisterCode; |
996 | // Preserve the syscall registers here as we don't |
997 | // want to make any assumptions about the ordering of what registers are |
998 | // loaded in first, and we might have already loaded in registers that we are |
999 | // going to be clobbering here. |
1000 | saveSyscallRegisters(GeneratedCode&: loadSegmentRegisterCode, ArgumentCount: 2); |
1001 | // Generate the instructions to make the arch_prctl system call to set |
1002 | // the registers. |
1003 | int SyscallCode = 0; |
1004 | if (Reg == X86::FS) |
1005 | SyscallCode = ARCH_SET_FS; |
1006 | else if (Reg == X86::GS) |
1007 | SyscallCode = ARCH_SET_GS; |
1008 | else |
1009 | llvm_unreachable("Only the segment registers GS and FS are supported" ); |
1010 | loadSegmentRegisterCode.push_back( |
1011 | x: loadImmediate(Reg: X86::RDI, RegBitWidth: 64, Value: APInt(64, SyscallCode))); |
1012 | loadSegmentRegisterCode.push_back(x: loadImmediate(Reg: X86::RSI, RegBitWidth: 64, Value)); |
1013 | generateSyscall(SYS_arch_prctl, GeneratedCode&: loadSegmentRegisterCode); |
1014 | // Restore the registers in reverse order |
1015 | restoreSyscallRegisters(GeneratedCode&: loadSegmentRegisterCode, ArgumentCount: 2); |
1016 | return loadSegmentRegisterCode; |
1017 | #else |
1018 | llvm_unreachable("Loading immediate segment registers is only supported with " |
1019 | "x86-64 llvm-exegesis" ); |
1020 | #endif // defined(__x86_64__) && defined(__linux__) |
1021 | } |
1022 | |
1023 | std::vector<MCInst> ExegesisX86Target::setRegTo(const MCSubtargetInfo &STI, |
1024 | MCRegister Reg, |
1025 | const APInt &Value) const { |
1026 | if (X86::SEGMENT_REGRegClass.contains(Reg)) |
1027 | return loadImmediateSegmentRegister(Reg, Value); |
1028 | if (X86::GR8RegClass.contains(Reg)) |
1029 | return {loadImmediate(Reg, RegBitWidth: 8, Value)}; |
1030 | if (X86::GR16RegClass.contains(Reg)) |
1031 | return {loadImmediate(Reg, RegBitWidth: 16, Value)}; |
1032 | if (X86::GR32RegClass.contains(Reg)) |
1033 | return {loadImmediate(Reg, RegBitWidth: 32, Value)}; |
1034 | if (X86::GR64RegClass.contains(Reg)) |
1035 | return {loadImmediate(Reg, RegBitWidth: 64, Value)}; |
1036 | if (X86::VK8RegClass.contains(Reg) || X86::VK16RegClass.contains(Reg) || |
1037 | X86::VK32RegClass.contains(Reg) || X86::VK64RegClass.contains(Reg)) { |
1038 | switch (Value.getBitWidth()) { |
1039 | case 8: |
1040 | if (STI.getFeatureBits()[X86::FeatureDQI]) { |
1041 | ConstantInliner CI(Value); |
1042 | return CI.loadAndFinalize(Reg, RegBitWidth: Value.getBitWidth(), Opcode: X86::KMOVBkm); |
1043 | } |
1044 | [[fallthrough]]; |
1045 | case 16: |
1046 | if (STI.getFeatureBits()[X86::FeatureAVX512]) { |
1047 | ConstantInliner CI(Value.zextOrTrunc(width: 16)); |
1048 | return CI.loadAndFinalize(Reg, RegBitWidth: 16, Opcode: X86::KMOVWkm); |
1049 | } |
1050 | break; |
1051 | case 32: |
1052 | if (STI.getFeatureBits()[X86::FeatureBWI]) { |
1053 | ConstantInliner CI(Value); |
1054 | return CI.loadAndFinalize(Reg, RegBitWidth: Value.getBitWidth(), Opcode: X86::KMOVDkm); |
1055 | } |
1056 | break; |
1057 | case 64: |
1058 | if (STI.getFeatureBits()[X86::FeatureBWI]) { |
1059 | ConstantInliner CI(Value); |
1060 | return CI.loadAndFinalize(Reg, RegBitWidth: Value.getBitWidth(), Opcode: X86::KMOVQkm); |
1061 | } |
1062 | break; |
1063 | } |
1064 | } |
1065 | ConstantInliner CI(Value); |
1066 | if (X86::VR64RegClass.contains(Reg)) |
1067 | return CI.loadAndFinalize(Reg, RegBitWidth: 64, Opcode: X86::MMX_MOVQ64rm); |
1068 | if (X86::VR128RegClass.contains(Reg)) { |
1069 | if (STI.getFeatureBits()[X86::FeatureAVX]) |
1070 | return CI.loadAndFinalize(Reg, RegBitWidth: 128, Opcode: X86::VMOVDQUrm); |
1071 | return CI.loadAndFinalize(Reg, RegBitWidth: 128, Opcode: X86::MOVDQUrm); |
1072 | } |
1073 | if (X86::VR128XRegClass.contains(Reg)) { |
1074 | if (STI.getFeatureBits()[X86::FeatureAVX512]) |
1075 | return CI.loadAndFinalize(Reg, RegBitWidth: 128, Opcode: X86::VMOVDQU32Z128rm); |
1076 | } |
1077 | if (X86::VR256RegClass.contains(Reg)) { |
1078 | if (STI.getFeatureBits()[X86::FeatureAVX]) |
1079 | return CI.loadAndFinalize(Reg, RegBitWidth: 256, Opcode: X86::VMOVDQUYrm); |
1080 | } |
1081 | if (X86::VR256XRegClass.contains(Reg)) { |
1082 | if (STI.getFeatureBits()[X86::FeatureAVX512]) |
1083 | return CI.loadAndFinalize(Reg, RegBitWidth: 256, Opcode: X86::VMOVDQU32Z256rm); |
1084 | } |
1085 | if (X86::VR512RegClass.contains(Reg)) |
1086 | if (STI.getFeatureBits()[X86::FeatureAVX512]) |
1087 | return CI.loadAndFinalize(Reg, RegBitWidth: 512, Opcode: X86::VMOVDQU32Zrm); |
1088 | if (X86::RSTRegClass.contains(Reg)) { |
1089 | return CI.loadX87STAndFinalize(Reg); |
1090 | } |
1091 | if (X86::RFP32RegClass.contains(Reg) || X86::RFP64RegClass.contains(Reg) || |
1092 | X86::RFP80RegClass.contains(Reg)) { |
1093 | return CI.loadX87FPAndFinalize(Reg); |
1094 | } |
1095 | if (Reg == X86::EFLAGS) |
1096 | return CI.popFlagAndFinalize(); |
1097 | if (Reg == X86::MXCSR) |
1098 | return CI.loadImplicitRegAndFinalize( |
1099 | Opcode: STI.getFeatureBits()[X86::FeatureAVX] ? X86::VLDMXCSR : X86::LDMXCSR, |
1100 | Value: 0x1f80); |
1101 | if (Reg == X86::FPCW) |
1102 | return CI.loadImplicitRegAndFinalize(Opcode: X86::FLDCW16m, Value: 0x37f); |
1103 | if (Reg == X86::DF) |
1104 | return CI.loadDirectionFlagAndFinalize(); |
1105 | return {}; // Not yet implemented. |
1106 | } |
1107 | |
1108 | #ifdef __linux__ |
1109 | |
1110 | #ifdef __arm__ |
1111 | static constexpr const uintptr_t VAddressSpaceCeiling = 0xC0000000; |
1112 | #else |
1113 | static constexpr const uintptr_t VAddressSpaceCeiling = 0x0000800000000000; |
1114 | #endif |
1115 | |
1116 | void generateRoundToNearestPage(unsigned int Register, |
1117 | std::vector<MCInst> &GeneratedCode) { |
1118 | int PageSizeShift = static_cast<int>(round(x: log2(x: getpagesize()))); |
1119 | // Round down to the nearest page by getting rid of the least significant bits |
1120 | // representing location in the page. Shift right to get rid of this info and |
1121 | // then shift back left. |
1122 | GeneratedCode.push_back(x: MCInstBuilder(X86::SHR64ri) |
1123 | .addReg(Reg: Register) |
1124 | .addReg(Reg: Register) |
1125 | .addImm(Val: PageSizeShift)); |
1126 | GeneratedCode.push_back(x: MCInstBuilder(X86::SHL64ri) |
1127 | .addReg(Reg: Register) |
1128 | .addReg(Reg: Register) |
1129 | .addImm(Val: PageSizeShift)); |
1130 | } |
1131 | |
1132 | void generateGetInstructionPointer(unsigned int ResultRegister, |
1133 | std::vector<MCInst> &GeneratedCode) { |
1134 | // Use a load effective address to get the current instruction pointer and put |
1135 | // it into the result register. |
1136 | GeneratedCode.push_back(x: MCInstBuilder(X86::LEA64r) |
1137 | .addReg(Reg: ResultRegister) |
1138 | .addReg(Reg: X86::RIP) |
1139 | .addImm(Val: 1) |
1140 | .addReg(Reg: 0) |
1141 | .addImm(Val: 0) |
1142 | .addReg(Reg: 0)); |
1143 | } |
1144 | |
1145 | void ExegesisX86Target::generateLowerMunmap( |
1146 | std::vector<MCInst> &GeneratedCode) const { |
1147 | // Unmap starting at address zero |
1148 | GeneratedCode.push_back(x: loadImmediate(Reg: X86::RDI, RegBitWidth: 64, Value: APInt(64, 0))); |
1149 | // Get the current instruction pointer so we know where to unmap up to. |
1150 | generateGetInstructionPointer(ResultRegister: X86::RSI, GeneratedCode); |
1151 | generateRoundToNearestPage(Register: X86::RSI, GeneratedCode); |
1152 | // Subtract a page from the end of the unmap so we don't unmap the currently |
1153 | // executing section. |
1154 | GeneratedCode.push_back(x: MCInstBuilder(X86::SUB64ri32) |
1155 | .addReg(Reg: X86::RSI) |
1156 | .addReg(Reg: X86::RSI) |
1157 | .addImm(Val: getpagesize())); |
1158 | generateSyscall(SYS_munmap, GeneratedCode); |
1159 | } |
1160 | |
1161 | void ExegesisX86Target::generateUpperMunmap( |
1162 | std::vector<MCInst> &GeneratedCode) const { |
1163 | generateGetInstructionPointer(ResultRegister: X86::R8, GeneratedCode); |
1164 | // Load in the size of the snippet to RDI from from the argument register. |
1165 | GeneratedCode.push_back(x: MCInstBuilder(X86::MOV64rr) |
1166 | .addReg(Reg: X86::RDI) |
1167 | .addReg(Reg: ArgumentRegisters::CodeSize)); |
1168 | // Add the length of the snippet (in %RDI) to the current instruction pointer |
1169 | // (%R8) to get the address where we should start unmapping at. |
1170 | GeneratedCode.push_back(x: MCInstBuilder(X86::ADD64rr) |
1171 | .addReg(Reg: X86::RDI) |
1172 | .addReg(Reg: X86::RDI) |
1173 | .addReg(Reg: X86::R8)); |
1174 | generateRoundToNearestPage(Register: X86::RDI, GeneratedCode); |
1175 | // Add a one page to the start address to ensure that we're above the snippet |
1176 | // since the above function rounds down. |
1177 | GeneratedCode.push_back(x: MCInstBuilder(X86::ADD64ri32) |
1178 | .addReg(Reg: X86::RDI) |
1179 | .addReg(Reg: X86::RDI) |
1180 | .addImm(Val: getpagesize())); |
1181 | // Unmap to just one page under the ceiling of the address space. |
1182 | GeneratedCode.push_back(x: loadImmediate( |
1183 | Reg: X86::RSI, RegBitWidth: 64, Value: APInt(64, VAddressSpaceCeiling - getpagesize()))); |
1184 | GeneratedCode.push_back(x: MCInstBuilder(X86::SUB64rr) |
1185 | .addReg(Reg: X86::RSI) |
1186 | .addReg(Reg: X86::RSI) |
1187 | .addReg(Reg: X86::RDI)); |
1188 | generateSyscall(SYS_munmap, GeneratedCode); |
1189 | } |
1190 | |
1191 | std::vector<MCInst> |
1192 | ExegesisX86Target::generateExitSyscall(unsigned ExitCode) const { |
1193 | std::vector<MCInst> ExitCallCode; |
1194 | ExitCallCode.push_back(x: loadImmediate(Reg: X86::RDI, RegBitWidth: 64, Value: APInt(64, ExitCode))); |
1195 | generateSyscall(SYS_exit, GeneratedCode&: ExitCallCode); |
1196 | return ExitCallCode; |
1197 | } |
1198 | |
1199 | std::vector<MCInst> |
1200 | ExegesisX86Target::generateMmap(uintptr_t Address, size_t Length, |
1201 | uintptr_t FileDescriptorAddress) const { |
1202 | std::vector<MCInst> MmapCode; |
1203 | MmapCode.push_back(x: loadImmediate(Reg: X86::RDI, RegBitWidth: 64, Value: APInt(64, Address))); |
1204 | MmapCode.push_back(x: loadImmediate(Reg: X86::RSI, RegBitWidth: 64, Value: APInt(64, Length))); |
1205 | MmapCode.push_back( |
1206 | x: loadImmediate(Reg: X86::RDX, RegBitWidth: 64, Value: APInt(64, PROT_READ | PROT_WRITE))); |
1207 | MmapCode.push_back( |
1208 | x: loadImmediate(Reg: X86::R10, RegBitWidth: 64, Value: APInt(64, MAP_SHARED | MAP_FIXED_NOREPLACE))); |
1209 | // Copy file descriptor location from aux memory into R8 |
1210 | MmapCode.push_back( |
1211 | x: loadImmediate(Reg: X86::R8, RegBitWidth: 64, Value: APInt(64, FileDescriptorAddress))); |
1212 | // Dereference file descriptor into FD argument register |
1213 | MmapCode.push_back(x: MCInstBuilder(X86::MOV32rm) |
1214 | .addReg(Reg: X86::R8D) |
1215 | .addReg(Reg: X86::R8) |
1216 | .addImm(Val: 1) |
1217 | .addReg(Reg: 0) |
1218 | .addImm(Val: 0) |
1219 | .addReg(Reg: 0)); |
1220 | MmapCode.push_back(x: loadImmediate(Reg: X86::R9, RegBitWidth: 64, Value: APInt(64, 0))); |
1221 | generateSyscall(SYS_mmap, GeneratedCode&: MmapCode); |
1222 | return MmapCode; |
1223 | } |
1224 | |
1225 | void ExegesisX86Target::generateMmapAuxMem( |
1226 | std::vector<MCInst> &GeneratedCode) const { |
1227 | GeneratedCode.push_back( |
1228 | x: loadImmediate(Reg: X86::RDI, RegBitWidth: 64, Value: APInt(64, getAuxiliaryMemoryStartAddress()))); |
1229 | GeneratedCode.push_back(x: loadImmediate( |
1230 | Reg: X86::RSI, RegBitWidth: 64, Value: APInt(64, SubprocessMemory::AuxiliaryMemorySize))); |
1231 | GeneratedCode.push_back( |
1232 | x: loadImmediate(Reg: X86::RDX, RegBitWidth: 64, Value: APInt(64, PROT_READ | PROT_WRITE))); |
1233 | GeneratedCode.push_back( |
1234 | x: loadImmediate(Reg: X86::R10, RegBitWidth: 64, Value: APInt(64, MAP_SHARED | MAP_FIXED_NOREPLACE))); |
1235 | GeneratedCode.push_back(x: MCInstBuilder(X86::MOV64rr) |
1236 | .addReg(Reg: X86::R8) |
1237 | .addReg(Reg: ArgumentRegisters::AuxiliaryMemoryFD)); |
1238 | GeneratedCode.push_back(x: loadImmediate(Reg: X86::R9, RegBitWidth: 64, Value: APInt(64, 0))); |
1239 | generateSyscall(SYS_mmap, GeneratedCode); |
1240 | } |
1241 | |
1242 | void ExegesisX86Target::moveArgumentRegisters( |
1243 | std::vector<MCInst> &GeneratedCode) const { |
1244 | GeneratedCode.push_back(x: MCInstBuilder(X86::MOV64rr) |
1245 | .addReg(Reg: ArgumentRegisters::CodeSize) |
1246 | .addReg(Reg: X86::RDI)); |
1247 | GeneratedCode.push_back(x: MCInstBuilder(X86::MOV64rr) |
1248 | .addReg(Reg: ArgumentRegisters::AuxiliaryMemoryFD) |
1249 | .addReg(Reg: X86::RSI)); |
1250 | } |
1251 | |
1252 | std::vector<MCInst> ExegesisX86Target::generateMemoryInitialSetup() const { |
1253 | std::vector<MCInst> MemoryInitialSetupCode; |
1254 | moveArgumentRegisters(GeneratedCode&: MemoryInitialSetupCode); |
1255 | generateLowerMunmap(GeneratedCode&: MemoryInitialSetupCode); |
1256 | generateUpperMunmap(GeneratedCode&: MemoryInitialSetupCode); |
1257 | generateMmapAuxMem(GeneratedCode&: MemoryInitialSetupCode); |
1258 | return MemoryInitialSetupCode; |
1259 | } |
1260 | |
1261 | std::vector<MCInst> ExegesisX86Target::setStackRegisterToAuxMem() const { |
1262 | // Moves %rsp to the end of the auxiliary memory |
1263 | return {MCInstBuilder(X86::MOV64ri) |
1264 | .addReg(Reg: X86::RSP) |
1265 | .addImm(Val: getAuxiliaryMemoryStartAddress() + |
1266 | SubprocessMemory::AuxiliaryMemorySize)}; |
1267 | } |
1268 | |
1269 | uintptr_t ExegesisX86Target::getAuxiliaryMemoryStartAddress() const { |
1270 | // Return the second to last page in the virtual address space to try and |
1271 | // prevent interference with memory annotations in the snippet |
1272 | return VAddressSpaceCeiling - 2 * getpagesize(); |
1273 | } |
1274 | |
1275 | std::vector<MCInst> |
1276 | ExegesisX86Target::configurePerfCounter(long Request, bool SaveRegisters) const { |
1277 | std::vector<MCInst> ConfigurePerfCounterCode; |
1278 | if (SaveRegisters) |
1279 | saveSyscallRegisters(GeneratedCode&: ConfigurePerfCounterCode, ArgumentCount: 3); |
1280 | ConfigurePerfCounterCode.push_back( |
1281 | x: loadImmediate(Reg: X86::RDI, RegBitWidth: 64, Value: APInt(64, getAuxiliaryMemoryStartAddress()))); |
1282 | ConfigurePerfCounterCode.push_back(x: MCInstBuilder(X86::MOV32rm) |
1283 | .addReg(Reg: X86::EDI) |
1284 | .addReg(Reg: X86::RDI) |
1285 | .addImm(Val: 1) |
1286 | .addReg(Reg: 0) |
1287 | .addImm(Val: 0) |
1288 | .addReg(Reg: 0)); |
1289 | ConfigurePerfCounterCode.push_back( |
1290 | x: loadImmediate(Reg: X86::RSI, RegBitWidth: 64, Value: APInt(64, Request))); |
1291 | #ifdef HAVE_LIBPFM |
1292 | ConfigurePerfCounterCode.push_back( |
1293 | loadImmediate(X86::RDX, 64, APInt(64, PERF_IOC_FLAG_GROUP))); |
1294 | #endif // HAVE_LIBPFM |
1295 | generateSyscall(SYS_ioctl, GeneratedCode&: ConfigurePerfCounterCode); |
1296 | if (SaveRegisters) |
1297 | restoreSyscallRegisters(GeneratedCode&: ConfigurePerfCounterCode, ArgumentCount: 3); |
1298 | return ConfigurePerfCounterCode; |
1299 | } |
1300 | |
1301 | std::vector<MCRegister> ExegesisX86Target::getArgumentRegisters() const { |
1302 | return {X86::RDI, X86::RSI}; |
1303 | } |
1304 | |
1305 | std::vector<MCRegister> ExegesisX86Target::getRegistersNeedSaving() const { |
1306 | return {X86::RAX, X86::RDI, X86::RSI, X86::RCX, X86::R11}; |
1307 | } |
1308 | |
1309 | #endif // __linux__ |
1310 | |
1311 | // Instruction can have some variable operands, and we may want to see how |
1312 | // different operands affect performance. So for each operand position, |
1313 | // precompute all the possible choices we might care about, |
1314 | // and greedily generate all the possible combinations of choices. |
1315 | std::vector<InstructionTemplate> ExegesisX86Target::generateInstructionVariants( |
1316 | const Instruction &Instr, unsigned MaxConfigsPerOpcode) const { |
1317 | bool Exploration = false; |
1318 | SmallVector<SmallVector<MCOperand, 1>, 4> VariableChoices; |
1319 | VariableChoices.resize(N: Instr.Variables.size()); |
1320 | for (auto I : zip(t: Instr.Variables, u&: VariableChoices)) { |
1321 | const Variable &Var = std::get<0>(t&: I); |
1322 | SmallVectorImpl<MCOperand> &Choices = std::get<1>(t&: I); |
1323 | |
1324 | switch (Instr.getPrimaryOperand(Var).getExplicitOperandInfo().OperandType) { |
1325 | default: |
1326 | // We don't wish to explicitly explore this variable. |
1327 | Choices.emplace_back(); // But add invalid MCOperand to simplify logic. |
1328 | continue; |
1329 | case X86::OperandType::OPERAND_COND_CODE: { |
1330 | Exploration = true; |
1331 | auto CondCodes = enum_seq_inclusive(Begin: X86::CondCode::COND_O, |
1332 | End: X86::CondCode::LAST_VALID_COND, |
1333 | force_iteration_on_noniterable_enum); |
1334 | Choices.reserve(N: CondCodes.size()); |
1335 | for (int CondCode : CondCodes) |
1336 | Choices.emplace_back(Args: MCOperand::createImm(Val: CondCode)); |
1337 | break; |
1338 | } |
1339 | } |
1340 | } |
1341 | |
1342 | // If we don't wish to explore any variables, defer to the baseline method. |
1343 | if (!Exploration) |
1344 | return ExegesisTarget::generateInstructionVariants(Instr, |
1345 | MaxConfigsPerOpcode); |
1346 | |
1347 | std::vector<InstructionTemplate> Variants; |
1348 | size_t NumVariants; |
1349 | CombinationGenerator<MCOperand, decltype(VariableChoices)::value_type, 4> G( |
1350 | VariableChoices); |
1351 | |
1352 | // How many operand combinations can we produce, within the limit? |
1353 | NumVariants = std::min(a: G.numCombinations(), b: (size_t)MaxConfigsPerOpcode); |
1354 | // And actually produce all the wanted operand combinations. |
1355 | Variants.reserve(n: NumVariants); |
1356 | G.generate(Callback: [&](ArrayRef<MCOperand> State) -> bool { |
1357 | Variants.emplace_back(args: &Instr); |
1358 | Variants.back().setVariableValues(State); |
1359 | // Did we run out of space for variants? |
1360 | return Variants.size() >= NumVariants; |
1361 | }); |
1362 | |
1363 | assert(Variants.size() == NumVariants && |
1364 | Variants.size() <= MaxConfigsPerOpcode && |
1365 | "Should not produce too many variants" ); |
1366 | return Variants; |
1367 | } |
1368 | |
1369 | static ExegesisTarget *getTheExegesisX86Target() { |
1370 | static ExegesisX86Target Target; |
1371 | return &Target; |
1372 | } |
1373 | |
1374 | void InitializeX86ExegesisTarget() { |
1375 | ExegesisTarget::registerTarget(T: getTheExegesisX86Target()); |
1376 | } |
1377 | |
1378 | } // namespace exegesis |
1379 | } // namespace llvm |
1380 | |