1 | //===-- Target.cpp ----------------------------------------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | #include "../Target.h" |
9 | |
10 | #include "../Error.h" |
11 | #include "../MmapUtils.h" |
12 | #include "../ParallelSnippetGenerator.h" |
13 | #include "../SerialSnippetGenerator.h" |
14 | #include "../SnippetGenerator.h" |
15 | #include "../SubprocessMemory.h" |
16 | #include "MCTargetDesc/X86BaseInfo.h" |
17 | #include "MCTargetDesc/X86MCTargetDesc.h" |
18 | #include "X86.h" |
19 | #include "X86Counter.h" |
20 | #include "X86RegisterInfo.h" |
21 | #include "llvm/ADT/Sequence.h" |
22 | #include "llvm/CodeGen/MachineInstrBuilder.h" |
23 | #include "llvm/MC/MCInstBuilder.h" |
24 | #include "llvm/Support/Errc.h" |
25 | #include "llvm/Support/Error.h" |
26 | #include "llvm/Support/ErrorHandling.h" |
27 | #include "llvm/Support/FormatVariadic.h" |
28 | #include "llvm/TargetParser/Host.h" |
29 | |
30 | #include <memory> |
31 | #include <string> |
32 | #include <vector> |
33 | #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) |
34 | #include <immintrin.h> |
35 | #include <intrin.h> |
36 | #endif |
37 | #if defined(_MSC_VER) && defined(_M_X64) |
38 | #include <float.h> // For _clearfp in ~X86SavedState(). |
39 | #endif |
40 | |
41 | #ifdef __linux__ |
42 | #ifdef __x86_64__ |
43 | #include <asm/prctl.h> |
44 | #endif // __x86_64__ |
45 | #include <sys/mman.h> |
46 | #include <sys/syscall.h> |
47 | #include <unistd.h> |
48 | #ifdef HAVE_LIBPFM |
49 | #include <perfmon/perf_event.h> |
50 | #endif // HAVE_LIBPFM |
51 | #endif |
52 | |
53 | #define GET_AVAILABLE_OPCODE_CHECKER |
54 | #include "X86GenInstrInfo.inc" |
55 | |
56 | namespace llvm { |
57 | namespace exegesis { |
58 | |
59 | // If a positive value is specified, we are going to use the LBR in |
60 | // latency-mode. |
61 | // |
62 | // Note: |
63 | // - A small value is preferred, but too low a value could result in |
64 | // throttling. |
65 | // - A prime number is preferred to avoid always skipping certain blocks. |
66 | // |
67 | static cl::opt<unsigned> LbrSamplingPeriod( |
68 | "x86-lbr-sample-period" , |
69 | cl::desc("The sample period (nbranches/sample), used for LBR sampling" ), |
70 | cl::cat(BenchmarkOptions), cl::init(Val: 0)); |
71 | |
72 | static cl::opt<bool> |
73 | ("x86-disable-upper-sse-registers" , |
74 | cl::desc("Disable XMM8-XMM15 register usage" ), |
75 | cl::cat(BenchmarkOptions), cl::init(Val: false)); |
76 | |
77 | // FIXME: Validates that repetition-mode is loop if LBR is requested. |
78 | |
79 | // Returns a non-null reason if we cannot handle the memory references in this |
80 | // instruction. |
81 | static const char *isInvalidMemoryInstr(const Instruction &Instr) { |
82 | switch (Instr.Description.TSFlags & X86II::FormMask) { |
83 | default: |
84 | return "Unknown FormMask value" ; |
85 | // These have no memory access. |
86 | case X86II::Pseudo: |
87 | case X86II::RawFrm: |
88 | case X86II::AddCCFrm: |
89 | case X86II::PrefixByte: |
90 | case X86II::MRMDestReg: |
91 | case X86II::MRMSrcReg: |
92 | case X86II::MRMSrcReg4VOp3: |
93 | case X86II::MRMSrcRegOp4: |
94 | case X86II::MRMSrcRegCC: |
95 | case X86II::MRMXrCC: |
96 | case X86II::MRMr0: |
97 | case X86II::MRMXr: |
98 | case X86II::MRM0r: |
99 | case X86II::MRM1r: |
100 | case X86II::MRM2r: |
101 | case X86II::MRM3r: |
102 | case X86II::MRM4r: |
103 | case X86II::MRM5r: |
104 | case X86II::MRM6r: |
105 | case X86II::MRM7r: |
106 | case X86II::MRM0X: |
107 | case X86II::MRM1X: |
108 | case X86II::MRM2X: |
109 | case X86II::MRM3X: |
110 | case X86II::MRM4X: |
111 | case X86II::MRM5X: |
112 | case X86II::MRM6X: |
113 | case X86II::MRM7X: |
114 | case X86II::MRM_C0: |
115 | case X86II::MRM_C1: |
116 | case X86II::MRM_C2: |
117 | case X86II::MRM_C3: |
118 | case X86II::MRM_C4: |
119 | case X86II::MRM_C5: |
120 | case X86II::MRM_C6: |
121 | case X86II::MRM_C7: |
122 | case X86II::MRM_C8: |
123 | case X86II::MRM_C9: |
124 | case X86II::MRM_CA: |
125 | case X86II::MRM_CB: |
126 | case X86II::MRM_CC: |
127 | case X86II::MRM_CD: |
128 | case X86II::MRM_CE: |
129 | case X86II::MRM_CF: |
130 | case X86II::MRM_D0: |
131 | case X86II::MRM_D1: |
132 | case X86II::MRM_D2: |
133 | case X86II::MRM_D3: |
134 | case X86II::MRM_D4: |
135 | case X86II::MRM_D5: |
136 | case X86II::MRM_D6: |
137 | case X86II::MRM_D7: |
138 | case X86II::MRM_D8: |
139 | case X86II::MRM_D9: |
140 | case X86II::MRM_DA: |
141 | case X86II::MRM_DB: |
142 | case X86II::MRM_DC: |
143 | case X86II::MRM_DD: |
144 | case X86II::MRM_DE: |
145 | case X86II::MRM_DF: |
146 | case X86II::MRM_E0: |
147 | case X86II::MRM_E1: |
148 | case X86II::MRM_E2: |
149 | case X86II::MRM_E3: |
150 | case X86II::MRM_E4: |
151 | case X86II::MRM_E5: |
152 | case X86II::MRM_E6: |
153 | case X86II::MRM_E7: |
154 | case X86II::MRM_E8: |
155 | case X86II::MRM_E9: |
156 | case X86II::MRM_EA: |
157 | case X86II::MRM_EB: |
158 | case X86II::MRM_EC: |
159 | case X86II::MRM_ED: |
160 | case X86II::MRM_EE: |
161 | case X86II::MRM_EF: |
162 | case X86II::MRM_F0: |
163 | case X86II::MRM_F1: |
164 | case X86II::MRM_F2: |
165 | case X86II::MRM_F3: |
166 | case X86II::MRM_F4: |
167 | case X86II::MRM_F5: |
168 | case X86II::MRM_F6: |
169 | case X86II::MRM_F7: |
170 | case X86II::MRM_F8: |
171 | case X86II::MRM_F9: |
172 | case X86II::MRM_FA: |
173 | case X86II::MRM_FB: |
174 | case X86II::MRM_FC: |
175 | case X86II::MRM_FD: |
176 | case X86II::MRM_FE: |
177 | case X86II::MRM_FF: |
178 | case X86II::RawFrmImm8: |
179 | return nullptr; |
180 | case X86II::AddRegFrm: |
181 | return (Instr.Description.Opcode == X86::POP16r || |
182 | Instr.Description.Opcode == X86::POP32r || |
183 | Instr.Description.Opcode == X86::PUSH16r || |
184 | Instr.Description.Opcode == X86::PUSH32r) |
185 | ? "unsupported opcode: unsupported memory access" |
186 | : nullptr; |
187 | // These access memory and are handled. |
188 | case X86II::MRMDestMem: |
189 | case X86II::MRMSrcMem: |
190 | case X86II::MRMSrcMem4VOp3: |
191 | case X86II::MRMSrcMemOp4: |
192 | case X86II::MRMSrcMemCC: |
193 | case X86II::MRMXmCC: |
194 | case X86II::MRMXm: |
195 | case X86II::MRM0m: |
196 | case X86II::MRM1m: |
197 | case X86II::MRM2m: |
198 | case X86II::MRM3m: |
199 | case X86II::MRM4m: |
200 | case X86II::MRM5m: |
201 | case X86II::MRM6m: |
202 | case X86II::MRM7m: |
203 | return nullptr; |
204 | // These access memory and are not handled yet. |
205 | case X86II::RawFrmImm16: |
206 | case X86II::RawFrmMemOffs: |
207 | case X86II::RawFrmSrc: |
208 | case X86II::RawFrmDst: |
209 | case X86II::RawFrmDstSrc: |
210 | return "unsupported opcode: non uniform memory access" ; |
211 | } |
212 | } |
213 | |
214 | // If the opcode is invalid, returns a pointer to a character literal indicating |
215 | // the reason. nullptr indicates a valid opcode. |
216 | static const char *isInvalidOpcode(const Instruction &Instr) { |
217 | const auto OpcodeName = Instr.Name; |
218 | if ((Instr.Description.TSFlags & X86II::FormMask) == X86II::Pseudo) |
219 | return "unsupported opcode: pseudo instruction" ; |
220 | if ((OpcodeName.starts_with(Prefix: "POP" ) && !OpcodeName.starts_with(Prefix: "POPCNT" )) || |
221 | OpcodeName.starts_with(Prefix: "PUSH" ) || |
222 | OpcodeName.starts_with(Prefix: "ADJCALLSTACK" ) || OpcodeName.starts_with(Prefix: "LEAVE" )) |
223 | return "unsupported opcode: Push/Pop/AdjCallStack/Leave" ; |
224 | switch (Instr.Description.Opcode) { |
225 | case X86::LFS16rm: |
226 | case X86::LFS32rm: |
227 | case X86::LFS64rm: |
228 | case X86::LGS16rm: |
229 | case X86::LGS32rm: |
230 | case X86::LGS64rm: |
231 | case X86::LSS16rm: |
232 | case X86::LSS32rm: |
233 | case X86::LSS64rm: |
234 | case X86::SYSENTER: |
235 | case X86::WRFSBASE: |
236 | case X86::WRFSBASE64: |
237 | return "unsupported opcode" ; |
238 | default: |
239 | break; |
240 | } |
241 | if (const auto reason = isInvalidMemoryInstr(Instr)) |
242 | return reason; |
243 | // We do not handle instructions with OPERAND_PCREL. |
244 | for (const Operand &Op : Instr.Operands) |
245 | if (Op.isExplicit() && |
246 | Op.getExplicitOperandInfo().OperandType == MCOI::OPERAND_PCREL) |
247 | return "unsupported opcode: PC relative operand" ; |
248 | // We do not handle second-form X87 instructions. We only handle first-form |
249 | // ones (_Fp), see comment in X86InstrFPStack.td. |
250 | for (const Operand &Op : Instr.Operands) |
251 | if (Op.isReg() && Op.isExplicit() && |
252 | Op.getExplicitOperandInfo().RegClass == X86::RSTRegClassID) |
253 | return "unsupported second-form X87 instruction" ; |
254 | return nullptr; |
255 | } |
256 | |
257 | static unsigned getX86FPFlags(const Instruction &Instr) { |
258 | return Instr.Description.TSFlags & X86II::FPTypeMask; |
259 | } |
260 | |
261 | // Helper to fill a memory operand with a value. |
262 | static void setMemOp(InstructionTemplate &IT, int OpIdx, |
263 | const MCOperand &OpVal) { |
264 | const auto Op = IT.getInstr().Operands[OpIdx]; |
265 | assert(Op.isExplicit() && "invalid memory pattern" ); |
266 | IT.getValueFor(Op) = OpVal; |
267 | } |
268 | |
269 | // Common (latency, uops) code for LEA templates. `GetDestReg` takes the |
270 | // addressing base and index registers and returns the LEA destination register. |
271 | static Expected<std::vector<CodeTemplate>> generateLEATemplatesCommon( |
272 | const Instruction &Instr, const BitVector &ForbiddenRegisters, |
273 | const LLVMState &State, const SnippetGenerator::Options &Opts, |
274 | std::function<void(unsigned, unsigned, BitVector &CandidateDestRegs)> |
275 | RestrictDestRegs) { |
276 | assert(Instr.Operands.size() == 6 && "invalid LEA" ); |
277 | assert(X86II::getMemoryOperandNo(Instr.Description.TSFlags) == 1 && |
278 | "invalid LEA" ); |
279 | |
280 | constexpr const int kDestOp = 0; |
281 | constexpr const int kBaseOp = 1; |
282 | constexpr const int kIndexOp = 3; |
283 | auto PossibleDestRegs = |
284 | Instr.Operands[kDestOp].getRegisterAliasing().sourceBits(); |
285 | remove(A&: PossibleDestRegs, B: ForbiddenRegisters); |
286 | auto PossibleBaseRegs = |
287 | Instr.Operands[kBaseOp].getRegisterAliasing().sourceBits(); |
288 | remove(A&: PossibleBaseRegs, B: ForbiddenRegisters); |
289 | auto PossibleIndexRegs = |
290 | Instr.Operands[kIndexOp].getRegisterAliasing().sourceBits(); |
291 | remove(A&: PossibleIndexRegs, B: ForbiddenRegisters); |
292 | |
293 | const auto &RegInfo = State.getRegInfo(); |
294 | std::vector<CodeTemplate> Result; |
295 | for (const unsigned BaseReg : PossibleBaseRegs.set_bits()) { |
296 | for (const unsigned IndexReg : PossibleIndexRegs.set_bits()) { |
297 | for (int LogScale = 0; LogScale <= 3; ++LogScale) { |
298 | // FIXME: Add an option for controlling how we explore immediates. |
299 | for (const int Disp : {0, 42}) { |
300 | InstructionTemplate IT(&Instr); |
301 | const int64_t Scale = 1ull << LogScale; |
302 | setMemOp(IT, OpIdx: 1, OpVal: MCOperand::createReg(Reg: BaseReg)); |
303 | setMemOp(IT, OpIdx: 2, OpVal: MCOperand::createImm(Val: Scale)); |
304 | setMemOp(IT, OpIdx: 3, OpVal: MCOperand::createReg(Reg: IndexReg)); |
305 | setMemOp(IT, OpIdx: 4, OpVal: MCOperand::createImm(Val: Disp)); |
306 | // SegmentReg must be 0 for LEA. |
307 | setMemOp(IT, OpIdx: 5, OpVal: MCOperand::createReg(Reg: 0)); |
308 | |
309 | // Output reg candidates are selected by the caller. |
310 | auto PossibleDestRegsNow = PossibleDestRegs; |
311 | RestrictDestRegs(BaseReg, IndexReg, PossibleDestRegsNow); |
312 | assert(PossibleDestRegsNow.set_bits().begin() != |
313 | PossibleDestRegsNow.set_bits().end() && |
314 | "no remaining registers" ); |
315 | setMemOp( |
316 | IT, OpIdx: 0, |
317 | OpVal: MCOperand::createReg(Reg: *PossibleDestRegsNow.set_bits().begin())); |
318 | |
319 | CodeTemplate CT; |
320 | CT.Instructions.push_back(x: std::move(IT)); |
321 | CT.Config = formatv(Fmt: "{3}(%{0}, %{1}, {2})" , Vals: RegInfo.getName(RegNo: BaseReg), |
322 | Vals: RegInfo.getName(RegNo: IndexReg), Vals: Scale, Vals: Disp) |
323 | .str(); |
324 | Result.push_back(x: std::move(CT)); |
325 | if (Result.size() >= Opts.MaxConfigsPerOpcode) |
326 | return std::move(Result); |
327 | } |
328 | } |
329 | } |
330 | } |
331 | |
332 | return std::move(Result); |
333 | } |
334 | |
335 | namespace { |
336 | class X86SerialSnippetGenerator : public SerialSnippetGenerator { |
337 | public: |
338 | using SerialSnippetGenerator::SerialSnippetGenerator; |
339 | |
340 | Expected<std::vector<CodeTemplate>> |
341 | generateCodeTemplates(InstructionTemplate Variant, |
342 | const BitVector &ForbiddenRegisters) const override; |
343 | }; |
344 | } // namespace |
345 | |
346 | Expected<std::vector<CodeTemplate>> |
347 | X86SerialSnippetGenerator::generateCodeTemplates( |
348 | InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const { |
349 | const Instruction &Instr = Variant.getInstr(); |
350 | |
351 | if (const auto reason = isInvalidOpcode(Instr)) |
352 | return make_error<Failure>(Args: reason); |
353 | |
354 | // LEA gets special attention. |
355 | const auto Opcode = Instr.Description.getOpcode(); |
356 | if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r) { |
357 | return generateLEATemplatesCommon( |
358 | Instr, ForbiddenRegisters, State, Opts, |
359 | RestrictDestRegs: [this](unsigned BaseReg, unsigned IndexReg, |
360 | BitVector &CandidateDestRegs) { |
361 | // We just select a destination register that aliases the base |
362 | // register. |
363 | CandidateDestRegs &= |
364 | State.getRATC().getRegister(Reg: BaseReg).aliasedBits(); |
365 | }); |
366 | } |
367 | |
368 | if (Instr.hasMemoryOperands()) |
369 | return make_error<Failure>( |
370 | Args: "unsupported memory operand in latency measurements" ); |
371 | |
372 | switch (getX86FPFlags(Instr)) { |
373 | case X86II::NotFP: |
374 | return SerialSnippetGenerator::generateCodeTemplates(Variant, |
375 | ForbiddenRegisters); |
376 | case X86II::ZeroArgFP: |
377 | case X86II::OneArgFP: |
378 | case X86II::SpecialFP: |
379 | case X86II::CompareFP: |
380 | case X86II::CondMovFP: |
381 | return make_error<Failure>(Args: "Unsupported x87 Instruction" ); |
382 | case X86II::OneArgFPRW: |
383 | case X86II::TwoArgFP: |
384 | // These are instructions like |
385 | // - `ST(0) = fsqrt(ST(0))` (OneArgFPRW) |
386 | // - `ST(0) = ST(0) + ST(i)` (TwoArgFP) |
387 | // They are intrinsically serial and do not modify the state of the stack. |
388 | return generateSelfAliasingCodeTemplates(Variant, ForbiddenRegisters); |
389 | default: |
390 | llvm_unreachable("Unknown FP Type!" ); |
391 | } |
392 | } |
393 | |
394 | namespace { |
395 | class X86ParallelSnippetGenerator : public ParallelSnippetGenerator { |
396 | public: |
397 | using ParallelSnippetGenerator::ParallelSnippetGenerator; |
398 | |
399 | Expected<std::vector<CodeTemplate>> |
400 | generateCodeTemplates(InstructionTemplate Variant, |
401 | const BitVector &ForbiddenRegisters) const override; |
402 | }; |
403 | |
404 | } // namespace |
405 | |
406 | Expected<std::vector<CodeTemplate>> |
407 | X86ParallelSnippetGenerator::generateCodeTemplates( |
408 | InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const { |
409 | const Instruction &Instr = Variant.getInstr(); |
410 | |
411 | if (const auto reason = isInvalidOpcode(Instr)) |
412 | return make_error<Failure>(Args: reason); |
413 | |
414 | // LEA gets special attention. |
415 | const auto Opcode = Instr.Description.getOpcode(); |
416 | if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r) { |
417 | return generateLEATemplatesCommon( |
418 | Instr, ForbiddenRegisters, State, Opts, |
419 | RestrictDestRegs: [this](unsigned BaseReg, unsigned IndexReg, |
420 | BitVector &CandidateDestRegs) { |
421 | // Any destination register that is not used for addressing is fine. |
422 | remove(A&: CandidateDestRegs, |
423 | B: State.getRATC().getRegister(Reg: BaseReg).aliasedBits()); |
424 | remove(A&: CandidateDestRegs, |
425 | B: State.getRATC().getRegister(Reg: IndexReg).aliasedBits()); |
426 | }); |
427 | } |
428 | |
429 | switch (getX86FPFlags(Instr)) { |
430 | case X86II::NotFP: |
431 | return ParallelSnippetGenerator::generateCodeTemplates(Variant, |
432 | ForbiddenRegisters); |
433 | case X86II::ZeroArgFP: |
434 | case X86II::OneArgFP: |
435 | case X86II::SpecialFP: |
436 | return make_error<Failure>(Args: "Unsupported x87 Instruction" ); |
437 | case X86II::OneArgFPRW: |
438 | case X86II::TwoArgFP: |
439 | // These are instructions like |
440 | // - `ST(0) = fsqrt(ST(0))` (OneArgFPRW) |
441 | // - `ST(0) = ST(0) + ST(i)` (TwoArgFP) |
442 | // They are intrinsically serial and do not modify the state of the stack. |
443 | // We generate the same code for latency and uops. |
444 | return generateSelfAliasingCodeTemplates(Variant, ForbiddenRegisters); |
445 | case X86II::CompareFP: |
446 | case X86II::CondMovFP: |
447 | // We can compute uops for any FP instruction that does not grow or shrink |
448 | // the stack (either do not touch the stack or push as much as they pop). |
449 | return generateUnconstrainedCodeTemplates( |
450 | Variant, Msg: "instruction does not grow/shrink the FP stack" ); |
451 | default: |
452 | llvm_unreachable("Unknown FP Type!" ); |
453 | } |
454 | } |
455 | |
456 | static unsigned getLoadImmediateOpcode(unsigned RegBitWidth) { |
457 | switch (RegBitWidth) { |
458 | case 8: |
459 | return X86::MOV8ri; |
460 | case 16: |
461 | return X86::MOV16ri; |
462 | case 32: |
463 | return X86::MOV32ri; |
464 | case 64: |
465 | return X86::MOV64ri; |
466 | } |
467 | llvm_unreachable("Invalid Value Width" ); |
468 | } |
469 | |
470 | // Generates instruction to load an immediate value into a register. |
471 | static MCInst loadImmediate(unsigned Reg, unsigned RegBitWidth, |
472 | const APInt &Value) { |
473 | if (Value.getBitWidth() > RegBitWidth) |
474 | llvm_unreachable("Value must fit in the Register" ); |
475 | return MCInstBuilder(getLoadImmediateOpcode(RegBitWidth)) |
476 | .addReg(Reg) |
477 | .addImm(Val: Value.getZExtValue()); |
478 | } |
479 | |
480 | // Allocates scratch memory on the stack. |
481 | static MCInst allocateStackSpace(unsigned Bytes) { |
482 | return MCInstBuilder(X86::SUB64ri8) |
483 | .addReg(Reg: X86::RSP) |
484 | .addReg(Reg: X86::RSP) |
485 | .addImm(Val: Bytes); |
486 | } |
487 | |
488 | // Fills scratch memory at offset `OffsetBytes` with value `Imm`. |
489 | static MCInst fillStackSpace(unsigned MovOpcode, unsigned OffsetBytes, |
490 | uint64_t Imm) { |
491 | return MCInstBuilder(MovOpcode) |
492 | // Address = ESP |
493 | .addReg(Reg: X86::RSP) // BaseReg |
494 | .addImm(Val: 1) // ScaleAmt |
495 | .addReg(Reg: 0) // IndexReg |
496 | .addImm(Val: OffsetBytes) // Disp |
497 | .addReg(Reg: 0) // Segment |
498 | // Immediate. |
499 | .addImm(Val: Imm); |
500 | } |
501 | |
502 | // Loads scratch memory into register `Reg` using opcode `RMOpcode`. |
503 | static MCInst loadToReg(unsigned Reg, unsigned RMOpcode) { |
504 | return MCInstBuilder(RMOpcode) |
505 | .addReg(Reg) |
506 | // Address = ESP |
507 | .addReg(Reg: X86::RSP) // BaseReg |
508 | .addImm(Val: 1) // ScaleAmt |
509 | .addReg(Reg: 0) // IndexReg |
510 | .addImm(Val: 0) // Disp |
511 | .addReg(Reg: 0); // Segment |
512 | } |
513 | |
514 | // Releases scratch memory. |
515 | static MCInst releaseStackSpace(unsigned Bytes) { |
516 | return MCInstBuilder(X86::ADD64ri8) |
517 | .addReg(Reg: X86::RSP) |
518 | .addReg(Reg: X86::RSP) |
519 | .addImm(Val: Bytes); |
520 | } |
521 | |
522 | // Reserves some space on the stack, fills it with the content of the provided |
523 | // constant and provide methods to load the stack value into a register. |
524 | namespace { |
525 | struct ConstantInliner { |
526 | explicit ConstantInliner(const APInt &Constant) : Constant_(Constant) {} |
527 | |
528 | std::vector<MCInst> loadAndFinalize(unsigned Reg, unsigned RegBitWidth, |
529 | unsigned Opcode); |
530 | |
531 | std::vector<MCInst> loadX87STAndFinalize(unsigned Reg); |
532 | |
533 | std::vector<MCInst> loadX87FPAndFinalize(unsigned Reg); |
534 | |
535 | std::vector<MCInst> popFlagAndFinalize(); |
536 | |
537 | std::vector<MCInst> loadImplicitRegAndFinalize(unsigned Opcode, |
538 | unsigned Value); |
539 | |
540 | private: |
541 | ConstantInliner &add(const MCInst &Inst) { |
542 | Instructions.push_back(x: Inst); |
543 | return *this; |
544 | } |
545 | |
546 | void initStack(unsigned Bytes); |
547 | |
548 | static constexpr const unsigned kF80Bytes = 10; // 80 bits. |
549 | |
550 | APInt Constant_; |
551 | std::vector<MCInst> Instructions; |
552 | }; |
553 | } // namespace |
554 | |
555 | std::vector<MCInst> ConstantInliner::loadAndFinalize(unsigned Reg, |
556 | unsigned RegBitWidth, |
557 | unsigned Opcode) { |
558 | assert((RegBitWidth & 7) == 0 && "RegBitWidth must be a multiple of 8 bits" ); |
559 | initStack(Bytes: RegBitWidth / 8); |
560 | add(Inst: loadToReg(Reg, RMOpcode: Opcode)); |
561 | add(Inst: releaseStackSpace(Bytes: RegBitWidth / 8)); |
562 | return std::move(Instructions); |
563 | } |
564 | |
565 | std::vector<MCInst> ConstantInliner::loadX87STAndFinalize(unsigned Reg) { |
566 | initStack(Bytes: kF80Bytes); |
567 | add(Inst: MCInstBuilder(X86::LD_F80m) |
568 | // Address = ESP |
569 | .addReg(Reg: X86::RSP) // BaseReg |
570 | .addImm(Val: 1) // ScaleAmt |
571 | .addReg(Reg: 0) // IndexReg |
572 | .addImm(Val: 0) // Disp |
573 | .addReg(Reg: 0)); // Segment |
574 | if (Reg != X86::ST0) |
575 | add(Inst: MCInstBuilder(X86::ST_Frr).addReg(Reg)); |
576 | add(Inst: releaseStackSpace(Bytes: kF80Bytes)); |
577 | return std::move(Instructions); |
578 | } |
579 | |
580 | std::vector<MCInst> ConstantInliner::loadX87FPAndFinalize(unsigned Reg) { |
581 | initStack(Bytes: kF80Bytes); |
582 | add(Inst: MCInstBuilder(X86::LD_Fp80m) |
583 | .addReg(Reg) |
584 | // Address = ESP |
585 | .addReg(Reg: X86::RSP) // BaseReg |
586 | .addImm(Val: 1) // ScaleAmt |
587 | .addReg(Reg: 0) // IndexReg |
588 | .addImm(Val: 0) // Disp |
589 | .addReg(Reg: 0)); // Segment |
590 | add(Inst: releaseStackSpace(Bytes: kF80Bytes)); |
591 | return std::move(Instructions); |
592 | } |
593 | |
594 | std::vector<MCInst> ConstantInliner::popFlagAndFinalize() { |
595 | initStack(Bytes: 8); |
596 | add(Inst: MCInstBuilder(X86::POPF64)); |
597 | return std::move(Instructions); |
598 | } |
599 | |
600 | std::vector<MCInst> |
601 | ConstantInliner::loadImplicitRegAndFinalize(unsigned Opcode, unsigned Value) { |
602 | add(Inst: allocateStackSpace(Bytes: 4)); |
603 | add(Inst: fillStackSpace(MovOpcode: X86::MOV32mi, OffsetBytes: 0, Imm: Value)); // Mask all FP exceptions |
604 | add(Inst: MCInstBuilder(Opcode) |
605 | // Address = ESP |
606 | .addReg(Reg: X86::RSP) // BaseReg |
607 | .addImm(Val: 1) // ScaleAmt |
608 | .addReg(Reg: 0) // IndexReg |
609 | .addImm(Val: 0) // Disp |
610 | .addReg(Reg: 0)); // Segment |
611 | add(Inst: releaseStackSpace(Bytes: 4)); |
612 | return std::move(Instructions); |
613 | } |
614 | |
615 | void ConstantInliner::initStack(unsigned Bytes) { |
616 | assert(Constant_.getBitWidth() <= Bytes * 8 && |
617 | "Value does not have the correct size" ); |
618 | const APInt WideConstant = Constant_.getBitWidth() < Bytes * 8 |
619 | ? Constant_.sext(width: Bytes * 8) |
620 | : Constant_; |
621 | add(Inst: allocateStackSpace(Bytes)); |
622 | size_t ByteOffset = 0; |
623 | for (; Bytes - ByteOffset >= 4; ByteOffset += 4) |
624 | add(Inst: fillStackSpace( |
625 | MovOpcode: X86::MOV32mi, OffsetBytes: ByteOffset, |
626 | Imm: WideConstant.extractBits(numBits: 32, bitPosition: ByteOffset * 8).getZExtValue())); |
627 | if (Bytes - ByteOffset >= 2) { |
628 | add(Inst: fillStackSpace( |
629 | MovOpcode: X86::MOV16mi, OffsetBytes: ByteOffset, |
630 | Imm: WideConstant.extractBits(numBits: 16, bitPosition: ByteOffset * 8).getZExtValue())); |
631 | ByteOffset += 2; |
632 | } |
633 | if (Bytes - ByteOffset >= 1) |
634 | add(Inst: fillStackSpace( |
635 | MovOpcode: X86::MOV8mi, OffsetBytes: ByteOffset, |
636 | Imm: WideConstant.extractBits(numBits: 8, bitPosition: ByteOffset * 8).getZExtValue())); |
637 | } |
638 | |
639 | #include "X86GenExegesis.inc" |
640 | |
641 | namespace { |
642 | |
643 | class X86SavedState : public ExegesisTarget::SavedState { |
644 | public: |
645 | X86SavedState() { |
646 | #if defined(_MSC_VER) && defined(_M_X64) |
647 | _fxsave64(FPState); |
648 | Eflags = __readeflags(); |
649 | #elif defined(__GNUC__) && defined(__x86_64__) |
650 | __builtin_ia32_fxsave64(FPState); |
651 | Eflags = __builtin_ia32_readeflags_u64(); |
652 | #else |
653 | report_fatal_error("X86 exegesis running on unsupported target" ); |
654 | #endif |
655 | } |
656 | |
657 | ~X86SavedState() { |
658 | // Restoring the X87 state does not flush pending exceptions, make sure |
659 | // these exceptions are flushed now. |
660 | #if defined(_MSC_VER) && defined(_M_X64) |
661 | _clearfp(); |
662 | _fxrstor64(FPState); |
663 | __writeeflags(Eflags); |
664 | #elif defined(__GNUC__) && defined(__x86_64__) |
665 | asm volatile("fwait" ); |
666 | __builtin_ia32_fxrstor64(FPState); |
667 | __builtin_ia32_writeeflags_u64(Eflags); |
668 | #else |
669 | report_fatal_error("X86 exegesis running on unsupported target" ); |
670 | #endif |
671 | } |
672 | |
673 | private: |
674 | #if defined(__x86_64__) || defined(_M_X64) |
675 | alignas(16) char FPState[512]; |
676 | uint64_t Eflags; |
677 | #endif |
678 | }; |
679 | |
680 | class ExegesisX86Target : public ExegesisTarget { |
681 | public: |
682 | ExegesisX86Target() |
683 | : ExegesisTarget(X86CpuPfmCounters, X86_MC::isOpcodeAvailable) {} |
684 | |
685 | Expected<std::unique_ptr<pfm::CounterGroup>> |
686 | createCounter(StringRef CounterName, const LLVMState &State, |
687 | ArrayRef<const char *> ValidationCounters, |
688 | const pid_t ProcessID) const override { |
689 | // If LbrSamplingPeriod was provided, then ignore the |
690 | // CounterName because we only have one for LBR. |
691 | if (LbrSamplingPeriod > 0) { |
692 | // Can't use LBR without HAVE_LIBPFM, LIBPFM_HAS_FIELD_CYCLES, or without |
693 | // __linux__ (for now) |
694 | #if defined(HAVE_LIBPFM) && defined(LIBPFM_HAS_FIELD_CYCLES) && \ |
695 | defined(__linux__) |
696 | // TODO(boomanaiden154): Add in support for using validation counters when |
697 | // using LBR counters. |
698 | if (ValidationCounters.size() > 0) |
699 | return make_error<StringError>( |
700 | "Using LBR is not currently supported with validation counters" , |
701 | errc::invalid_argument); |
702 | |
703 | return std::make_unique<X86LbrCounter>( |
704 | X86LbrPerfEvent(LbrSamplingPeriod)); |
705 | #else |
706 | return make_error<StringError>( |
707 | Args: "LBR counter requested without HAVE_LIBPFM, LIBPFM_HAS_FIELD_CYCLES, " |
708 | "or running on Linux." , |
709 | Args: errc::invalid_argument); |
710 | #endif |
711 | } |
712 | return ExegesisTarget::createCounter(CounterName, State, ValidationCounters, |
713 | ProcessID); |
714 | } |
715 | |
716 | enum ArgumentRegisters { CodeSize = X86::R12, AuxiliaryMemoryFD = X86::R13 }; |
717 | |
718 | private: |
719 | void addTargetSpecificPasses(PassManagerBase &PM) const override; |
720 | |
721 | unsigned getScratchMemoryRegister(const Triple &TT) const override; |
722 | |
723 | unsigned getDefaultLoopCounterRegister(const Triple &) const override; |
724 | |
725 | unsigned getMaxMemoryAccessSize() const override { return 64; } |
726 | |
727 | Error randomizeTargetMCOperand(const Instruction &Instr, const Variable &Var, |
728 | MCOperand &AssignedValue, |
729 | const BitVector &ForbiddenRegs) const override; |
730 | |
731 | void fillMemoryOperands(InstructionTemplate &IT, unsigned Reg, |
732 | unsigned Offset) const override; |
733 | |
734 | void decrementLoopCounterAndJump(MachineBasicBlock &MBB, |
735 | MachineBasicBlock &TargetMBB, |
736 | const MCInstrInfo &MII, |
737 | unsigned LoopRegister) const override; |
738 | |
739 | std::vector<MCInst> setRegTo(const MCSubtargetInfo &STI, unsigned Reg, |
740 | const APInt &Value) const override; |
741 | |
742 | #ifdef __linux__ |
743 | void generateLowerMunmap(std::vector<MCInst> &GeneratedCode) const override; |
744 | |
745 | void generateUpperMunmap(std::vector<MCInst> &GeneratedCode) const override; |
746 | |
747 | std::vector<MCInst> generateExitSyscall(unsigned ExitCode) const override; |
748 | |
749 | std::vector<MCInst> |
750 | generateMmap(intptr_t Address, size_t Length, |
751 | intptr_t FileDescriptorAddress) const override; |
752 | |
753 | void generateMmapAuxMem(std::vector<MCInst> &GeneratedCode) const override; |
754 | |
755 | void moveArgumentRegisters(std::vector<MCInst> &GeneratedCode) const override; |
756 | |
757 | std::vector<MCInst> generateMemoryInitialSetup() const override; |
758 | |
759 | std::vector<MCInst> setStackRegisterToAuxMem() const override; |
760 | |
761 | intptr_t getAuxiliaryMemoryStartAddress() const override; |
762 | |
763 | std::vector<MCInst> configurePerfCounter(long Request, bool SaveRegisters) const override; |
764 | |
765 | std::vector<unsigned> getArgumentRegisters() const override; |
766 | |
767 | std::vector<unsigned> getRegistersNeedSaving() const override; |
768 | #endif // __linux__ |
769 | |
770 | ArrayRef<unsigned> getUnavailableRegisters() const override { |
771 | if (DisableUpperSSERegisters) |
772 | return ArrayRef(kUnavailableRegistersSSE); |
773 | |
774 | return ArrayRef(kUnavailableRegisters); |
775 | } |
776 | |
777 | bool allowAsBackToBack(const Instruction &Instr) const override { |
778 | const unsigned Opcode = Instr.Description.Opcode; |
779 | return !isInvalidOpcode(Instr) && Opcode != X86::LEA64r && |
780 | Opcode != X86::LEA64_32r && Opcode != X86::LEA16r; |
781 | } |
782 | |
783 | std::vector<InstructionTemplate> |
784 | generateInstructionVariants(const Instruction &Instr, |
785 | unsigned MaxConfigsPerOpcode) const override; |
786 | |
787 | std::unique_ptr<SnippetGenerator> createSerialSnippetGenerator( |
788 | const LLVMState &State, |
789 | const SnippetGenerator::Options &Opts) const override { |
790 | return std::make_unique<X86SerialSnippetGenerator>(args: State, args: Opts); |
791 | } |
792 | |
793 | std::unique_ptr<SnippetGenerator> createParallelSnippetGenerator( |
794 | const LLVMState &State, |
795 | const SnippetGenerator::Options &Opts) const override { |
796 | return std::make_unique<X86ParallelSnippetGenerator>(args: State, args: Opts); |
797 | } |
798 | |
799 | bool matchesArch(Triple::ArchType Arch) const override { |
800 | return Arch == Triple::x86_64 || Arch == Triple::x86; |
801 | } |
802 | |
803 | Error checkFeatureSupport() const override { |
804 | // LBR is the only feature we conditionally support now. |
805 | // So if LBR is not requested, then we should be able to run the benchmarks. |
806 | if (LbrSamplingPeriod == 0) |
807 | return Error::success(); |
808 | |
809 | #if defined(__linux__) && defined(HAVE_LIBPFM) && \ |
810 | defined(LIBPFM_HAS_FIELD_CYCLES) |
811 | // FIXME: Fix this. |
812 | // https://bugs.llvm.org/show_bug.cgi?id=48918 |
813 | // For now, only do the check if we see an Intel machine because |
814 | // the counter uses some intel-specific magic and it could |
815 | // be confuse and think an AMD machine actually has LBR support. |
816 | #if defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) || \ |
817 | defined(_M_X64) |
818 | using namespace sys::detail::x86; |
819 | |
820 | if (getVendorSignature() == VendorSignatures::GENUINE_INTEL) |
821 | // If the kernel supports it, the hardware still may not have it. |
822 | return X86LbrCounter::checkLbrSupport(); |
823 | #else |
824 | report_fatal_error("Running X86 exegesis on unsupported target" ); |
825 | #endif |
826 | #endif |
827 | return make_error<StringError>( |
828 | Args: "LBR not supported on this kernel and/or platform" , |
829 | Args: errc::not_supported); |
830 | } |
831 | |
832 | std::unique_ptr<SavedState> withSavedState() const override { |
833 | return std::make_unique<X86SavedState>(); |
834 | } |
835 | |
836 | static const unsigned kUnavailableRegisters[4]; |
837 | static const unsigned [12]; |
838 | }; |
839 | |
840 | // We disable a few registers that cannot be encoded on instructions with a REX |
841 | // prefix. |
842 | const unsigned ExegesisX86Target::kUnavailableRegisters[4] = {X86::AH, X86::BH, |
843 | X86::CH, X86::DH}; |
844 | |
845 | // Optionally, also disable the upper (x86_64) SSE registers to reduce frontend |
846 | // decoder load. |
847 | const unsigned ExegesisX86Target::[12] = { |
848 | X86::AH, X86::BH, X86::CH, X86::DH, X86::XMM8, X86::XMM9, |
849 | X86::XMM10, X86::XMM11, X86::XMM12, X86::XMM13, X86::XMM14, X86::XMM15}; |
850 | |
851 | // We're using one of R8-R15 because these registers are never hardcoded in |
852 | // instructions (e.g. MOVS writes to EDI, ESI, EDX), so they have less |
853 | // conflicts. |
854 | constexpr const unsigned kDefaultLoopCounterReg = X86::R8; |
855 | |
856 | } // namespace |
857 | |
858 | void ExegesisX86Target::addTargetSpecificPasses(PassManagerBase &PM) const { |
859 | // Lowers FP pseudo-instructions, e.g. ABS_Fp32 -> ABS_F. |
860 | PM.add(P: createX86FloatingPointStackifierPass()); |
861 | } |
862 | |
863 | unsigned ExegesisX86Target::getScratchMemoryRegister(const Triple &TT) const { |
864 | if (!TT.isArch64Bit()) { |
865 | // FIXME: This would require popping from the stack, so we would have to |
866 | // add some additional setup code. |
867 | return 0; |
868 | } |
869 | return TT.isOSWindows() ? X86::RCX : X86::RDI; |
870 | } |
871 | |
872 | unsigned |
873 | ExegesisX86Target::getDefaultLoopCounterRegister(const Triple &TT) const { |
874 | if (!TT.isArch64Bit()) { |
875 | return 0; |
876 | } |
877 | return kDefaultLoopCounterReg; |
878 | } |
879 | |
880 | Error ExegesisX86Target::randomizeTargetMCOperand( |
881 | const Instruction &Instr, const Variable &Var, MCOperand &AssignedValue, |
882 | const BitVector &ForbiddenRegs) const { |
883 | const Operand &Op = Instr.getPrimaryOperand(Var); |
884 | switch (Op.getExplicitOperandInfo().OperandType) { |
885 | case X86::OperandType::OPERAND_COND_CODE: |
886 | AssignedValue = |
887 | MCOperand::createImm(Val: randomIndex(Max: X86::CondCode::LAST_VALID_COND)); |
888 | return Error::success(); |
889 | case X86::OperandType::OPERAND_ROUNDING_CONTROL: |
890 | AssignedValue = |
891 | MCOperand::createImm(Val: randomIndex(Max: X86::STATIC_ROUNDING::TO_ZERO)); |
892 | return Error::success(); |
893 | default: |
894 | break; |
895 | } |
896 | return make_error<Failure>( |
897 | Args: Twine("unimplemented operand type " ) |
898 | .concat(Suffix: Twine(Op.getExplicitOperandInfo().OperandType))); |
899 | } |
900 | |
901 | void ExegesisX86Target::fillMemoryOperands(InstructionTemplate &IT, |
902 | unsigned Reg, |
903 | unsigned Offset) const { |
904 | assert(!isInvalidMemoryInstr(IT.getInstr()) && |
905 | "fillMemoryOperands requires a valid memory instruction" ); |
906 | int MemOpIdx = X86II::getMemoryOperandNo(TSFlags: IT.getInstr().Description.TSFlags); |
907 | assert(MemOpIdx >= 0 && "invalid memory operand index" ); |
908 | // getMemoryOperandNo() ignores tied operands, so we have to add them back. |
909 | MemOpIdx += X86II::getOperandBias(Desc: IT.getInstr().Description); |
910 | setMemOp(IT, OpIdx: MemOpIdx + 0, OpVal: MCOperand::createReg(Reg)); // BaseReg |
911 | setMemOp(IT, OpIdx: MemOpIdx + 1, OpVal: MCOperand::createImm(Val: 1)); // ScaleAmt |
912 | setMemOp(IT, OpIdx: MemOpIdx + 2, OpVal: MCOperand::createReg(Reg: 0)); // IndexReg |
913 | setMemOp(IT, OpIdx: MemOpIdx + 3, OpVal: MCOperand::createImm(Val: Offset)); // Disp |
914 | setMemOp(IT, OpIdx: MemOpIdx + 4, OpVal: MCOperand::createReg(Reg: 0)); // Segment |
915 | } |
916 | |
917 | void ExegesisX86Target::decrementLoopCounterAndJump( |
918 | MachineBasicBlock &MBB, MachineBasicBlock &TargetMBB, |
919 | const MCInstrInfo &MII, unsigned LoopRegister) const { |
920 | BuildMI(BB: &MBB, MIMD: DebugLoc(), MCID: MII.get(Opcode: X86::ADD64ri8)) |
921 | .addDef(RegNo: LoopRegister) |
922 | .addUse(RegNo: LoopRegister) |
923 | .addImm(Val: -1); |
924 | BuildMI(BB: &MBB, MIMD: DebugLoc(), MCID: MII.get(Opcode: X86::JCC_1)) |
925 | .addMBB(MBB: &TargetMBB) |
926 | .addImm(Val: X86::COND_NE); |
927 | } |
928 | |
929 | void generateRegisterStackPush(unsigned int Register, |
930 | std::vector<MCInst> &GeneratedCode) { |
931 | GeneratedCode.push_back(x: MCInstBuilder(X86::PUSH64r).addReg(Reg: Register)); |
932 | } |
933 | |
934 | void generateRegisterStackPop(unsigned int Register, |
935 | std::vector<MCInst> &GeneratedCode) { |
936 | GeneratedCode.push_back(x: MCInstBuilder(X86::POP64r).addReg(Reg: Register)); |
937 | } |
938 | |
939 | void generateSyscall(long SyscallNumber, std::vector<MCInst> &GeneratedCode) { |
940 | GeneratedCode.push_back( |
941 | x: loadImmediate(Reg: X86::RAX, RegBitWidth: 64, Value: APInt(64, SyscallNumber))); |
942 | GeneratedCode.push_back(x: MCInstBuilder(X86::SYSCALL)); |
943 | } |
944 | |
945 | // The functions below for saving and restoring system call registers are only |
946 | // used when llvm-exegesis is built on Linux. |
947 | #ifdef __linux__ |
948 | constexpr std::array<unsigned, 6> SyscallArgumentRegisters{ |
949 | X86::RDI, X86::RSI, X86::RDX, X86::R10, X86::R8, X86::R9}; |
950 | |
951 | static void saveSyscallRegisters(std::vector<MCInst> &GeneratedCode, |
952 | unsigned ArgumentCount) { |
953 | assert(ArgumentCount <= 6 && |
954 | "System calls only X86-64 Linux can only take six arguments" ); |
955 | // Preserve RCX and R11 (Clobbered by the system call). |
956 | generateRegisterStackPush(Register: X86::RCX, GeneratedCode); |
957 | generateRegisterStackPush(Register: X86::R11, GeneratedCode); |
958 | // Preserve RAX (used for the syscall number/return value). |
959 | generateRegisterStackPush(Register: X86::RAX, GeneratedCode); |
960 | // Preserve the registers used to pass arguments to the system call. |
961 | for (unsigned I = 0; I < ArgumentCount; ++I) |
962 | generateRegisterStackPush(Register: SyscallArgumentRegisters[I], GeneratedCode); |
963 | } |
964 | |
965 | static void restoreSyscallRegisters(std::vector<MCInst> &GeneratedCode, |
966 | unsigned ArgumentCount) { |
967 | assert(ArgumentCount <= 6 && |
968 | "System calls only X86-64 Linux can only take six arguments" ); |
969 | // Restore the argument registers, in the opposite order of the way they are |
970 | // saved. |
971 | for (unsigned I = ArgumentCount; I > 0; --I) { |
972 | generateRegisterStackPop(Register: SyscallArgumentRegisters[I - 1], GeneratedCode); |
973 | } |
974 | generateRegisterStackPop(Register: X86::RAX, GeneratedCode); |
975 | generateRegisterStackPop(Register: X86::R11, GeneratedCode); |
976 | generateRegisterStackPop(Register: X86::RCX, GeneratedCode); |
977 | } |
978 | #endif // __linux__ |
979 | |
980 | static std::vector<MCInst> loadImmediateSegmentRegister(unsigned Reg, |
981 | const APInt &Value) { |
982 | #if defined(__x86_64__) && defined(__linux__) |
983 | assert(Value.getBitWidth() <= 64 && "Value must fit in the register." ); |
984 | std::vector<MCInst> loadSegmentRegisterCode; |
985 | // Preserve the syscall registers here as we don't |
986 | // want to make any assumptions about the ordering of what registers are |
987 | // loaded in first, and we might have already loaded in registers that we are |
988 | // going to be clobbering here. |
989 | saveSyscallRegisters(GeneratedCode&: loadSegmentRegisterCode, ArgumentCount: 2); |
990 | // Generate the instructions to make the arch_prctl system call to set |
991 | // the registers. |
992 | int SyscallCode = 0; |
993 | if (Reg == X86::FS) |
994 | SyscallCode = ARCH_SET_FS; |
995 | else if (Reg == X86::GS) |
996 | SyscallCode = ARCH_SET_GS; |
997 | else |
998 | llvm_unreachable("Only the segment registers GS and FS are supported" ); |
999 | loadSegmentRegisterCode.push_back( |
1000 | x: loadImmediate(Reg: X86::RDI, RegBitWidth: 64, Value: APInt(64, SyscallCode))); |
1001 | loadSegmentRegisterCode.push_back(x: loadImmediate(Reg: X86::RSI, RegBitWidth: 64, Value)); |
1002 | generateSyscall(SYS_arch_prctl, GeneratedCode&: loadSegmentRegisterCode); |
1003 | // Restore the registers in reverse order |
1004 | restoreSyscallRegisters(GeneratedCode&: loadSegmentRegisterCode, ArgumentCount: 2); |
1005 | return loadSegmentRegisterCode; |
1006 | #else |
1007 | llvm_unreachable("Loading immediate segment registers is only supported with " |
1008 | "x86-64 llvm-exegesis" ); |
1009 | #endif // defined(__x86_64__) && defined(__linux__) |
1010 | } |
1011 | |
1012 | std::vector<MCInst> ExegesisX86Target::setRegTo(const MCSubtargetInfo &STI, |
1013 | unsigned Reg, |
1014 | const APInt &Value) const { |
1015 | if (X86::SEGMENT_REGRegClass.contains(Reg)) |
1016 | return loadImmediateSegmentRegister(Reg, Value); |
1017 | if (X86::GR8RegClass.contains(Reg)) |
1018 | return {loadImmediate(Reg, RegBitWidth: 8, Value)}; |
1019 | if (X86::GR16RegClass.contains(Reg)) |
1020 | return {loadImmediate(Reg, RegBitWidth: 16, Value)}; |
1021 | if (X86::GR32RegClass.contains(Reg)) |
1022 | return {loadImmediate(Reg, RegBitWidth: 32, Value)}; |
1023 | if (X86::GR64RegClass.contains(Reg)) |
1024 | return {loadImmediate(Reg, RegBitWidth: 64, Value)}; |
1025 | if (X86::VK8RegClass.contains(Reg) || X86::VK16RegClass.contains(Reg) || |
1026 | X86::VK32RegClass.contains(Reg) || X86::VK64RegClass.contains(Reg)) { |
1027 | switch (Value.getBitWidth()) { |
1028 | case 8: |
1029 | if (STI.getFeatureBits()[X86::FeatureDQI]) { |
1030 | ConstantInliner CI(Value); |
1031 | return CI.loadAndFinalize(Reg, RegBitWidth: Value.getBitWidth(), Opcode: X86::KMOVBkm); |
1032 | } |
1033 | [[fallthrough]]; |
1034 | case 16: |
1035 | if (STI.getFeatureBits()[X86::FeatureAVX512]) { |
1036 | ConstantInliner CI(Value.zextOrTrunc(width: 16)); |
1037 | return CI.loadAndFinalize(Reg, RegBitWidth: 16, Opcode: X86::KMOVWkm); |
1038 | } |
1039 | break; |
1040 | case 32: |
1041 | if (STI.getFeatureBits()[X86::FeatureBWI]) { |
1042 | ConstantInliner CI(Value); |
1043 | return CI.loadAndFinalize(Reg, RegBitWidth: Value.getBitWidth(), Opcode: X86::KMOVDkm); |
1044 | } |
1045 | break; |
1046 | case 64: |
1047 | if (STI.getFeatureBits()[X86::FeatureBWI]) { |
1048 | ConstantInliner CI(Value); |
1049 | return CI.loadAndFinalize(Reg, RegBitWidth: Value.getBitWidth(), Opcode: X86::KMOVQkm); |
1050 | } |
1051 | break; |
1052 | } |
1053 | } |
1054 | ConstantInliner CI(Value); |
1055 | if (X86::VR64RegClass.contains(Reg)) |
1056 | return CI.loadAndFinalize(Reg, RegBitWidth: 64, Opcode: X86::MMX_MOVQ64rm); |
1057 | if (X86::VR128XRegClass.contains(Reg)) { |
1058 | if (STI.getFeatureBits()[X86::FeatureAVX512]) |
1059 | return CI.loadAndFinalize(Reg, RegBitWidth: 128, Opcode: X86::VMOVDQU32Z128rm); |
1060 | if (STI.getFeatureBits()[X86::FeatureAVX]) |
1061 | return CI.loadAndFinalize(Reg, RegBitWidth: 128, Opcode: X86::VMOVDQUrm); |
1062 | return CI.loadAndFinalize(Reg, RegBitWidth: 128, Opcode: X86::MOVDQUrm); |
1063 | } |
1064 | if (X86::VR256XRegClass.contains(Reg)) { |
1065 | if (STI.getFeatureBits()[X86::FeatureAVX512]) |
1066 | return CI.loadAndFinalize(Reg, RegBitWidth: 256, Opcode: X86::VMOVDQU32Z256rm); |
1067 | if (STI.getFeatureBits()[X86::FeatureAVX]) |
1068 | return CI.loadAndFinalize(Reg, RegBitWidth: 256, Opcode: X86::VMOVDQUYrm); |
1069 | } |
1070 | if (X86::VR512RegClass.contains(Reg)) |
1071 | if (STI.getFeatureBits()[X86::FeatureAVX512]) |
1072 | return CI.loadAndFinalize(Reg, RegBitWidth: 512, Opcode: X86::VMOVDQU32Zrm); |
1073 | if (X86::RSTRegClass.contains(Reg)) { |
1074 | return CI.loadX87STAndFinalize(Reg); |
1075 | } |
1076 | if (X86::RFP32RegClass.contains(Reg) || X86::RFP64RegClass.contains(Reg) || |
1077 | X86::RFP80RegClass.contains(Reg)) { |
1078 | return CI.loadX87FPAndFinalize(Reg); |
1079 | } |
1080 | if (Reg == X86::EFLAGS) |
1081 | return CI.popFlagAndFinalize(); |
1082 | if (Reg == X86::MXCSR) |
1083 | return CI.loadImplicitRegAndFinalize( |
1084 | Opcode: STI.getFeatureBits()[X86::FeatureAVX] ? X86::VLDMXCSR : X86::LDMXCSR, |
1085 | Value: 0x1f80); |
1086 | if (Reg == X86::FPCW) |
1087 | return CI.loadImplicitRegAndFinalize(Opcode: X86::FLDCW16m, Value: 0x37f); |
1088 | return {}; // Not yet implemented. |
1089 | } |
1090 | |
1091 | #ifdef __linux__ |
1092 | |
1093 | #ifdef __arm__ |
1094 | static constexpr const intptr_t VAddressSpaceCeiling = 0xC0000000; |
1095 | #else |
1096 | static constexpr const intptr_t VAddressSpaceCeiling = 0x0000800000000000; |
1097 | #endif |
1098 | |
1099 | void generateRoundToNearestPage(unsigned int Register, |
1100 | std::vector<MCInst> &GeneratedCode) { |
1101 | int PageSizeShift = static_cast<int>(round(x: log2(x: getpagesize()))); |
1102 | // Round down to the nearest page by getting rid of the least significant bits |
1103 | // representing location in the page. Shift right to get rid of this info and |
1104 | // then shift back left. |
1105 | GeneratedCode.push_back(x: MCInstBuilder(X86::SHR64ri) |
1106 | .addReg(Reg: Register) |
1107 | .addReg(Reg: Register) |
1108 | .addImm(Val: PageSizeShift)); |
1109 | GeneratedCode.push_back(x: MCInstBuilder(X86::SHL64ri) |
1110 | .addReg(Reg: Register) |
1111 | .addReg(Reg: Register) |
1112 | .addImm(Val: PageSizeShift)); |
1113 | } |
1114 | |
1115 | void generateGetInstructionPointer(unsigned int ResultRegister, |
1116 | std::vector<MCInst> &GeneratedCode) { |
1117 | // Use a load effective address to get the current instruction pointer and put |
1118 | // it into the result register. |
1119 | GeneratedCode.push_back(x: MCInstBuilder(X86::LEA64r) |
1120 | .addReg(Reg: ResultRegister) |
1121 | .addReg(Reg: X86::RIP) |
1122 | .addImm(Val: 1) |
1123 | .addReg(Reg: 0) |
1124 | .addImm(Val: 0) |
1125 | .addReg(Reg: 0)); |
1126 | } |
1127 | |
1128 | void ExegesisX86Target::generateLowerMunmap( |
1129 | std::vector<MCInst> &GeneratedCode) const { |
1130 | // Unmap starting at address zero |
1131 | GeneratedCode.push_back(x: loadImmediate(Reg: X86::RDI, RegBitWidth: 64, Value: APInt(64, 0))); |
1132 | // Get the current instruction pointer so we know where to unmap up to. |
1133 | generateGetInstructionPointer(ResultRegister: X86::RSI, GeneratedCode); |
1134 | generateRoundToNearestPage(Register: X86::RSI, GeneratedCode); |
1135 | // Subtract a page from the end of the unmap so we don't unmap the currently |
1136 | // executing section. |
1137 | GeneratedCode.push_back(x: MCInstBuilder(X86::SUB64ri32) |
1138 | .addReg(Reg: X86::RSI) |
1139 | .addReg(Reg: X86::RSI) |
1140 | .addImm(Val: getpagesize())); |
1141 | generateSyscall(SYS_munmap, GeneratedCode); |
1142 | } |
1143 | |
1144 | void ExegesisX86Target::generateUpperMunmap( |
1145 | std::vector<MCInst> &GeneratedCode) const { |
1146 | generateGetInstructionPointer(ResultRegister: X86::R8, GeneratedCode); |
1147 | // Load in the size of the snippet to RDI from from the argument register. |
1148 | GeneratedCode.push_back(x: MCInstBuilder(X86::MOV64rr) |
1149 | .addReg(Reg: X86::RDI) |
1150 | .addReg(Reg: ArgumentRegisters::CodeSize)); |
1151 | // Add the length of the snippet (in %RDI) to the current instruction pointer |
1152 | // (%R8) to get the address where we should start unmapping at. |
1153 | GeneratedCode.push_back(x: MCInstBuilder(X86::ADD64rr) |
1154 | .addReg(Reg: X86::RDI) |
1155 | .addReg(Reg: X86::RDI) |
1156 | .addReg(Reg: X86::R8)); |
1157 | generateRoundToNearestPage(Register: X86::RDI, GeneratedCode); |
1158 | // Add a one page to the start address to ensure that we're above the snippet |
1159 | // since the above function rounds down. |
1160 | GeneratedCode.push_back(x: MCInstBuilder(X86::ADD64ri32) |
1161 | .addReg(Reg: X86::RDI) |
1162 | .addReg(Reg: X86::RDI) |
1163 | .addImm(Val: getpagesize())); |
1164 | // Unmap to just one page under the ceiling of the address space. |
1165 | GeneratedCode.push_back(x: loadImmediate( |
1166 | Reg: X86::RSI, RegBitWidth: 64, Value: APInt(64, VAddressSpaceCeiling - getpagesize()))); |
1167 | GeneratedCode.push_back(x: MCInstBuilder(X86::SUB64rr) |
1168 | .addReg(Reg: X86::RSI) |
1169 | .addReg(Reg: X86::RSI) |
1170 | .addReg(Reg: X86::RDI)); |
1171 | generateSyscall(SYS_munmap, GeneratedCode); |
1172 | } |
1173 | |
1174 | std::vector<MCInst> |
1175 | ExegesisX86Target::generateExitSyscall(unsigned ExitCode) const { |
1176 | std::vector<MCInst> ExitCallCode; |
1177 | ExitCallCode.push_back(x: loadImmediate(Reg: X86::RDI, RegBitWidth: 64, Value: APInt(64, ExitCode))); |
1178 | generateSyscall(SYS_exit, GeneratedCode&: ExitCallCode); |
1179 | return ExitCallCode; |
1180 | } |
1181 | |
1182 | std::vector<MCInst> |
1183 | ExegesisX86Target::generateMmap(intptr_t Address, size_t Length, |
1184 | intptr_t FileDescriptorAddress) const { |
1185 | std::vector<MCInst> MmapCode; |
1186 | MmapCode.push_back(x: loadImmediate(Reg: X86::RDI, RegBitWidth: 64, Value: APInt(64, Address))); |
1187 | MmapCode.push_back(x: loadImmediate(Reg: X86::RSI, RegBitWidth: 64, Value: APInt(64, Length))); |
1188 | MmapCode.push_back( |
1189 | x: loadImmediate(Reg: X86::RDX, RegBitWidth: 64, Value: APInt(64, PROT_READ | PROT_WRITE))); |
1190 | MmapCode.push_back( |
1191 | x: loadImmediate(Reg: X86::R10, RegBitWidth: 64, Value: APInt(64, MAP_SHARED | MAP_FIXED_NOREPLACE))); |
1192 | // Copy file descriptor location from aux memory into R8 |
1193 | MmapCode.push_back( |
1194 | x: loadImmediate(Reg: X86::R8, RegBitWidth: 64, Value: APInt(64, FileDescriptorAddress))); |
1195 | // Dereference file descriptor into FD argument register |
1196 | MmapCode.push_back(x: MCInstBuilder(X86::MOV32rm) |
1197 | .addReg(Reg: X86::R8D) |
1198 | .addReg(Reg: X86::R8) |
1199 | .addImm(Val: 1) |
1200 | .addReg(Reg: 0) |
1201 | .addImm(Val: 0) |
1202 | .addReg(Reg: 0)); |
1203 | MmapCode.push_back(x: loadImmediate(Reg: X86::R9, RegBitWidth: 64, Value: APInt(64, 0))); |
1204 | generateSyscall(SYS_mmap, GeneratedCode&: MmapCode); |
1205 | return MmapCode; |
1206 | } |
1207 | |
1208 | void ExegesisX86Target::generateMmapAuxMem( |
1209 | std::vector<MCInst> &GeneratedCode) const { |
1210 | GeneratedCode.push_back( |
1211 | x: loadImmediate(Reg: X86::RDI, RegBitWidth: 64, Value: APInt(64, getAuxiliaryMemoryStartAddress()))); |
1212 | GeneratedCode.push_back(x: loadImmediate( |
1213 | Reg: X86::RSI, RegBitWidth: 64, Value: APInt(64, SubprocessMemory::AuxiliaryMemorySize))); |
1214 | GeneratedCode.push_back( |
1215 | x: loadImmediate(Reg: X86::RDX, RegBitWidth: 64, Value: APInt(64, PROT_READ | PROT_WRITE))); |
1216 | GeneratedCode.push_back( |
1217 | x: loadImmediate(Reg: X86::R10, RegBitWidth: 64, Value: APInt(64, MAP_SHARED | MAP_FIXED_NOREPLACE))); |
1218 | GeneratedCode.push_back(x: MCInstBuilder(X86::MOV64rr) |
1219 | .addReg(Reg: X86::R8) |
1220 | .addReg(Reg: ArgumentRegisters::AuxiliaryMemoryFD)); |
1221 | GeneratedCode.push_back(x: loadImmediate(Reg: X86::R9, RegBitWidth: 64, Value: APInt(64, 0))); |
1222 | generateSyscall(SYS_mmap, GeneratedCode); |
1223 | } |
1224 | |
1225 | void ExegesisX86Target::moveArgumentRegisters( |
1226 | std::vector<MCInst> &GeneratedCode) const { |
1227 | GeneratedCode.push_back(x: MCInstBuilder(X86::MOV64rr) |
1228 | .addReg(Reg: ArgumentRegisters::CodeSize) |
1229 | .addReg(Reg: X86::RDI)); |
1230 | GeneratedCode.push_back(x: MCInstBuilder(X86::MOV64rr) |
1231 | .addReg(Reg: ArgumentRegisters::AuxiliaryMemoryFD) |
1232 | .addReg(Reg: X86::RSI)); |
1233 | } |
1234 | |
1235 | std::vector<MCInst> ExegesisX86Target::generateMemoryInitialSetup() const { |
1236 | std::vector<MCInst> MemoryInitialSetupCode; |
1237 | moveArgumentRegisters(GeneratedCode&: MemoryInitialSetupCode); |
1238 | generateLowerMunmap(GeneratedCode&: MemoryInitialSetupCode); |
1239 | generateUpperMunmap(GeneratedCode&: MemoryInitialSetupCode); |
1240 | generateMmapAuxMem(GeneratedCode&: MemoryInitialSetupCode); |
1241 | return MemoryInitialSetupCode; |
1242 | } |
1243 | |
1244 | std::vector<MCInst> ExegesisX86Target::setStackRegisterToAuxMem() const { |
1245 | // Moves %rsp to the end of the auxiliary memory |
1246 | return {MCInstBuilder(X86::MOV64ri) |
1247 | .addReg(Reg: X86::RSP) |
1248 | .addImm(Val: getAuxiliaryMemoryStartAddress() + |
1249 | SubprocessMemory::AuxiliaryMemorySize)}; |
1250 | } |
1251 | |
1252 | intptr_t ExegesisX86Target::getAuxiliaryMemoryStartAddress() const { |
1253 | // Return the second to last page in the virtual address space to try and |
1254 | // prevent interference with memory annotations in the snippet |
1255 | return VAddressSpaceCeiling - 2 * getpagesize(); |
1256 | } |
1257 | |
1258 | std::vector<MCInst> |
1259 | ExegesisX86Target::configurePerfCounter(long Request, bool SaveRegisters) const { |
1260 | std::vector<MCInst> ConfigurePerfCounterCode; |
1261 | if (SaveRegisters) |
1262 | saveSyscallRegisters(GeneratedCode&: ConfigurePerfCounterCode, ArgumentCount: 3); |
1263 | ConfigurePerfCounterCode.push_back( |
1264 | x: loadImmediate(Reg: X86::RDI, RegBitWidth: 64, Value: APInt(64, getAuxiliaryMemoryStartAddress()))); |
1265 | ConfigurePerfCounterCode.push_back(x: MCInstBuilder(X86::MOV32rm) |
1266 | .addReg(Reg: X86::EDI) |
1267 | .addReg(Reg: X86::RDI) |
1268 | .addImm(Val: 1) |
1269 | .addReg(Reg: 0) |
1270 | .addImm(Val: 0) |
1271 | .addReg(Reg: 0)); |
1272 | ConfigurePerfCounterCode.push_back( |
1273 | x: loadImmediate(Reg: X86::RSI, RegBitWidth: 64, Value: APInt(64, Request))); |
1274 | #ifdef HAVE_LIBPFM |
1275 | ConfigurePerfCounterCode.push_back( |
1276 | loadImmediate(X86::RDX, 64, APInt(64, PERF_IOC_FLAG_GROUP))); |
1277 | #endif // HAVE_LIBPFM |
1278 | generateSyscall(SYS_ioctl, GeneratedCode&: ConfigurePerfCounterCode); |
1279 | if (SaveRegisters) |
1280 | restoreSyscallRegisters(GeneratedCode&: ConfigurePerfCounterCode, ArgumentCount: 3); |
1281 | return ConfigurePerfCounterCode; |
1282 | } |
1283 | |
1284 | std::vector<unsigned> ExegesisX86Target::getArgumentRegisters() const { |
1285 | return {X86::RDI, X86::RSI}; |
1286 | } |
1287 | |
1288 | std::vector<unsigned> ExegesisX86Target::getRegistersNeedSaving() const { |
1289 | return {X86::RAX, X86::RDI, X86::RSI, X86::RCX, X86::R11}; |
1290 | } |
1291 | |
1292 | #endif // __linux__ |
1293 | |
1294 | // Instruction can have some variable operands, and we may want to see how |
1295 | // different operands affect performance. So for each operand position, |
1296 | // precompute all the possible choices we might care about, |
1297 | // and greedily generate all the possible combinations of choices. |
1298 | std::vector<InstructionTemplate> ExegesisX86Target::generateInstructionVariants( |
1299 | const Instruction &Instr, unsigned MaxConfigsPerOpcode) const { |
1300 | bool Exploration = false; |
1301 | SmallVector<SmallVector<MCOperand, 1>, 4> VariableChoices; |
1302 | VariableChoices.resize(N: Instr.Variables.size()); |
1303 | for (auto I : zip(t: Instr.Variables, u&: VariableChoices)) { |
1304 | const Variable &Var = std::get<0>(t&: I); |
1305 | SmallVectorImpl<MCOperand> &Choices = std::get<1>(t&: I); |
1306 | |
1307 | switch (Instr.getPrimaryOperand(Var).getExplicitOperandInfo().OperandType) { |
1308 | default: |
1309 | // We don't wish to explicitly explore this variable. |
1310 | Choices.emplace_back(); // But add invalid MCOperand to simplify logic. |
1311 | continue; |
1312 | case X86::OperandType::OPERAND_COND_CODE: { |
1313 | Exploration = true; |
1314 | auto CondCodes = enum_seq_inclusive(Begin: X86::CondCode::COND_O, |
1315 | End: X86::CondCode::LAST_VALID_COND, |
1316 | force_iteration_on_noniterable_enum); |
1317 | Choices.reserve(N: CondCodes.size()); |
1318 | for (int CondCode : CondCodes) |
1319 | Choices.emplace_back(Args: MCOperand::createImm(Val: CondCode)); |
1320 | break; |
1321 | } |
1322 | } |
1323 | } |
1324 | |
1325 | // If we don't wish to explore any variables, defer to the baseline method. |
1326 | if (!Exploration) |
1327 | return ExegesisTarget::generateInstructionVariants(Instr, |
1328 | MaxConfigsPerOpcode); |
1329 | |
1330 | std::vector<InstructionTemplate> Variants; |
1331 | size_t NumVariants; |
1332 | CombinationGenerator<MCOperand, decltype(VariableChoices)::value_type, 4> G( |
1333 | VariableChoices); |
1334 | |
1335 | // How many operand combinations can we produce, within the limit? |
1336 | NumVariants = std::min(a: G.numCombinations(), b: (size_t)MaxConfigsPerOpcode); |
1337 | // And actually produce all the wanted operand combinations. |
1338 | Variants.reserve(n: NumVariants); |
1339 | G.generate(Callback: [&](ArrayRef<MCOperand> State) -> bool { |
1340 | Variants.emplace_back(args: &Instr); |
1341 | Variants.back().setVariableValues(State); |
1342 | // Did we run out of space for variants? |
1343 | return Variants.size() >= NumVariants; |
1344 | }); |
1345 | |
1346 | assert(Variants.size() == NumVariants && |
1347 | Variants.size() <= MaxConfigsPerOpcode && |
1348 | "Should not produce too many variants" ); |
1349 | return Variants; |
1350 | } |
1351 | |
1352 | static ExegesisTarget *getTheExegesisX86Target() { |
1353 | static ExegesisX86Target Target; |
1354 | return &Target; |
1355 | } |
1356 | |
1357 | void InitializeX86ExegesisTarget() { |
1358 | ExegesisTarget::registerTarget(T: getTheExegesisX86Target()); |
1359 | } |
1360 | |
1361 | } // namespace exegesis |
1362 | } // namespace llvm |
1363 | |