1//===-- BenchmarkRunner.cpp -------------------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "BenchmarkRunner.h"
10#include "Assembler.h"
11#include "DisassemblerHelper.h"
12#include "Error.h"
13#include "MCInstrDescView.h"
14#include "MmapUtils.h"
15#include "PerfHelper.h"
16#include "SubprocessMemory.h"
17#include "Target.h"
18#include "llvm/ADT/ScopeExit.h"
19#include "llvm/ADT/StringExtras.h"
20#include "llvm/ADT/StringRef.h"
21#include "llvm/ADT/Twine.h"
22#include "llvm/Config/llvm-config.h" // for LLVM_ON_UNIX
23#include "llvm/Support/CrashRecoveryContext.h"
24#include "llvm/Support/Debug.h"
25#include "llvm/Support/Error.h"
26#include "llvm/Support/FileSystem.h"
27#include "llvm/Support/MemoryBuffer.h"
28#include "llvm/Support/Program.h"
29#include "llvm/Support/Signals.h"
30#include <cmath>
31#include <memory>
32#include <string>
33
34#ifdef __linux__
35#ifdef HAVE_LIBPFM
36#include <perfmon/perf_event.h>
37#endif
38#include <sys/mman.h>
39#include <sys/ptrace.h>
40#include <sys/resource.h>
41#include <sys/socket.h>
42#include <sys/syscall.h>
43#include <sys/wait.h>
44#include <unistd.h>
45
46#if defined(__GLIBC__) && __has_include(<sys/rseq.h>) && defined(HAVE_BUILTIN_THREAD_POINTER)
47#include <sys/rseq.h>
48#if defined(RSEQ_SIG) && defined(SYS_rseq)
49#define GLIBC_INITS_RSEQ
50#endif
51#endif
52#endif // __linux__
53
54namespace llvm {
55namespace exegesis {
56
57BenchmarkRunner::BenchmarkRunner(const LLVMState &State, Benchmark::ModeE Mode,
58 BenchmarkPhaseSelectorE BenchmarkPhaseSelector,
59 ExecutionModeE ExecutionMode,
60 ArrayRef<ValidationEvent> ValCounters)
61 : State(State), Mode(Mode), BenchmarkPhaseSelector(BenchmarkPhaseSelector),
62 ExecutionMode(ExecutionMode), ValidationCounters(ValCounters),
63 Scratch(std::make_unique<ScratchSpace>()) {}
64
65BenchmarkRunner::~BenchmarkRunner() = default;
66
67void BenchmarkRunner::FunctionExecutor::accumulateCounterValues(
68 const SmallVectorImpl<int64_t> &NewValues,
69 SmallVectorImpl<int64_t> *Result) {
70 const size_t NumValues = std::max(a: NewValues.size(), b: Result->size());
71 if (NumValues > Result->size())
72 Result->resize(N: NumValues, NV: 0);
73 for (size_t I = 0, End = NewValues.size(); I < End; ++I)
74 (*Result)[I] += NewValues[I];
75}
76
77Expected<SmallVector<int64_t, 4>>
78BenchmarkRunner::FunctionExecutor::runAndSample(
79 const char *Counters, ArrayRef<const char *> ValidationCounters,
80 SmallVectorImpl<int64_t> &ValidationCounterValues) const {
81 // We sum counts when there are several counters for a single ProcRes
82 // (e.g. P23 on SandyBridge).
83 SmallVector<int64_t, 4> CounterValues;
84 SmallVector<StringRef, 2> CounterNames;
85 StringRef(Counters).split(A&: CounterNames, Separator: '+');
86 for (auto &CounterName : CounterNames) {
87 CounterName = CounterName.trim();
88 Expected<SmallVector<int64_t, 4>> ValueOrError = runWithCounter(
89 CounterName, ValidationCounters, ValidationCounterValues);
90 if (!ValueOrError)
91 return ValueOrError.takeError();
92 accumulateCounterValues(NewValues: ValueOrError.get(), Result: &CounterValues);
93 }
94 return CounterValues;
95}
96
97namespace {
98class InProcessFunctionExecutorImpl : public BenchmarkRunner::FunctionExecutor {
99public:
100 static Expected<std::unique_ptr<InProcessFunctionExecutorImpl>>
101 create(const LLVMState &State, object::OwningBinary<object::ObjectFile> Obj,
102 BenchmarkRunner::ScratchSpace *Scratch,
103 std::optional<int> BenchmarkProcessCPU) {
104 Expected<ExecutableFunction> EF =
105 ExecutableFunction::create(TM: State.createTargetMachine(), ObjectFileHolder: std::move(Obj));
106
107 if (!EF)
108 return EF.takeError();
109
110 return std::unique_ptr<InProcessFunctionExecutorImpl>(
111 new InProcessFunctionExecutorImpl(State, std::move(*EF), Scratch));
112 }
113
114private:
115 InProcessFunctionExecutorImpl(const LLVMState &State,
116 ExecutableFunction Function,
117 BenchmarkRunner::ScratchSpace *Scratch)
118 : State(State), Function(std::move(Function)), Scratch(Scratch) {}
119
120 static void accumulateCounterValues(const SmallVector<int64_t, 4> &NewValues,
121 SmallVector<int64_t, 4> *Result) {
122 const size_t NumValues = std::max(a: NewValues.size(), b: Result->size());
123 if (NumValues > Result->size())
124 Result->resize(N: NumValues, NV: 0);
125 for (size_t I = 0, End = NewValues.size(); I < End; ++I)
126 (*Result)[I] += NewValues[I];
127 }
128
129 Expected<SmallVector<int64_t, 4>> runWithCounter(
130 StringRef CounterName, ArrayRef<const char *> ValidationCounters,
131 SmallVectorImpl<int64_t> &ValidationCounterValues) const override {
132 const ExegesisTarget &ET = State.getExegesisTarget();
133 char *const ScratchPtr = Scratch->ptr();
134 auto CounterOrError =
135 ET.createCounter(CounterName, State, ValidationCounters);
136
137 if (!CounterOrError)
138 return CounterOrError.takeError();
139
140 pfm::CounterGroup *Counter = CounterOrError.get().get();
141 Scratch->clear();
142 {
143 auto PS = ET.withSavedState();
144 CrashRecoveryContext CRC;
145 CrashRecoveryContext::Enable();
146 const bool Crashed = !CRC.RunSafely(Fn: [this, Counter, ScratchPtr]() {
147 Counter->start();
148 this->Function(ScratchPtr);
149 Counter->stop();
150 });
151 CrashRecoveryContext::Disable();
152 PS.reset();
153 if (Crashed) {
154#ifdef LLVM_ON_UNIX
155 // See "Exit Status for Commands":
156 // https://pubs.opengroup.org/onlinepubs/9699919799/xrat/V4_xcu_chap02.html
157 constexpr int kSigOffset = 128;
158 return make_error<SnippetSignal>(Args: CRC.RetCode - kSigOffset);
159#else
160 // The exit code of the process on windows is not meaningful as a
161 // signal, so simply pass in -1 as the signal into the error.
162 return make_error<SnippetSignal>(-1);
163#endif // LLVM_ON_UNIX
164 }
165 }
166
167 auto ValidationValuesOrErr = Counter->readValidationCountersOrError();
168 if (!ValidationValuesOrErr)
169 return ValidationValuesOrErr.takeError();
170
171 ArrayRef RealValidationValues = *ValidationValuesOrErr;
172 for (size_t I = 0; I < RealValidationValues.size(); ++I)
173 ValidationCounterValues[I] = RealValidationValues[I];
174
175 return Counter->readOrError(FunctionBytes: Function.getFunctionBytes());
176 }
177
178 const LLVMState &State;
179 const ExecutableFunction Function;
180 BenchmarkRunner::ScratchSpace *const Scratch;
181};
182
183#ifdef __linux__
184// The following class implements a function executor that executes the
185// benchmark code within a subprocess rather than within the main llvm-exegesis
186// process. This allows for much more control over the execution context of the
187// snippet, particularly with regard to memory. This class performs all the
188// necessary functions to create the subprocess, execute the snippet in the
189// subprocess, and report results/handle errors.
190class SubProcessFunctionExecutorImpl
191 : public BenchmarkRunner::FunctionExecutor {
192public:
193 static Expected<std::unique_ptr<SubProcessFunctionExecutorImpl>>
194 create(const LLVMState &State, object::OwningBinary<object::ObjectFile> Obj,
195 const BenchmarkKey &Key, std::optional<int> BenchmarkProcessCPU) {
196 Expected<ExecutableFunction> EF =
197 ExecutableFunction::create(TM: State.createTargetMachine(), ObjectFileHolder: std::move(Obj));
198 if (!EF)
199 return EF.takeError();
200
201 return std::unique_ptr<SubProcessFunctionExecutorImpl>(
202 new SubProcessFunctionExecutorImpl(State, std::move(*EF), Key,
203 BenchmarkProcessCPU));
204 }
205
206private:
207 SubProcessFunctionExecutorImpl(const LLVMState &State,
208 ExecutableFunction Function,
209 const BenchmarkKey &Key,
210 std::optional<int> BenchmarkCPU)
211 : State(State), Function(std::move(Function)), Key(Key),
212 BenchmarkProcessCPU(BenchmarkCPU) {}
213
214 enum ChildProcessExitCodeE {
215 CounterFDReadFailed = 1,
216 RSeqDisableFailed,
217 FunctionDataMappingFailed,
218 AuxiliaryMemorySetupFailed,
219 SetCPUAffinityFailed
220 };
221
222 StringRef childProcessExitCodeToString(int ExitCode) const {
223 switch (ExitCode) {
224 case ChildProcessExitCodeE::CounterFDReadFailed:
225 return "Counter file descriptor read failed";
226 case ChildProcessExitCodeE::RSeqDisableFailed:
227 return "Disabling restartable sequences failed";
228 case ChildProcessExitCodeE::FunctionDataMappingFailed:
229 return "Failed to map memory for assembled snippet";
230 case ChildProcessExitCodeE::AuxiliaryMemorySetupFailed:
231 return "Failed to setup auxiliary memory";
232 case ChildProcessExitCodeE::SetCPUAffinityFailed:
233 return "Failed to set CPU affinity of the benchmarking process";
234 default:
235 return "Child process returned with unknown exit code";
236 }
237 }
238
239 Error sendFileDescriptorThroughSocket(int SocketFD, int FD) const {
240 struct msghdr Message = {};
241 char Buffer[CMSG_SPACE(sizeof(FD))];
242 memset(s: Buffer, c: 0, n: sizeof(Buffer));
243 Message.msg_control = Buffer;
244 Message.msg_controllen = sizeof(Buffer);
245
246 struct cmsghdr *ControlMessage = CMSG_FIRSTHDR(&Message);
247 ControlMessage->cmsg_level = SOL_SOCKET;
248 ControlMessage->cmsg_type = SCM_RIGHTS;
249 ControlMessage->cmsg_len = CMSG_LEN(sizeof(FD));
250
251 memcpy(CMSG_DATA(ControlMessage), src: &FD, n: sizeof(FD));
252
253 Message.msg_controllen = CMSG_SPACE(sizeof(FD));
254
255 ssize_t BytesWritten = sendmsg(fd: SocketFD, message: &Message, flags: 0);
256
257 if (BytesWritten < 0)
258 return make_error<Failure>(Args: "Failed to write FD to socket: " +
259 Twine(strerror(errno)));
260
261 return Error::success();
262 }
263
264 Expected<int> getFileDescriptorFromSocket(int SocketFD) const {
265 struct msghdr Message = {};
266
267 char ControlBuffer[256];
268 Message.msg_control = ControlBuffer;
269 Message.msg_controllen = sizeof(ControlBuffer);
270
271 ssize_t BytesRead = recvmsg(fd: SocketFD, message: &Message, flags: 0);
272
273 if (BytesRead < 0)
274 return make_error<Failure>(Args: "Failed to read FD from socket: " +
275 Twine(strerror(errno)));
276
277 struct cmsghdr *ControlMessage = CMSG_FIRSTHDR(&Message);
278
279 int FD;
280
281 if (ControlMessage->cmsg_len != CMSG_LEN(sizeof(FD)))
282 return make_error<Failure>(Args: "Failed to get correct number of bytes for "
283 "file descriptor from socket.");
284
285 memcpy(dest: &FD, CMSG_DATA(ControlMessage), n: sizeof(FD));
286
287 return FD;
288 }
289
290 Error
291 runParentProcess(pid_t ChildPID, int WriteFD, StringRef CounterName,
292 SmallVectorImpl<int64_t> &CounterValues,
293 ArrayRef<const char *> ValidationCounters,
294 SmallVectorImpl<int64_t> &ValidationCounterValues) const {
295 scope_exit WriteFDClose([WriteFD]() { close(fd: WriteFD); });
296 const ExegesisTarget &ET = State.getExegesisTarget();
297 auto CounterOrError =
298 ET.createCounter(CounterName, State, ValidationCounters, ProcessID: ChildPID);
299
300 if (!CounterOrError)
301 return CounterOrError.takeError();
302
303 pfm::CounterGroup *Counter = CounterOrError.get().get();
304
305 // Make sure to attach to the process (and wait for the sigstop to be
306 // delivered and for the process to continue) before we write to the counter
307 // file descriptor. Attaching to the process before writing to the socket
308 // ensures that the subprocess at most has blocked on the read call. If we
309 // attach afterwards, the subprocess might exit before we get to the attach
310 // call due to effects like scheduler contention, introducing transient
311 // failures.
312 if (ptrace(request: PTRACE_ATTACH, ChildPID, NULL, NULL) != 0)
313 return make_error<Failure>(Args: "Failed to attach to the child process: " +
314 Twine(strerror(errno)));
315
316 if (waitpid(pid: ChildPID, NULL, options: 0) == -1) {
317 return make_error<Failure>(
318 Args: "Failed to wait for child process to stop after attaching: " +
319 Twine(strerror(errno)));
320 }
321
322 if (ptrace(request: PTRACE_CONT, ChildPID, NULL, NULL) != 0)
323 return make_error<Failure>(
324 Args: "Failed to continue execution of the child process: " +
325 Twine(strerror(errno)));
326
327 int CounterFileDescriptor = Counter->getFileDescriptor();
328 Error SendError =
329 sendFileDescriptorThroughSocket(SocketFD: WriteFD, FD: CounterFileDescriptor);
330
331 if (SendError)
332 return SendError;
333
334 int ChildStatus;
335 if (waitpid(pid: ChildPID, stat_loc: &ChildStatus, options: 0) == -1) {
336 return make_error<Failure>(
337 Args: "Waiting for the child process to complete failed: " +
338 Twine(strerror(errno)));
339 }
340
341 if (WIFEXITED(ChildStatus)) {
342 int ChildExitCode = WEXITSTATUS(ChildStatus);
343 if (ChildExitCode == 0) {
344 // The child exited succesfully, read counter values and return
345 // success.
346 auto CounterValueOrErr = Counter->readOrError();
347 if (!CounterValueOrErr)
348 return CounterValueOrErr.takeError();
349 CounterValues = std::move(*CounterValueOrErr);
350
351 auto ValidationValuesOrErr = Counter->readValidationCountersOrError();
352 if (!ValidationValuesOrErr)
353 return ValidationValuesOrErr.takeError();
354
355 ArrayRef RealValidationValues = *ValidationValuesOrErr;
356 for (size_t I = 0; I < RealValidationValues.size(); ++I)
357 ValidationCounterValues[I] = RealValidationValues[I];
358
359 return Error::success();
360 }
361 // The child exited, but not successfully.
362 return make_error<Failure>(
363 Args: "Child benchmarking process exited with non-zero exit code: " +
364 childProcessExitCodeToString(ExitCode: ChildExitCode));
365 }
366
367 // An error was encountered running the snippet, process it
368 siginfo_t ChildSignalInfo;
369 if (ptrace(request: PTRACE_GETSIGINFO, ChildPID, NULL, &ChildSignalInfo) == -1) {
370 return make_error<Failure>(Args: "Getting signal info from the child failed: " +
371 Twine(strerror(errno)));
372 }
373
374 // Send SIGKILL rather than SIGTERM as the child process has no SIGTERM
375 // handlers to run, and calling SIGTERM would mean that ptrace will force
376 // it to block in the signal-delivery-stop for the SIGSEGV/other signals,
377 // and upon exit.
378 if (kill(pid: ChildPID, SIGKILL) == -1)
379 return make_error<Failure>(Args: "Failed to kill child benchmarking proces: " +
380 Twine(strerror(errno)));
381
382 // Wait for the process to exit so that there are no zombie processes left
383 // around.
384 if (waitpid(pid: ChildPID, NULL, options: 0) == -1)
385 return make_error<Failure>(Args: "Failed to wait for process to die: " +
386 Twine(strerror(errno)));
387
388 if (ChildSignalInfo.si_signo == SIGSEGV)
389 return make_error<SnippetSegmentationFault>(
390 Args: reinterpret_cast<uintptr_t>(ChildSignalInfo.si_addr));
391
392 return make_error<SnippetSignal>(Args&: ChildSignalInfo.si_signo);
393 }
394
395 static void setCPUAffinityIfRequested(int CPUToUse) {
396// Special case this function for x86_64 for now as certain more esoteric
397// platforms have different definitions for some of the libc functions that
398// cause buildtime failures. Additionally, the subprocess executor mode (the
399// sole mode where this is supported) currently only supports x86_64.
400
401// Also check that we have the SYS_getcpu macro defined, meaning the syscall
402// actually exists within the build environment. We manually use the syscall
403// rather than the libc wrapper given the wrapper for getcpu is only available
404// in glibc 2.29 and later.
405#if defined(__x86_64__) && defined(SYS_getcpu)
406 // Set the CPU affinity for the child process, so that we ensure that if
407 // the user specified a CPU the process should run on, the benchmarking
408 // process is running on that CPU.
409 cpu_set_t CPUMask;
410 CPU_ZERO(&CPUMask);
411 CPU_SET(CPUToUse, &CPUMask);
412 // TODO(boomanaiden154): Rewrite this to use LLVM primitives once they
413 // are available.
414 int SetAffinityReturn = sched_setaffinity(pid: 0, cpusetsize: sizeof(CPUMask), cpuset: &CPUMask);
415 if (SetAffinityReturn == -1) {
416 exit(status: ChildProcessExitCodeE::SetCPUAffinityFailed);
417 }
418
419 // Check (if assertions are enabled) that we are actually running on the
420 // CPU that was specified by the user.
421 [[maybe_unused]] unsigned int CurrentCPU;
422 assert(syscall(SYS_getcpu, &CurrentCPU, nullptr) == 0 &&
423 "Expected getcpu call to succeed.");
424 assert(static_cast<int>(CurrentCPU) == CPUToUse &&
425 "Expected current CPU to equal the CPU requested by the user");
426#else
427 exit(ChildProcessExitCodeE::SetCPUAffinityFailed);
428#endif // defined(__x86_64__) && defined(SYS_getcpu)
429 }
430
431 Error createSubProcessAndRunBenchmark(
432 StringRef CounterName, SmallVectorImpl<int64_t> &CounterValues,
433 ArrayRef<const char *> ValidationCounters,
434 SmallVectorImpl<int64_t> &ValidationCounterValues) const {
435 int PipeFiles[2];
436 int PipeSuccessOrErr = socketpair(AF_UNIX, SOCK_DGRAM, protocol: 0, fds: PipeFiles);
437 if (PipeSuccessOrErr != 0) {
438 return make_error<Failure>(
439 Args: "Failed to create a pipe for interprocess communication between "
440 "llvm-exegesis and the benchmarking subprocess: " +
441 Twine(strerror(errno)));
442 }
443
444 SubprocessMemory SPMemory;
445 Error MemoryInitError = SPMemory.initializeSubprocessMemory(ProcessID: getpid());
446 if (MemoryInitError)
447 return MemoryInitError;
448
449 Error AddMemDefError =
450 SPMemory.addMemoryDefinition(MemoryDefinitions: Key.MemoryValues, ProcessID: getpid());
451 if (AddMemDefError)
452 return AddMemDefError;
453
454 long ParentTID = SubprocessMemory::getCurrentTID();
455 pid_t ParentOrChildPID = fork();
456
457 if (ParentOrChildPID == -1) {
458 return make_error<Failure>(Args: "Failed to create child process: " +
459 Twine(strerror(errno)));
460 }
461
462 if (ParentOrChildPID == 0) {
463 if (BenchmarkProcessCPU.has_value()) {
464 setCPUAffinityIfRequested(*BenchmarkProcessCPU);
465 }
466
467 // We are in the child process, close the write end of the pipe.
468 close(fd: PipeFiles[1]);
469 // Unregister handlers, signal handling is now handled through ptrace in
470 // the host process.
471 sys::unregisterHandlers();
472 runChildSubprocess(Pipe: PipeFiles[0], Key, ParentTID);
473 // The child process terminates in the above function, so we should never
474 // get to this point.
475 llvm_unreachable("Child process didn't exit when expected.");
476 }
477
478 // Close the read end of the pipe as we only need to write to the subprocess
479 // from the parent process.
480 close(fd: PipeFiles[0]);
481 return runParentProcess(ChildPID: ParentOrChildPID, WriteFD: PipeFiles[1], CounterName,
482 CounterValues, ValidationCounters,
483 ValidationCounterValues);
484 }
485
486 void disableCoreDumps() const {
487 struct rlimit rlim;
488
489 rlim.rlim_cur = 0;
490 setrlimit(RLIMIT_CORE, rlimits: &rlim);
491 }
492
493 [[noreturn]] void runChildSubprocess(int Pipe, const BenchmarkKey &Key,
494 long ParentTID) const {
495 // Disable core dumps in the child process as otherwise everytime we
496 // encounter an execution failure like a segmentation fault, we will create
497 // a core dump. We report the information directly rather than require the
498 // user inspect a core dump.
499 disableCoreDumps();
500
501 // The following occurs within the benchmarking subprocess.
502 pid_t ParentPID = getppid();
503
504 Expected<int> CounterFileDescriptorOrError =
505 getFileDescriptorFromSocket(SocketFD: Pipe);
506
507 if (!CounterFileDescriptorOrError)
508 exit(status: ChildProcessExitCodeE::CounterFDReadFailed);
509
510 int CounterFileDescriptor = *CounterFileDescriptorOrError;
511
512// Glibc versions greater than 2.35 automatically call rseq during
513// initialization. Unmapping the region that glibc sets up for this causes
514// segfaults in the program. Unregister the rseq region so that we can safely
515// unmap it later
516#ifdef GLIBC_INITS_RSEQ
517 unsigned int RseqStructSize = __rseq_size;
518
519 // Glibc v2.40 (the change is also expected to be backported to v2.35)
520 // changes the definition of __rseq_size to be the usable area of the struct
521 // rather than the actual size of the struct. v2.35 uses only 20 bytes of
522 // the 32 byte struct. For now, it should be safe to assume that if the
523 // usable size is less than 32, the actual size of the struct will be 32
524 // bytes given alignment requirements.
525 if (__rseq_size < 32)
526 RseqStructSize = 32;
527
528 long RseqDisableOutput = syscall(
529 SYS_rseq,
530 reinterpret_cast<uintptr_t>(__builtin_thread_pointer()) + __rseq_offset,
531 RseqStructSize, RSEQ_FLAG_UNREGISTER, RSEQ_SIG);
532 if (RseqDisableOutput != 0)
533 exit(status: ChildProcessExitCodeE::RSeqDisableFailed);
534#endif // GLIBC_INITS_RSEQ
535
536 // The frontend that generates the memory annotation structures should
537 // validate that the address to map the snippet in at is a multiple of
538 // the page size. Assert that this is true here.
539 assert(Key.SnippetAddress % getpagesize() == 0 &&
540 "The snippet address needs to be aligned to a page boundary.");
541
542 size_t FunctionDataCopySize = this->Function.FunctionBytes.size();
543 void *MapAddress = NULL;
544 int MapFlags = MAP_PRIVATE | MAP_ANONYMOUS;
545
546 if (Key.SnippetAddress != 0) {
547 MapAddress = reinterpret_cast<void *>(Key.SnippetAddress);
548 MapFlags |= MAP_FIXED_NOREPLACE;
549 }
550
551 char *FunctionDataCopy =
552 (char *)mmap(addr: MapAddress, len: FunctionDataCopySize, PROT_READ | PROT_WRITE,
553 flags: MapFlags, fd: 0, offset: 0);
554 if (reinterpret_cast<intptr_t>(FunctionDataCopy) == -1)
555 exit(status: ChildProcessExitCodeE::FunctionDataMappingFailed);
556
557 memcpy(dest: FunctionDataCopy, src: this->Function.FunctionBytes.data(),
558 n: this->Function.FunctionBytes.size());
559 mprotect(addr: FunctionDataCopy, len: FunctionDataCopySize, PROT_READ | PROT_EXEC);
560
561 Expected<int> AuxMemFDOrError =
562 SubprocessMemory::setupAuxiliaryMemoryInSubprocess(
563 MemoryDefinitions: Key.MemoryValues, ParentPID, ParentTID, CounterFileDescriptor);
564 if (!AuxMemFDOrError)
565 exit(status: ChildProcessExitCodeE::AuxiliaryMemorySetupFailed);
566
567 ((void (*)(size_t, int))(uintptr_t)FunctionDataCopy)(FunctionDataCopySize,
568 *AuxMemFDOrError);
569
570 exit(status: 0);
571 }
572
573 Expected<SmallVector<int64_t, 4>> runWithCounter(
574 StringRef CounterName, ArrayRef<const char *> ValidationCounters,
575 SmallVectorImpl<int64_t> &ValidationCounterValues) const override {
576 SmallVector<int64_t, 4> Value(1, 0);
577 Error PossibleBenchmarkError = createSubProcessAndRunBenchmark(
578 CounterName, CounterValues&: Value, ValidationCounters, ValidationCounterValues);
579
580 if (PossibleBenchmarkError)
581 return std::move(PossibleBenchmarkError);
582
583 return Value;
584 }
585
586 const LLVMState &State;
587 const ExecutableFunction Function;
588 const BenchmarkKey &Key;
589 const std::optional<int> BenchmarkProcessCPU;
590};
591#endif // __linux__
592
593// Structure to hold instruction information for assembly printing
594struct InstructionInfo {
595 std::string Text;
596 uint64_t Address;
597 std::string HexBytes;
598};
599
600#ifndef NDEBUG
601// Helper function to print generated assembly snippets
602void printInstructions(const std::vector<InstructionInfo> &Instructions,
603 int InitialLinesCount, int LastLinesCount) {
604 int N = Instructions.size();
605 dbgs() << "Generated assembly snippet:\n```\n";
606
607 // Print initial lines
608 for (int i = 0; i < InitialLinesCount; ++i)
609 dbgs() << format_hex_no_prefix(Instructions[i].Address, 0) << ":\t"
610 << Instructions[i].HexBytes << Instructions[i].Text << '\n';
611
612 // Show truncation message if needed
613 int SkippedInstructions = N - InitialLinesCount - LastLinesCount;
614 if (SkippedInstructions > 0)
615 dbgs() << "...\t(" << SkippedInstructions << " more instructions)\n";
616
617 // Print last min(PreviewLast, N - PreviewFirst) lines
618 int LastLinesToPrint = std::min(
619 LastLinesCount, N > InitialLinesCount ? N - InitialLinesCount : 0);
620 for (int i = N - LastLinesToPrint; i < N; ++i)
621 dbgs() << format_hex_no_prefix(Instructions[i].Address, 0) << ":\t"
622 << Instructions[i].HexBytes << Instructions[i].Text << '\n';
623 dbgs() << "```\n";
624}
625#endif // NDEBUG
626
627// Function to extract and print assembly from snippet
628Error printAssembledSnippet(const LLVMState &State,
629 const SmallString<0> &Snippet) {
630 // Extract the actual function bytes from the object file
631 std::vector<uint8_t> FunctionBytes;
632 if (auto Err = getBenchmarkFunctionBytes(InputData: Snippet, Bytes&: FunctionBytes))
633 return make_error<Failure>(Args: "Failed to extract function bytes: " +
634 toString(E: std::move(Err)));
635
636 // Decode all instructions first
637 DisassemblerHelper DisHelper(State);
638 uint64_t Address = 0;
639 std::vector<InstructionInfo> Instructions;
640 const size_t FunctionBytesSize = FunctionBytes.size();
641
642 while (Address < FunctionBytesSize) {
643 MCInst Inst;
644 uint64_t Size;
645 ArrayRef<uint8_t> Bytes(FunctionBytes.data() + Address,
646 FunctionBytesSize - Address);
647
648 if (!DisHelper.decodeInst(MI&: Inst, MISize&: Size, Bytes)) {
649 Instructions.push_back(x: {.Text: "<decode error>", .Address: Address, .HexBytes: ""});
650 break;
651 }
652
653 // Format instruction text
654 std::string InstStr;
655 raw_string_ostream OS(InstStr);
656 DisHelper.printInst(MI: &Inst, OS);
657
658 // Create hex string for this instruction (big-endian order)
659 std::string HexStr;
660 raw_string_ostream HexOS(HexStr);
661 for (int i = Size - 1; i >= 0; --i)
662 HexOS << format_hex_no_prefix(N: Bytes[i], Width: 2);
663
664 Instructions.push_back(x: {.Text: OS.str(), .Address: Address, .HexBytes: HexOS.str()});
665 Address += Size;
666 }
667
668#undef DEBUG_TYPE
669#define DEBUG_TYPE "preview-gen-assembly"
670 LLVM_DEBUG(printInstructions(Instructions, 10, 3));
671#undef DEBUG_TYPE
672#define DEBUG_TYPE "print-gen-assembly"
673 LLVM_DEBUG(printInstructions(Instructions, Instructions.size(), 0));
674#undef DEBUG_TYPE
675 return Error::success();
676}
677} // namespace
678
679Expected<SmallString<0>> BenchmarkRunner::assembleSnippet(
680 const BenchmarkCode &BC, const SnippetRepetitor &Repetitor,
681 unsigned MinInstructions, unsigned LoopBodySize,
682 bool GenerateMemoryInstructions) const {
683 const std::vector<MCInst> &Instructions = BC.Key.Instructions;
684 SmallString<0> Buffer;
685 raw_svector_ostream OS(Buffer);
686 if (Error E = assembleToStream(
687 ET: State.getExegesisTarget(), TM: State.createTargetMachine(), LiveIns: BC.LiveIns,
688 Fill: Repetitor.Repeat(Instructions, MinInstructions, LoopBodySize,
689 CleanupMemory: GenerateMemoryInstructions),
690 AsmStreamm&: OS, Key: BC.Key, GenerateMemoryInstructions)) {
691 return std::move(E);
692 }
693 return Buffer;
694}
695
696Expected<BenchmarkRunner::RunnableConfiguration>
697BenchmarkRunner::getRunnableConfiguration(
698 const BenchmarkCode &BC, unsigned MinInstructions, unsigned LoopBodySize,
699 const SnippetRepetitor &Repetitor) const {
700 RunnableConfiguration RC;
701
702 Benchmark &BenchmarkResult = RC.BenchmarkResult;
703 BenchmarkResult.Mode = Mode;
704 BenchmarkResult.CpuName =
705 std::string(State.getTargetMachine().getTargetCPU());
706 BenchmarkResult.LLVMTriple =
707 State.getTargetMachine().getTargetTriple().normalize();
708 BenchmarkResult.MinInstructions = MinInstructions;
709 BenchmarkResult.Info = BC.Info;
710
711 const std::vector<MCInst> &Instructions = BC.Key.Instructions;
712
713 bool GenerateMemoryInstructions = ExecutionMode == ExecutionModeE::SubProcess;
714
715 BenchmarkResult.Key = BC.Key;
716
717 // Assemble at least kMinInstructionsForSnippet instructions by repeating
718 // the snippet for debug/analysis. This is so that the user clearly
719 // understands that the inside instructions are repeated.
720 if (BenchmarkPhaseSelector > BenchmarkPhaseSelectorE::PrepareSnippet) {
721 const int MinInstructionsForSnippet = 4 * Instructions.size();
722 const int LoopBodySizeForSnippet = 2 * Instructions.size();
723 auto Snippet =
724 assembleSnippet(BC, Repetitor, MinInstructions: MinInstructionsForSnippet,
725 LoopBodySize: LoopBodySizeForSnippet, GenerateMemoryInstructions);
726 if (Error E = Snippet.takeError())
727 return std::move(E);
728
729 if (auto Err = getBenchmarkFunctionBytes(InputData: *Snippet,
730 Bytes&: BenchmarkResult.AssembledSnippet))
731 return std::move(Err);
732 }
733
734 // Assemble enough repetitions of the snippet so we have at least
735 // MinInstructions instructions.
736 if (BenchmarkPhaseSelector >
737 BenchmarkPhaseSelectorE::PrepareAndAssembleSnippet) {
738 auto Snippet =
739 assembleSnippet(BC, Repetitor, MinInstructions: BenchmarkResult.MinInstructions,
740 LoopBodySize, GenerateMemoryInstructions);
741 if (Error E = Snippet.takeError())
742 return std::move(E);
743 RC.ObjectFile = getObjectFromBuffer(Buffer: *Snippet);
744
745 // Print the assembled snippet by disassembling the binary data
746 if (Error E = printAssembledSnippet(State, Snippet: *Snippet))
747 return std::move(E);
748 }
749
750 return std::move(RC);
751}
752
753Expected<std::unique_ptr<BenchmarkRunner::FunctionExecutor>>
754BenchmarkRunner::createFunctionExecutor(
755 object::OwningBinary<object::ObjectFile> ObjectFile,
756 const BenchmarkKey &Key, std::optional<int> BenchmarkProcessCPU) const {
757 switch (ExecutionMode) {
758 case ExecutionModeE::InProcess: {
759 if (BenchmarkProcessCPU.has_value())
760 return make_error<Failure>(Args: "The inprocess execution mode does not "
761 "support benchmark core pinning.");
762
763 auto InProcessExecutorOrErr = InProcessFunctionExecutorImpl::create(
764 State, Obj: std::move(ObjectFile), Scratch: Scratch.get(), BenchmarkProcessCPU);
765 if (!InProcessExecutorOrErr)
766 return InProcessExecutorOrErr.takeError();
767
768 return std::move(*InProcessExecutorOrErr);
769 }
770 case ExecutionModeE::SubProcess: {
771#ifdef __linux__
772 auto SubProcessExecutorOrErr = SubProcessFunctionExecutorImpl::create(
773 State, Obj: std::move(ObjectFile), Key, BenchmarkProcessCPU);
774 if (!SubProcessExecutorOrErr)
775 return SubProcessExecutorOrErr.takeError();
776
777 return std::move(*SubProcessExecutorOrErr);
778#else
779 return make_error<Failure>(
780 "The subprocess execution mode is only supported on Linux");
781#endif
782 }
783 }
784 llvm_unreachable("ExecutionMode is outside expected range");
785}
786
787std::pair<Error, Benchmark> BenchmarkRunner::runConfiguration(
788 RunnableConfiguration &&RC, const std::optional<StringRef> &DumpFile,
789 std::optional<int> BenchmarkProcessCPU) const {
790 Benchmark &BenchmarkResult = RC.BenchmarkResult;
791 object::OwningBinary<object::ObjectFile> &ObjectFile = RC.ObjectFile;
792
793 if (DumpFile && BenchmarkPhaseSelector >
794 BenchmarkPhaseSelectorE::PrepareAndAssembleSnippet) {
795 auto ObjectFilePath =
796 writeObjectFile(Buffer: ObjectFile.getBinary()->getData(), FileName: *DumpFile);
797 if (Error E = ObjectFilePath.takeError()) {
798 return {std::move(E), std::move(BenchmarkResult)};
799 }
800 outs() << "Check generated assembly with: /usr/bin/objdump -d "
801 << *ObjectFilePath << "\n";
802 }
803
804 if (BenchmarkPhaseSelector < BenchmarkPhaseSelectorE::Measure) {
805 BenchmarkResult.Error = "actual measurements skipped.";
806 return {Error::success(), std::move(BenchmarkResult)};
807 }
808
809 Expected<std::unique_ptr<BenchmarkRunner::FunctionExecutor>> Executor =
810 createFunctionExecutor(ObjectFile: std::move(ObjectFile), Key: RC.BenchmarkResult.Key,
811 BenchmarkProcessCPU);
812 if (!Executor)
813 return {Executor.takeError(), std::move(BenchmarkResult)};
814 auto NewMeasurements = runMeasurements(Executor: **Executor);
815
816 if (Error E = NewMeasurements.takeError()) {
817 return {std::move(E), std::move(BenchmarkResult)};
818 }
819 assert(BenchmarkResult.MinInstructions > 0 && "invalid MinInstructions");
820 for (BenchmarkMeasure &BM : *NewMeasurements) {
821 // Scale the measurements by the number of instructions.
822 BM.PerInstructionValue /= BenchmarkResult.MinInstructions;
823 // Scale the measurements by the number of times the entire snippet is
824 // repeated.
825 BM.PerSnippetValue /=
826 std::ceil(x: BenchmarkResult.MinInstructions /
827 static_cast<double>(BenchmarkResult.Key.Instructions.size()));
828 }
829 BenchmarkResult.Measurements = std::move(*NewMeasurements);
830
831 return {Error::success(), std::move(BenchmarkResult)};
832}
833
834Expected<std::string>
835BenchmarkRunner::writeObjectFile(StringRef Buffer, StringRef FileName) const {
836 int ResultFD = 0;
837 SmallString<256> ResultPath = FileName;
838 if (Error E = errorCodeToError(
839 EC: FileName.empty() ? sys::fs::createTemporaryFile(Prefix: "snippet", Suffix: "o",
840 ResultFD, ResultPath)
841 : sys::fs::openFileForReadWrite(
842 Name: FileName, ResultFD, Disp: sys::fs::CD_CreateAlways,
843 Flags: sys::fs::OF_None)))
844 return std::move(E);
845 raw_fd_ostream OFS(ResultFD, true /*ShouldClose*/);
846 OFS.write(Ptr: Buffer.data(), Size: Buffer.size());
847 OFS.flush();
848 return std::string(ResultPath);
849}
850
851static bool EventLessThan(const std::pair<ValidationEvent, const char *> LHS,
852 const ValidationEvent RHS) {
853 return static_cast<int>(LHS.first) < static_cast<int>(RHS);
854}
855
856Error BenchmarkRunner::getValidationCountersToRun(
857 SmallVector<const char *> &ValCountersToRun) const {
858 const PfmCountersInfo &PCI = State.getPfmCounters();
859 ValCountersToRun.reserve(N: ValidationCounters.size());
860
861 ValCountersToRun.reserve(N: ValidationCounters.size());
862 ArrayRef TargetValidationEvents(PCI.ValidationEvents,
863 PCI.NumValidationEvents);
864 for (const ValidationEvent RequestedValEvent : ValidationCounters) {
865 auto ValCounterIt =
866 lower_bound(Range&: TargetValidationEvents, Value: RequestedValEvent, C: EventLessThan);
867 if (ValCounterIt == TargetValidationEvents.end() ||
868 ValCounterIt->first != RequestedValEvent)
869 return make_error<Failure>(Args: "Cannot create validation counter");
870
871 assert(ValCounterIt->first == RequestedValEvent &&
872 "The array of validation events from the target should be sorted");
873 ValCountersToRun.push_back(Elt: ValCounterIt->second);
874 }
875
876 return Error::success();
877}
878
879BenchmarkRunner::FunctionExecutor::~FunctionExecutor() = default;
880
881} // namespace exegesis
882} // namespace llvm
883