| 1 | //===-- BenchmarkRunner.cpp -------------------------------------*- C++ -*-===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | |
| 9 | #include "BenchmarkRunner.h" |
| 10 | #include "Assembler.h" |
| 11 | #include "DisassemblerHelper.h" |
| 12 | #include "Error.h" |
| 13 | #include "MCInstrDescView.h" |
| 14 | #include "MmapUtils.h" |
| 15 | #include "PerfHelper.h" |
| 16 | #include "SubprocessMemory.h" |
| 17 | #include "Target.h" |
| 18 | #include "llvm/ADT/ScopeExit.h" |
| 19 | #include "llvm/ADT/StringExtras.h" |
| 20 | #include "llvm/ADT/StringRef.h" |
| 21 | #include "llvm/ADT/Twine.h" |
| 22 | #include "llvm/Config/llvm-config.h" // for LLVM_ON_UNIX |
| 23 | #include "llvm/Support/CrashRecoveryContext.h" |
| 24 | #include "llvm/Support/Debug.h" |
| 25 | #include "llvm/Support/Error.h" |
| 26 | #include "llvm/Support/FileSystem.h" |
| 27 | #include "llvm/Support/MemoryBuffer.h" |
| 28 | #include "llvm/Support/Program.h" |
| 29 | #include "llvm/Support/Signals.h" |
| 30 | #include <cmath> |
| 31 | #include <memory> |
| 32 | #include <string> |
| 33 | |
| 34 | #ifdef __linux__ |
| 35 | #ifdef HAVE_LIBPFM |
| 36 | #include <perfmon/perf_event.h> |
| 37 | #endif |
| 38 | #include <sys/mman.h> |
| 39 | #include <sys/ptrace.h> |
| 40 | #include <sys/resource.h> |
| 41 | #include <sys/socket.h> |
| 42 | #include <sys/syscall.h> |
| 43 | #include <sys/wait.h> |
| 44 | #include <unistd.h> |
| 45 | |
| 46 | #if defined(__GLIBC__) && __has_include(<sys/rseq.h>) && defined(HAVE_BUILTIN_THREAD_POINTER) |
| 47 | #include <sys/rseq.h> |
| 48 | #if defined(RSEQ_SIG) && defined(SYS_rseq) |
| 49 | #define GLIBC_INITS_RSEQ |
| 50 | #endif |
| 51 | #endif |
| 52 | #endif // __linux__ |
| 53 | |
| 54 | namespace llvm { |
| 55 | namespace exegesis { |
| 56 | |
| 57 | BenchmarkRunner::BenchmarkRunner(const LLVMState &State, Benchmark::ModeE Mode, |
| 58 | BenchmarkPhaseSelectorE BenchmarkPhaseSelector, |
| 59 | ExecutionModeE ExecutionMode, |
| 60 | ArrayRef<ValidationEvent> ValCounters) |
| 61 | : State(State), Mode(Mode), BenchmarkPhaseSelector(BenchmarkPhaseSelector), |
| 62 | ExecutionMode(ExecutionMode), ValidationCounters(ValCounters), |
| 63 | Scratch(std::make_unique<ScratchSpace>()) {} |
| 64 | |
| 65 | BenchmarkRunner::~BenchmarkRunner() = default; |
| 66 | |
| 67 | void BenchmarkRunner::FunctionExecutor::accumulateCounterValues( |
| 68 | const SmallVectorImpl<int64_t> &NewValues, |
| 69 | SmallVectorImpl<int64_t> *Result) { |
| 70 | const size_t NumValues = std::max(a: NewValues.size(), b: Result->size()); |
| 71 | if (NumValues > Result->size()) |
| 72 | Result->resize(N: NumValues, NV: 0); |
| 73 | for (size_t I = 0, End = NewValues.size(); I < End; ++I) |
| 74 | (*Result)[I] += NewValues[I]; |
| 75 | } |
| 76 | |
| 77 | Expected<SmallVector<int64_t, 4>> |
| 78 | BenchmarkRunner::FunctionExecutor::runAndSample( |
| 79 | const char *Counters, ArrayRef<const char *> ValidationCounters, |
| 80 | SmallVectorImpl<int64_t> &ValidationCounterValues) const { |
| 81 | // We sum counts when there are several counters for a single ProcRes |
| 82 | // (e.g. P23 on SandyBridge). |
| 83 | SmallVector<int64_t, 4> CounterValues; |
| 84 | SmallVector<StringRef, 2> CounterNames; |
| 85 | StringRef(Counters).split(A&: CounterNames, Separator: '+'); |
| 86 | for (auto &CounterName : CounterNames) { |
| 87 | CounterName = CounterName.trim(); |
| 88 | Expected<SmallVector<int64_t, 4>> ValueOrError = runWithCounter( |
| 89 | CounterName, ValidationCounters, ValidationCounterValues); |
| 90 | if (!ValueOrError) |
| 91 | return ValueOrError.takeError(); |
| 92 | accumulateCounterValues(NewValues: ValueOrError.get(), Result: &CounterValues); |
| 93 | } |
| 94 | return CounterValues; |
| 95 | } |
| 96 | |
| 97 | namespace { |
| 98 | class InProcessFunctionExecutorImpl : public BenchmarkRunner::FunctionExecutor { |
| 99 | public: |
| 100 | static Expected<std::unique_ptr<InProcessFunctionExecutorImpl>> |
| 101 | create(const LLVMState &State, object::OwningBinary<object::ObjectFile> Obj, |
| 102 | BenchmarkRunner::ScratchSpace *Scratch, |
| 103 | std::optional<int> BenchmarkProcessCPU) { |
| 104 | Expected<ExecutableFunction> EF = |
| 105 | ExecutableFunction::create(TM: State.createTargetMachine(), ObjectFileHolder: std::move(Obj)); |
| 106 | |
| 107 | if (!EF) |
| 108 | return EF.takeError(); |
| 109 | |
| 110 | return std::unique_ptr<InProcessFunctionExecutorImpl>( |
| 111 | new InProcessFunctionExecutorImpl(State, std::move(*EF), Scratch)); |
| 112 | } |
| 113 | |
| 114 | private: |
| 115 | InProcessFunctionExecutorImpl(const LLVMState &State, |
| 116 | ExecutableFunction Function, |
| 117 | BenchmarkRunner::ScratchSpace *Scratch) |
| 118 | : State(State), Function(std::move(Function)), Scratch(Scratch) {} |
| 119 | |
| 120 | static void accumulateCounterValues(const SmallVector<int64_t, 4> &NewValues, |
| 121 | SmallVector<int64_t, 4> *Result) { |
| 122 | const size_t NumValues = std::max(a: NewValues.size(), b: Result->size()); |
| 123 | if (NumValues > Result->size()) |
| 124 | Result->resize(N: NumValues, NV: 0); |
| 125 | for (size_t I = 0, End = NewValues.size(); I < End; ++I) |
| 126 | (*Result)[I] += NewValues[I]; |
| 127 | } |
| 128 | |
| 129 | Expected<SmallVector<int64_t, 4>> runWithCounter( |
| 130 | StringRef CounterName, ArrayRef<const char *> ValidationCounters, |
| 131 | SmallVectorImpl<int64_t> &ValidationCounterValues) const override { |
| 132 | const ExegesisTarget &ET = State.getExegesisTarget(); |
| 133 | char *const ScratchPtr = Scratch->ptr(); |
| 134 | auto CounterOrError = |
| 135 | ET.createCounter(CounterName, State, ValidationCounters); |
| 136 | |
| 137 | if (!CounterOrError) |
| 138 | return CounterOrError.takeError(); |
| 139 | |
| 140 | pfm::CounterGroup *Counter = CounterOrError.get().get(); |
| 141 | Scratch->clear(); |
| 142 | { |
| 143 | auto PS = ET.withSavedState(); |
| 144 | CrashRecoveryContext CRC; |
| 145 | CrashRecoveryContext::Enable(); |
| 146 | const bool Crashed = !CRC.RunSafely(Fn: [this, Counter, ScratchPtr]() { |
| 147 | Counter->start(); |
| 148 | this->Function(ScratchPtr); |
| 149 | Counter->stop(); |
| 150 | }); |
| 151 | CrashRecoveryContext::Disable(); |
| 152 | PS.reset(); |
| 153 | if (Crashed) { |
| 154 | #ifdef LLVM_ON_UNIX |
| 155 | // See "Exit Status for Commands": |
| 156 | // https://pubs.opengroup.org/onlinepubs/9699919799/xrat/V4_xcu_chap02.html |
| 157 | constexpr int kSigOffset = 128; |
| 158 | return make_error<SnippetSignal>(Args: CRC.RetCode - kSigOffset); |
| 159 | #else |
| 160 | // The exit code of the process on windows is not meaningful as a |
| 161 | // signal, so simply pass in -1 as the signal into the error. |
| 162 | return make_error<SnippetSignal>(-1); |
| 163 | #endif // LLVM_ON_UNIX |
| 164 | } |
| 165 | } |
| 166 | |
| 167 | auto ValidationValuesOrErr = Counter->readValidationCountersOrError(); |
| 168 | if (!ValidationValuesOrErr) |
| 169 | return ValidationValuesOrErr.takeError(); |
| 170 | |
| 171 | ArrayRef RealValidationValues = *ValidationValuesOrErr; |
| 172 | for (size_t I = 0; I < RealValidationValues.size(); ++I) |
| 173 | ValidationCounterValues[I] = RealValidationValues[I]; |
| 174 | |
| 175 | return Counter->readOrError(FunctionBytes: Function.getFunctionBytes()); |
| 176 | } |
| 177 | |
| 178 | const LLVMState &State; |
| 179 | const ExecutableFunction Function; |
| 180 | BenchmarkRunner::ScratchSpace *const Scratch; |
| 181 | }; |
| 182 | |
| 183 | #ifdef __linux__ |
| 184 | // The following class implements a function executor that executes the |
| 185 | // benchmark code within a subprocess rather than within the main llvm-exegesis |
| 186 | // process. This allows for much more control over the execution context of the |
| 187 | // snippet, particularly with regard to memory. This class performs all the |
| 188 | // necessary functions to create the subprocess, execute the snippet in the |
| 189 | // subprocess, and report results/handle errors. |
| 190 | class SubProcessFunctionExecutorImpl |
| 191 | : public BenchmarkRunner::FunctionExecutor { |
| 192 | public: |
| 193 | static Expected<std::unique_ptr<SubProcessFunctionExecutorImpl>> |
| 194 | create(const LLVMState &State, object::OwningBinary<object::ObjectFile> Obj, |
| 195 | const BenchmarkKey &Key, std::optional<int> BenchmarkProcessCPU) { |
| 196 | Expected<ExecutableFunction> EF = |
| 197 | ExecutableFunction::create(TM: State.createTargetMachine(), ObjectFileHolder: std::move(Obj)); |
| 198 | if (!EF) |
| 199 | return EF.takeError(); |
| 200 | |
| 201 | return std::unique_ptr<SubProcessFunctionExecutorImpl>( |
| 202 | new SubProcessFunctionExecutorImpl(State, std::move(*EF), Key, |
| 203 | BenchmarkProcessCPU)); |
| 204 | } |
| 205 | |
| 206 | private: |
| 207 | SubProcessFunctionExecutorImpl(const LLVMState &State, |
| 208 | ExecutableFunction Function, |
| 209 | const BenchmarkKey &Key, |
| 210 | std::optional<int> BenchmarkCPU) |
| 211 | : State(State), Function(std::move(Function)), Key(Key), |
| 212 | BenchmarkProcessCPU(BenchmarkCPU) {} |
| 213 | |
| 214 | enum ChildProcessExitCodeE { |
| 215 | CounterFDReadFailed = 1, |
| 216 | RSeqDisableFailed, |
| 217 | FunctionDataMappingFailed, |
| 218 | AuxiliaryMemorySetupFailed, |
| 219 | SetCPUAffinityFailed |
| 220 | }; |
| 221 | |
| 222 | StringRef childProcessExitCodeToString(int ExitCode) const { |
| 223 | switch (ExitCode) { |
| 224 | case ChildProcessExitCodeE::CounterFDReadFailed: |
| 225 | return "Counter file descriptor read failed" ; |
| 226 | case ChildProcessExitCodeE::RSeqDisableFailed: |
| 227 | return "Disabling restartable sequences failed" ; |
| 228 | case ChildProcessExitCodeE::FunctionDataMappingFailed: |
| 229 | return "Failed to map memory for assembled snippet" ; |
| 230 | case ChildProcessExitCodeE::AuxiliaryMemorySetupFailed: |
| 231 | return "Failed to setup auxiliary memory" ; |
| 232 | case ChildProcessExitCodeE::SetCPUAffinityFailed: |
| 233 | return "Failed to set CPU affinity of the benchmarking process" ; |
| 234 | default: |
| 235 | return "Child process returned with unknown exit code" ; |
| 236 | } |
| 237 | } |
| 238 | |
| 239 | Error sendFileDescriptorThroughSocket(int SocketFD, int FD) const { |
| 240 | struct msghdr Message = {}; |
| 241 | char Buffer[CMSG_SPACE(sizeof(FD))]; |
| 242 | memset(s: Buffer, c: 0, n: sizeof(Buffer)); |
| 243 | Message.msg_control = Buffer; |
| 244 | Message.msg_controllen = sizeof(Buffer); |
| 245 | |
| 246 | struct cmsghdr *ControlMessage = CMSG_FIRSTHDR(&Message); |
| 247 | ControlMessage->cmsg_level = SOL_SOCKET; |
| 248 | ControlMessage->cmsg_type = SCM_RIGHTS; |
| 249 | ControlMessage->cmsg_len = CMSG_LEN(sizeof(FD)); |
| 250 | |
| 251 | memcpy(CMSG_DATA(ControlMessage), src: &FD, n: sizeof(FD)); |
| 252 | |
| 253 | Message.msg_controllen = CMSG_SPACE(sizeof(FD)); |
| 254 | |
| 255 | ssize_t BytesWritten = sendmsg(fd: SocketFD, message: &Message, flags: 0); |
| 256 | |
| 257 | if (BytesWritten < 0) |
| 258 | return make_error<Failure>(Args: "Failed to write FD to socket: " + |
| 259 | Twine(strerror(errno))); |
| 260 | |
| 261 | return Error::success(); |
| 262 | } |
| 263 | |
| 264 | Expected<int> getFileDescriptorFromSocket(int SocketFD) const { |
| 265 | struct msghdr Message = {}; |
| 266 | |
| 267 | char ControlBuffer[256]; |
| 268 | Message.msg_control = ControlBuffer; |
| 269 | Message.msg_controllen = sizeof(ControlBuffer); |
| 270 | |
| 271 | ssize_t BytesRead = recvmsg(fd: SocketFD, message: &Message, flags: 0); |
| 272 | |
| 273 | if (BytesRead < 0) |
| 274 | return make_error<Failure>(Args: "Failed to read FD from socket: " + |
| 275 | Twine(strerror(errno))); |
| 276 | |
| 277 | struct cmsghdr *ControlMessage = CMSG_FIRSTHDR(&Message); |
| 278 | |
| 279 | int FD; |
| 280 | |
| 281 | if (ControlMessage->cmsg_len != CMSG_LEN(sizeof(FD))) |
| 282 | return make_error<Failure>(Args: "Failed to get correct number of bytes for " |
| 283 | "file descriptor from socket." ); |
| 284 | |
| 285 | memcpy(dest: &FD, CMSG_DATA(ControlMessage), n: sizeof(FD)); |
| 286 | |
| 287 | return FD; |
| 288 | } |
| 289 | |
| 290 | Error |
| 291 | runParentProcess(pid_t ChildPID, int WriteFD, StringRef CounterName, |
| 292 | SmallVectorImpl<int64_t> &CounterValues, |
| 293 | ArrayRef<const char *> ValidationCounters, |
| 294 | SmallVectorImpl<int64_t> &ValidationCounterValues) const { |
| 295 | scope_exit WriteFDClose([WriteFD]() { close(fd: WriteFD); }); |
| 296 | const ExegesisTarget &ET = State.getExegesisTarget(); |
| 297 | auto CounterOrError = |
| 298 | ET.createCounter(CounterName, State, ValidationCounters, ProcessID: ChildPID); |
| 299 | |
| 300 | if (!CounterOrError) |
| 301 | return CounterOrError.takeError(); |
| 302 | |
| 303 | pfm::CounterGroup *Counter = CounterOrError.get().get(); |
| 304 | |
| 305 | // Make sure to attach to the process (and wait for the sigstop to be |
| 306 | // delivered and for the process to continue) before we write to the counter |
| 307 | // file descriptor. Attaching to the process before writing to the socket |
| 308 | // ensures that the subprocess at most has blocked on the read call. If we |
| 309 | // attach afterwards, the subprocess might exit before we get to the attach |
| 310 | // call due to effects like scheduler contention, introducing transient |
| 311 | // failures. |
| 312 | if (ptrace(request: PTRACE_ATTACH, ChildPID, NULL, NULL) != 0) |
| 313 | return make_error<Failure>(Args: "Failed to attach to the child process: " + |
| 314 | Twine(strerror(errno))); |
| 315 | |
| 316 | if (waitpid(pid: ChildPID, NULL, options: 0) == -1) { |
| 317 | return make_error<Failure>( |
| 318 | Args: "Failed to wait for child process to stop after attaching: " + |
| 319 | Twine(strerror(errno))); |
| 320 | } |
| 321 | |
| 322 | if (ptrace(request: PTRACE_CONT, ChildPID, NULL, NULL) != 0) |
| 323 | return make_error<Failure>( |
| 324 | Args: "Failed to continue execution of the child process: " + |
| 325 | Twine(strerror(errno))); |
| 326 | |
| 327 | int CounterFileDescriptor = Counter->getFileDescriptor(); |
| 328 | Error SendError = |
| 329 | sendFileDescriptorThroughSocket(SocketFD: WriteFD, FD: CounterFileDescriptor); |
| 330 | |
| 331 | if (SendError) |
| 332 | return SendError; |
| 333 | |
| 334 | int ChildStatus; |
| 335 | if (waitpid(pid: ChildPID, stat_loc: &ChildStatus, options: 0) == -1) { |
| 336 | return make_error<Failure>( |
| 337 | Args: "Waiting for the child process to complete failed: " + |
| 338 | Twine(strerror(errno))); |
| 339 | } |
| 340 | |
| 341 | if (WIFEXITED(ChildStatus)) { |
| 342 | int ChildExitCode = WEXITSTATUS(ChildStatus); |
| 343 | if (ChildExitCode == 0) { |
| 344 | // The child exited succesfully, read counter values and return |
| 345 | // success. |
| 346 | auto CounterValueOrErr = Counter->readOrError(); |
| 347 | if (!CounterValueOrErr) |
| 348 | return CounterValueOrErr.takeError(); |
| 349 | CounterValues = std::move(*CounterValueOrErr); |
| 350 | |
| 351 | auto ValidationValuesOrErr = Counter->readValidationCountersOrError(); |
| 352 | if (!ValidationValuesOrErr) |
| 353 | return ValidationValuesOrErr.takeError(); |
| 354 | |
| 355 | ArrayRef RealValidationValues = *ValidationValuesOrErr; |
| 356 | for (size_t I = 0; I < RealValidationValues.size(); ++I) |
| 357 | ValidationCounterValues[I] = RealValidationValues[I]; |
| 358 | |
| 359 | return Error::success(); |
| 360 | } |
| 361 | // The child exited, but not successfully. |
| 362 | return make_error<Failure>( |
| 363 | Args: "Child benchmarking process exited with non-zero exit code: " + |
| 364 | childProcessExitCodeToString(ExitCode: ChildExitCode)); |
| 365 | } |
| 366 | |
| 367 | // An error was encountered running the snippet, process it |
| 368 | siginfo_t ChildSignalInfo; |
| 369 | if (ptrace(request: PTRACE_GETSIGINFO, ChildPID, NULL, &ChildSignalInfo) == -1) { |
| 370 | return make_error<Failure>(Args: "Getting signal info from the child failed: " + |
| 371 | Twine(strerror(errno))); |
| 372 | } |
| 373 | |
| 374 | // Send SIGKILL rather than SIGTERM as the child process has no SIGTERM |
| 375 | // handlers to run, and calling SIGTERM would mean that ptrace will force |
| 376 | // it to block in the signal-delivery-stop for the SIGSEGV/other signals, |
| 377 | // and upon exit. |
| 378 | if (kill(pid: ChildPID, SIGKILL) == -1) |
| 379 | return make_error<Failure>(Args: "Failed to kill child benchmarking proces: " + |
| 380 | Twine(strerror(errno))); |
| 381 | |
| 382 | // Wait for the process to exit so that there are no zombie processes left |
| 383 | // around. |
| 384 | if (waitpid(pid: ChildPID, NULL, options: 0) == -1) |
| 385 | return make_error<Failure>(Args: "Failed to wait for process to die: " + |
| 386 | Twine(strerror(errno))); |
| 387 | |
| 388 | if (ChildSignalInfo.si_signo == SIGSEGV) |
| 389 | return make_error<SnippetSegmentationFault>( |
| 390 | Args: reinterpret_cast<uintptr_t>(ChildSignalInfo.si_addr)); |
| 391 | |
| 392 | return make_error<SnippetSignal>(Args&: ChildSignalInfo.si_signo); |
| 393 | } |
| 394 | |
| 395 | static void setCPUAffinityIfRequested(int CPUToUse) { |
| 396 | // Special case this function for x86_64 for now as certain more esoteric |
| 397 | // platforms have different definitions for some of the libc functions that |
| 398 | // cause buildtime failures. Additionally, the subprocess executor mode (the |
| 399 | // sole mode where this is supported) currently only supports x86_64. |
| 400 | |
| 401 | // Also check that we have the SYS_getcpu macro defined, meaning the syscall |
| 402 | // actually exists within the build environment. We manually use the syscall |
| 403 | // rather than the libc wrapper given the wrapper for getcpu is only available |
| 404 | // in glibc 2.29 and later. |
| 405 | #if defined(__x86_64__) && defined(SYS_getcpu) |
| 406 | // Set the CPU affinity for the child process, so that we ensure that if |
| 407 | // the user specified a CPU the process should run on, the benchmarking |
| 408 | // process is running on that CPU. |
| 409 | cpu_set_t CPUMask; |
| 410 | CPU_ZERO(&CPUMask); |
| 411 | CPU_SET(CPUToUse, &CPUMask); |
| 412 | // TODO(boomanaiden154): Rewrite this to use LLVM primitives once they |
| 413 | // are available. |
| 414 | int SetAffinityReturn = sched_setaffinity(pid: 0, cpusetsize: sizeof(CPUMask), cpuset: &CPUMask); |
| 415 | if (SetAffinityReturn == -1) { |
| 416 | exit(status: ChildProcessExitCodeE::SetCPUAffinityFailed); |
| 417 | } |
| 418 | |
| 419 | // Check (if assertions are enabled) that we are actually running on the |
| 420 | // CPU that was specified by the user. |
| 421 | [[maybe_unused]] unsigned int CurrentCPU; |
| 422 | assert(syscall(SYS_getcpu, &CurrentCPU, nullptr) == 0 && |
| 423 | "Expected getcpu call to succeed." ); |
| 424 | assert(static_cast<int>(CurrentCPU) == CPUToUse && |
| 425 | "Expected current CPU to equal the CPU requested by the user" ); |
| 426 | #else |
| 427 | exit(ChildProcessExitCodeE::SetCPUAffinityFailed); |
| 428 | #endif // defined(__x86_64__) && defined(SYS_getcpu) |
| 429 | } |
| 430 | |
| 431 | Error createSubProcessAndRunBenchmark( |
| 432 | StringRef CounterName, SmallVectorImpl<int64_t> &CounterValues, |
| 433 | ArrayRef<const char *> ValidationCounters, |
| 434 | SmallVectorImpl<int64_t> &ValidationCounterValues) const { |
| 435 | int PipeFiles[2]; |
| 436 | int PipeSuccessOrErr = socketpair(AF_UNIX, SOCK_DGRAM, protocol: 0, fds: PipeFiles); |
| 437 | if (PipeSuccessOrErr != 0) { |
| 438 | return make_error<Failure>( |
| 439 | Args: "Failed to create a pipe for interprocess communication between " |
| 440 | "llvm-exegesis and the benchmarking subprocess: " + |
| 441 | Twine(strerror(errno))); |
| 442 | } |
| 443 | |
| 444 | SubprocessMemory SPMemory; |
| 445 | Error MemoryInitError = SPMemory.initializeSubprocessMemory(ProcessID: getpid()); |
| 446 | if (MemoryInitError) |
| 447 | return MemoryInitError; |
| 448 | |
| 449 | Error AddMemDefError = |
| 450 | SPMemory.addMemoryDefinition(MemoryDefinitions: Key.MemoryValues, ProcessID: getpid()); |
| 451 | if (AddMemDefError) |
| 452 | return AddMemDefError; |
| 453 | |
| 454 | long ParentTID = SubprocessMemory::getCurrentTID(); |
| 455 | pid_t ParentOrChildPID = fork(); |
| 456 | |
| 457 | if (ParentOrChildPID == -1) { |
| 458 | return make_error<Failure>(Args: "Failed to create child process: " + |
| 459 | Twine(strerror(errno))); |
| 460 | } |
| 461 | |
| 462 | if (ParentOrChildPID == 0) { |
| 463 | if (BenchmarkProcessCPU.has_value()) { |
| 464 | setCPUAffinityIfRequested(*BenchmarkProcessCPU); |
| 465 | } |
| 466 | |
| 467 | // We are in the child process, close the write end of the pipe. |
| 468 | close(fd: PipeFiles[1]); |
| 469 | // Unregister handlers, signal handling is now handled through ptrace in |
| 470 | // the host process. |
| 471 | sys::unregisterHandlers(); |
| 472 | runChildSubprocess(Pipe: PipeFiles[0], Key, ParentTID); |
| 473 | // The child process terminates in the above function, so we should never |
| 474 | // get to this point. |
| 475 | llvm_unreachable("Child process didn't exit when expected." ); |
| 476 | } |
| 477 | |
| 478 | // Close the read end of the pipe as we only need to write to the subprocess |
| 479 | // from the parent process. |
| 480 | close(fd: PipeFiles[0]); |
| 481 | return runParentProcess(ChildPID: ParentOrChildPID, WriteFD: PipeFiles[1], CounterName, |
| 482 | CounterValues, ValidationCounters, |
| 483 | ValidationCounterValues); |
| 484 | } |
| 485 | |
| 486 | void disableCoreDumps() const { |
| 487 | struct rlimit rlim; |
| 488 | |
| 489 | rlim.rlim_cur = 0; |
| 490 | setrlimit(RLIMIT_CORE, rlimits: &rlim); |
| 491 | } |
| 492 | |
| 493 | [[noreturn]] void runChildSubprocess(int Pipe, const BenchmarkKey &Key, |
| 494 | long ParentTID) const { |
| 495 | // Disable core dumps in the child process as otherwise everytime we |
| 496 | // encounter an execution failure like a segmentation fault, we will create |
| 497 | // a core dump. We report the information directly rather than require the |
| 498 | // user inspect a core dump. |
| 499 | disableCoreDumps(); |
| 500 | |
| 501 | // The following occurs within the benchmarking subprocess. |
| 502 | pid_t ParentPID = getppid(); |
| 503 | |
| 504 | Expected<int> CounterFileDescriptorOrError = |
| 505 | getFileDescriptorFromSocket(SocketFD: Pipe); |
| 506 | |
| 507 | if (!CounterFileDescriptorOrError) |
| 508 | exit(status: ChildProcessExitCodeE::CounterFDReadFailed); |
| 509 | |
| 510 | int CounterFileDescriptor = *CounterFileDescriptorOrError; |
| 511 | |
| 512 | // Glibc versions greater than 2.35 automatically call rseq during |
| 513 | // initialization. Unmapping the region that glibc sets up for this causes |
| 514 | // segfaults in the program. Unregister the rseq region so that we can safely |
| 515 | // unmap it later |
| 516 | #ifdef GLIBC_INITS_RSEQ |
| 517 | unsigned int RseqStructSize = __rseq_size; |
| 518 | |
| 519 | // Glibc v2.40 (the change is also expected to be backported to v2.35) |
| 520 | // changes the definition of __rseq_size to be the usable area of the struct |
| 521 | // rather than the actual size of the struct. v2.35 uses only 20 bytes of |
| 522 | // the 32 byte struct. For now, it should be safe to assume that if the |
| 523 | // usable size is less than 32, the actual size of the struct will be 32 |
| 524 | // bytes given alignment requirements. |
| 525 | if (__rseq_size < 32) |
| 526 | RseqStructSize = 32; |
| 527 | |
| 528 | long RseqDisableOutput = syscall( |
| 529 | SYS_rseq, |
| 530 | reinterpret_cast<uintptr_t>(__builtin_thread_pointer()) + __rseq_offset, |
| 531 | RseqStructSize, RSEQ_FLAG_UNREGISTER, RSEQ_SIG); |
| 532 | if (RseqDisableOutput != 0) |
| 533 | exit(status: ChildProcessExitCodeE::RSeqDisableFailed); |
| 534 | #endif // GLIBC_INITS_RSEQ |
| 535 | |
| 536 | // The frontend that generates the memory annotation structures should |
| 537 | // validate that the address to map the snippet in at is a multiple of |
| 538 | // the page size. Assert that this is true here. |
| 539 | assert(Key.SnippetAddress % getpagesize() == 0 && |
| 540 | "The snippet address needs to be aligned to a page boundary." ); |
| 541 | |
| 542 | size_t FunctionDataCopySize = this->Function.FunctionBytes.size(); |
| 543 | void *MapAddress = NULL; |
| 544 | int MapFlags = MAP_PRIVATE | MAP_ANONYMOUS; |
| 545 | |
| 546 | if (Key.SnippetAddress != 0) { |
| 547 | MapAddress = reinterpret_cast<void *>(Key.SnippetAddress); |
| 548 | MapFlags |= MAP_FIXED_NOREPLACE; |
| 549 | } |
| 550 | |
| 551 | char *FunctionDataCopy = |
| 552 | (char *)mmap(addr: MapAddress, len: FunctionDataCopySize, PROT_READ | PROT_WRITE, |
| 553 | flags: MapFlags, fd: 0, offset: 0); |
| 554 | if (reinterpret_cast<intptr_t>(FunctionDataCopy) == -1) |
| 555 | exit(status: ChildProcessExitCodeE::FunctionDataMappingFailed); |
| 556 | |
| 557 | memcpy(dest: FunctionDataCopy, src: this->Function.FunctionBytes.data(), |
| 558 | n: this->Function.FunctionBytes.size()); |
| 559 | mprotect(addr: FunctionDataCopy, len: FunctionDataCopySize, PROT_READ | PROT_EXEC); |
| 560 | |
| 561 | Expected<int> AuxMemFDOrError = |
| 562 | SubprocessMemory::setupAuxiliaryMemoryInSubprocess( |
| 563 | MemoryDefinitions: Key.MemoryValues, ParentPID, ParentTID, CounterFileDescriptor); |
| 564 | if (!AuxMemFDOrError) |
| 565 | exit(status: ChildProcessExitCodeE::AuxiliaryMemorySetupFailed); |
| 566 | |
| 567 | ((void (*)(size_t, int))(uintptr_t)FunctionDataCopy)(FunctionDataCopySize, |
| 568 | *AuxMemFDOrError); |
| 569 | |
| 570 | exit(status: 0); |
| 571 | } |
| 572 | |
| 573 | Expected<SmallVector<int64_t, 4>> runWithCounter( |
| 574 | StringRef CounterName, ArrayRef<const char *> ValidationCounters, |
| 575 | SmallVectorImpl<int64_t> &ValidationCounterValues) const override { |
| 576 | SmallVector<int64_t, 4> Value(1, 0); |
| 577 | Error PossibleBenchmarkError = createSubProcessAndRunBenchmark( |
| 578 | CounterName, CounterValues&: Value, ValidationCounters, ValidationCounterValues); |
| 579 | |
| 580 | if (PossibleBenchmarkError) |
| 581 | return std::move(PossibleBenchmarkError); |
| 582 | |
| 583 | return Value; |
| 584 | } |
| 585 | |
| 586 | const LLVMState &State; |
| 587 | const ExecutableFunction Function; |
| 588 | const BenchmarkKey &Key; |
| 589 | const std::optional<int> BenchmarkProcessCPU; |
| 590 | }; |
| 591 | #endif // __linux__ |
| 592 | |
| 593 | // Structure to hold instruction information for assembly printing |
| 594 | struct InstructionInfo { |
| 595 | std::string Text; |
| 596 | uint64_t Address; |
| 597 | std::string HexBytes; |
| 598 | }; |
| 599 | |
| 600 | #ifndef NDEBUG |
| 601 | // Helper function to print generated assembly snippets |
| 602 | void printInstructions(const std::vector<InstructionInfo> &Instructions, |
| 603 | int InitialLinesCount, int LastLinesCount) { |
| 604 | int N = Instructions.size(); |
| 605 | dbgs() << "Generated assembly snippet:\n```\n" ; |
| 606 | |
| 607 | // Print initial lines |
| 608 | for (int i = 0; i < InitialLinesCount; ++i) |
| 609 | dbgs() << format_hex_no_prefix(Instructions[i].Address, 0) << ":\t" |
| 610 | << Instructions[i].HexBytes << Instructions[i].Text << '\n'; |
| 611 | |
| 612 | // Show truncation message if needed |
| 613 | int SkippedInstructions = N - InitialLinesCount - LastLinesCount; |
| 614 | if (SkippedInstructions > 0) |
| 615 | dbgs() << "...\t(" << SkippedInstructions << " more instructions)\n" ; |
| 616 | |
| 617 | // Print last min(PreviewLast, N - PreviewFirst) lines |
| 618 | int LastLinesToPrint = std::min( |
| 619 | LastLinesCount, N > InitialLinesCount ? N - InitialLinesCount : 0); |
| 620 | for (int i = N - LastLinesToPrint; i < N; ++i) |
| 621 | dbgs() << format_hex_no_prefix(Instructions[i].Address, 0) << ":\t" |
| 622 | << Instructions[i].HexBytes << Instructions[i].Text << '\n'; |
| 623 | dbgs() << "```\n" ; |
| 624 | } |
| 625 | #endif // NDEBUG |
| 626 | |
| 627 | // Function to extract and print assembly from snippet |
| 628 | Error printAssembledSnippet(const LLVMState &State, |
| 629 | const SmallString<0> &Snippet) { |
| 630 | // Extract the actual function bytes from the object file |
| 631 | std::vector<uint8_t> FunctionBytes; |
| 632 | if (auto Err = getBenchmarkFunctionBytes(InputData: Snippet, Bytes&: FunctionBytes)) |
| 633 | return make_error<Failure>(Args: "Failed to extract function bytes: " + |
| 634 | toString(E: std::move(Err))); |
| 635 | |
| 636 | // Decode all instructions first |
| 637 | DisassemblerHelper DisHelper(State); |
| 638 | uint64_t Address = 0; |
| 639 | std::vector<InstructionInfo> Instructions; |
| 640 | const size_t FunctionBytesSize = FunctionBytes.size(); |
| 641 | |
| 642 | while (Address < FunctionBytesSize) { |
| 643 | MCInst Inst; |
| 644 | uint64_t Size; |
| 645 | ArrayRef<uint8_t> Bytes(FunctionBytes.data() + Address, |
| 646 | FunctionBytesSize - Address); |
| 647 | |
| 648 | if (!DisHelper.decodeInst(MI&: Inst, MISize&: Size, Bytes)) { |
| 649 | Instructions.push_back(x: {.Text: "<decode error>" , .Address: Address, .HexBytes: "" }); |
| 650 | break; |
| 651 | } |
| 652 | |
| 653 | // Format instruction text |
| 654 | std::string InstStr; |
| 655 | raw_string_ostream OS(InstStr); |
| 656 | DisHelper.printInst(MI: &Inst, OS); |
| 657 | |
| 658 | // Create hex string for this instruction (big-endian order) |
| 659 | std::string HexStr; |
| 660 | raw_string_ostream HexOS(HexStr); |
| 661 | for (int i = Size - 1; i >= 0; --i) |
| 662 | HexOS << format_hex_no_prefix(N: Bytes[i], Width: 2); |
| 663 | |
| 664 | Instructions.push_back(x: {.Text: OS.str(), .Address: Address, .HexBytes: HexOS.str()}); |
| 665 | Address += Size; |
| 666 | } |
| 667 | |
| 668 | #undef DEBUG_TYPE |
| 669 | #define DEBUG_TYPE "preview-gen-assembly" |
| 670 | LLVM_DEBUG(printInstructions(Instructions, 10, 3)); |
| 671 | #undef DEBUG_TYPE |
| 672 | #define DEBUG_TYPE "print-gen-assembly" |
| 673 | LLVM_DEBUG(printInstructions(Instructions, Instructions.size(), 0)); |
| 674 | #undef DEBUG_TYPE |
| 675 | return Error::success(); |
| 676 | } |
| 677 | } // namespace |
| 678 | |
| 679 | Expected<SmallString<0>> BenchmarkRunner::assembleSnippet( |
| 680 | const BenchmarkCode &BC, const SnippetRepetitor &Repetitor, |
| 681 | unsigned MinInstructions, unsigned LoopBodySize, |
| 682 | bool GenerateMemoryInstructions) const { |
| 683 | const std::vector<MCInst> &Instructions = BC.Key.Instructions; |
| 684 | SmallString<0> Buffer; |
| 685 | raw_svector_ostream OS(Buffer); |
| 686 | if (Error E = assembleToStream( |
| 687 | ET: State.getExegesisTarget(), TM: State.createTargetMachine(), LiveIns: BC.LiveIns, |
| 688 | Fill: Repetitor.Repeat(Instructions, MinInstructions, LoopBodySize, |
| 689 | CleanupMemory: GenerateMemoryInstructions), |
| 690 | AsmStreamm&: OS, Key: BC.Key, GenerateMemoryInstructions)) { |
| 691 | return std::move(E); |
| 692 | } |
| 693 | return Buffer; |
| 694 | } |
| 695 | |
| 696 | Expected<BenchmarkRunner::RunnableConfiguration> |
| 697 | BenchmarkRunner::getRunnableConfiguration( |
| 698 | const BenchmarkCode &BC, unsigned MinInstructions, unsigned LoopBodySize, |
| 699 | const SnippetRepetitor &Repetitor) const { |
| 700 | RunnableConfiguration RC; |
| 701 | |
| 702 | Benchmark &BenchmarkResult = RC.BenchmarkResult; |
| 703 | BenchmarkResult.Mode = Mode; |
| 704 | BenchmarkResult.CpuName = |
| 705 | std::string(State.getTargetMachine().getTargetCPU()); |
| 706 | BenchmarkResult.LLVMTriple = |
| 707 | State.getTargetMachine().getTargetTriple().normalize(); |
| 708 | BenchmarkResult.MinInstructions = MinInstructions; |
| 709 | BenchmarkResult.Info = BC.Info; |
| 710 | |
| 711 | const std::vector<MCInst> &Instructions = BC.Key.Instructions; |
| 712 | |
| 713 | bool GenerateMemoryInstructions = ExecutionMode == ExecutionModeE::SubProcess; |
| 714 | |
| 715 | BenchmarkResult.Key = BC.Key; |
| 716 | |
| 717 | // Assemble at least kMinInstructionsForSnippet instructions by repeating |
| 718 | // the snippet for debug/analysis. This is so that the user clearly |
| 719 | // understands that the inside instructions are repeated. |
| 720 | if (BenchmarkPhaseSelector > BenchmarkPhaseSelectorE::PrepareSnippet) { |
| 721 | const int MinInstructionsForSnippet = 4 * Instructions.size(); |
| 722 | const int LoopBodySizeForSnippet = 2 * Instructions.size(); |
| 723 | auto Snippet = |
| 724 | assembleSnippet(BC, Repetitor, MinInstructions: MinInstructionsForSnippet, |
| 725 | LoopBodySize: LoopBodySizeForSnippet, GenerateMemoryInstructions); |
| 726 | if (Error E = Snippet.takeError()) |
| 727 | return std::move(E); |
| 728 | |
| 729 | if (auto Err = getBenchmarkFunctionBytes(InputData: *Snippet, |
| 730 | Bytes&: BenchmarkResult.AssembledSnippet)) |
| 731 | return std::move(Err); |
| 732 | } |
| 733 | |
| 734 | // Assemble enough repetitions of the snippet so we have at least |
| 735 | // MinInstructions instructions. |
| 736 | if (BenchmarkPhaseSelector > |
| 737 | BenchmarkPhaseSelectorE::PrepareAndAssembleSnippet) { |
| 738 | auto Snippet = |
| 739 | assembleSnippet(BC, Repetitor, MinInstructions: BenchmarkResult.MinInstructions, |
| 740 | LoopBodySize, GenerateMemoryInstructions); |
| 741 | if (Error E = Snippet.takeError()) |
| 742 | return std::move(E); |
| 743 | RC.ObjectFile = getObjectFromBuffer(Buffer: *Snippet); |
| 744 | |
| 745 | // Print the assembled snippet by disassembling the binary data |
| 746 | if (Error E = printAssembledSnippet(State, Snippet: *Snippet)) |
| 747 | return std::move(E); |
| 748 | } |
| 749 | |
| 750 | return std::move(RC); |
| 751 | } |
| 752 | |
| 753 | Expected<std::unique_ptr<BenchmarkRunner::FunctionExecutor>> |
| 754 | BenchmarkRunner::createFunctionExecutor( |
| 755 | object::OwningBinary<object::ObjectFile> ObjectFile, |
| 756 | const BenchmarkKey &Key, std::optional<int> BenchmarkProcessCPU) const { |
| 757 | switch (ExecutionMode) { |
| 758 | case ExecutionModeE::InProcess: { |
| 759 | if (BenchmarkProcessCPU.has_value()) |
| 760 | return make_error<Failure>(Args: "The inprocess execution mode does not " |
| 761 | "support benchmark core pinning." ); |
| 762 | |
| 763 | auto InProcessExecutorOrErr = InProcessFunctionExecutorImpl::create( |
| 764 | State, Obj: std::move(ObjectFile), Scratch: Scratch.get(), BenchmarkProcessCPU); |
| 765 | if (!InProcessExecutorOrErr) |
| 766 | return InProcessExecutorOrErr.takeError(); |
| 767 | |
| 768 | return std::move(*InProcessExecutorOrErr); |
| 769 | } |
| 770 | case ExecutionModeE::SubProcess: { |
| 771 | #ifdef __linux__ |
| 772 | auto SubProcessExecutorOrErr = SubProcessFunctionExecutorImpl::create( |
| 773 | State, Obj: std::move(ObjectFile), Key, BenchmarkProcessCPU); |
| 774 | if (!SubProcessExecutorOrErr) |
| 775 | return SubProcessExecutorOrErr.takeError(); |
| 776 | |
| 777 | return std::move(*SubProcessExecutorOrErr); |
| 778 | #else |
| 779 | return make_error<Failure>( |
| 780 | "The subprocess execution mode is only supported on Linux" ); |
| 781 | #endif |
| 782 | } |
| 783 | } |
| 784 | llvm_unreachable("ExecutionMode is outside expected range" ); |
| 785 | } |
| 786 | |
| 787 | std::pair<Error, Benchmark> BenchmarkRunner::runConfiguration( |
| 788 | RunnableConfiguration &&RC, const std::optional<StringRef> &DumpFile, |
| 789 | std::optional<int> BenchmarkProcessCPU) const { |
| 790 | Benchmark &BenchmarkResult = RC.BenchmarkResult; |
| 791 | object::OwningBinary<object::ObjectFile> &ObjectFile = RC.ObjectFile; |
| 792 | |
| 793 | if (DumpFile && BenchmarkPhaseSelector > |
| 794 | BenchmarkPhaseSelectorE::PrepareAndAssembleSnippet) { |
| 795 | auto ObjectFilePath = |
| 796 | writeObjectFile(Buffer: ObjectFile.getBinary()->getData(), FileName: *DumpFile); |
| 797 | if (Error E = ObjectFilePath.takeError()) { |
| 798 | return {std::move(E), std::move(BenchmarkResult)}; |
| 799 | } |
| 800 | outs() << "Check generated assembly with: /usr/bin/objdump -d " |
| 801 | << *ObjectFilePath << "\n" ; |
| 802 | } |
| 803 | |
| 804 | if (BenchmarkPhaseSelector < BenchmarkPhaseSelectorE::Measure) { |
| 805 | BenchmarkResult.Error = "actual measurements skipped." ; |
| 806 | return {Error::success(), std::move(BenchmarkResult)}; |
| 807 | } |
| 808 | |
| 809 | Expected<std::unique_ptr<BenchmarkRunner::FunctionExecutor>> Executor = |
| 810 | createFunctionExecutor(ObjectFile: std::move(ObjectFile), Key: RC.BenchmarkResult.Key, |
| 811 | BenchmarkProcessCPU); |
| 812 | if (!Executor) |
| 813 | return {Executor.takeError(), std::move(BenchmarkResult)}; |
| 814 | auto NewMeasurements = runMeasurements(Executor: **Executor); |
| 815 | |
| 816 | if (Error E = NewMeasurements.takeError()) { |
| 817 | return {std::move(E), std::move(BenchmarkResult)}; |
| 818 | } |
| 819 | assert(BenchmarkResult.MinInstructions > 0 && "invalid MinInstructions" ); |
| 820 | for (BenchmarkMeasure &BM : *NewMeasurements) { |
| 821 | // Scale the measurements by the number of instructions. |
| 822 | BM.PerInstructionValue /= BenchmarkResult.MinInstructions; |
| 823 | // Scale the measurements by the number of times the entire snippet is |
| 824 | // repeated. |
| 825 | BM.PerSnippetValue /= |
| 826 | std::ceil(x: BenchmarkResult.MinInstructions / |
| 827 | static_cast<double>(BenchmarkResult.Key.Instructions.size())); |
| 828 | } |
| 829 | BenchmarkResult.Measurements = std::move(*NewMeasurements); |
| 830 | |
| 831 | return {Error::success(), std::move(BenchmarkResult)}; |
| 832 | } |
| 833 | |
| 834 | Expected<std::string> |
| 835 | BenchmarkRunner::writeObjectFile(StringRef Buffer, StringRef FileName) const { |
| 836 | int ResultFD = 0; |
| 837 | SmallString<256> ResultPath = FileName; |
| 838 | if (Error E = errorCodeToError( |
| 839 | EC: FileName.empty() ? sys::fs::createTemporaryFile(Prefix: "snippet" , Suffix: "o" , |
| 840 | ResultFD, ResultPath) |
| 841 | : sys::fs::openFileForReadWrite( |
| 842 | Name: FileName, ResultFD, Disp: sys::fs::CD_CreateAlways, |
| 843 | Flags: sys::fs::OF_None))) |
| 844 | return std::move(E); |
| 845 | raw_fd_ostream OFS(ResultFD, true /*ShouldClose*/); |
| 846 | OFS.write(Ptr: Buffer.data(), Size: Buffer.size()); |
| 847 | OFS.flush(); |
| 848 | return std::string(ResultPath); |
| 849 | } |
| 850 | |
| 851 | static bool EventLessThan(const std::pair<ValidationEvent, const char *> LHS, |
| 852 | const ValidationEvent RHS) { |
| 853 | return static_cast<int>(LHS.first) < static_cast<int>(RHS); |
| 854 | } |
| 855 | |
| 856 | Error BenchmarkRunner::getValidationCountersToRun( |
| 857 | SmallVector<const char *> &ValCountersToRun) const { |
| 858 | const PfmCountersInfo &PCI = State.getPfmCounters(); |
| 859 | ValCountersToRun.reserve(N: ValidationCounters.size()); |
| 860 | |
| 861 | ValCountersToRun.reserve(N: ValidationCounters.size()); |
| 862 | ArrayRef TargetValidationEvents(PCI.ValidationEvents, |
| 863 | PCI.NumValidationEvents); |
| 864 | for (const ValidationEvent RequestedValEvent : ValidationCounters) { |
| 865 | auto ValCounterIt = |
| 866 | lower_bound(Range&: TargetValidationEvents, Value: RequestedValEvent, C: EventLessThan); |
| 867 | if (ValCounterIt == TargetValidationEvents.end() || |
| 868 | ValCounterIt->first != RequestedValEvent) |
| 869 | return make_error<Failure>(Args: "Cannot create validation counter" ); |
| 870 | |
| 871 | assert(ValCounterIt->first == RequestedValEvent && |
| 872 | "The array of validation events from the target should be sorted" ); |
| 873 | ValCountersToRun.push_back(Elt: ValCounterIt->second); |
| 874 | } |
| 875 | |
| 876 | return Error::success(); |
| 877 | } |
| 878 | |
| 879 | BenchmarkRunner::FunctionExecutor::~FunctionExecutor() = default; |
| 880 | |
| 881 | } // namespace exegesis |
| 882 | } // namespace llvm |
| 883 | |