| 1 | //===-- X86Counter.cpp ------------------------------------------*- C++ -*-===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | |
| 9 | #include "X86Counter.h" |
| 10 | |
| 11 | #if defined(__linux__) && defined(HAVE_LIBPFM) && \ |
| 12 | defined(LIBPFM_HAS_FIELD_CYCLES) |
| 13 | |
| 14 | // FIXME: Use appropriate wrappers for poll.h and mman.h |
| 15 | // to support Windows and remove this linux-only guard. |
| 16 | |
| 17 | #include "llvm/Support/Endian.h" |
| 18 | #include "llvm/Support/Errc.h" |
| 19 | |
| 20 | #include <perfmon/perf_event.h> |
| 21 | #include <perfmon/pfmlib.h> |
| 22 | #include <perfmon/pfmlib_perf_event.h> |
| 23 | |
| 24 | #include <atomic> |
| 25 | #include <chrono> |
| 26 | #include <cstddef> |
| 27 | #include <cstdint> |
| 28 | #include <limits> |
| 29 | #include <memory> |
| 30 | #include <vector> |
| 31 | |
| 32 | #include <poll.h> |
| 33 | #include <sys/mman.h> |
| 34 | #include <unistd.h> |
| 35 | |
| 36 | namespace llvm { |
| 37 | namespace exegesis { |
| 38 | |
| 39 | // Number of entries in the LBR. |
| 40 | static constexpr int kLbrEntries = 16; |
| 41 | static constexpr size_t kBufferPages = 8; |
| 42 | static const size_t kDataBufferSize = kBufferPages * getpagesize(); |
| 43 | |
| 44 | // First page is reserved for perf_event_mmap_page. Data buffer starts on |
| 45 | // the next page, so we allocate one more page. |
| 46 | static const size_t kMappedBufferSize = (kBufferPages + 1) * getpagesize(); |
| 47 | |
| 48 | // Waits for the LBR perf events. |
| 49 | static int pollLbrPerfEvent(const int FileDescriptor) { |
| 50 | struct pollfd PollFd; |
| 51 | PollFd.fd = FileDescriptor; |
| 52 | PollFd.events = POLLIN; |
| 53 | PollFd.revents = 0; |
| 54 | return poll(&PollFd, 1 /* num of fds */, 10000 /* timeout in ms */); |
| 55 | } |
| 56 | |
| 57 | // Copies the data-buffer into Buf, given the pointer to MMapped. |
| 58 | static void copyDataBuffer(void *MMappedBuffer, char *Buf, uint64_t Tail, |
| 59 | size_t DataSize) { |
| 60 | // First page is reserved for perf_event_mmap_page. Data buffer starts on |
| 61 | // the next page. |
| 62 | char *Start = reinterpret_cast<char *>(MMappedBuffer) + getpagesize(); |
| 63 | // The LBR buffer is a cyclic buffer, we copy data to another buffer. |
| 64 | uint64_t Offset = Tail % kDataBufferSize; |
| 65 | size_t CopySize = kDataBufferSize - Offset; |
| 66 | memcpy(Buf, Start + Offset, CopySize); |
| 67 | if (CopySize >= DataSize) |
| 68 | return; |
| 69 | |
| 70 | memcpy(Buf + CopySize, Start, Offset); |
| 71 | return; |
| 72 | } |
| 73 | |
| 74 | // Parses the given data-buffer for stats and fill the CycleArray. |
| 75 | // If data has been extracted successfully, also modifies the code to jump |
| 76 | // out the benchmark loop. |
| 77 | static Error parseDataBuffer(const char *DataBuf, size_t DataSize, |
| 78 | const void *From, const void *To, |
| 79 | SmallVector<int64_t, 4> *CycleArray) { |
| 80 | const char *DataPtr = DataBuf; |
| 81 | while (DataPtr < DataBuf + DataSize) { |
| 82 | struct perf_event_header Header; |
| 83 | memcpy(&Header, DataPtr, sizeof(struct perf_event_header)); |
| 84 | if (Header.type != PERF_RECORD_SAMPLE) { |
| 85 | // Ignores non-sample records. |
| 86 | DataPtr += Header.size; |
| 87 | continue; |
| 88 | } |
| 89 | DataPtr += sizeof(Header); |
| 90 | uint64_t Count = support::endian::read64(DataPtr, endianness::native); |
| 91 | DataPtr += sizeof(Count); |
| 92 | |
| 93 | struct perf_branch_entry Entry; |
| 94 | memcpy(&Entry, DataPtr, sizeof(struct perf_branch_entry)); |
| 95 | |
| 96 | // Read the perf_branch_entry array. |
| 97 | for (uint64_t i = 0; i < Count; ++i) { |
| 98 | const uint64_t BlockStart = From == nullptr |
| 99 | ? std::numeric_limits<uint64_t>::min() |
| 100 | : reinterpret_cast<uint64_t>(From); |
| 101 | const uint64_t BlockEnd = To == nullptr |
| 102 | ? std::numeric_limits<uint64_t>::max() |
| 103 | : reinterpret_cast<uint64_t>(To); |
| 104 | |
| 105 | if (BlockStart <= Entry.from && BlockEnd >= Entry.to) |
| 106 | CycleArray->push_back(Entry.cycles); |
| 107 | |
| 108 | if (i == Count - 1) |
| 109 | // We've reached the last entry. |
| 110 | return Error::success(); |
| 111 | |
| 112 | // Advance to next entry |
| 113 | DataPtr += sizeof(Entry); |
| 114 | memcpy(&Entry, DataPtr, sizeof(struct perf_branch_entry)); |
| 115 | } |
| 116 | } |
| 117 | return make_error<StringError>("Unable to parse databuffer." , errc::io_error); |
| 118 | } |
| 119 | |
| 120 | X86LbrPerfEvent::X86LbrPerfEvent(unsigned SamplingPeriod) { |
| 121 | assert(SamplingPeriod > 0 && "SamplingPeriod must be positive" ); |
| 122 | EventString = "BR_INST_RETIRED.NEAR_TAKEN" ; |
| 123 | Attr = new perf_event_attr(); |
| 124 | Attr->size = sizeof(*Attr); |
| 125 | Attr->type = PERF_TYPE_RAW; |
| 126 | // FIXME This is SKL's encoding. Not sure if it'll change. |
| 127 | Attr->config = 0x20c4; // BR_INST_RETIRED.NEAR_TAKEN |
| 128 | Attr->sample_type = PERF_SAMPLE_BRANCH_STACK; |
| 129 | // Don't need to specify "USER" because we've already excluded HV and Kernel. |
| 130 | Attr->branch_sample_type = PERF_SAMPLE_BRANCH_ANY; |
| 131 | Attr->sample_period = SamplingPeriod; |
| 132 | Attr->wakeup_events = 1; // We need this even when using ioctl REFRESH. |
| 133 | Attr->disabled = 1; |
| 134 | Attr->exclude_kernel = 1; |
| 135 | Attr->exclude_hv = 1; |
| 136 | Attr->read_format = PERF_FORMAT_GROUP; |
| 137 | |
| 138 | FullQualifiedEventString = EventString; |
| 139 | } |
| 140 | |
| 141 | X86LbrCounter::X86LbrCounter(pfm::PerfEvent &&NewEvent) |
| 142 | : CounterGroup(std::move(NewEvent), {}) { |
| 143 | MMappedBuffer = mmap(nullptr, kMappedBufferSize, PROT_READ | PROT_WRITE, |
| 144 | MAP_SHARED, getFileDescriptor(), 0); |
| 145 | if (MMappedBuffer == MAP_FAILED) |
| 146 | errs() << "Failed to mmap buffer." ; |
| 147 | } |
| 148 | |
| 149 | X86LbrCounter::~X86LbrCounter() { |
| 150 | if (0 != munmap(MMappedBuffer, kMappedBufferSize)) |
| 151 | errs() << "Failed to munmap buffer." ; |
| 152 | } |
| 153 | |
| 154 | void X86LbrCounter::start() { |
| 155 | ioctl(getFileDescriptor(), PERF_EVENT_IOC_REFRESH, 1024 /* kMaxPollsPerFd */); |
| 156 | } |
| 157 | |
| 158 | Error X86LbrCounter::checkLbrSupport() { |
| 159 | // Do a sample read and check if the results contain non-zero values. |
| 160 | |
| 161 | X86LbrCounter counter(X86LbrPerfEvent(123)); |
| 162 | counter.start(); |
| 163 | |
| 164 | // Prevent the compiler from unrolling the loop and get rid of all the |
| 165 | // branches. We need at least 16 iterations. |
| 166 | int Sum = 0; |
| 167 | int V = 1; |
| 168 | |
| 169 | volatile int *P = &V; |
| 170 | auto TimeLimit = |
| 171 | std::chrono::high_resolution_clock::now() + std::chrono::microseconds(5); |
| 172 | |
| 173 | for (int I = 0; |
| 174 | I < kLbrEntries || std::chrono::high_resolution_clock::now() < TimeLimit; |
| 175 | ++I) { |
| 176 | Sum += *P; |
| 177 | } |
| 178 | |
| 179 | counter.stop(); |
| 180 | (void)Sum; |
| 181 | |
| 182 | auto ResultOrError = counter.doReadCounter(nullptr, nullptr); |
| 183 | if (ResultOrError) |
| 184 | if (!ResultOrError.get().empty()) |
| 185 | // If there is at least one non-zero entry, then LBR is supported. |
| 186 | for (const int64_t &Value : ResultOrError.get()) |
| 187 | if (Value != 0) |
| 188 | return Error::success(); |
| 189 | |
| 190 | return make_error<StringError>( |
| 191 | "LBR format with cycles is not suppported on the host." , |
| 192 | errc::not_supported); |
| 193 | } |
| 194 | |
| 195 | Expected<SmallVector<int64_t, 4>> |
| 196 | X86LbrCounter::readOrError(StringRef FunctionBytes) const { |
| 197 | // Disable the event before reading |
| 198 | ioctl(getFileDescriptor(), PERF_EVENT_IOC_DISABLE, 0); |
| 199 | |
| 200 | // Find the boundary of the function so that we could filter the LBRs |
| 201 | // to keep only the relevant records. |
| 202 | if (FunctionBytes.empty()) |
| 203 | return make_error<StringError>("Empty function bytes" , |
| 204 | errc::invalid_argument); |
| 205 | const void *From = reinterpret_cast<const void *>(FunctionBytes.data()); |
| 206 | const void *To = reinterpret_cast<const void *>(FunctionBytes.data() + |
| 207 | FunctionBytes.size()); |
| 208 | return doReadCounter(From, To); |
| 209 | } |
| 210 | |
| 211 | Expected<SmallVector<int64_t, 4>> |
| 212 | X86LbrCounter::doReadCounter(const void *From, const void *To) const { |
| 213 | // The max number of time-outs/retries before we give up. |
| 214 | static constexpr int kMaxTimeouts = 160; |
| 215 | |
| 216 | // Parses the LBR buffer and fills CycleArray with the sequence of cycle |
| 217 | // counts from the buffer. |
| 218 | SmallVector<int64_t, 4> CycleArray; |
| 219 | auto DataBuf = std::make_unique<char[]>(kDataBufferSize); |
| 220 | int NumTimeouts = 0; |
| 221 | int PollResult = 0; |
| 222 | |
| 223 | while (PollResult <= 0) { |
| 224 | PollResult = pollLbrPerfEvent(getFileDescriptor()); |
| 225 | if (PollResult > 0) |
| 226 | break; |
| 227 | if (PollResult == -1) |
| 228 | return make_error<StringError>("Cannot poll LBR perf event." , |
| 229 | errc::io_error); |
| 230 | if (NumTimeouts++ >= kMaxTimeouts) |
| 231 | return make_error<StringError>( |
| 232 | "LBR polling still timed out after max number of attempts." , |
| 233 | errc::device_or_resource_busy); |
| 234 | } |
| 235 | |
| 236 | struct perf_event_mmap_page Page; |
| 237 | memcpy(&Page, MMappedBuffer, sizeof(struct perf_event_mmap_page)); |
| 238 | |
| 239 | const uint64_t DataTail = Page.data_tail; |
| 240 | const uint64_t DataHead = Page.data_head; |
| 241 | // We're supposed to use a barrier after reading data_head. |
| 242 | std::atomic_thread_fence(std::memory_order_acq_rel); |
| 243 | const size_t DataSize = DataHead - DataTail; |
| 244 | if (DataSize > kDataBufferSize) |
| 245 | return make_error<StringError>("DataSize larger than buffer size." , |
| 246 | errc::invalid_argument); |
| 247 | |
| 248 | copyDataBuffer(MMappedBuffer, DataBuf.get(), DataTail, DataSize); |
| 249 | Error error = parseDataBuffer(DataBuf.get(), DataSize, From, To, &CycleArray); |
| 250 | if (!error) |
| 251 | return CycleArray; |
| 252 | return std::move(error); |
| 253 | } |
| 254 | |
| 255 | } // namespace exegesis |
| 256 | } // namespace llvm |
| 257 | |
| 258 | #endif // defined(__linux__) && defined(HAVE_LIBPFM) && |
| 259 | // defined(LIBPFM_HAS_FIELD_CYCLES) |
| 260 | |