1//===-- X86Counter.cpp ------------------------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "X86Counter.h"
10
11#if defined(__linux__) && defined(HAVE_LIBPFM) && \
12 defined(LIBPFM_HAS_FIELD_CYCLES)
13
14// FIXME: Use appropriate wrappers for poll.h and mman.h
15// to support Windows and remove this linux-only guard.
16
17#include "llvm/Support/Endian.h"
18#include "llvm/Support/Errc.h"
19
20#include <perfmon/perf_event.h>
21#include <perfmon/pfmlib.h>
22#include <perfmon/pfmlib_perf_event.h>
23
24#include <atomic>
25#include <chrono>
26#include <cstddef>
27#include <cstdint>
28#include <limits>
29#include <memory>
30
31#include <poll.h>
32#include <sys/mman.h>
33#include <unistd.h>
34
35namespace llvm {
36namespace exegesis {
37
38// Number of entries in the LBR.
39static constexpr int kLbrEntries = 16;
40static constexpr size_t kBufferPages = 8;
41static const size_t kDataBufferSize = kBufferPages * getpagesize();
42
43// First page is reserved for perf_event_mmap_page. Data buffer starts on
44// the next page, so we allocate one more page.
45static const size_t kMappedBufferSize = (kBufferPages + 1) * getpagesize();
46
47// Waits for the LBR perf events.
48static int pollLbrPerfEvent(const int FileDescriptor) {
49 struct pollfd PollFd;
50 PollFd.fd = FileDescriptor;
51 PollFd.events = POLLIN;
52 PollFd.revents = 0;
53 return poll(&PollFd, 1 /* num of fds */, 10000 /* timeout in ms */);
54}
55
56// Copies the data-buffer into Buf, given the pointer to MMapped.
57static void copyDataBuffer(void *MMappedBuffer, char *Buf, uint64_t Tail,
58 size_t DataSize) {
59 // First page is reserved for perf_event_mmap_page. Data buffer starts on
60 // the next page.
61 char *Start = reinterpret_cast<char *>(MMappedBuffer) + getpagesize();
62 // The LBR buffer is a cyclic buffer, we copy data to another buffer.
63 uint64_t Offset = Tail % kDataBufferSize;
64 size_t CopySize = kDataBufferSize - Offset;
65 memcpy(Buf, Start + Offset, CopySize);
66 if (CopySize >= DataSize)
67 return;
68
69 memcpy(Buf + CopySize, Start, Offset);
70 return;
71}
72
73// Parses the given data-buffer for stats and fill the CycleArray.
74// If data has been extracted successfully, also modifies the code to jump
75// out the benchmark loop.
76static Error parseDataBuffer(const char *DataBuf, size_t DataSize,
77 const void *From, const void *To,
78 SmallVector<int64_t, 4> *CycleArray) {
79 const char *DataPtr = DataBuf;
80 while (DataPtr < DataBuf + DataSize) {
81 struct perf_event_header Header;
82 memcpy(&Header, DataPtr, sizeof(struct perf_event_header));
83 if (Header.type != PERF_RECORD_SAMPLE) {
84 // Ignores non-sample records.
85 DataPtr += Header.size;
86 continue;
87 }
88 DataPtr += sizeof(Header);
89 uint64_t Count = support::endian::read64(DataPtr, endianness::native);
90 DataPtr += sizeof(Count);
91
92 struct perf_branch_entry Entry;
93 memcpy(&Entry, DataPtr, sizeof(struct perf_branch_entry));
94
95 // Read the perf_branch_entry array.
96 for (uint64_t i = 0; i < Count; ++i) {
97 const uint64_t BlockStart = From == nullptr
98 ? std::numeric_limits<uint64_t>::min()
99 : reinterpret_cast<uint64_t>(From);
100 const uint64_t BlockEnd = To == nullptr
101 ? std::numeric_limits<uint64_t>::max()
102 : reinterpret_cast<uint64_t>(To);
103
104 if (BlockStart <= Entry.from && BlockEnd >= Entry.to)
105 CycleArray->push_back(Entry.cycles);
106
107 if (i == Count - 1)
108 // We've reached the last entry.
109 return Error::success();
110
111 // Advance to next entry
112 DataPtr += sizeof(Entry);
113 memcpy(&Entry, DataPtr, sizeof(struct perf_branch_entry));
114 }
115 }
116 return make_error<StringError>("Unable to parse databuffer.", errc::io_error);
117}
118
119X86LbrPerfEvent::X86LbrPerfEvent(unsigned SamplingPeriod) {
120 assert(SamplingPeriod > 0 && "SamplingPeriod must be positive");
121 EventString = "BR_INST_RETIRED.NEAR_TAKEN";
122 Attr = new perf_event_attr();
123 Attr->size = sizeof(*Attr);
124 Attr->type = PERF_TYPE_RAW;
125 // FIXME This is SKL's encoding. Not sure if it'll change.
126 Attr->config = 0x20c4; // BR_INST_RETIRED.NEAR_TAKEN
127 Attr->sample_type = PERF_SAMPLE_BRANCH_STACK;
128 // Don't need to specify "USER" because we've already excluded HV and Kernel.
129 Attr->branch_sample_type = PERF_SAMPLE_BRANCH_ANY;
130 Attr->sample_period = SamplingPeriod;
131 Attr->wakeup_events = 1; // We need this even when using ioctl REFRESH.
132 Attr->disabled = 1;
133 Attr->exclude_kernel = 1;
134 Attr->exclude_hv = 1;
135 Attr->read_format = PERF_FORMAT_GROUP;
136
137 FullQualifiedEventString = EventString;
138}
139
140X86LbrCounter::X86LbrCounter(pfm::PerfEvent &&NewEvent)
141 : CounterGroup(std::move(NewEvent), {}) {
142 MMappedBuffer = mmap(nullptr, kMappedBufferSize, PROT_READ | PROT_WRITE,
143 MAP_SHARED, getFileDescriptor(), 0);
144 if (MMappedBuffer == MAP_FAILED)
145 errs() << "Failed to mmap buffer.";
146}
147
148X86LbrCounter::~X86LbrCounter() {
149 if (0 != munmap(MMappedBuffer, kMappedBufferSize))
150 errs() << "Failed to munmap buffer.";
151}
152
153void X86LbrCounter::start() {
154 ioctl(getFileDescriptor(), PERF_EVENT_IOC_REFRESH, 1024 /* kMaxPollsPerFd */);
155}
156
157Error X86LbrCounter::checkLbrSupport() {
158 // Do a sample read and check if the results contain non-zero values.
159
160 X86LbrCounter counter(X86LbrPerfEvent(123));
161 counter.start();
162
163 // Prevent the compiler from unrolling the loop and get rid of all the
164 // branches. We need at least 16 iterations.
165 int Sum = 0;
166 int V = 1;
167
168 volatile int *P = &V;
169 auto TimeLimit =
170 std::chrono::high_resolution_clock::now() + std::chrono::microseconds(5);
171
172 for (int I = 0;
173 I < kLbrEntries || std::chrono::high_resolution_clock::now() < TimeLimit;
174 ++I) {
175 Sum += *P;
176 }
177
178 counter.stop();
179 (void)Sum;
180
181 auto ResultOrError = counter.doReadCounter(nullptr, nullptr);
182 if (ResultOrError)
183 if (!ResultOrError.get().empty())
184 // If there is at least one non-zero entry, then LBR is supported.
185 for (const int64_t &Value : ResultOrError.get())
186 if (Value != 0)
187 return Error::success();
188
189 return make_error<StringError>(
190 "LBR format with cycles is not suppported on the host.",
191 errc::not_supported);
192}
193
194Expected<SmallVector<int64_t, 4>>
195X86LbrCounter::readOrError(StringRef FunctionBytes) const {
196 // Disable the event before reading
197 ioctl(getFileDescriptor(), PERF_EVENT_IOC_DISABLE, 0);
198
199 // Find the boundary of the function so that we could filter the LBRs
200 // to keep only the relevant records.
201 if (FunctionBytes.empty())
202 return make_error<StringError>("Empty function bytes",
203 errc::invalid_argument);
204 const void *From = reinterpret_cast<const void *>(FunctionBytes.data());
205 const void *To = reinterpret_cast<const void *>(FunctionBytes.data() +
206 FunctionBytes.size());
207 return doReadCounter(From, To);
208}
209
210Expected<SmallVector<int64_t, 4>>
211X86LbrCounter::doReadCounter(const void *From, const void *To) const {
212 // The max number of time-outs/retries before we give up.
213 static constexpr int kMaxTimeouts = 160;
214
215 // Parses the LBR buffer and fills CycleArray with the sequence of cycle
216 // counts from the buffer.
217 SmallVector<int64_t, 4> CycleArray;
218 auto DataBuf = std::make_unique<char[]>(kDataBufferSize);
219 int NumTimeouts = 0;
220 int PollResult = 0;
221
222 while (PollResult <= 0) {
223 PollResult = pollLbrPerfEvent(getFileDescriptor());
224 if (PollResult > 0)
225 break;
226 if (PollResult == -1)
227 return make_error<StringError>("Cannot poll LBR perf event.",
228 errc::io_error);
229 if (NumTimeouts++ >= kMaxTimeouts)
230 return make_error<StringError>(
231 "LBR polling still timed out after max number of attempts.",
232 errc::device_or_resource_busy);
233 }
234
235 struct perf_event_mmap_page Page;
236 memcpy(&Page, MMappedBuffer, sizeof(struct perf_event_mmap_page));
237
238 const uint64_t DataTail = Page.data_tail;
239 const uint64_t DataHead = Page.data_head;
240 // We're supposed to use a barrier after reading data_head.
241 std::atomic_thread_fence(std::memory_order_acq_rel);
242 const size_t DataSize = DataHead - DataTail;
243 if (DataSize > kDataBufferSize)
244 return make_error<StringError>("DataSize larger than buffer size.",
245 errc::invalid_argument);
246
247 copyDataBuffer(MMappedBuffer, DataBuf.get(), DataTail, DataSize);
248 Error error = parseDataBuffer(DataBuf.get(), DataSize, From, To, &CycleArray);
249 if (!error)
250 return CycleArray;
251 return std::move(error);
252}
253
254} // namespace exegesis
255} // namespace llvm
256
257#endif // defined(__linux__) && defined(HAVE_LIBPFM) &&
258 // defined(LIBPFM_HAS_FIELD_CYCLES)
259