1 | //===-- X86Counter.cpp ------------------------------------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "X86Counter.h" |
10 | |
11 | #if defined(__linux__) && defined(HAVE_LIBPFM) && \ |
12 | defined(LIBPFM_HAS_FIELD_CYCLES) |
13 | |
14 | // FIXME: Use appropriate wrappers for poll.h and mman.h |
15 | // to support Windows and remove this linux-only guard. |
16 | |
17 | #include "llvm/Support/Endian.h" |
18 | #include "llvm/Support/Errc.h" |
19 | |
20 | #include <perfmon/perf_event.h> |
21 | #include <perfmon/pfmlib.h> |
22 | #include <perfmon/pfmlib_perf_event.h> |
23 | |
24 | #include <atomic> |
25 | #include <chrono> |
26 | #include <cstddef> |
27 | #include <cstdint> |
28 | #include <limits> |
29 | #include <memory> |
30 | #include <vector> |
31 | |
32 | #include <poll.h> |
33 | #include <sys/mman.h> |
34 | #include <unistd.h> |
35 | |
36 | namespace llvm { |
37 | namespace exegesis { |
38 | |
39 | // Number of entries in the LBR. |
40 | static constexpr int kLbrEntries = 16; |
41 | static constexpr size_t kBufferPages = 8; |
42 | static const size_t kDataBufferSize = kBufferPages * getpagesize(); |
43 | |
44 | // First page is reserved for perf_event_mmap_page. Data buffer starts on |
45 | // the next page, so we allocate one more page. |
46 | static const size_t kMappedBufferSize = (kBufferPages + 1) * getpagesize(); |
47 | |
48 | // Waits for the LBR perf events. |
49 | static int pollLbrPerfEvent(const int FileDescriptor) { |
50 | struct pollfd PollFd; |
51 | PollFd.fd = FileDescriptor; |
52 | PollFd.events = POLLIN; |
53 | PollFd.revents = 0; |
54 | return poll(&PollFd, 1 /* num of fds */, 10000 /* timeout in ms */); |
55 | } |
56 | |
57 | // Copies the data-buffer into Buf, given the pointer to MMapped. |
58 | static void copyDataBuffer(void *MMappedBuffer, char *Buf, uint64_t Tail, |
59 | size_t DataSize) { |
60 | // First page is reserved for perf_event_mmap_page. Data buffer starts on |
61 | // the next page. |
62 | char *Start = reinterpret_cast<char *>(MMappedBuffer) + getpagesize(); |
63 | // The LBR buffer is a cyclic buffer, we copy data to another buffer. |
64 | uint64_t Offset = Tail % kDataBufferSize; |
65 | size_t CopySize = kDataBufferSize - Offset; |
66 | memcpy(Buf, Start + Offset, CopySize); |
67 | if (CopySize >= DataSize) |
68 | return; |
69 | |
70 | memcpy(Buf + CopySize, Start, Offset); |
71 | return; |
72 | } |
73 | |
74 | // Parses the given data-buffer for stats and fill the CycleArray. |
75 | // If data has been extracted successfully, also modifies the code to jump |
76 | // out the benchmark loop. |
77 | static Error parseDataBuffer(const char *DataBuf, size_t DataSize, |
78 | const void *From, const void *To, |
79 | SmallVector<int64_t, 4> *CycleArray) { |
80 | const char *DataPtr = DataBuf; |
81 | while (DataPtr < DataBuf + DataSize) { |
82 | struct perf_event_header Header; |
83 | memcpy(&Header, DataPtr, sizeof(struct perf_event_header)); |
84 | if (Header.type != PERF_RECORD_SAMPLE) { |
85 | // Ignores non-sample records. |
86 | DataPtr += Header.size; |
87 | continue; |
88 | } |
89 | DataPtr += sizeof(Header); |
90 | uint64_t Count = support::endian::read64(DataPtr, endianness::native); |
91 | DataPtr += sizeof(Count); |
92 | |
93 | struct perf_branch_entry Entry; |
94 | memcpy(&Entry, DataPtr, sizeof(struct perf_branch_entry)); |
95 | |
96 | // Read the perf_branch_entry array. |
97 | for (uint64_t i = 0; i < Count; ++i) { |
98 | const uint64_t BlockStart = From == nullptr |
99 | ? std::numeric_limits<uint64_t>::min() |
100 | : reinterpret_cast<uint64_t>(From); |
101 | const uint64_t BlockEnd = To == nullptr |
102 | ? std::numeric_limits<uint64_t>::max() |
103 | : reinterpret_cast<uint64_t>(To); |
104 | |
105 | if (BlockStart <= Entry.from && BlockEnd >= Entry.to) |
106 | CycleArray->push_back(Entry.cycles); |
107 | |
108 | if (i == Count - 1) |
109 | // We've reached the last entry. |
110 | return Error::success(); |
111 | |
112 | // Advance to next entry |
113 | DataPtr += sizeof(Entry); |
114 | memcpy(&Entry, DataPtr, sizeof(struct perf_branch_entry)); |
115 | } |
116 | } |
117 | return make_error<StringError>("Unable to parse databuffer." , errc::io_error); |
118 | } |
119 | |
120 | X86LbrPerfEvent::X86LbrPerfEvent(unsigned SamplingPeriod) { |
121 | assert(SamplingPeriod > 0 && "SamplingPeriod must be positive" ); |
122 | EventString = "BR_INST_RETIRED.NEAR_TAKEN" ; |
123 | Attr = new perf_event_attr(); |
124 | Attr->size = sizeof(*Attr); |
125 | Attr->type = PERF_TYPE_RAW; |
126 | // FIXME This is SKL's encoding. Not sure if it'll change. |
127 | Attr->config = 0x20c4; // BR_INST_RETIRED.NEAR_TAKEN |
128 | Attr->sample_type = PERF_SAMPLE_BRANCH_STACK; |
129 | // Don't need to specify "USER" because we've already excluded HV and Kernel. |
130 | Attr->branch_sample_type = PERF_SAMPLE_BRANCH_ANY; |
131 | Attr->sample_period = SamplingPeriod; |
132 | Attr->wakeup_events = 1; // We need this even when using ioctl REFRESH. |
133 | Attr->disabled = 1; |
134 | Attr->exclude_kernel = 1; |
135 | Attr->exclude_hv = 1; |
136 | Attr->read_format = PERF_FORMAT_GROUP; |
137 | |
138 | FullQualifiedEventString = EventString; |
139 | } |
140 | |
141 | X86LbrCounter::X86LbrCounter(pfm::PerfEvent &&NewEvent) |
142 | : CounterGroup(std::move(NewEvent), {}) { |
143 | MMappedBuffer = mmap(nullptr, kMappedBufferSize, PROT_READ | PROT_WRITE, |
144 | MAP_SHARED, getFileDescriptor(), 0); |
145 | if (MMappedBuffer == MAP_FAILED) |
146 | errs() << "Failed to mmap buffer." ; |
147 | } |
148 | |
149 | X86LbrCounter::~X86LbrCounter() { |
150 | if (0 != munmap(MMappedBuffer, kMappedBufferSize)) |
151 | errs() << "Failed to munmap buffer." ; |
152 | } |
153 | |
154 | void X86LbrCounter::start() { |
155 | ioctl(getFileDescriptor(), PERF_EVENT_IOC_REFRESH, 1024 /* kMaxPollsPerFd */); |
156 | } |
157 | |
158 | Error X86LbrCounter::checkLbrSupport() { |
159 | // Do a sample read and check if the results contain non-zero values. |
160 | |
161 | X86LbrCounter counter(X86LbrPerfEvent(123)); |
162 | counter.start(); |
163 | |
164 | // Prevent the compiler from unrolling the loop and get rid of all the |
165 | // branches. We need at least 16 iterations. |
166 | int Sum = 0; |
167 | int V = 1; |
168 | |
169 | volatile int *P = &V; |
170 | auto TimeLimit = |
171 | std::chrono::high_resolution_clock::now() + std::chrono::microseconds(5); |
172 | |
173 | for (int I = 0; |
174 | I < kLbrEntries || std::chrono::high_resolution_clock::now() < TimeLimit; |
175 | ++I) { |
176 | Sum += *P; |
177 | } |
178 | |
179 | counter.stop(); |
180 | (void)Sum; |
181 | |
182 | auto ResultOrError = counter.doReadCounter(nullptr, nullptr); |
183 | if (ResultOrError) |
184 | if (!ResultOrError.get().empty()) |
185 | // If there is at least one non-zero entry, then LBR is supported. |
186 | for (const int64_t &Value : ResultOrError.get()) |
187 | if (Value != 0) |
188 | return Error::success(); |
189 | |
190 | return make_error<StringError>( |
191 | "LBR format with cycles is not suppported on the host." , |
192 | errc::not_supported); |
193 | } |
194 | |
195 | Expected<SmallVector<int64_t, 4>> |
196 | X86LbrCounter::readOrError(StringRef FunctionBytes) const { |
197 | // Disable the event before reading |
198 | ioctl(getFileDescriptor(), PERF_EVENT_IOC_DISABLE, 0); |
199 | |
200 | // Find the boundary of the function so that we could filter the LBRs |
201 | // to keep only the relevant records. |
202 | if (FunctionBytes.empty()) |
203 | return make_error<StringError>("Empty function bytes" , |
204 | errc::invalid_argument); |
205 | const void *From = reinterpret_cast<const void *>(FunctionBytes.data()); |
206 | const void *To = reinterpret_cast<const void *>(FunctionBytes.data() + |
207 | FunctionBytes.size()); |
208 | return doReadCounter(From, To); |
209 | } |
210 | |
211 | Expected<SmallVector<int64_t, 4>> |
212 | X86LbrCounter::doReadCounter(const void *From, const void *To) const { |
213 | // The max number of time-outs/retries before we give up. |
214 | static constexpr int kMaxTimeouts = 160; |
215 | |
216 | // Parses the LBR buffer and fills CycleArray with the sequence of cycle |
217 | // counts from the buffer. |
218 | SmallVector<int64_t, 4> CycleArray; |
219 | auto DataBuf = std::make_unique<char[]>(kDataBufferSize); |
220 | int NumTimeouts = 0; |
221 | int PollResult = 0; |
222 | |
223 | while (PollResult <= 0) { |
224 | PollResult = pollLbrPerfEvent(getFileDescriptor()); |
225 | if (PollResult > 0) |
226 | break; |
227 | if (PollResult == -1) |
228 | return make_error<StringError>("Cannot poll LBR perf event." , |
229 | errc::io_error); |
230 | if (NumTimeouts++ >= kMaxTimeouts) |
231 | return make_error<StringError>( |
232 | "LBR polling still timed out after max number of attempts." , |
233 | errc::device_or_resource_busy); |
234 | } |
235 | |
236 | struct perf_event_mmap_page Page; |
237 | memcpy(&Page, MMappedBuffer, sizeof(struct perf_event_mmap_page)); |
238 | |
239 | const uint64_t DataTail = Page.data_tail; |
240 | const uint64_t DataHead = Page.data_head; |
241 | // We're supposed to use a barrier after reading data_head. |
242 | std::atomic_thread_fence(std::memory_order_acq_rel); |
243 | const size_t DataSize = DataHead - DataTail; |
244 | if (DataSize > kDataBufferSize) |
245 | return make_error<StringError>("DataSize larger than buffer size." , |
246 | errc::invalid_argument); |
247 | |
248 | copyDataBuffer(MMappedBuffer, DataBuf.get(), DataTail, DataSize); |
249 | Error error = parseDataBuffer(DataBuf.get(), DataSize, From, To, &CycleArray); |
250 | if (!error) |
251 | return CycleArray; |
252 | return std::move(error); |
253 | } |
254 | |
255 | } // namespace exegesis |
256 | } // namespace llvm |
257 | |
258 | #endif // defined(__linux__) && defined(HAVE_LIBPFM) && |
259 | // defined(LIBPFM_HAS_FIELD_CYCLES) |
260 | |