1 | //===-- Clustering.cpp ------------------------------------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "Clustering.h" |
10 | #include "Error.h" |
11 | #include "SchedClassResolution.h" |
12 | #include "llvm/ADT/MapVector.h" |
13 | #include "llvm/ADT/SetVector.h" |
14 | #include "llvm/ADT/SmallSet.h" |
15 | #include "llvm/ADT/SmallVector.h" |
16 | #include <algorithm> |
17 | #include <deque> |
18 | #include <string> |
19 | #include <vector> |
20 | |
21 | namespace llvm { |
22 | namespace exegesis { |
23 | |
24 | // The clustering problem has the following characteristics: |
25 | // (A) - Low dimension (dimensions are typically proc resource units, |
26 | // typically < 10). |
27 | // (B) - Number of points : ~thousands (points are measurements of an MCInst) |
28 | // (C) - Number of clusters: ~tens. |
29 | // (D) - The number of clusters is not known /a priory/. |
30 | // (E) - The amount of noise is relatively small. |
31 | // The problem is rather small. In terms of algorithms, (D) disqualifies |
32 | // k-means and makes algorithms such as DBSCAN[1] or OPTICS[2] more applicable. |
33 | // |
34 | // We've used DBSCAN here because it's simple to implement. This is a pretty |
35 | // straightforward and inefficient implementation of the pseudocode in [2]. |
36 | // |
37 | // [1] https://en.wikipedia.org/wiki/DBSCAN |
38 | // [2] https://en.wikipedia.org/wiki/OPTICS_algorithm |
39 | |
40 | // Finds the points at distance less than sqrt(EpsilonSquared) of Q (not |
41 | // including Q). |
42 | void BenchmarkClustering::rangeQuery( |
43 | const size_t Q, std::vector<size_t> &Neighbors) const { |
44 | Neighbors.clear(); |
45 | Neighbors.reserve(n: Points_.size() - 1); // The Q itself isn't a neighbor. |
46 | const auto &QMeasurements = Points_[Q].Measurements; |
47 | for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) { |
48 | if (P == Q) |
49 | continue; |
50 | const auto &PMeasurements = Points_[P].Measurements; |
51 | if (PMeasurements.empty()) // Error point. |
52 | continue; |
53 | if (isNeighbour(P: PMeasurements, Q: QMeasurements, |
54 | EpsilonSquared_: AnalysisClusteringEpsilonSquared_)) { |
55 | Neighbors.push_back(x: P); |
56 | } |
57 | } |
58 | } |
59 | |
60 | // Given a set of points, checks that all the points are neighbours |
61 | // up to AnalysisClusteringEpsilon. This is O(2*N). |
62 | bool BenchmarkClustering::areAllNeighbours( |
63 | ArrayRef<size_t> Pts) const { |
64 | // First, get the centroid of this group of points. This is O(N). |
65 | SchedClassClusterCentroid G; |
66 | for (size_t P : Pts) { |
67 | assert(P < Points_.size()); |
68 | ArrayRef<BenchmarkMeasure> Measurements = Points_[P].Measurements; |
69 | if (Measurements.empty()) // Error point. |
70 | continue; |
71 | G.addPoint(Point: Measurements); |
72 | } |
73 | const std::vector<BenchmarkMeasure> Centroid = G.getAsPoint(); |
74 | |
75 | // Since we will be comparing with the centroid, we need to halve the epsilon. |
76 | double AnalysisClusteringEpsilonHalvedSquared = |
77 | AnalysisClusteringEpsilonSquared_ / 4.0; |
78 | |
79 | // And now check that every point is a neighbour of the centroid. Also O(N). |
80 | return all_of( |
81 | Range&: Pts, P: [this, &Centroid, AnalysisClusteringEpsilonHalvedSquared](size_t P) { |
82 | assert(P < Points_.size()); |
83 | const auto &PMeasurements = Points_[P].Measurements; |
84 | if (PMeasurements.empty()) // Error point. |
85 | return true; // Pretend that error point is a neighbour. |
86 | return isNeighbour(P: PMeasurements, Q: Centroid, |
87 | EpsilonSquared_: AnalysisClusteringEpsilonHalvedSquared); |
88 | }); |
89 | } |
90 | |
91 | BenchmarkClustering::BenchmarkClustering( |
92 | const std::vector<Benchmark> &Points, |
93 | const double AnalysisClusteringEpsilonSquared) |
94 | : Points_(Points), |
95 | AnalysisClusteringEpsilonSquared_(AnalysisClusteringEpsilonSquared), |
96 | NoiseCluster_(ClusterId::noise()), ErrorCluster_(ClusterId::error()) {} |
97 | |
98 | Error BenchmarkClustering::validateAndSetup() { |
99 | ClusterIdForPoint_.resize(new_size: Points_.size()); |
100 | // Mark erroneous measurements out. |
101 | // All points must have the same number of dimensions, in the same order. |
102 | const std::vector<BenchmarkMeasure> *LastMeasurement = nullptr; |
103 | for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) { |
104 | const auto &Point = Points_[P]; |
105 | if (!Point.Error.empty()) { |
106 | ClusterIdForPoint_[P] = ClusterId::error(); |
107 | ErrorCluster_.PointIndices.push_back(x: P); |
108 | continue; |
109 | } |
110 | const auto *CurMeasurement = &Point.Measurements; |
111 | if (LastMeasurement) { |
112 | if (LastMeasurement->size() != CurMeasurement->size()) { |
113 | return make_error<ClusteringError>( |
114 | Args: "inconsistent measurement dimensions" ); |
115 | } |
116 | for (size_t I = 0, E = LastMeasurement->size(); I < E; ++I) { |
117 | if (LastMeasurement->at(n: I).Key != CurMeasurement->at(n: I).Key) { |
118 | return make_error<ClusteringError>( |
119 | Args: "inconsistent measurement dimensions keys" ); |
120 | } |
121 | } |
122 | } |
123 | LastMeasurement = CurMeasurement; |
124 | } |
125 | if (LastMeasurement) { |
126 | NumDimensions_ = LastMeasurement->size(); |
127 | } |
128 | return Error::success(); |
129 | } |
130 | |
131 | void BenchmarkClustering::clusterizeDbScan(const size_t MinPts) { |
132 | std::vector<size_t> Neighbors; // Persistent buffer to avoid allocs. |
133 | for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) { |
134 | if (!ClusterIdForPoint_[P].isUndef()) |
135 | continue; // Previously processed in inner loop. |
136 | rangeQuery(Q: P, Neighbors); |
137 | if (Neighbors.size() + 1 < MinPts) { // Density check. |
138 | // The region around P is not dense enough to create a new cluster, mark |
139 | // as noise for now. |
140 | ClusterIdForPoint_[P] = ClusterId::noise(); |
141 | continue; |
142 | } |
143 | |
144 | // Create a new cluster, add P. |
145 | Clusters_.emplace_back(args: ClusterId::makeValid(Id: Clusters_.size())); |
146 | Cluster &CurrentCluster = Clusters_.back(); |
147 | ClusterIdForPoint_[P] = CurrentCluster.Id; /* Label initial point */ |
148 | CurrentCluster.PointIndices.push_back(x: P); |
149 | |
150 | // Process P's neighbors. |
151 | SetVector<size_t, std::deque<size_t>> ToProcess; |
152 | ToProcess.insert(Start: Neighbors.begin(), End: Neighbors.end()); |
153 | while (!ToProcess.empty()) { |
154 | // Retrieve a point from the set. |
155 | const size_t Q = *ToProcess.begin(); |
156 | ToProcess.erase(I: ToProcess.begin()); |
157 | |
158 | if (ClusterIdForPoint_[Q].isNoise()) { |
159 | // Change noise point to border point. |
160 | ClusterIdForPoint_[Q] = CurrentCluster.Id; |
161 | CurrentCluster.PointIndices.push_back(x: Q); |
162 | continue; |
163 | } |
164 | if (!ClusterIdForPoint_[Q].isUndef()) { |
165 | continue; // Previously processed. |
166 | } |
167 | // Add Q to the current custer. |
168 | ClusterIdForPoint_[Q] = CurrentCluster.Id; |
169 | CurrentCluster.PointIndices.push_back(x: Q); |
170 | // And extend to the neighbors of Q if the region is dense enough. |
171 | rangeQuery(Q, Neighbors); |
172 | if (Neighbors.size() + 1 >= MinPts) { |
173 | ToProcess.insert(Start: Neighbors.begin(), End: Neighbors.end()); |
174 | } |
175 | } |
176 | } |
177 | // assert(Neighbors.capacity() == (Points_.size() - 1)); |
178 | // ^ True, but it is not quaranteed to be true in all the cases. |
179 | |
180 | // Add noisy points to noise cluster. |
181 | for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) { |
182 | if (ClusterIdForPoint_[P].isNoise()) { |
183 | NoiseCluster_.PointIndices.push_back(x: P); |
184 | } |
185 | } |
186 | } |
187 | |
188 | void BenchmarkClustering::clusterizeNaive( |
189 | const MCSubtargetInfo &SubtargetInfo, const MCInstrInfo &InstrInfo) { |
190 | // Given an instruction Opcode, which sched class id's are represented, |
191 | // and which are the benchmarks for each sched class? |
192 | std::vector<SmallMapVector<unsigned, SmallVector<size_t, 1>, 1>> |
193 | OpcodeToSchedClassesToPoints; |
194 | const unsigned NumOpcodes = InstrInfo.getNumOpcodes(); |
195 | OpcodeToSchedClassesToPoints.resize(new_size: NumOpcodes); |
196 | size_t NumClusters = 0; |
197 | for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) { |
198 | const Benchmark &Point = Points_[P]; |
199 | const MCInst &MCI = Point.keyInstruction(); |
200 | unsigned SchedClassId; |
201 | std::tie(args&: SchedClassId, args: std::ignore) = |
202 | ResolvedSchedClass::resolveSchedClassId(SubtargetInfo, InstrInfo, MCI); |
203 | const unsigned Opcode = MCI.getOpcode(); |
204 | assert(Opcode < NumOpcodes && "NumOpcodes is incorrect (too small)" ); |
205 | auto &Points = OpcodeToSchedClassesToPoints[Opcode][SchedClassId]; |
206 | if (Points.empty()) // If we previously have not seen any points of |
207 | ++NumClusters; // this opcode's sched class, then new cluster begins. |
208 | Points.emplace_back(Args&: P); |
209 | } |
210 | assert(NumClusters <= NumOpcodes && |
211 | "can't see more opcodes than there are total opcodes" ); |
212 | assert(NumClusters <= Points_.size() && |
213 | "can't see more opcodes than there are total points" ); |
214 | |
215 | Clusters_.reserve(n: NumClusters); // We already know how many clusters there is. |
216 | for (const auto &SchedClassesOfOpcode : OpcodeToSchedClassesToPoints) { |
217 | if (SchedClassesOfOpcode.empty()) |
218 | continue; |
219 | for (ArrayRef<size_t> PointsOfSchedClass : |
220 | make_second_range(c: SchedClassesOfOpcode)) { |
221 | if (PointsOfSchedClass.empty()) |
222 | continue; |
223 | // Create a new cluster. |
224 | Clusters_.emplace_back(args: ClusterId::makeValid( |
225 | Id: Clusters_.size(), |
226 | /*IsUnstable=*/!areAllNeighbours(Pts: PointsOfSchedClass))); |
227 | Cluster &CurrentCluster = Clusters_.back(); |
228 | // Mark points as belonging to the new cluster. |
229 | for (size_t P : PointsOfSchedClass) |
230 | ClusterIdForPoint_[P] = CurrentCluster.Id; |
231 | // And add all the points of this opcode's sched class to the new cluster. |
232 | CurrentCluster.PointIndices.reserve(n: PointsOfSchedClass.size()); |
233 | CurrentCluster.PointIndices.assign(first: PointsOfSchedClass.begin(), |
234 | last: PointsOfSchedClass.end()); |
235 | assert(CurrentCluster.PointIndices.size() == PointsOfSchedClass.size()); |
236 | } |
237 | } |
238 | assert(Clusters_.size() == NumClusters); |
239 | } |
240 | |
241 | // Given an instruction Opcode, we can make benchmarks (measurements) of the |
242 | // instruction characteristics/performance. Then, to facilitate further analysis |
243 | // we group the benchmarks with *similar* characteristics into clusters. |
244 | // Now, this is all not entirely deterministic. Some instructions have variable |
245 | // characteristics, depending on their arguments. And thus, if we do several |
246 | // benchmarks of the same instruction Opcode, we may end up with *different* |
247 | // performance characteristics measurements. And when we then do clustering, |
248 | // these several benchmarks of the same instruction Opcode may end up being |
249 | // clustered into *different* clusters. This is not great for further analysis. |
250 | // We shall find every opcode with benchmarks not in just one cluster, and move |
251 | // *all* the benchmarks of said Opcode into one new unstable cluster per Opcode. |
252 | void BenchmarkClustering::stabilize(unsigned NumOpcodes) { |
253 | // Given an instruction Opcode and Config, in which clusters do benchmarks of |
254 | // this instruction lie? Normally, they all should be in the same cluster. |
255 | struct OpcodeAndConfig { |
256 | explicit OpcodeAndConfig(const Benchmark &IB) |
257 | : Opcode(IB.keyInstruction().getOpcode()), Config(&IB.Key.Config) {} |
258 | unsigned Opcode; |
259 | const std::string *Config; |
260 | |
261 | auto Tie() const -> auto { return std::tie(args: Opcode, args: *Config); } |
262 | |
263 | bool operator<(const OpcodeAndConfig &O) const { return Tie() < O.Tie(); } |
264 | bool operator!=(const OpcodeAndConfig &O) const { return Tie() != O.Tie(); } |
265 | }; |
266 | std::map<OpcodeAndConfig, SmallSet<ClusterId, 1>> OpcodeConfigToClusterIDs; |
267 | // Populate OpcodeConfigToClusterIDs and UnstableOpcodes data structures. |
268 | assert(ClusterIdForPoint_.size() == Points_.size() && "size mismatch" ); |
269 | for (auto Point : zip(t: Points_, u&: ClusterIdForPoint_)) { |
270 | const ClusterId &ClusterIdOfPoint = std::get<1>(t&: Point); |
271 | if (!ClusterIdOfPoint.isValid()) |
272 | continue; // Only process fully valid clusters. |
273 | const OpcodeAndConfig Key(std::get<0>(t&: Point)); |
274 | SmallSet<ClusterId, 1> &ClusterIDsOfOpcode = OpcodeConfigToClusterIDs[Key]; |
275 | ClusterIDsOfOpcode.insert(V: ClusterIdOfPoint); |
276 | } |
277 | |
278 | for (const auto &OpcodeConfigToClusterID : OpcodeConfigToClusterIDs) { |
279 | const SmallSet<ClusterId, 1> &ClusterIDs = OpcodeConfigToClusterID.second; |
280 | const OpcodeAndConfig &Key = OpcodeConfigToClusterID.first; |
281 | // We only care about unstable instructions. |
282 | if (ClusterIDs.size() < 2) |
283 | continue; |
284 | |
285 | // Create a new unstable cluster, one per Opcode. |
286 | Clusters_.emplace_back(args: ClusterId::makeValidUnstable(Id: Clusters_.size())); |
287 | Cluster &UnstableCluster = Clusters_.back(); |
288 | // We will find *at least* one point in each of these clusters. |
289 | UnstableCluster.PointIndices.reserve(n: ClusterIDs.size()); |
290 | |
291 | // Go through every cluster which we recorded as containing benchmarks |
292 | // of this UnstableOpcode. NOTE: we only recorded valid clusters. |
293 | for (const ClusterId &CID : ClusterIDs) { |
294 | assert(CID.isValid() && |
295 | "We only recorded valid clusters, not noise/error clusters." ); |
296 | Cluster &OldCluster = Clusters_[CID.getId()]; // Valid clusters storage. |
297 | // Within each cluster, go through each point, and either move it to the |
298 | // new unstable cluster, or 'keep' it. |
299 | // In this case, we'll reshuffle OldCluster.PointIndices vector |
300 | // so that all the points that are *not* for UnstableOpcode are first, |
301 | // and the rest of the points is for the UnstableOpcode. |
302 | const auto it = std::stable_partition( |
303 | first: OldCluster.PointIndices.begin(), last: OldCluster.PointIndices.end(), |
304 | pred: [this, &Key](size_t P) { |
305 | return OpcodeAndConfig(Points_[P]) != Key; |
306 | }); |
307 | assert(std::distance(it, OldCluster.PointIndices.end()) > 0 && |
308 | "Should have found at least one bad point" ); |
309 | // Mark to-be-moved points as belonging to the new cluster. |
310 | for (size_t P : make_range(x: it, y: OldCluster.PointIndices.end())) |
311 | ClusterIdForPoint_[P] = UnstableCluster.Id; |
312 | // Actually append to-be-moved points to the new cluster. |
313 | UnstableCluster.PointIndices.insert(position: UnstableCluster.PointIndices.end(), |
314 | first: it, last: OldCluster.PointIndices.end()); |
315 | // And finally, remove "to-be-moved" points from the old cluster. |
316 | OldCluster.PointIndices.erase(first: it, last: OldCluster.PointIndices.end()); |
317 | // Now, the old cluster may end up being empty, but let's just keep it |
318 | // in whatever state it ended up. Purging empty clusters isn't worth it. |
319 | }; |
320 | assert(UnstableCluster.PointIndices.size() > 1 && |
321 | "New unstable cluster should end up with more than one point." ); |
322 | assert(UnstableCluster.PointIndices.size() >= ClusterIDs.size() && |
323 | "New unstable cluster should end up with no less points than there " |
324 | "was clusters" ); |
325 | } |
326 | } |
327 | |
328 | Expected<BenchmarkClustering> BenchmarkClustering::create( |
329 | const std::vector<Benchmark> &Points, const ModeE Mode, |
330 | const size_t DbscanMinPts, const double AnalysisClusteringEpsilon, |
331 | const MCSubtargetInfo *SubtargetInfo, const MCInstrInfo *InstrInfo) { |
332 | BenchmarkClustering Clustering( |
333 | Points, AnalysisClusteringEpsilon * AnalysisClusteringEpsilon); |
334 | if (auto Error = Clustering.validateAndSetup()) { |
335 | return std::move(Error); |
336 | } |
337 | if (Clustering.ErrorCluster_.PointIndices.size() == Points.size()) { |
338 | return Clustering; // Nothing to cluster. |
339 | } |
340 | |
341 | if (Mode == ModeE::Dbscan) { |
342 | Clustering.clusterizeDbScan(MinPts: DbscanMinPts); |
343 | |
344 | if (InstrInfo) |
345 | Clustering.stabilize(NumOpcodes: InstrInfo->getNumOpcodes()); |
346 | } else /*if(Mode == ModeE::Naive)*/ { |
347 | if (!SubtargetInfo || !InstrInfo) |
348 | return make_error<Failure>(Args: "'naive' clustering mode requires " |
349 | "SubtargetInfo and InstrInfo to be present" ); |
350 | Clustering.clusterizeNaive(SubtargetInfo: *SubtargetInfo, InstrInfo: *InstrInfo); |
351 | } |
352 | |
353 | return Clustering; |
354 | } |
355 | |
356 | void SchedClassClusterCentroid::addPoint(ArrayRef<BenchmarkMeasure> Point) { |
357 | if (Representative.empty()) |
358 | Representative.resize(new_size: Point.size()); |
359 | assert(Representative.size() == Point.size() && |
360 | "All points should have identical dimensions." ); |
361 | |
362 | for (auto I : zip(t&: Representative, u&: Point)) |
363 | std::get<0>(t&: I).push(BM: std::get<1>(t&: I)); |
364 | } |
365 | |
366 | std::vector<BenchmarkMeasure> SchedClassClusterCentroid::getAsPoint() const { |
367 | std::vector<BenchmarkMeasure> ClusterCenterPoint(Representative.size()); |
368 | for (auto I : zip(t&: ClusterCenterPoint, u: Representative)) |
369 | std::get<0>(t&: I).PerInstructionValue = std::get<1>(t&: I).avg(); |
370 | return ClusterCenterPoint; |
371 | } |
372 | |
373 | bool SchedClassClusterCentroid::validate( |
374 | Benchmark::ModeE Mode) const { |
375 | size_t NumMeasurements = Representative.size(); |
376 | switch (Mode) { |
377 | case Benchmark::Latency: |
378 | if (NumMeasurements != 1) { |
379 | errs() |
380 | << "invalid number of measurements in latency mode: expected 1, got " |
381 | << NumMeasurements << "\n" ; |
382 | return false; |
383 | } |
384 | break; |
385 | case Benchmark::Uops: |
386 | // Can have many measurements. |
387 | break; |
388 | case Benchmark::InverseThroughput: |
389 | if (NumMeasurements != 1) { |
390 | errs() << "invalid number of measurements in inverse throughput " |
391 | "mode: expected 1, got " |
392 | << NumMeasurements << "\n" ; |
393 | return false; |
394 | } |
395 | break; |
396 | default: |
397 | llvm_unreachable("unimplemented measurement matching mode" ); |
398 | return false; |
399 | } |
400 | |
401 | return true; // All good. |
402 | } |
403 | |
404 | } // namespace exegesis |
405 | } // namespace llvm |
406 | |