| 1 | //===-- Clustering.cpp ------------------------------------------*- C++ -*-===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | |
| 9 | #include "Clustering.h" |
| 10 | #include "Error.h" |
| 11 | #include "SchedClassResolution.h" |
| 12 | #include "llvm/ADT/MapVector.h" |
| 13 | #include "llvm/ADT/SetVector.h" |
| 14 | #include "llvm/ADT/SmallSet.h" |
| 15 | #include "llvm/ADT/SmallVector.h" |
| 16 | #include <algorithm> |
| 17 | #include <deque> |
| 18 | #include <string> |
| 19 | #include <vector> |
| 20 | |
| 21 | namespace llvm { |
| 22 | namespace exegesis { |
| 23 | |
| 24 | // The clustering problem has the following characteristics: |
| 25 | // (A) - Low dimension (dimensions are typically proc resource units, |
| 26 | // typically < 10). |
| 27 | // (B) - Number of points : ~thousands (points are measurements of an MCInst) |
| 28 | // (C) - Number of clusters: ~tens. |
| 29 | // (D) - The number of clusters is not known /a priory/. |
| 30 | // (E) - The amount of noise is relatively small. |
| 31 | // The problem is rather small. In terms of algorithms, (D) disqualifies |
| 32 | // k-means and makes algorithms such as DBSCAN[1] or OPTICS[2] more applicable. |
| 33 | // |
| 34 | // We've used DBSCAN here because it's simple to implement. This is a pretty |
| 35 | // straightforward and inefficient implementation of the pseudocode in [2]. |
| 36 | // |
| 37 | // [1] https://en.wikipedia.org/wiki/DBSCAN |
| 38 | // [2] https://en.wikipedia.org/wiki/OPTICS_algorithm |
| 39 | |
| 40 | // Finds the points at distance less than sqrt(EpsilonSquared) of Q (not |
| 41 | // including Q). |
| 42 | void BenchmarkClustering::rangeQuery( |
| 43 | const size_t Q, std::vector<size_t> &Neighbors) const { |
| 44 | Neighbors.clear(); |
| 45 | Neighbors.reserve(n: Points_.size() - 1); // The Q itself isn't a neighbor. |
| 46 | const auto &QMeasurements = Points_[Q].Measurements; |
| 47 | for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) { |
| 48 | if (P == Q) |
| 49 | continue; |
| 50 | const auto &PMeasurements = Points_[P].Measurements; |
| 51 | if (PMeasurements.empty()) // Error point. |
| 52 | continue; |
| 53 | if (isNeighbour(P: PMeasurements, Q: QMeasurements, |
| 54 | EpsilonSquared_: AnalysisClusteringEpsilonSquared_)) { |
| 55 | Neighbors.push_back(x: P); |
| 56 | } |
| 57 | } |
| 58 | } |
| 59 | |
| 60 | // Given a set of points, checks that all the points are neighbours |
| 61 | // up to AnalysisClusteringEpsilon. This is O(2*N). |
| 62 | bool BenchmarkClustering::areAllNeighbours( |
| 63 | ArrayRef<size_t> Pts) const { |
| 64 | // First, get the centroid of this group of points. This is O(N). |
| 65 | SchedClassClusterCentroid G; |
| 66 | for (size_t P : Pts) { |
| 67 | assert(P < Points_.size()); |
| 68 | ArrayRef<BenchmarkMeasure> Measurements = Points_[P].Measurements; |
| 69 | if (Measurements.empty()) // Error point. |
| 70 | continue; |
| 71 | G.addPoint(Point: Measurements); |
| 72 | } |
| 73 | const std::vector<BenchmarkMeasure> Centroid = G.getAsPoint(); |
| 74 | |
| 75 | // Since we will be comparing with the centroid, we need to halve the epsilon. |
| 76 | double AnalysisClusteringEpsilonHalvedSquared = |
| 77 | AnalysisClusteringEpsilonSquared_ / 4.0; |
| 78 | |
| 79 | // And now check that every point is a neighbour of the centroid. Also O(N). |
| 80 | return all_of( |
| 81 | Range&: Pts, P: [this, &Centroid, AnalysisClusteringEpsilonHalvedSquared](size_t P) { |
| 82 | assert(P < Points_.size()); |
| 83 | const auto &PMeasurements = Points_[P].Measurements; |
| 84 | if (PMeasurements.empty()) // Error point. |
| 85 | return true; // Pretend that error point is a neighbour. |
| 86 | return isNeighbour(P: PMeasurements, Q: Centroid, |
| 87 | EpsilonSquared_: AnalysisClusteringEpsilonHalvedSquared); |
| 88 | }); |
| 89 | } |
| 90 | |
| 91 | BenchmarkClustering::BenchmarkClustering( |
| 92 | const std::vector<Benchmark> &Points, |
| 93 | const double AnalysisClusteringEpsilonSquared) |
| 94 | : Points_(Points), |
| 95 | AnalysisClusteringEpsilonSquared_(AnalysisClusteringEpsilonSquared), |
| 96 | NoiseCluster_(ClusterId::noise()), ErrorCluster_(ClusterId::error()) {} |
| 97 | |
| 98 | Error BenchmarkClustering::validateAndSetup() { |
| 99 | ClusterIdForPoint_.resize(new_size: Points_.size()); |
| 100 | // Mark erroneous measurements out. |
| 101 | // All points must have the same number of dimensions, in the same order. |
| 102 | const std::vector<BenchmarkMeasure> *LastMeasurement = nullptr; |
| 103 | for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) { |
| 104 | const auto &Point = Points_[P]; |
| 105 | if (!Point.Error.empty()) { |
| 106 | ClusterIdForPoint_[P] = ClusterId::error(); |
| 107 | ErrorCluster_.PointIndices.push_back(x: P); |
| 108 | continue; |
| 109 | } |
| 110 | const auto *CurMeasurement = &Point.Measurements; |
| 111 | if (LastMeasurement) { |
| 112 | if (LastMeasurement->size() != CurMeasurement->size()) { |
| 113 | return make_error<ClusteringError>( |
| 114 | Args: "inconsistent measurement dimensions" ); |
| 115 | } |
| 116 | for (size_t I = 0, E = LastMeasurement->size(); I < E; ++I) { |
| 117 | if (LastMeasurement->at(n: I).Key != CurMeasurement->at(n: I).Key) { |
| 118 | return make_error<ClusteringError>( |
| 119 | Args: "inconsistent measurement dimensions keys" ); |
| 120 | } |
| 121 | } |
| 122 | } |
| 123 | LastMeasurement = CurMeasurement; |
| 124 | } |
| 125 | if (LastMeasurement) { |
| 126 | NumDimensions_ = LastMeasurement->size(); |
| 127 | } |
| 128 | return Error::success(); |
| 129 | } |
| 130 | |
| 131 | void BenchmarkClustering::clusterizeDbScan(const size_t MinPts) { |
| 132 | std::vector<size_t> Neighbors; // Persistent buffer to avoid allocs. |
| 133 | for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) { |
| 134 | if (!ClusterIdForPoint_[P].isUndef()) |
| 135 | continue; // Previously processed in inner loop. |
| 136 | rangeQuery(Q: P, Neighbors); |
| 137 | if (Neighbors.size() + 1 < MinPts) { // Density check. |
| 138 | // The region around P is not dense enough to create a new cluster, mark |
| 139 | // as noise for now. |
| 140 | ClusterIdForPoint_[P] = ClusterId::noise(); |
| 141 | continue; |
| 142 | } |
| 143 | |
| 144 | // Create a new cluster, add P. |
| 145 | Clusters_.emplace_back(args: ClusterId::makeValid(Id: Clusters_.size())); |
| 146 | Cluster &CurrentCluster = Clusters_.back(); |
| 147 | ClusterIdForPoint_[P] = CurrentCluster.Id; /* Label initial point */ |
| 148 | CurrentCluster.PointIndices.push_back(x: P); |
| 149 | |
| 150 | // Process P's neighbors. |
| 151 | SetVector<size_t, std::deque<size_t>> ToProcess(llvm::from_range, |
| 152 | Neighbors); |
| 153 | while (!ToProcess.empty()) { |
| 154 | // Retrieve a point from the set. |
| 155 | const size_t Q = *ToProcess.begin(); |
| 156 | ToProcess.erase(I: ToProcess.begin()); |
| 157 | |
| 158 | if (ClusterIdForPoint_[Q].isNoise()) { |
| 159 | // Change noise point to border point. |
| 160 | ClusterIdForPoint_[Q] = CurrentCluster.Id; |
| 161 | CurrentCluster.PointIndices.push_back(x: Q); |
| 162 | continue; |
| 163 | } |
| 164 | if (!ClusterIdForPoint_[Q].isUndef()) { |
| 165 | continue; // Previously processed. |
| 166 | } |
| 167 | // Add Q to the current custer. |
| 168 | ClusterIdForPoint_[Q] = CurrentCluster.Id; |
| 169 | CurrentCluster.PointIndices.push_back(x: Q); |
| 170 | // And extend to the neighbors of Q if the region is dense enough. |
| 171 | rangeQuery(Q, Neighbors); |
| 172 | if (Neighbors.size() + 1 >= MinPts) { |
| 173 | ToProcess.insert_range(R&: Neighbors); |
| 174 | } |
| 175 | } |
| 176 | } |
| 177 | // assert(Neighbors.capacity() == (Points_.size() - 1)); |
| 178 | // ^ True, but it is not quaranteed to be true in all the cases. |
| 179 | |
| 180 | // Add noisy points to noise cluster. |
| 181 | for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) { |
| 182 | if (ClusterIdForPoint_[P].isNoise()) { |
| 183 | NoiseCluster_.PointIndices.push_back(x: P); |
| 184 | } |
| 185 | } |
| 186 | } |
| 187 | |
| 188 | void BenchmarkClustering::clusterizeNaive( |
| 189 | const MCSubtargetInfo &SubtargetInfo, const MCInstrInfo &InstrInfo) { |
| 190 | // Given an instruction Opcode, which sched class id's are represented, |
| 191 | // and which are the benchmarks for each sched class? |
| 192 | std::vector<SmallMapVector<unsigned, SmallVector<size_t, 1>, 1>> |
| 193 | OpcodeToSchedClassesToPoints; |
| 194 | const unsigned NumOpcodes = InstrInfo.getNumOpcodes(); |
| 195 | OpcodeToSchedClassesToPoints.resize(new_size: NumOpcodes); |
| 196 | size_t NumClusters = 0; |
| 197 | for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) { |
| 198 | const Benchmark &Point = Points_[P]; |
| 199 | const MCInst &MCI = Point.keyInstruction(); |
| 200 | unsigned SchedClassId; |
| 201 | std::tie(args&: SchedClassId, args: std::ignore) = |
| 202 | ResolvedSchedClass::resolveSchedClassId(SubtargetInfo, InstrInfo, MCI); |
| 203 | const unsigned Opcode = MCI.getOpcode(); |
| 204 | assert(Opcode < NumOpcodes && "NumOpcodes is incorrect (too small)" ); |
| 205 | auto &Points = OpcodeToSchedClassesToPoints[Opcode][SchedClassId]; |
| 206 | if (Points.empty()) // If we previously have not seen any points of |
| 207 | ++NumClusters; // this opcode's sched class, then new cluster begins. |
| 208 | Points.emplace_back(Args&: P); |
| 209 | } |
| 210 | assert(NumClusters <= NumOpcodes && |
| 211 | "can't see more opcodes than there are total opcodes" ); |
| 212 | assert(NumClusters <= Points_.size() && |
| 213 | "can't see more opcodes than there are total points" ); |
| 214 | |
| 215 | Clusters_.reserve(n: NumClusters); // We already know how many clusters there is. |
| 216 | for (const auto &SchedClassesOfOpcode : OpcodeToSchedClassesToPoints) { |
| 217 | if (SchedClassesOfOpcode.empty()) |
| 218 | continue; |
| 219 | for (ArrayRef<size_t> PointsOfSchedClass : |
| 220 | make_second_range(c: SchedClassesOfOpcode)) { |
| 221 | if (PointsOfSchedClass.empty()) |
| 222 | continue; |
| 223 | // Create a new cluster. |
| 224 | Clusters_.emplace_back(args: ClusterId::makeValid( |
| 225 | Id: Clusters_.size(), |
| 226 | /*IsUnstable=*/!areAllNeighbours(Pts: PointsOfSchedClass))); |
| 227 | Cluster &CurrentCluster = Clusters_.back(); |
| 228 | // Mark points as belonging to the new cluster. |
| 229 | for (size_t P : PointsOfSchedClass) |
| 230 | ClusterIdForPoint_[P] = CurrentCluster.Id; |
| 231 | // And add all the points of this opcode's sched class to the new cluster. |
| 232 | CurrentCluster.PointIndices.reserve(n: PointsOfSchedClass.size()); |
| 233 | CurrentCluster.PointIndices.assign(first: PointsOfSchedClass.begin(), |
| 234 | last: PointsOfSchedClass.end()); |
| 235 | assert(CurrentCluster.PointIndices.size() == PointsOfSchedClass.size()); |
| 236 | } |
| 237 | } |
| 238 | assert(Clusters_.size() == NumClusters); |
| 239 | } |
| 240 | |
| 241 | // Given an instruction Opcode, we can make benchmarks (measurements) of the |
| 242 | // instruction characteristics/performance. Then, to facilitate further analysis |
| 243 | // we group the benchmarks with *similar* characteristics into clusters. |
| 244 | // Now, this is all not entirely deterministic. Some instructions have variable |
| 245 | // characteristics, depending on their arguments. And thus, if we do several |
| 246 | // benchmarks of the same instruction Opcode, we may end up with *different* |
| 247 | // performance characteristics measurements. And when we then do clustering, |
| 248 | // these several benchmarks of the same instruction Opcode may end up being |
| 249 | // clustered into *different* clusters. This is not great for further analysis. |
| 250 | // We shall find every opcode with benchmarks not in just one cluster, and move |
| 251 | // *all* the benchmarks of said Opcode into one new unstable cluster per Opcode. |
| 252 | void BenchmarkClustering::stabilize(unsigned NumOpcodes) { |
| 253 | // Given an instruction Opcode and Config, in which clusters do benchmarks of |
| 254 | // this instruction lie? Normally, they all should be in the same cluster. |
| 255 | struct OpcodeAndConfig { |
| 256 | explicit OpcodeAndConfig(const Benchmark &IB) |
| 257 | : Opcode(IB.keyInstruction().getOpcode()), Config(&IB.Key.Config) {} |
| 258 | unsigned Opcode; |
| 259 | const std::string *Config; |
| 260 | |
| 261 | auto Tie() const -> auto { return std::tie(args: Opcode, args: *Config); } |
| 262 | |
| 263 | bool operator<(const OpcodeAndConfig &O) const { return Tie() < O.Tie(); } |
| 264 | bool operator!=(const OpcodeAndConfig &O) const { return Tie() != O.Tie(); } |
| 265 | }; |
| 266 | std::map<OpcodeAndConfig, SmallSet<ClusterId, 1>> OpcodeConfigToClusterIDs; |
| 267 | // Populate OpcodeConfigToClusterIDs and UnstableOpcodes data structures. |
| 268 | assert(ClusterIdForPoint_.size() == Points_.size() && "size mismatch" ); |
| 269 | for (auto Point : zip(t: Points_, u&: ClusterIdForPoint_)) { |
| 270 | const ClusterId &ClusterIdOfPoint = std::get<1>(t&: Point); |
| 271 | if (!ClusterIdOfPoint.isValid()) |
| 272 | continue; // Only process fully valid clusters. |
| 273 | const OpcodeAndConfig Key(std::get<0>(t&: Point)); |
| 274 | SmallSet<ClusterId, 1> &ClusterIDsOfOpcode = OpcodeConfigToClusterIDs[Key]; |
| 275 | ClusterIDsOfOpcode.insert(V: ClusterIdOfPoint); |
| 276 | } |
| 277 | |
| 278 | for (const auto &OpcodeConfigToClusterID : OpcodeConfigToClusterIDs) { |
| 279 | const SmallSet<ClusterId, 1> &ClusterIDs = OpcodeConfigToClusterID.second; |
| 280 | const OpcodeAndConfig &Key = OpcodeConfigToClusterID.first; |
| 281 | // We only care about unstable instructions. |
| 282 | if (ClusterIDs.size() < 2) |
| 283 | continue; |
| 284 | |
| 285 | // Create a new unstable cluster, one per Opcode. |
| 286 | Clusters_.emplace_back(args: ClusterId::makeValidUnstable(Id: Clusters_.size())); |
| 287 | Cluster &UnstableCluster = Clusters_.back(); |
| 288 | // We will find *at least* one point in each of these clusters. |
| 289 | UnstableCluster.PointIndices.reserve(n: ClusterIDs.size()); |
| 290 | |
| 291 | // Go through every cluster which we recorded as containing benchmarks |
| 292 | // of this UnstableOpcode. NOTE: we only recorded valid clusters. |
| 293 | for (const ClusterId &CID : ClusterIDs) { |
| 294 | assert(CID.isValid() && |
| 295 | "We only recorded valid clusters, not noise/error clusters." ); |
| 296 | Cluster &OldCluster = Clusters_[CID.getId()]; // Valid clusters storage. |
| 297 | // Within each cluster, go through each point, and either move it to the |
| 298 | // new unstable cluster, or 'keep' it. |
| 299 | // In this case, we'll reshuffle OldCluster.PointIndices vector |
| 300 | // so that all the points that are *not* for UnstableOpcode are first, |
| 301 | // and the rest of the points is for the UnstableOpcode. |
| 302 | const auto it = std::stable_partition( |
| 303 | first: OldCluster.PointIndices.begin(), last: OldCluster.PointIndices.end(), |
| 304 | pred: [this, &Key](size_t P) { |
| 305 | return OpcodeAndConfig(Points_[P]) != Key; |
| 306 | }); |
| 307 | assert(std::distance(it, OldCluster.PointIndices.end()) > 0 && |
| 308 | "Should have found at least one bad point" ); |
| 309 | // Mark to-be-moved points as belonging to the new cluster. |
| 310 | for (size_t P : make_range(x: it, y: OldCluster.PointIndices.end())) |
| 311 | ClusterIdForPoint_[P] = UnstableCluster.Id; |
| 312 | // Actually append to-be-moved points to the new cluster. |
| 313 | UnstableCluster.PointIndices.insert(position: UnstableCluster.PointIndices.end(), |
| 314 | first: it, last: OldCluster.PointIndices.end()); |
| 315 | // And finally, remove "to-be-moved" points from the old cluster. |
| 316 | OldCluster.PointIndices.erase(first: it, last: OldCluster.PointIndices.end()); |
| 317 | // Now, the old cluster may end up being empty, but let's just keep it |
| 318 | // in whatever state it ended up. Purging empty clusters isn't worth it. |
| 319 | }; |
| 320 | assert(UnstableCluster.PointIndices.size() > 1 && |
| 321 | "New unstable cluster should end up with more than one point." ); |
| 322 | assert(UnstableCluster.PointIndices.size() >= ClusterIDs.size() && |
| 323 | "New unstable cluster should end up with no less points than there " |
| 324 | "was clusters" ); |
| 325 | } |
| 326 | } |
| 327 | |
| 328 | Expected<BenchmarkClustering> BenchmarkClustering::create( |
| 329 | const std::vector<Benchmark> &Points, const ModeE Mode, |
| 330 | const size_t DbscanMinPts, const double AnalysisClusteringEpsilon, |
| 331 | const MCSubtargetInfo *SubtargetInfo, const MCInstrInfo *InstrInfo) { |
| 332 | BenchmarkClustering Clustering( |
| 333 | Points, AnalysisClusteringEpsilon * AnalysisClusteringEpsilon); |
| 334 | if (auto Error = Clustering.validateAndSetup()) { |
| 335 | return std::move(Error); |
| 336 | } |
| 337 | if (Clustering.ErrorCluster_.PointIndices.size() == Points.size()) { |
| 338 | return Clustering; // Nothing to cluster. |
| 339 | } |
| 340 | |
| 341 | if (Mode == ModeE::Dbscan) { |
| 342 | Clustering.clusterizeDbScan(MinPts: DbscanMinPts); |
| 343 | |
| 344 | if (InstrInfo) |
| 345 | Clustering.stabilize(NumOpcodes: InstrInfo->getNumOpcodes()); |
| 346 | } else /*if(Mode == ModeE::Naive)*/ { |
| 347 | if (!SubtargetInfo || !InstrInfo) |
| 348 | return make_error<Failure>(Args: "'naive' clustering mode requires " |
| 349 | "SubtargetInfo and InstrInfo to be present" ); |
| 350 | Clustering.clusterizeNaive(SubtargetInfo: *SubtargetInfo, InstrInfo: *InstrInfo); |
| 351 | } |
| 352 | |
| 353 | return Clustering; |
| 354 | } |
| 355 | |
| 356 | void SchedClassClusterCentroid::addPoint(ArrayRef<BenchmarkMeasure> Point) { |
| 357 | if (Representative.empty()) |
| 358 | Representative.resize(new_size: Point.size()); |
| 359 | assert(Representative.size() == Point.size() && |
| 360 | "All points should have identical dimensions." ); |
| 361 | |
| 362 | for (auto I : zip(t&: Representative, u&: Point)) |
| 363 | std::get<0>(t&: I).push(BM: std::get<1>(t&: I)); |
| 364 | } |
| 365 | |
| 366 | std::vector<BenchmarkMeasure> SchedClassClusterCentroid::getAsPoint() const { |
| 367 | std::vector<BenchmarkMeasure> ClusterCenterPoint(Representative.size()); |
| 368 | for (auto I : zip(t&: ClusterCenterPoint, u: Representative)) |
| 369 | std::get<0>(t&: I).PerInstructionValue = std::get<1>(t&: I).avg(); |
| 370 | return ClusterCenterPoint; |
| 371 | } |
| 372 | |
| 373 | bool SchedClassClusterCentroid::validate( |
| 374 | Benchmark::ModeE Mode) const { |
| 375 | size_t NumMeasurements = Representative.size(); |
| 376 | switch (Mode) { |
| 377 | case Benchmark::Latency: |
| 378 | if (NumMeasurements != 1) { |
| 379 | errs() |
| 380 | << "invalid number of measurements in latency mode: expected 1, got " |
| 381 | << NumMeasurements << "\n" ; |
| 382 | return false; |
| 383 | } |
| 384 | break; |
| 385 | case Benchmark::Uops: |
| 386 | // Can have many measurements. |
| 387 | break; |
| 388 | case Benchmark::InverseThroughput: |
| 389 | if (NumMeasurements != 1) { |
| 390 | errs() << "invalid number of measurements in inverse throughput " |
| 391 | "mode: expected 1, got " |
| 392 | << NumMeasurements << "\n" ; |
| 393 | return false; |
| 394 | } |
| 395 | break; |
| 396 | default: |
| 397 | llvm_unreachable("unimplemented measurement matching mode" ); |
| 398 | return false; |
| 399 | } |
| 400 | |
| 401 | return true; // All good. |
| 402 | } |
| 403 | |
| 404 | } // namespace exegesis |
| 405 | } // namespace llvm |
| 406 | |