| 1 | //===- BalancedPartitioning.cpp -------------------------------------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This file implements BalancedPartitioning, a recursive balanced graph |
| 10 | // partitioning algorithm. |
| 11 | // |
| 12 | //===----------------------------------------------------------------------===// |
| 13 | |
| 14 | #include "llvm/Support/BalancedPartitioning.h" |
| 15 | #include "llvm/Config/llvm-config.h" // for LLVM_ENABLE_THREADS |
| 16 | #include "llvm/Support/Debug.h" |
| 17 | #include "llvm/Support/Format.h" |
| 18 | #include "llvm/Support/FormatVariadic.h" |
| 19 | #include "llvm/Support/ThreadPool.h" |
| 20 | |
| 21 | using namespace llvm; |
| 22 | #define DEBUG_TYPE "balanced-partitioning" |
| 23 | |
| 24 | void BPFunctionNode::dump(raw_ostream &OS) const { |
| 25 | OS << formatv(Fmt: "{{ID={0} Utilities={{{1:$[,]}} Bucket={2}}" , Vals: Id, |
| 26 | Vals: make_range(x: UtilityNodes.begin(), y: UtilityNodes.end()), Vals: Bucket); |
| 27 | } |
| 28 | |
| 29 | template <typename Func> |
| 30 | void BalancedPartitioning::BPThreadPool::async(Func &&F) { |
| 31 | #if LLVM_ENABLE_THREADS |
| 32 | // This new thread could spawn more threads, so mark it as active |
| 33 | ++NumActiveThreads; |
| 34 | TheThreadPool.async([this, F]() { |
| 35 | // Run the task |
| 36 | F(); |
| 37 | |
| 38 | // This thread will no longer spawn new threads, so mark it as inactive |
| 39 | if (--NumActiveThreads == 0) { |
| 40 | // There are no more active threads, so mark as finished and notify |
| 41 | { |
| 42 | std::unique_lock<std::mutex> lock(mtx); |
| 43 | assert(!IsFinishedSpawning); |
| 44 | IsFinishedSpawning = true; |
| 45 | } |
| 46 | cv.notify_one(); |
| 47 | } |
| 48 | }); |
| 49 | #else |
| 50 | llvm_unreachable("threads are disabled" ); |
| 51 | #endif |
| 52 | } |
| 53 | |
| 54 | void BalancedPartitioning::BPThreadPool::wait() { |
| 55 | #if LLVM_ENABLE_THREADS |
| 56 | // TODO: We could remove the mutex and condition variable and use |
| 57 | // std::atomic::wait() instead, but that isn't available until C++20 |
| 58 | { |
| 59 | std::unique_lock<std::mutex> lock(mtx); |
| 60 | cv.wait(lock&: lock, p: [&]() { return IsFinishedSpawning; }); |
| 61 | assert(IsFinishedSpawning && NumActiveThreads == 0); |
| 62 | } |
| 63 | // Now we can call ThreadPool::wait() since all tasks have been submitted |
| 64 | TheThreadPool.wait(); |
| 65 | #else |
| 66 | llvm_unreachable("threads are disabled" ); |
| 67 | #endif |
| 68 | } |
| 69 | |
| 70 | BalancedPartitioning::BalancedPartitioning( |
| 71 | const BalancedPartitioningConfig &Config) |
| 72 | : Config(Config) { |
| 73 | // Pre-computing log2 values |
| 74 | Log2Cache[0] = 0.0; |
| 75 | for (unsigned I = 1; I < LOG_CACHE_SIZE; I++) |
| 76 | Log2Cache[I] = std::log2(x: I); |
| 77 | } |
| 78 | |
| 79 | void BalancedPartitioning::run(std::vector<BPFunctionNode> &Nodes) const { |
| 80 | LLVM_DEBUG( |
| 81 | dbgs() << format( |
| 82 | "Partitioning %d nodes using depth %d and %d iterations per split\n" , |
| 83 | Nodes.size(), Config.SplitDepth, Config.IterationsPerSplit)); |
| 84 | std::optional<BPThreadPool> TP; |
| 85 | #if LLVM_ENABLE_THREADS |
| 86 | DefaultThreadPool TheThreadPool; |
| 87 | if (Config.TaskSplitDepth > 1) |
| 88 | TP.emplace(args&: TheThreadPool); |
| 89 | #endif |
| 90 | |
| 91 | // Record the input order |
| 92 | for (unsigned I = 0; I < Nodes.size(); I++) |
| 93 | Nodes[I].InputOrderIndex = I; |
| 94 | |
| 95 | auto NodesRange = llvm::make_range(x: Nodes.begin(), y: Nodes.end()); |
| 96 | auto BisectTask = [this, NodesRange, &TP]() { |
| 97 | bisect(Nodes: NodesRange, /*RecDepth=*/0, /*RootBucket=*/1, /*Offset=*/0, TP); |
| 98 | }; |
| 99 | if (TP) { |
| 100 | TP->async(F: std::move(BisectTask)); |
| 101 | TP->wait(); |
| 102 | } else { |
| 103 | BisectTask(); |
| 104 | } |
| 105 | |
| 106 | llvm::stable_sort(Range&: NodesRange, C: [](const auto &L, const auto &R) { |
| 107 | return L.Bucket < R.Bucket; |
| 108 | }); |
| 109 | |
| 110 | LLVM_DEBUG(dbgs() << "Balanced partitioning completed\n" ); |
| 111 | } |
| 112 | |
| 113 | void BalancedPartitioning::bisect(const FunctionNodeRange Nodes, |
| 114 | unsigned RecDepth, unsigned RootBucket, |
| 115 | unsigned Offset, |
| 116 | std::optional<BPThreadPool> &TP) const { |
| 117 | unsigned NumNodes = std::distance(first: Nodes.begin(), last: Nodes.end()); |
| 118 | if (NumNodes <= 1 || RecDepth >= Config.SplitDepth) { |
| 119 | // We've reach the lowest level of the recursion tree. Fall back to the |
| 120 | // original order and assign to buckets. |
| 121 | llvm::sort(C: Nodes, Comp: [](const auto &L, const auto &R) { |
| 122 | return L.InputOrderIndex < R.InputOrderIndex; |
| 123 | }); |
| 124 | for (auto &N : Nodes) |
| 125 | N.Bucket = Offset++; |
| 126 | return; |
| 127 | } |
| 128 | |
| 129 | LLVM_DEBUG(dbgs() << format("Bisect with %d nodes and root bucket %d\n" , |
| 130 | NumNodes, RootBucket)); |
| 131 | |
| 132 | std::mt19937 RNG(RootBucket); |
| 133 | |
| 134 | unsigned LeftBucket = 2 * RootBucket; |
| 135 | unsigned RightBucket = 2 * RootBucket + 1; |
| 136 | |
| 137 | // Split into two and assign to the left and right buckets |
| 138 | split(Nodes, StartBucket: LeftBucket); |
| 139 | |
| 140 | runIterations(Nodes, LeftBucket, RightBucket, RNG); |
| 141 | |
| 142 | // Split nodes wrt the resulting buckets |
| 143 | auto NodesMid = |
| 144 | llvm::partition(Range: Nodes, P: [&](auto &N) { return N.Bucket == LeftBucket; }); |
| 145 | unsigned MidOffset = Offset + std::distance(first: Nodes.begin(), last: NodesMid); |
| 146 | |
| 147 | auto LeftNodes = llvm::make_range(x: Nodes.begin(), y: NodesMid); |
| 148 | auto RightNodes = llvm::make_range(x: NodesMid, y: Nodes.end()); |
| 149 | |
| 150 | auto LeftRecTask = [this, LeftNodes, RecDepth, LeftBucket, Offset, &TP]() { |
| 151 | bisect(Nodes: LeftNodes, RecDepth: RecDepth + 1, RootBucket: LeftBucket, Offset, TP); |
| 152 | }; |
| 153 | auto RightRecTask = [this, RightNodes, RecDepth, RightBucket, MidOffset, |
| 154 | &TP]() { |
| 155 | bisect(Nodes: RightNodes, RecDepth: RecDepth + 1, RootBucket: RightBucket, Offset: MidOffset, TP); |
| 156 | }; |
| 157 | |
| 158 | if (TP && RecDepth < Config.TaskSplitDepth && NumNodes >= 4) { |
| 159 | TP->async(F: std::move(LeftRecTask)); |
| 160 | TP->async(F: std::move(RightRecTask)); |
| 161 | } else { |
| 162 | LeftRecTask(); |
| 163 | RightRecTask(); |
| 164 | } |
| 165 | } |
| 166 | |
| 167 | void BalancedPartitioning::runIterations(const FunctionNodeRange Nodes, |
| 168 | unsigned LeftBucket, |
| 169 | unsigned RightBucket, |
| 170 | std::mt19937 &RNG) const { |
| 171 | unsigned NumNodes = std::distance(first: Nodes.begin(), last: Nodes.end()); |
| 172 | DenseMap<BPFunctionNode::UtilityNodeT, unsigned> UtilityNodeIndex; |
| 173 | for (auto &N : Nodes) |
| 174 | for (auto &UN : N.UtilityNodes) |
| 175 | ++UtilityNodeIndex[UN]; |
| 176 | // Remove utility nodes if they have just one edge or are connected to all |
| 177 | // functions |
| 178 | for (auto &N : Nodes) |
| 179 | llvm::erase_if(C&: N.UtilityNodes, P: [&](auto &UN) { |
| 180 | unsigned UNI = UtilityNodeIndex[UN]; |
| 181 | return UNI == 1 || UNI == NumNodes; |
| 182 | }); |
| 183 | |
| 184 | // Renumber utility nodes so they can be used to index into Signatures |
| 185 | UtilityNodeIndex.clear(); |
| 186 | for (auto &N : Nodes) |
| 187 | for (auto &UN : N.UtilityNodes) |
| 188 | UN = UtilityNodeIndex.insert(KV: {UN, UtilityNodeIndex.size()}).first->second; |
| 189 | |
| 190 | // Initialize signatures |
| 191 | SignaturesT Signatures(/*Size=*/UtilityNodeIndex.size()); |
| 192 | for (auto &N : Nodes) { |
| 193 | for (auto &UN : N.UtilityNodes) { |
| 194 | assert(UN < Signatures.size()); |
| 195 | if (N.Bucket == LeftBucket) { |
| 196 | Signatures[UN].LeftCount++; |
| 197 | } else { |
| 198 | Signatures[UN].RightCount++; |
| 199 | } |
| 200 | } |
| 201 | } |
| 202 | |
| 203 | for (unsigned I = 0; I < Config.IterationsPerSplit; I++) { |
| 204 | unsigned NumMovedNodes = |
| 205 | runIteration(Nodes, LeftBucket, RightBucket, Signatures, RNG); |
| 206 | if (NumMovedNodes == 0) |
| 207 | break; |
| 208 | } |
| 209 | } |
| 210 | |
| 211 | unsigned BalancedPartitioning::runIteration(const FunctionNodeRange Nodes, |
| 212 | unsigned LeftBucket, |
| 213 | unsigned RightBucket, |
| 214 | SignaturesT &Signatures, |
| 215 | std::mt19937 &RNG) const { |
| 216 | // Init signature cost caches |
| 217 | for (auto &Signature : Signatures) { |
| 218 | if (Signature.CachedGainIsValid) |
| 219 | continue; |
| 220 | unsigned L = Signature.LeftCount; |
| 221 | unsigned R = Signature.RightCount; |
| 222 | assert((L > 0 || R > 0) && "incorrect signature" ); |
| 223 | float Cost = logCost(X: L, Y: R); |
| 224 | Signature.CachedGainLR = 0.f; |
| 225 | Signature.CachedGainRL = 0.f; |
| 226 | if (L > 0) |
| 227 | Signature.CachedGainLR = Cost - logCost(X: L - 1, Y: R + 1); |
| 228 | if (R > 0) |
| 229 | Signature.CachedGainRL = Cost - logCost(X: L + 1, Y: R - 1); |
| 230 | Signature.CachedGainIsValid = true; |
| 231 | } |
| 232 | |
| 233 | // Compute move gains |
| 234 | typedef std::pair<float, BPFunctionNode *> GainPair; |
| 235 | std::vector<GainPair> Gains; |
| 236 | for (auto &N : Nodes) { |
| 237 | bool FromLeftToRight = (N.Bucket == LeftBucket); |
| 238 | float Gain = moveGain(N, FromLeftToRight, Signatures); |
| 239 | Gains.push_back(x: std::make_pair(x&: Gain, y: &N)); |
| 240 | } |
| 241 | |
| 242 | // Collect left and right gains |
| 243 | auto LeftEnd = llvm::partition( |
| 244 | Range&: Gains, P: [&](const auto &GP) { return GP.second->Bucket == LeftBucket; }); |
| 245 | auto LeftRange = llvm::make_range(x: Gains.begin(), y: LeftEnd); |
| 246 | auto RightRange = llvm::make_range(x: LeftEnd, y: Gains.end()); |
| 247 | |
| 248 | // Sort gains in descending order |
| 249 | auto LargerGain = [](const auto &L, const auto &R) { |
| 250 | return L.first > R.first; |
| 251 | }; |
| 252 | llvm::stable_sort(Range&: LeftRange, C: LargerGain); |
| 253 | llvm::stable_sort(Range&: RightRange, C: LargerGain); |
| 254 | |
| 255 | unsigned NumMovedDataVertices = 0; |
| 256 | for (auto [LeftPair, RightPair] : llvm::zip(t&: LeftRange, u&: RightRange)) { |
| 257 | auto &[LeftGain, LeftNode] = LeftPair; |
| 258 | auto &[RightGain, RightNode] = RightPair; |
| 259 | // Stop when the gain is no longer beneficial |
| 260 | if (LeftGain + RightGain <= 0.f) |
| 261 | break; |
| 262 | // Try to exchange the nodes between buckets |
| 263 | if (moveFunctionNode(N&: *LeftNode, LeftBucket, RightBucket, Signatures, RNG)) |
| 264 | ++NumMovedDataVertices; |
| 265 | if (moveFunctionNode(N&: *RightNode, LeftBucket, RightBucket, Signatures, RNG)) |
| 266 | ++NumMovedDataVertices; |
| 267 | } |
| 268 | return NumMovedDataVertices; |
| 269 | } |
| 270 | |
| 271 | bool BalancedPartitioning::moveFunctionNode(BPFunctionNode &N, |
| 272 | unsigned LeftBucket, |
| 273 | unsigned RightBucket, |
| 274 | SignaturesT &Signatures, |
| 275 | std::mt19937 &RNG) const { |
| 276 | // Sometimes we skip the move. This helps to escape local optima |
| 277 | if (std::uniform_real_distribution<float>(0.f, 1.f)(RNG) <= |
| 278 | Config.SkipProbability) |
| 279 | return false; |
| 280 | |
| 281 | bool FromLeftToRight = (N.Bucket == LeftBucket); |
| 282 | // Update the current bucket |
| 283 | N.Bucket = (FromLeftToRight ? RightBucket : LeftBucket); |
| 284 | |
| 285 | // Update signatures and invalidate gain cache |
| 286 | if (FromLeftToRight) { |
| 287 | for (auto &UN : N.UtilityNodes) { |
| 288 | auto &Signature = Signatures[UN]; |
| 289 | Signature.LeftCount--; |
| 290 | Signature.RightCount++; |
| 291 | Signature.CachedGainIsValid = false; |
| 292 | } |
| 293 | } else { |
| 294 | for (auto &UN : N.UtilityNodes) { |
| 295 | auto &Signature = Signatures[UN]; |
| 296 | Signature.LeftCount++; |
| 297 | Signature.RightCount--; |
| 298 | Signature.CachedGainIsValid = false; |
| 299 | } |
| 300 | } |
| 301 | return true; |
| 302 | } |
| 303 | |
| 304 | void BalancedPartitioning::split(const FunctionNodeRange Nodes, |
| 305 | unsigned StartBucket) const { |
| 306 | unsigned NumNodes = std::distance(first: Nodes.begin(), last: Nodes.end()); |
| 307 | auto NodesMid = Nodes.begin() + (NumNodes + 1) / 2; |
| 308 | |
| 309 | llvm::sort(Start: Nodes.begin(), End: Nodes.end(), Comp: [](auto &L, auto &R) { |
| 310 | return L.InputOrderIndex < R.InputOrderIndex; |
| 311 | }); |
| 312 | |
| 313 | for (auto &N : llvm::make_range(x: Nodes.begin(), y: NodesMid)) |
| 314 | N.Bucket = StartBucket; |
| 315 | for (auto &N : llvm::make_range(x: NodesMid, y: Nodes.end())) |
| 316 | N.Bucket = StartBucket + 1; |
| 317 | } |
| 318 | |
| 319 | float BalancedPartitioning::moveGain(const BPFunctionNode &N, |
| 320 | bool FromLeftToRight, |
| 321 | const SignaturesT &Signatures) { |
| 322 | float Gain = 0.f; |
| 323 | for (auto &UN : N.UtilityNodes) |
| 324 | Gain += (FromLeftToRight ? Signatures[UN].CachedGainLR |
| 325 | : Signatures[UN].CachedGainRL); |
| 326 | return Gain; |
| 327 | } |
| 328 | |
| 329 | float BalancedPartitioning::logCost(unsigned X, unsigned Y) const { |
| 330 | return -(X * log2Cached(i: X + 1) + Y * log2Cached(i: Y + 1)); |
| 331 | } |
| 332 | |
| 333 | float BalancedPartitioning::log2Cached(unsigned i) const { |
| 334 | return (i < LOG_CACHE_SIZE) ? Log2Cache[i] : std::log2(x: i); |
| 335 | } |
| 336 | |