| 1 | //===- BalancedPartitioning.h ---------------------------------------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This file implements BalancedPartitioning, a recursive balanced graph |
| 10 | // partitioning algorithm. |
| 11 | // |
| 12 | // The algorithm is used to find an ordering of FunctionNodes while optimizing |
| 13 | // a specified objective. The algorithm uses recursive bisection; it starts |
| 14 | // with a collection of unordered FunctionNodes and tries to split them into |
| 15 | // two sets (buckets) of equal cardinality. Each bisection step is comprised of |
| 16 | // iterations that greedily swap the FunctionNodes between the two buckets while |
| 17 | // there is an improvement of the objective. Once the process converges, the |
| 18 | // problem is divided into two sub-problems of half the size, which are |
| 19 | // recursively applied for the two buckets. The final ordering of the |
| 20 | // FunctionNodes is obtained by concatenating the two (recursively computed) |
| 21 | // orderings. |
| 22 | // |
| 23 | // In order to speed up the computation, we limit the depth of the recursive |
| 24 | // tree by a specified constant (SplitDepth) and apply at most a constant |
| 25 | // number of greedy iterations per split (IterationsPerSplit). The worst-case |
| 26 | // time complexity of the implementation is bounded by O(M*log^2 N), where |
| 27 | // N is the number of FunctionNodes and M is the number of |
| 28 | // FunctionNode-UtilityNode edges; (assuming that any collection of D |
| 29 | // FunctionNodes contains O(D) UtilityNodes). Notice that the two different |
| 30 | // recursive sub-problems are independent and thus can be efficiently processed |
| 31 | // in parallel. |
| 32 | // |
| 33 | // Reference: |
| 34 | // * Optimizing Function Layout for Mobile Applications, |
| 35 | // https://arxiv.org/abs/2211.09285 |
| 36 | // |
| 37 | //===----------------------------------------------------------------------===// |
| 38 | |
| 39 | #ifndef LLVM_SUPPORT_BALANCED_PARTITIONING_H |
| 40 | #define LLVM_SUPPORT_BALANCED_PARTITIONING_H |
| 41 | |
| 42 | #include "raw_ostream.h" |
| 43 | #include "llvm/ADT/ArrayRef.h" |
| 44 | #include "llvm/Support/Compiler.h" |
| 45 | |
| 46 | #include <atomic> |
| 47 | #include <condition_variable> |
| 48 | #include <mutex> |
| 49 | #include <random> |
| 50 | #include <vector> |
| 51 | |
| 52 | namespace llvm { |
| 53 | |
| 54 | class ThreadPoolInterface; |
| 55 | /// A function with a set of utility nodes where it is beneficial to order two |
| 56 | /// functions close together if they have similar utility nodes |
| 57 | class BPFunctionNode { |
| 58 | friend class BalancedPartitioning; |
| 59 | |
| 60 | public: |
| 61 | using IDT = uint64_t; |
| 62 | using UtilityNodeT = uint32_t; |
| 63 | |
| 64 | /// \param UtilityNodes the set of utility nodes (must be unique'd) |
| 65 | BPFunctionNode(IDT Id, ArrayRef<UtilityNodeT> UtilityNodes) |
| 66 | : Id(Id), UtilityNodes(UtilityNodes) {} |
| 67 | |
| 68 | /// The ID of this node |
| 69 | IDT Id; |
| 70 | |
| 71 | LLVM_ABI void dump(raw_ostream &OS) const; |
| 72 | |
| 73 | protected: |
| 74 | /// The list of utility nodes associated with this node |
| 75 | SmallVector<UtilityNodeT, 4> UtilityNodes; |
| 76 | /// The bucket assigned by balanced partitioning |
| 77 | std::optional<unsigned> Bucket; |
| 78 | /// The index of the input order of the FunctionNodes |
| 79 | uint64_t InputOrderIndex = 0; |
| 80 | |
| 81 | friend class BPFunctionNodeTest_Basic_Test; |
| 82 | friend class BalancedPartitioningTest_Basic_Test; |
| 83 | friend class BalancedPartitioningTest_Large_Test; |
| 84 | }; |
| 85 | |
| 86 | /// Algorithm parameters; default values are tuned on real-world binaries |
| 87 | struct BalancedPartitioningConfig { |
| 88 | /// The depth of the recursive bisection |
| 89 | unsigned SplitDepth = 18; |
| 90 | /// The maximum number of bp iterations per split |
| 91 | unsigned IterationsPerSplit = 40; |
| 92 | /// The probability for a vertex to skip a move from its current bucket to |
| 93 | /// another bucket; it often helps to escape from a local optima |
| 94 | float SkipProbability = 0.1f; |
| 95 | /// Recursive subtasks up to the given depth are added to the queue and |
| 96 | /// distributed among threads by ThreadPool; all subsequent calls are executed |
| 97 | /// on the same thread |
| 98 | unsigned TaskSplitDepth = 9; |
| 99 | }; |
| 100 | |
| 101 | class BalancedPartitioning { |
| 102 | public: |
| 103 | LLVM_ABI BalancedPartitioning(const BalancedPartitioningConfig &Config); |
| 104 | |
| 105 | /// Run recursive graph partitioning that optimizes a given objective. |
| 106 | LLVM_ABI void run(std::vector<BPFunctionNode> &Nodes) const; |
| 107 | |
| 108 | private: |
| 109 | struct UtilitySignature; |
| 110 | using SignaturesT = SmallVector<UtilitySignature, 4>; |
| 111 | using FunctionNodeRange = |
| 112 | iterator_range<std::vector<BPFunctionNode>::iterator>; |
| 113 | |
| 114 | /// A special ThreadPool that allows for spawning new tasks after blocking on |
| 115 | /// wait(). BalancedPartitioning recursively spawns new threads inside other |
| 116 | /// threads, so we need to track how many active threads that could spawn more |
| 117 | /// threads. |
| 118 | struct BPThreadPool { |
| 119 | ThreadPoolInterface &TheThreadPool; |
| 120 | std::mutex mtx; |
| 121 | std::condition_variable cv; |
| 122 | /// The number of threads that could spawn more threads |
| 123 | std::atomic<int> NumActiveThreads = 0; |
| 124 | /// Only true when all threads are down spawning new threads |
| 125 | bool IsFinishedSpawning = false; |
| 126 | /// Asynchronous submission of the task to the pool |
| 127 | template <typename Func> void async(Func &&F); |
| 128 | /// Blocking wait for all threads to complete. Unlike ThreadPool, it is |
| 129 | /// acceptable for other threads to add more tasks while blocking on this |
| 130 | /// call. |
| 131 | LLVM_ABI void wait(); |
| 132 | BPThreadPool(ThreadPoolInterface &TheThreadPool) |
| 133 | : TheThreadPool(TheThreadPool) {} |
| 134 | }; |
| 135 | |
| 136 | /// Run a recursive bisection of a given list of FunctionNodes |
| 137 | /// \param RecDepth the current depth of recursion |
| 138 | /// \param RootBucket the initial bucket of the dataVertices |
| 139 | /// \param Offset the assigned buckets are the range [Offset, Offset + |
| 140 | /// Nodes.size()] |
| 141 | void bisect(const FunctionNodeRange Nodes, unsigned RecDepth, |
| 142 | unsigned RootBucket, unsigned Offset, |
| 143 | std::optional<BPThreadPool> &TP) const; |
| 144 | |
| 145 | /// Run bisection iterations |
| 146 | void runIterations(const FunctionNodeRange Nodes, unsigned LeftBucket, |
| 147 | unsigned RightBucket, std::mt19937 &RNG) const; |
| 148 | |
| 149 | /// Run a bisection iteration to improve the optimization goal |
| 150 | /// \returns the total number of moved FunctionNodes |
| 151 | unsigned runIteration(const FunctionNodeRange Nodes, unsigned LeftBucket, |
| 152 | unsigned RightBucket, SignaturesT &Signatures, |
| 153 | std::mt19937 &RNG) const; |
| 154 | |
| 155 | /// Try to move \p N from one bucket to another |
| 156 | /// \returns true iff \p N is moved |
| 157 | bool moveFunctionNode(BPFunctionNode &N, unsigned LeftBucket, |
| 158 | unsigned RightBucket, SignaturesT &Signatures, |
| 159 | std::mt19937 &RNG) const; |
| 160 | |
| 161 | /// Split all the FunctionNodes into 2 buckets, StartBucket and StartBucket + |
| 162 | /// 1 The method is used for an initial assignment before a bisection step |
| 163 | void split(const FunctionNodeRange Nodes, unsigned StartBucket) const; |
| 164 | |
| 165 | /// The cost of the uniform log-gap cost, assuming a utility node has \p X |
| 166 | /// FunctionNodes in the left bucket and \p Y FunctionNodes in the right one. |
| 167 | float logCost(unsigned X, unsigned Y) const; |
| 168 | |
| 169 | float log2Cached(unsigned i) const; |
| 170 | |
| 171 | const BalancedPartitioningConfig &Config; |
| 172 | |
| 173 | /// Precomputed values of log2(x). Table size is small enough to fit in cache. |
| 174 | static constexpr unsigned LOG_CACHE_SIZE = 16384; |
| 175 | float Log2Cache[LOG_CACHE_SIZE]; |
| 176 | |
| 177 | /// The signature of a particular utility node used for the bisection step, |
| 178 | /// i.e., the number of \p FunctionNodes in each of the two buckets |
| 179 | struct UtilitySignature { |
| 180 | /// The number of \p FunctionNodes in the left bucket |
| 181 | unsigned LeftCount = 0; |
| 182 | /// The number of \p FunctionNodes in the right bucket |
| 183 | unsigned RightCount = 0; |
| 184 | /// The cached gain of moving a \p FunctionNode from the left bucket to the |
| 185 | /// right bucket |
| 186 | float CachedGainLR; |
| 187 | /// The cached gain of moving a \p FunctionNode from the right bucket to the |
| 188 | /// left bucket |
| 189 | float CachedGainRL; |
| 190 | /// Whether \p CachedGainLR and \p CachedGainRL are valid |
| 191 | bool CachedGainIsValid = false; |
| 192 | }; |
| 193 | |
| 194 | protected: |
| 195 | /// Compute the move gain for uniform log-gap cost |
| 196 | LLVM_ABI static float moveGain(const BPFunctionNode &N, bool FromLeftToRight, |
| 197 | const SignaturesT &Signatures); |
| 198 | friend class BalancedPartitioningTest_MoveGain_Test; |
| 199 | }; |
| 200 | |
| 201 | } // end namespace llvm |
| 202 | |
| 203 | #endif // LLVM_SUPPORT_BALANCED_PARTITIONING_H |
| 204 | |