1//===- BalancedPartitioning.h ---------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements BalancedPartitioning, a recursive balanced graph
10// partitioning algorithm.
11//
12// The algorithm is used to find an ordering of FunctionNodes while optimizing
13// a specified objective. The algorithm uses recursive bisection; it starts
14// with a collection of unordered FunctionNodes and tries to split them into
15// two sets (buckets) of equal cardinality. Each bisection step is comprised of
16// iterations that greedily swap the FunctionNodes between the two buckets while
17// there is an improvement of the objective. Once the process converges, the
18// problem is divided into two sub-problems of half the size, which are
19// recursively applied for the two buckets. The final ordering of the
20// FunctionNodes is obtained by concatenating the two (recursively computed)
21// orderings.
22//
23// In order to speed up the computation, we limit the depth of the recursive
24// tree by a specified constant (SplitDepth) and apply at most a constant
25// number of greedy iterations per split (IterationsPerSplit). The worst-case
26// time complexity of the implementation is bounded by O(M*log^2 N), where
27// N is the number of FunctionNodes and M is the number of
28// FunctionNode-UtilityNode edges; (assuming that any collection of D
29// FunctionNodes contains O(D) UtilityNodes). Notice that the two different
30// recursive sub-problems are independent and thus can be efficiently processed
31// in parallel.
32//
33// Reference:
34// * Optimizing Function Layout for Mobile Applications,
35// https://arxiv.org/abs/2211.09285
36//
37//===----------------------------------------------------------------------===//
38
39#ifndef LLVM_SUPPORT_BALANCED_PARTITIONING_H
40#define LLVM_SUPPORT_BALANCED_PARTITIONING_H
41
42#include "raw_ostream.h"
43#include "llvm/ADT/ArrayRef.h"
44#include "llvm/Support/Compiler.h"
45
46#include <atomic>
47#include <condition_variable>
48#include <mutex>
49#include <random>
50#include <vector>
51
52namespace llvm {
53
54class ThreadPoolInterface;
55/// A function with a set of utility nodes where it is beneficial to order two
56/// functions close together if they have similar utility nodes
57class BPFunctionNode {
58 friend class BalancedPartitioning;
59
60public:
61 using IDT = uint64_t;
62 using UtilityNodeT = uint32_t;
63
64 /// \param UtilityNodes the set of utility nodes (must be unique'd)
65 BPFunctionNode(IDT Id, ArrayRef<UtilityNodeT> UtilityNodes)
66 : Id(Id), UtilityNodes(UtilityNodes) {}
67
68 /// The ID of this node
69 IDT Id;
70
71 LLVM_ABI void dump(raw_ostream &OS) const;
72
73protected:
74 /// The list of utility nodes associated with this node
75 SmallVector<UtilityNodeT, 4> UtilityNodes;
76 /// The bucket assigned by balanced partitioning
77 std::optional<unsigned> Bucket;
78 /// The index of the input order of the FunctionNodes
79 uint64_t InputOrderIndex = 0;
80
81 friend class BPFunctionNodeTest_Basic_Test;
82 friend class BalancedPartitioningTest_Basic_Test;
83 friend class BalancedPartitioningTest_Large_Test;
84};
85
86/// Algorithm parameters; default values are tuned on real-world binaries
87struct BalancedPartitioningConfig {
88 /// The depth of the recursive bisection
89 unsigned SplitDepth = 18;
90 /// The maximum number of bp iterations per split
91 unsigned IterationsPerSplit = 40;
92 /// The probability for a vertex to skip a move from its current bucket to
93 /// another bucket; it often helps to escape from a local optima
94 float SkipProbability = 0.1f;
95 /// Recursive subtasks up to the given depth are added to the queue and
96 /// distributed among threads by ThreadPool; all subsequent calls are executed
97 /// on the same thread
98 unsigned TaskSplitDepth = 9;
99};
100
101class BalancedPartitioning {
102public:
103 LLVM_ABI BalancedPartitioning(const BalancedPartitioningConfig &Config);
104
105 /// Run recursive graph partitioning that optimizes a given objective.
106 LLVM_ABI void run(std::vector<BPFunctionNode> &Nodes) const;
107
108private:
109 struct UtilitySignature;
110 using SignaturesT = SmallVector<UtilitySignature, 4>;
111 using FunctionNodeRange =
112 iterator_range<std::vector<BPFunctionNode>::iterator>;
113
114 /// A special ThreadPool that allows for spawning new tasks after blocking on
115 /// wait(). BalancedPartitioning recursively spawns new threads inside other
116 /// threads, so we need to track how many active threads that could spawn more
117 /// threads.
118 struct BPThreadPool {
119 ThreadPoolInterface &TheThreadPool;
120 std::mutex mtx;
121 std::condition_variable cv;
122 /// The number of threads that could spawn more threads
123 std::atomic<int> NumActiveThreads = 0;
124 /// Only true when all threads are down spawning new threads
125 bool IsFinishedSpawning = false;
126 /// Asynchronous submission of the task to the pool
127 template <typename Func> void async(Func &&F);
128 /// Blocking wait for all threads to complete. Unlike ThreadPool, it is
129 /// acceptable for other threads to add more tasks while blocking on this
130 /// call.
131 LLVM_ABI void wait();
132 BPThreadPool(ThreadPoolInterface &TheThreadPool)
133 : TheThreadPool(TheThreadPool) {}
134 };
135
136 /// Run a recursive bisection of a given list of FunctionNodes
137 /// \param RecDepth the current depth of recursion
138 /// \param RootBucket the initial bucket of the dataVertices
139 /// \param Offset the assigned buckets are the range [Offset, Offset +
140 /// Nodes.size()]
141 void bisect(const FunctionNodeRange Nodes, unsigned RecDepth,
142 unsigned RootBucket, unsigned Offset,
143 std::optional<BPThreadPool> &TP) const;
144
145 /// Run bisection iterations
146 void runIterations(const FunctionNodeRange Nodes, unsigned LeftBucket,
147 unsigned RightBucket, std::mt19937 &RNG) const;
148
149 /// Run a bisection iteration to improve the optimization goal
150 /// \returns the total number of moved FunctionNodes
151 unsigned runIteration(const FunctionNodeRange Nodes, unsigned LeftBucket,
152 unsigned RightBucket, SignaturesT &Signatures,
153 std::mt19937 &RNG) const;
154
155 /// Try to move \p N from one bucket to another
156 /// \returns true iff \p N is moved
157 bool moveFunctionNode(BPFunctionNode &N, unsigned LeftBucket,
158 unsigned RightBucket, SignaturesT &Signatures,
159 std::mt19937 &RNG) const;
160
161 /// Split all the FunctionNodes into 2 buckets, StartBucket and StartBucket +
162 /// 1 The method is used for an initial assignment before a bisection step
163 void split(const FunctionNodeRange Nodes, unsigned StartBucket) const;
164
165 /// The cost of the uniform log-gap cost, assuming a utility node has \p X
166 /// FunctionNodes in the left bucket and \p Y FunctionNodes in the right one.
167 float logCost(unsigned X, unsigned Y) const;
168
169 float log2Cached(unsigned i) const;
170
171 const BalancedPartitioningConfig &Config;
172
173 /// Precomputed values of log2(x). Table size is small enough to fit in cache.
174 static constexpr unsigned LOG_CACHE_SIZE = 16384;
175 float Log2Cache[LOG_CACHE_SIZE];
176
177 /// The signature of a particular utility node used for the bisection step,
178 /// i.e., the number of \p FunctionNodes in each of the two buckets
179 struct UtilitySignature {
180 /// The number of \p FunctionNodes in the left bucket
181 unsigned LeftCount = 0;
182 /// The number of \p FunctionNodes in the right bucket
183 unsigned RightCount = 0;
184 /// The cached gain of moving a \p FunctionNode from the left bucket to the
185 /// right bucket
186 float CachedGainLR;
187 /// The cached gain of moving a \p FunctionNode from the right bucket to the
188 /// left bucket
189 float CachedGainRL;
190 /// Whether \p CachedGainLR and \p CachedGainRL are valid
191 bool CachedGainIsValid = false;
192 };
193
194protected:
195 /// Compute the move gain for uniform log-gap cost
196 LLVM_ABI static float moveGain(const BPFunctionNode &N, bool FromLeftToRight,
197 const SignaturesT &Signatures);
198 friend class BalancedPartitioningTest_MoveGain_Test;
199};
200
201} // end namespace llvm
202
203#endif // LLVM_SUPPORT_BALANCED_PARTITIONING_H
204