BalancedPartitioning.h source code [llvm_projects/llvm/include/llvm/Support/BalancedPartitioning.h]

1	//===- BalancedPartitioning.h ---------------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file implements BalancedPartitioning, a recursive balanced graph
10	// partitioning algorithm.
11	//
12	// The algorithm is used to find an ordering of FunctionNodes while optimizing
13	// a specified objective. The algorithm uses recursive bisection; it starts
14	// with a collection of unordered FunctionNodes and tries to split them into
15	// two sets (buckets) of equal cardinality. Each bisection step is comprised of
16	// iterations that greedily swap the FunctionNodes between the two buckets while
17	// there is an improvement of the objective. Once the process converges, the
18	// problem is divided into two sub-problems of half the size, which are
19	// recursively applied for the two buckets. The final ordering of the
20	// FunctionNodes is obtained by concatenating the two (recursively computed)
21	// orderings.
22	//
23	// In order to speed up the computation, we limit the depth of the recursive
24	// tree by a specified constant (SplitDepth) and apply at most a constant
25	// number of greedy iterations per split (IterationsPerSplit). The worst-case
26	// time complexity of the implementation is bounded by O(Mlog^2 N), where*
27	// N is the number of FunctionNodes and M is the number of
28	// FunctionNode-UtilityNode edges; (assuming that any collection of D
29	// FunctionNodes contains O(D) UtilityNodes). Notice that the two different
30	// recursive sub-problems are independent and thus can be efficiently processed
31	// in parallel.
32	//
33	// Reference:
34	// Optimizing Function Layout for Mobile Applications,*
35	// https://arxiv.org/abs/2211.09285
36	//
37	//===----------------------------------------------------------------------===//
38
39	#ifndef LLVM_SUPPORT_BALANCED_PARTITIONING_H
40	#define LLVM_SUPPORT_BALANCED_PARTITIONING_H
41
42	#include "raw_ostream.h"
43	#include "llvm/ADT/ArrayRef.h"
44	#include "llvm/Support/Compiler.h"
45
46	#include <atomic>
47	#include <condition_variable>
48	#include <mutex>
49	#include <random>
50	#include <vector>
51
52	namespace llvm {
53
54	class ThreadPoolInterface;
55	/// A function with a set of utility nodes where it is beneficial to order two
56	/// functions close together if they have similar utility nodes
57	class BPFunctionNode {
58	friend class BalancedPartitioning;
59
60	public:
61	using IDT = uint64_t;
62	using UtilityNodeT = uint32_t;
63
64	/// \param UtilityNodes the set of utility nodes (must be unique'd)
65	BPFunctionNode(IDT Id, ArrayRef<UtilityNodeT> UtilityNodes)
66	: Id(Id), UtilityNodes (UtilityNodes) {}
67
68	/// The ID of this node
69	IDT Id;
70
71	LLVM_ABI void dump(raw_ostream &OS) const;
72
73	protected:
74	/// The list of utility nodes associated with this node
75	SmallVector<UtilityNodeT, `4`> UtilityNodes;
76	/// The bucket assigned by balanced partitioning
77	std::optional<unsigned> Bucket;
78	/// The index of the input order of the FunctionNodes
79	uint64_t InputOrderIndex = `0`;
80
81	friend class BPFunctionNodeTest_Basic_Test;
82	friend class BalancedPartitioningTest_Basic_Test;
83	friend class BalancedPartitioningTest_Large_Test;
84	};
85
86	/// Algorithm parameters; default values are tuned on real-world binaries
87	struct BalancedPartitioningConfig {
88	/// The depth of the recursive bisection
89	unsigned SplitDepth = `18`;
90	/// The maximum number of bp iterations per split
91	unsigned IterationsPerSplit = `40`;
92	/// The probability for a vertex to skip a move from its current bucket to
93	/// another bucket; it often helps to escape from a local optima
94	float SkipProbability = `0.1f`;
95	/// Recursive subtasks up to the given depth are added to the queue and
96	/// distributed among threads by ThreadPool; all subsequent calls are executed
97	/// on the same thread
98	unsigned TaskSplitDepth = `9`;
99	};
100
101	class BalancedPartitioning {
102	public:
103	LLVM_ABI BalancedPartitioning(const BalancedPartitioningConfig &Config);
104
105	/// Run recursive graph partitioning that optimizes a given objective.
106	LLVM_ABI void run(std::vector<BPFunctionNode> &Nodes) const;
107
108	private:
109	struct UtilitySignature;
110	using SignaturesT = SmallVector<UtilitySignature, `4`>;
111	using FunctionNodeRange =
112	iterator_range<std::vector<BPFunctionNode>::iterator>;
113
114	/// A special ThreadPool that allows for spawning new tasks after blocking on
115	/// wait(). BalancedPartitioning recursively spawns new threads inside other
116	/// threads, so we need to track how many active threads that could spawn more
117	/// threads.
118	struct BPThreadPool {
119	ThreadPoolInterface &TheThreadPool;
120	std::mutex mtx;
121	std::condition_variable cv;
122	/// The number of threads that could spawn more threads
123	std::atomic<int> NumActiveThreads = `0`;
124	/// Only true when all threads are down spawning new threads
125	bool IsFinishedSpawning = false;
126	/// Asynchronous submission of the task to the pool
127	template <typename Func> void async(Func &&F);
128	/// Blocking wait for all threads to complete. Unlike ThreadPool, it is
129	/// acceptable for other threads to add more tasks while blocking on this
130	/// call.
131	LLVM_ABI void wait();
132	BPThreadPool(ThreadPoolInterface &TheThreadPool)
133	: TheThreadPool(TheThreadPool) {}
134	};
135
136	/// Run a recursive bisection of a given list of FunctionNodes
137	/// \param RecDepth the current depth of recursion
138	/// \param RootBucket the initial bucket of the dataVertices
139	/// \param Offset the assigned buckets are the range [Offset, Offset +
140	/// Nodes.size()]
141	void bisect(const FunctionNodeRange Nodes, unsigned RecDepth,
142	unsigned RootBucket, unsigned Offset,
143	std::optional<BPThreadPool> &TP) const;
144
145	/// Run bisection iterations
146	void runIterations(const FunctionNodeRange Nodes, unsigned LeftBucket,
147	unsigned RightBucket, std::mt19937 &RNG) const;
148
149	/// Run a bisection iteration to improve the optimization goal
150	/// \returns the total number of moved FunctionNodes
151	unsigned runIteration(const FunctionNodeRange Nodes, unsigned LeftBucket,
152	unsigned RightBucket, SignaturesT &Signatures,
153	std::mt19937 &RNG) const;
154
155	/// Try to move \p N from one bucket to another
156	/// \returns true iff \p N is moved
157	bool moveFunctionNode(BPFunctionNode &N, unsigned LeftBucket,
158	unsigned RightBucket, SignaturesT &Signatures,
159	std::mt19937 &RNG) const;
160
161	/// Split all the FunctionNodes into 2 buckets, StartBucket and StartBucket +
162	/// 1 The method is used for an initial assignment before a bisection step
163	void split(const FunctionNodeRange Nodes, unsigned StartBucket) const;
164
165	/// The cost of the uniform log-gap cost, assuming a utility node has \p X
166	/// FunctionNodes in the left bucket and \p Y FunctionNodes in the right one.
167	float logCost(unsigned X, unsigned Y) const;
168
169	float log2Cached(unsigned i) const;
170
171	const BalancedPartitioningConfig &Config;
172
173	/// Precomputed values of log2(x). Table size is small enough to fit in cache.
174	static constexpr unsigned LOG_CACHE_SIZE = `16384`;
175	float Log2Cache[LOG_CACHE_SIZE];
176
177	/// The signature of a particular utility node used for the bisection step,
178	/// i.e., the number of \p FunctionNodes in each of the two buckets
179	struct UtilitySignature {
180	/// The number of \p FunctionNodes in the left bucket
181	unsigned LeftCount = `0`;
182	/// The number of \p FunctionNodes in the right bucket
183	unsigned RightCount = `0`;
184	/// The cached gain of moving a \p FunctionNode from the left bucket to the
185	/// right bucket
186	float CachedGainLR;
187	/// The cached gain of moving a \p FunctionNode from the right bucket to the
188	/// left bucket
189	float CachedGainRL;
190	/// Whether \p CachedGainLR and \p CachedGainRL are valid
191	bool CachedGainIsValid = false;
192	};
193
194	protected:
195	/// Compute the move gain for uniform log-gap cost
196	LLVM_ABI static float moveGain(const BPFunctionNode &N, bool FromLeftToRight,
197	const SignaturesT &Signatures);
198	friend class BalancedPartitioningTest_MoveGain_Test;
199	};
200
201	} // end namespace llvm
202
203	#endif // LLVM_SUPPORT_BALANCED_PARTITIONING_H
204

Browse the source code of llvm_projects/llvm/include/llvm/Support/BalancedPartitioning.h