AMDGPUSplitModule.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp]

1	//===- AMDGPUSplitModule.cpp ----------------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file Implements a module splitting algorithm designed to support the
10	/// FullLTO --lto-partitions option for parallel codegen.
11	///
12	/// The role of this module splitting pass is the same as
13	/// lib/Transforms/Utils/SplitModule.cpp: load-balance the module's functions
14	/// across a set of N partitions to allow for parallel codegen.
15	///
16	/// The similarities mostly end here, as this pass achieves load-balancing in a
17	/// more elaborate fashion which is targeted towards AMDGPU modules. It can take
18	/// advantage of the structure of AMDGPU modules (which are mostly
19	/// self-contained) to allow for more efficient splitting without affecting
20	/// codegen negatively, or causing innaccurate resource usage analysis.
21	///
22	/// High-level pass overview:
23	/// - SplitGraph & associated classes
24	/// - Graph representation of the module and of the dependencies that
25	/// matter for splitting.
26	/// - RecursiveSearchSplitting
27	/// - Core splitting algorithm.
28	/// - SplitProposal
29	/// - Represents a suggested solution for splitting the input module. These
30	/// solutions can be scored to determine the best one when multiple
31	/// solutions are available.
32	/// - Driver/pass "run" function glues everything together.
33
34	#include "AMDGPUSplitModule.h"
35	#include "AMDGPUTargetMachine.h"
36	#include "Utils/AMDGPUBaseInfo.h"
37	#include "llvm/ADT/DenseMap.h"
38	#include "llvm/ADT/EquivalenceClasses.h"
39	#include "llvm/ADT/GraphTraits.h"
40	#include "llvm/ADT/SmallVector.h"
41	#include "llvm/ADT/StringExtras.h"
42	#include "llvm/ADT/StringRef.h"
43	#include "llvm/Analysis/CallGraph.h"
44	#include "llvm/Analysis/TargetTransformInfo.h"
45	#include "llvm/IR/Function.h"
46	#include "llvm/IR/InstIterator.h"
47	#include "llvm/IR/Instruction.h"
48	#include "llvm/IR/Module.h"
49	#include "llvm/IR/Value.h"
50	#include "llvm/Support/Allocator.h"
51	#include "llvm/Support/Casting.h"
52	#include "llvm/Support/DOTGraphTraits.h"
53	#include "llvm/Support/Debug.h"
54	#include "llvm/Support/GraphWriter.h"
55	#include "llvm/Support/Path.h"
56	#include "llvm/Support/Timer.h"
57	#include "llvm/Support/raw_ostream.h"
58	#include "llvm/Transforms/Utils/Cloning.h"
59	#include <cassert>
60	#include <cmath>
61	#include <memory>
62	#include <utility>
63	#include <vector>
64
65	#ifndef NDEBUG
66	#include "llvm/Support/LockFileManager.h"
67	#endif
68
69	#define DEBUG_TYPE "amdgpu-split-module"
70
71	namespace llvm {
72	namespace {
73
74	static cl::opt<unsigned> MaxDepth(
75	"amdgpu-module-splitting-max-depth",
76	cl::desc (
77	"maximum search depth. 0 forces a greedy approach. "
78	"warning: the algorithm is up to O(2^N), where N is the max depth."),
79	cl::init(Val: `8`));
80
81	static cl::opt<float> LargeFnFactor(
82	"amdgpu-module-splitting-large-threshold", cl::init(Val: `2.0f`), cl::Hidden,
83	cl::desc (
84	"when max depth is reached and we can no longer branch out, this "
85	"value determines if a function is worth merging into an already "
86	"existing partition to reduce code duplication. This is a factor "
87	"of the ideal partition size, e.g. 2.0 means we consider the "
88	"function for merging if its cost (including its callees) is 2x the "
89	"size of an ideal partition."));
90
91	static cl::opt<float> LargeFnOverlapForMerge(
92	"amdgpu-module-splitting-merge-threshold", cl::init(Val: `0.7f`), cl::Hidden,
93	cl::desc ("when a function is considered for merging into a partition that "
94	"already contains some of its callees, do the merge if at least "
95	"n% of the code it can reach is already present inside the "
96	"partition; e.g. 0.7 means only merge >70%"));
97
98	static cl::opt<bool> NoExternalizeGlobals(
99	"amdgpu-module-splitting-no-externalize-globals", cl::Hidden,
100	cl::desc ("disables externalization of global variable with local linkage; "
101	"may cause globals to be duplicated which increases binary size"));
102
103	static cl::opt<bool> NoExternalizeOnAddrTaken(
104	"amdgpu-module-splitting-no-externalize-address-taken", cl::Hidden,
105	cl::desc (
106	"disables externalization of functions whose addresses are taken"));
107
108	static cl::opt<std::string>
109	ModuleDotCfgOutput("amdgpu-module-splitting-print-module-dotcfg",
110	cl::Hidden,
111	cl::desc ("output file to write out the dotgraph "
112	"representation of the input module"));
113
114	static cl::opt<std::string> PartitionSummariesOutput(
115	"amdgpu-module-splitting-print-partition-summaries", cl::Hidden,
116	cl::desc ("output file to write out a summary of "
117	"the partitions created for each module"));
118
119	#ifndef NDEBUG
120	static cl::opt<bool>
121	UseLockFile("amdgpu-module-splitting-serial-execution", cl::Hidden,
122	cl::desc("use a lock file so only one process in the system "
123	"can run this pass at once. useful to avoid mangled "
124	"debug output in multithreaded environments."));
125
126	static cl::opt<bool>
127	DebugProposalSearch("amdgpu-module-splitting-debug-proposal-search",
128	cl::Hidden,
129	cl::desc("print all proposals received and whether "
130	"they were rejected or accepted"));
131	#endif
132
133	struct SplitModuleTimer : NamedRegionTimer {
134	SplitModuleTimer(StringRef Name, StringRef Desc)
135	: NamedRegionTimer (Name, Desc, DEBUG_TYPE, "AMDGPU Module Splitting",
136	TimePassesIsEnabled) {}
137	};
138
139	//===----------------------------------------------------------------------===//
140	// Utils
141	//===----------------------------------------------------------------------===//
142
143	using CostType = InstructionCost::CostType;
144	using FunctionsCostMap = DenseMap<const Function *, CostType>;
145	using GetTTIFn = function_ref<const TargetTransformInfo &(Function &)>;
146	static constexpr unsigned InvalidPID = -`1`;
147
148	/// \param Num numerator
149	/// \param Dem denominator
150	/// \returns a printable object to print (Num/Dem) using "%0.2f".
151	static auto formatRatioOf(CostType Num, CostType Dem) {
152	CostType DemOr1 = Dem ? Dem : `1`;
153	return format(Fmt: "%0.2f", Vals: (static_cast<double>(Num) / DemOr1) * `100`);
154	}
155
156	/// Checks whether a given function is non-copyable.
157	///
158	/// Non-copyable functions cannot be cloned into multiple partitions, and only
159	/// one copy of the function can be present across all partitions.
160	///
161	/// Kernel functions and external functions fall into this category. If we were
162	/// to clone them, we would end up with multiple symbol definitions and a very
163	/// unhappy linker.
164	static bool isNonCopyable(const Function &F) {
165	return F.hasExternalLinkage() \|\| !F.isDefinitionExact() \|\|
166	AMDGPU::isEntryFunctionCC(CC: F.getCallingConv());
167	}
168
169	/// If \p GV has local linkage, make it external + hidden.
170	static void externalize(GlobalValue &GV) {
171	if (GV.hasLocalLinkage()) {
172	GV.setLinkage(GlobalValue::ExternalLinkage);
173	GV.setVisibility(GlobalValue::HiddenVisibility);
174	}
175
176	// Unnamed entities must be named consistently between modules. setName will
177	// give a distinct name to each such entity.
178	if (!GV.hasName())
179	GV.setName("__llvmsplit_unnamed");
180	}
181
182	/// Cost analysis function. Calculates the cost of each function in \p M
183	///
184	/// \param GetTTI Abstract getter for TargetTransformInfo.
185	/// \param M Module to analyze.
186	/// \param CostMap[out] Resulting Function -> Cost map.
187	/// \return The module's total cost.
188	static CostType calculateFunctionCosts(GetTTIFn GetTTI, Module &M,
189	FunctionsCostMap &CostMap) {
190	SplitModuleTimer SMT("calculateFunctionCosts", "cost analysis");
191
192	LLVM_DEBUG(dbgs() << "[cost analysis] calculating function costs\n");
193	CostType ModuleCost = `0`;
194	[[maybe_unused]] CostType KernelCost = `0`;
195
196	for (auto &Fn : M) {
197	if (Fn.isDeclaration())
198	continue;
199
200	CostType FnCost = `0`;
201	const auto &TTI = GetTTI (Fn);
202	for (const auto &BB : Fn) {
203	for (const auto &I : BB) {
204	auto Cost =
205	TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_CodeSize);
206	assert(Cost != InstructionCost::getMax());
207	// Assume expensive if we can't tell the cost of an instruction.
208	CostType CostVal = Cost.isValid()
209	? Cost.getValue()
210	: (CostType)TargetTransformInfo::TCC_Expensive;
211	assert((FnCost + CostVal) >= FnCost && "Overflow!");
212	FnCost += CostVal;
213	}
214	}
215
216	assert(FnCost != `0`);
217
218	CostMap [&Fn] = FnCost;
219	assert((ModuleCost + FnCost) >= ModuleCost && "Overflow!");
220	ModuleCost += FnCost;
221
222	if (AMDGPU::isEntryFunctionCC(CC: Fn.getCallingConv()))
223	KernelCost += FnCost;
224	}
225
226	if (CostMap.empty())
227	return `0`;
228
229	assert(ModuleCost);
230	LLVM_DEBUG({
231	const CostType FnCost = ModuleCost - KernelCost;
232	dbgs() << " - total module cost is " << ModuleCost << ". kernels cost "
233	<< "" << KernelCost << " ("
234	<< format("%0.2f", (float(KernelCost) / ModuleCost) * `100`)
235	<< "% of the module), functions cost " << FnCost << " ("
236	<< format("%0.2f", (float(FnCost) / ModuleCost) * `100`)
237	<< "% of the module)\n";
238	});
239
240	return ModuleCost;
241	}
242
243	/// \return true if \p F can be indirectly called
244	static bool canBeIndirectlyCalled(const Function &F) {
245	if (F.isDeclaration() \|\| AMDGPU::isEntryFunctionCC(CC: F.getCallingConv()))
246	return false;
247	return !F.hasLocalLinkage() \|\|
248	F.hasAddressTaken(/PutOffender=/nullptr,
249	/IgnoreCallbackUses=/false,
250	/IgnoreAssumeLikeCalls=/true,
251	/IgnoreLLVMUsed=/IngoreLLVMUsed: true,
252	/IgnoreARCAttachedCall=/false,
253	/IgnoreCastedDirectCall=/true);
254	}
255
256	//===----------------------------------------------------------------------===//
257	// Graph-based Module Representation
258	//===----------------------------------------------------------------------===//
259
260	/// AMDGPUSplitModule's view of the source Module, as a graph of all components
261	/// that can be split into different modules.
262	///
263	/// The most trivial instance of this graph is just the CallGraph of the module,
264	/// but it is not guaranteed that the graph is strictly equal to the CG. It
265	/// currently always is but it's designed in a way that would eventually allow
266	/// us to create abstract nodes, or nodes for different entities such as global
267	/// variables or any other meaningful constraint we must consider.
268	///
269	/// The graph is only mutable by this class, and is generally not modified
270	/// after \ref SplitGraph::buildGraph runs. No consumers of the graph can
271	/// mutate it.
272	class SplitGraph {
273	public:
274	class Node;
275
276	enum class EdgeKind : uint8_t {
277	/// The nodes are related through a direct call. This is a "strong" edge as
278	/// it means the Src will directly reference the Dst.
279	DirectCall,
280	/// The nodes are related through an indirect call.
281	/// This is a "weaker" edge and is only considered when traversing the graph
282	/// starting from a kernel. We need this edge for resource usage analysis.
283	///
284	/// The reason why we have this edge in the first place is due to how
285	/// AMDGPUResourceUsageAnalysis works. In the presence of an indirect call,
286	/// the resource usage of the kernel containing the indirect call is the
287	/// max resource usage of all functions that can be indirectly called.
288	IndirectCall,
289	};
290
291	/// An edge between two nodes. Edges are directional, and tagged with a
292	/// "kind".
293	struct Edge {
294	Edge(Node Src, Node Dst, EdgeKind Kind)
295	: Src(Src), Dst(Dst), Kind(Kind) {}
296
297	Node Src; ///< Source*
298	Node Dst; ///< Destination*
299	EdgeKind Kind;
300	};
301
302	using EdgesVec = SmallVector<const Edge *, `0`>;
303	using edges_iterator = EdgesVec::const_iterator;
304	using nodes_iterator = const Node *const *;
305
306	SplitGraph(const Module &M, const FunctionsCostMap &CostMap,
307	CostType ModuleCost)
308	: M(M), CostMap(CostMap), ModuleCost(ModuleCost) {}
309
310	void buildGraph(CallGraph &CG);
311
312	#ifndef NDEBUG
313	bool verifyGraph() const;
314	#endif
315
316	bool empty() const { return Nodes.empty(); }
317	const iterator_range<nodes_iterator> nodes() const {
318	return {Nodes.begin(), Nodes.end()};
319	}
320	const Node &getNode(unsigned ID) const { return *Nodes [ID]; }
321
322	unsigned getNumNodes() const { return Nodes.size(); }
323	BitVector createNodesBitVector() const { return BitVector (Nodes.size()); }
324
325	const Module &getModule() const { return M; }
326
327	CostType getModuleCost() const { return ModuleCost; }
328	CostType getCost(const Function &F) const { return CostMap.at(Val: &F); }
329
330	/// \returns the aggregated cost of all nodes in \p BV (bits set to 1 = node
331	/// IDs).
332	CostType calculateCost(const BitVector &BV) const;
333
334	private:
335	/// Retrieves the node for \p GV in \p Cache, or creates a new node for it and
336	/// updates \p Cache.
337	Node &getNode(DenseMap<const GlobalValue , Node > &Cache,
338	const GlobalValue &GV);
339
340	// Create a new edge between two nodes and add it to both nodes.
341	const Edge &createEdge(Node &Src, Node &Dst, EdgeKind EK);
342
343	const Module &M;
344	const FunctionsCostMap &CostMap;
345	CostType ModuleCost;
346
347	// Final list of nodes with stable ordering.
348	SmallVector<Node *> Nodes;
349
350	SpecificBumpPtrAllocator<Node> NodesPool;
351
352	// Edges are trivially destructible objects, so as a small optimization we
353	// use a BumpPtrAllocator which avoids destructor calls but also makes
354	// allocation faster.
355	static_assert(
356	std::is_trivially_destructible_v<Edge>,
357	"Edge must be trivially destructible to use the BumpPtrAllocator");
358	BumpPtrAllocator EdgesPool;
359	};
360
361	/// Nodes in the SplitGraph contain both incoming, and outgoing edges.
362	/// Incoming edges have this node as their Dst, and Outgoing ones have this node
363	/// as their Src.
364	///
365	/// Edge objects are shared by both nodes in Src/Dst. They provide immediate
366	/// feedback on how two nodes are related, and in which direction they are
367	/// related, which is valuable information to make splitting decisions.
368	///
369	/// Nodes are fundamentally abstract, and any consumers of the graph should
370	/// treat them as such. While a node will be a function most of the time, we
371	/// could also create nodes for any other reason. In the future, we could have
372	/// single nodes for multiple functions, or nodes for GVs, etc.
373	class SplitGraph::Node {
374	friend class SplitGraph;
375
376	public:
377	Node(unsigned ID, const GlobalValue &GV, CostType IndividualCost,
378	bool IsNonCopyable)
379	: ID(ID), GV(GV), IndividualCost(IndividualCost),
380	IsNonCopyable(IsNonCopyable), IsEntryFnCC(false), IsGraphEntry(false) {
381	if (auto *Fn = dyn_cast<Function>(Val: &GV))
382	IsEntryFnCC = AMDGPU::isEntryFunctionCC(CC: Fn->getCallingConv());
383	}
384
385	/// An 0-indexed ID for the node. The maximum ID (exclusive) is the number of
386	/// nodes in the graph. This ID can be used as an index in a BitVector.
387	unsigned getID() const { return ID; }
388
389	const Function &getFunction() const { return cast<Function>(Val: GV); }
390
391	/// \returns the cost to import this component into a given module, not
392	/// accounting for any dependencies that may need to be imported as well.
393	CostType getIndividualCost() const { return IndividualCost; }
394
395	bool isNonCopyable() const { return IsNonCopyable; }
396	bool isEntryFunctionCC() const { return IsEntryFnCC; }
397
398	/// \returns whether this is an entry point in the graph. Entry points are
399	/// defined as follows: if you take all entry points in the graph, and iterate
400	/// their dependencies, you are guaranteed to visit all nodes in the graph at
401	/// least once.
402	bool isGraphEntryPoint() const { return IsGraphEntry; }
403
404	StringRef getName() const { return GV.getName(); }
405
406	bool hasAnyIncomingEdges() const { return IncomingEdges.size(); }
407	bool hasAnyIncomingEdgesOfKind(EdgeKind EK) const {
408	return any_of(Range: IncomingEdges, P: [&](const auto E) { return* E->Kind == EK; });
409	}
410
411	bool hasAnyOutgoingEdges() const { return OutgoingEdges.size(); }
412	bool hasAnyOutgoingEdgesOfKind(EdgeKind EK) const {
413	return any_of(Range: OutgoingEdges, P: [&](const auto E) { return* E->Kind == EK; });
414	}
415
416	iterator_range<edges_iterator> incoming_edges() const {
417	return IncomingEdges;
418	}
419
420	iterator_range<edges_iterator> outgoing_edges() const {
421	return OutgoingEdges;
422	}
423
424	bool shouldFollowIndirectCalls() const { return isEntryFunctionCC(); }
425
426	/// Visit all children of this node in a recursive fashion. Also visits Self.
427	/// If \ref shouldFollowIndirectCalls returns false, then this only follows
428	/// DirectCall edges.
429	///
430	/// \param Visitor Visitor Function.
431	void visitAllDependencies(std::function<void(const Node &)> Visitor) const;
432
433	/// Adds the depedencies of this node in \p BV by setting the bit
434	/// corresponding to each node.
435	///
436	/// Implemented using \ref visitAllDependencies, hence it follows the same
437	/// rules regarding dependencies traversal.
438	///
439	/// \param[out] BV The bitvector where the bits should be set.
440	void getDependencies(BitVector &BV) const {
441	visitAllDependencies(Visitor: [&](const Node &N) { BV.set(N.getID()); });
442	}
443
444	private:
445	void markAsGraphEntry() { IsGraphEntry = true; }
446
447	unsigned ID;
448	const GlobalValue &GV;
449	CostType IndividualCost;
450	bool IsNonCopyable : `1`;
451	bool IsEntryFnCC : `1`;
452	bool IsGraphEntry : `1`;
453
454	// TODO: Use a single sorted vector (with all incoming/outgoing edges grouped
455	// together)
456	EdgesVec IncomingEdges;
457	EdgesVec OutgoingEdges;
458	};
459
460	void SplitGraph::Node::visitAllDependencies(
461	std::function<void(const Node &)> Visitor) const {
462	const bool FollowIndirect = shouldFollowIndirectCalls();
463	// FIXME: If this can access SplitGraph in the future, use a BitVector
464	// instead.
465	DenseSet<const Node *> Seen;
466	SmallVector<const Node , `8`> WorkList({this*});
467	while (!WorkList.empty()) {
468	const Node *CurN = WorkList.pop_back_val();
469	if (auto [It, Inserted] = Seen.insert(V: CurN); !Inserted)
470	continue;
471
472	Visitor (*CurN);
473
474	for (const Edge *E : CurN->outgoing_edges()) {
475	if (!FollowIndirect && E->Kind == EdgeKind::IndirectCall)
476	continue;
477	WorkList.push_back(Elt: E->Dst);
478	}
479	}
480	}
481
482	/// Checks if \p I has MD_callees and if it does, parse it and put the function
483	/// in \p Callees.
484	///
485	/// \returns true if there was metadata and it was parsed correctly. false if
486	/// there was no MD or if it contained unknown entries and parsing failed.
487	/// If this returns false, \p Callees will contain incomplete information
488	/// and must not be used.
489	static bool handleCalleesMD(const Instruction &I,
490	SetVector<Function *> &Callees) {
491	auto *MD = I.getMetadata(KindID: LLVMContext::MD_callees);
492	if (!MD)
493	return false;
494
495	for (const auto &Op : MD->operands()) {
496	Function *Callee = mdconst::extract_or_null<Function>(MD: Op);
497	if (!Callee)
498	return false;
499	Callees.insert(X: Callee);
500	}
501
502	return true;
503	}
504
505	void SplitGraph::buildGraph(CallGraph &CG) {
506	SplitModuleTimer SMT("buildGraph", "graph construction");
507	LLVM_DEBUG(
508	dbgs()
509	<< "[build graph] constructing graph representation of the input\n");
510
511	// FIXME(?): Is the callgraph really worth using if we have to iterate the
512	// function again whenever it fails to give us enough information?
513
514	// We build the graph by just iterating all functions in the module and
515	// working on their direct callees. At the end, all nodes should be linked
516	// together as expected.
517	DenseMap<const GlobalValue , Node > Cache;
518	SmallVector<const Function *> FnsWithIndirectCalls, IndirectlyCallableFns;
519	for (const Function &Fn : M) {
520	if (Fn.isDeclaration())
521	continue;
522
523	// Look at direct callees and create the necessary edges in the graph.
524	SetVector<const Function *> DirectCallees;
525	bool CallsExternal = false;
526	for (auto &CGEntry : *CG [&Fn]) {
527	auto *CGNode = CGEntry.second;
528	if (auto *Callee = CGNode->getFunction()) {
529	if (!Callee->isDeclaration())
530	DirectCallees.insert(X: Callee);
531	} else if (CGNode == CG.getCallsExternalNode())
532	CallsExternal = true;
533	}
534
535	// Keep track of this function if it contains an indirect call and/or if it
536	// can be indirectly called.
537	if (CallsExternal) {
538	LLVM_DEBUG(dbgs() << " [!] callgraph is incomplete for ";
539	Fn.printAsOperand(dbgs());
540	dbgs() << " - analyzing function\n");
541
542	SetVector<Function *> KnownCallees;
543	bool HasUnknownIndirectCall = false;
544	for (const auto &Inst : instructions(F: Fn)) {
545	// look at all calls without a direct callee.
546	const auto *CB = dyn_cast<CallBase>(Val: &Inst);
547	if (!CB \|\| CB->getCalledFunction())
548	continue;
549
550	// inline assembly can be ignored, unless InlineAsmIsIndirectCall is
551	// true.
552	if (CB->isInlineAsm()) {
553	LLVM_DEBUG(dbgs() << " found inline assembly\n");
554	continue;
555	}
556
557	if (handleCalleesMD(I: Inst, Callees&: KnownCallees))
558	continue;
559	// If we failed to parse any !callees MD, or some was missing,
560	// the entire KnownCallees list is now unreliable.
561	KnownCallees.clear();
562
563	// Everything else is handled conservatively. If we fall into the
564	// conservative case don't bother analyzing further.
565	HasUnknownIndirectCall = true;
566	break;
567	}
568
569	if (HasUnknownIndirectCall) {
570	LLVM_DEBUG(dbgs() << " indirect call found\n");
571	FnsWithIndirectCalls.push_back(Elt: &Fn);
572	} else if (!KnownCallees.empty())
573	DirectCallees.insert_range(R&: KnownCallees);
574	}
575
576	Node &N = getNode(Cache, GV: Fn);
577	for (const auto *Callee : DirectCallees)
578	createEdge(Src&: N, Dst&: getNode(Cache, GV: *Callee), EK: EdgeKind::DirectCall);
579
580	if (canBeIndirectlyCalled(F: Fn))
581	IndirectlyCallableFns.push_back(Elt: &Fn);
582	}
583
584	// Post-process functions with indirect calls.
585	for (const Function *Fn : FnsWithIndirectCalls) {
586	for (const Function *Candidate : IndirectlyCallableFns) {
587	Node &Src = getNode(Cache, GV: *Fn);
588	Node &Dst = getNode(Cache, GV: *Candidate);
589	createEdge(Src, Dst, EK: EdgeKind::IndirectCall);
590	}
591	}
592
593	// Now, find all entry points.
594	SmallVector<Node *, `16`> CandidateEntryPoints;
595	BitVector NodesReachableByKernels = createNodesBitVector();
596	for (Node *N : Nodes) {
597	// Functions with an Entry CC are always graph entry points too.
598	if (N->isEntryFunctionCC()) {
599	N->markAsGraphEntry();
600	N->getDependencies(BV&: NodesReachableByKernels);
601	} else if (!N->hasAnyIncomingEdgesOfKind(EK: EdgeKind::DirectCall))
602	CandidateEntryPoints.push_back(Elt: N);
603	}
604
605	for (Node *N : CandidateEntryPoints) {
606	// This can be another entry point if it's not reachable by a kernel
607	// TODO: We could sort all of the possible new entries in a stable order
608	// (e.g. by cost), then consume them one by one until
609	// NodesReachableByKernels is all 1s. It'd allow us to avoid
610	// considering some nodes as non-entries in some specific cases.
611	if (!NodesReachableByKernels.test(Idx: N->getID()))
612	N->markAsGraphEntry();
613	}
614
615	#ifndef NDEBUG
616	assert(verifyGraph());
617	#endif
618	}
619
620	#ifndef NDEBUG
621	bool SplitGraph::verifyGraph() const {
622	unsigned ExpectedID = `0`;
623	// Exceptionally using a set here in case IDs are messed up.
624	DenseSet<const Node *> SeenNodes;
625	DenseSet<const Function *> SeenFunctionNodes;
626	for (const Node *N : Nodes) {
627	if (N->getID() != (ExpectedID++)) {
628	errs() << "Node IDs are incorrect!\n";
629	return false;
630	}
631
632	if (!SeenNodes.insert(N).second) {
633	errs() << "Node seen more than once!\n";
634	return false;
635	}
636
637	if (&getNode(N->getID()) != N) {
638	errs() << "getNode doesn't return the right node\n";
639	return false;
640	}
641
642	for (const Edge *E : N->IncomingEdges) {
643	if (!E->Src \|\| !E->Dst \|\| (E->Dst != N) \|\|
644	(find(E->Src->OutgoingEdges, E) == E->Src->OutgoingEdges.end())) {
645	errs() << "ill-formed incoming edges\n";
646	return false;
647	}
648	}
649
650	for (const Edge *E : N->OutgoingEdges) {
651	if (!E->Src \|\| !E->Dst \|\| (E->Src != N) \|\|
652	(find(E->Dst->IncomingEdges, E) == E->Dst->IncomingEdges.end())) {
653	errs() << "ill-formed outgoing edges\n";
654	return false;
655	}
656	}
657
658	const Function &Fn = N->getFunction();
659	if (AMDGPU::isEntryFunctionCC(Fn.getCallingConv())) {
660	if (N->hasAnyIncomingEdges()) {
661	errs() << "Kernels cannot have incoming edges\n";
662	return false;
663	}
664	}
665
666	if (Fn.isDeclaration()) {
667	errs() << "declarations shouldn't have nodes!\n";
668	return false;
669	}
670
671	auto [It, Inserted] = SeenFunctionNodes.insert(&Fn);
672	if (!Inserted) {
673	errs() << "one function has multiple nodes!\n";
674	return false;
675	}
676	}
677
678	if (ExpectedID != Nodes.size()) {
679	errs() << "Node IDs out of sync!\n";
680	return false;
681	}
682
683	if (createNodesBitVector().size() != getNumNodes()) {
684	errs() << "nodes bit vector doesn't have the right size!\n";
685	return false;
686	}
687
688	// Check we respect the promise of Node::isKernel
689	BitVector BV = createNodesBitVector();
690	for (const Node *N : nodes()) {
691	if (N->isGraphEntryPoint())
692	N->getDependencies(BV);
693	}
694
695	// Ensure each function in the module has an associated node.
696	for (const auto &Fn : M) {
697	if (!Fn.isDeclaration()) {
698	if (!SeenFunctionNodes.contains(&Fn)) {
699	errs() << "Fn has no associated node in the graph!\n";
700	return false;
701	}
702	}
703	}
704
705	if (!BV.all()) {
706	errs() << "not all nodes are reachable through the graph's entry points!\n";
707	return false;
708	}
709
710	return true;
711	}
712	#endif
713
714	CostType SplitGraph::calculateCost(const BitVector &BV) const {
715	CostType Cost = `0`;
716	for (unsigned NodeID : BV.set_bits())
717	Cost += getNode(ID: NodeID).getIndividualCost();
718	return Cost;
719	}
720
721	SplitGraph::Node &
722	SplitGraph::getNode(DenseMap<const GlobalValue , Node > &Cache,
723	const GlobalValue &GV) {
724	auto &N = Cache [&GV];
725	if (N)
726	return *N;
727
728	CostType Cost = `0`;
729	bool NonCopyable = false;
730	if (const Function *Fn = dyn_cast<Function>(Val: &GV)) {
731	NonCopyable = isNonCopyable(F: *Fn);
732	Cost = CostMap.at(Val: Fn);
733	}
734	N = new (NodesPool.Allocate()) Node (Nodes.size(), GV, Cost, NonCopyable);
735	Nodes.push_back(Elt: N);
736	assert(&getNode(N->getID()) == N);
737	return *N;
738	}
739
740	const SplitGraph::Edge &SplitGraph::createEdge(Node &Src, Node &Dst,
741	EdgeKind EK) {
742	const Edge E = new* (EdgesPool.Allocate<Edge>(Num: `1`)) Edge (&Src, &Dst, EK);
743	Src.OutgoingEdges.push_back(Elt: E);
744	Dst.IncomingEdges.push_back(Elt: E);
745	return *E;
746	}
747
748	//===----------------------------------------------------------------------===//
749	// Split Proposals
750	//===----------------------------------------------------------------------===//
751
752	/// Represents a module splitting proposal.
753	///
754	/// Proposals are made of N BitVectors, one for each partition, where each bit
755	/// set indicates that the node is present and should be copied inside that
756	/// partition.
757	///
758	/// Proposals have several metrics attached so they can be compared/sorted,
759	/// which the driver to try multiple strategies resultings in multiple proposals
760	/// and choose the best one out of them.
761	class SplitProposal {
762	public:
763	SplitProposal(const SplitGraph &SG, unsigned MaxPartitions) : SG(&SG) {
764	Partitions.resize(new_size: MaxPartitions, x: {`0`, SG.createNodesBitVector()});
765	}
766
767	void setName(StringRef NewName) { Name = NewName; }
768	StringRef getName() const { return Name; }
769
770	const BitVector &operator[](unsigned PID) const {
771	return Partitions [PID].second;
772	}
773
774	void add(unsigned PID, const BitVector &BV) {
775	Partitions [PID].second \|= BV;
776	updateScore(PID);
777	}
778
779	void print(raw_ostream &OS) const;
780	LLVM_DUMP_METHOD void dump() const { print(OS&: dbgs()); }
781
782	// Find the cheapest partition (lowest cost). In case of ties, always returns
783	// the highest partition number.
784	unsigned findCheapestPartition() const;
785
786	/// Calculate the CodeSize and Bottleneck scores.
787	void calculateScores();
788
789	#ifndef NDEBUG
790	void verifyCompleteness() const;
791	#endif
792
793	/// Only available after \ref calculateScores is called.
794	///
795	/// A positive number indicating the % of code duplication that this proposal
796	/// creates. e.g. 0.2 means this proposal adds roughly 20% code size by
797	/// duplicating some functions across partitions.
798	///
799	/// Value is always rounded up to 3 decimal places.
800	///
801	/// A perfect score would be 0.0, and anything approaching 1.0 is very bad.
802	double getCodeSizeScore() const { return CodeSizeScore; }
803
804	/// Only available after \ref calculateScores is called.
805	///
806	/// A number between [0, 1] which indicates how big of a bottleneck is
807	/// expected from the largest partition.
808	///
809	/// A score of 1.0 means the biggest partition is as big as the source module,
810	/// so build time will be equal to or greater than the build time of the
811	/// initial input.
812	///
813	/// Value is always rounded up to 3 decimal places.
814	///
815	/// This is one of the metrics used to estimate this proposal's build time.
816	double getBottleneckScore() const { return BottleneckScore; }
817
818	private:
819	void updateScore(unsigned PID) {
820	assert(SG);
821	for (auto &[PCost, Nodes] : Partitions) {
822	TotalCost -= PCost;
823	PCost = SG->calculateCost(BV: Nodes);
824	TotalCost += PCost;
825	}
826	}
827
828	/// \see getCodeSizeScore
829	double CodeSizeScore = `0.0`;
830	/// \see getBottleneckScore
831	double BottleneckScore = `0.0`;
832	/// Aggregated cost of all partitions
833	CostType TotalCost = `0`;
834
835	const SplitGraph SG = nullptr*;
836	std::string Name;
837
838	std::vector<std::pair<CostType, BitVector>> Partitions;
839	};
840
841	void SplitProposal::print(raw_ostream &OS) const {
842	assert(SG);
843
844	OS << "[proposal] " << Name << ", total cost:" << TotalCost
845	<< ", code size score:" << format(Fmt: "%0.3f", Vals: CodeSizeScore)
846	<< ", bottleneck score:" << format(Fmt: "%0.3f", Vals: BottleneckScore) << `'\n'`;
847	for (const auto &[PID, Part] : enumerate(First: Partitions)) {
848	const auto &[Cost, NodeIDs] = Part;
849	OS << " - P" << PID << " nodes:" << NodeIDs.count() << " cost: " << Cost
850	<< `'\|'` << formatRatioOf(Num: Cost, Dem: SG->getModuleCost()) << "%\n";
851	}
852	}
853
854	unsigned SplitProposal::findCheapestPartition() const {
855	assert(!Partitions.empty());
856	CostType CurCost = std::numeric_limits<CostType>::max();
857	unsigned CurPID = InvalidPID;
858	for (const auto &[Idx, Part] : enumerate(First: Partitions)) {
859	if (Part.first <= CurCost) {
860	CurPID = Idx;
861	CurCost = Part.first;
862	}
863	}
864	assert(CurPID != InvalidPID);
865	return CurPID;
866	}
867
868	void SplitProposal::calculateScores() {
869	if (Partitions.empty())
870	return;
871
872	assert(SG);
873	CostType LargestPCost = `0`;
874	for (auto &[PCost, Nodes] : Partitions) {
875	if (PCost > LargestPCost)
876	LargestPCost = PCost;
877	}
878
879	CostType ModuleCost = SG->getModuleCost();
880	CodeSizeScore = double(TotalCost) / ModuleCost;
881	assert(CodeSizeScore >= `0.0`);
882
883	BottleneckScore = double(LargestPCost) / ModuleCost;
884
885	CodeSizeScore = std::ceil(x: CodeSizeScore * `100.0`) / `100.0`;
886	BottleneckScore = std::ceil(x: BottleneckScore * `100.0`) / `100.0`;
887	}
888
889	#ifndef NDEBUG
890	void SplitProposal::verifyCompleteness() const {
891	if (Partitions.empty())
892	return;
893
894	BitVector Result = Partitions[`0`].second;
895	for (const auto &P : drop_begin(Partitions))
896	Result \|= P.second;
897	assert(Result.all() && "some nodes are missing from this proposal!");
898	}
899	#endif
900
901	//===-- RecursiveSearchStrategy -------------------------------------------===//
902
903	/// Partitioning algorithm.
904	///
905	/// This is a recursive search algorithm that can explore multiple possiblities.
906	///
907	/// When a cluster of nodes can go into more than one partition, and we haven't
908	/// reached maximum search depth, we recurse and explore both options and their
909	/// consequences. Both branches will yield a proposal, and the driver will grade
910	/// both and choose the best one.
911	///
912	/// If max depth is reached, we will use some heuristics to make a choice. Most
913	/// of the time we will just use the least-pressured (cheapest) partition, but
914	/// if a cluster is particularly big and there is a good amount of overlap with
915	/// an existing partition, we will choose that partition instead.
916	class RecursiveSearchSplitting {
917	public:
918	using SubmitProposalFn = function_ref<void(SplitProposal)>;
919
920	RecursiveSearchSplitting(const SplitGraph &SG, unsigned NumParts,
921	SubmitProposalFn SubmitProposal);
922
923	void run();
924
925	private:
926	struct WorkListEntry {
927	WorkListEntry(const BitVector &BV) : Cluster (BV) {}
928
929	unsigned NumNonEntryNodes = `0`;
930	CostType TotalCost = `0`;
931	CostType CostExcludingGraphEntryPoints = `0`;
932	BitVector Cluster;
933	};
934
935	/// Collects all graph entry points's clusters and sort them so the most
936	/// expensive clusters are viewed first. This will merge clusters together if
937	/// they share a non-copyable dependency.
938	void setupWorkList();
939
940	/// Recursive function that assigns the worklist item at \p Idx into a
941	/// partition of \p SP.
942	///
943	/// \p Depth is the current search depth. When this value is equal to
944	/// \ref MaxDepth, we can no longer recurse.
945	///
946	/// This function only recurses if there is more than one possible assignment,
947	/// otherwise it is iterative to avoid creating a call stack that is as big as
948	/// \ref WorkList.
949	void pickPartition(unsigned Depth, unsigned Idx, SplitProposal SP);
950
951	/// \return A pair: first element is the PID of the partition that has the
952	/// most similarities with \p Entry, or \ref InvalidPID if no partition was
953	/// found with at least one element in common. The second element is the
954	/// aggregated cost of all dependencies in common between \p Entry and that
955	/// partition.
956	std::pair<unsigned, CostType>
957	findMostSimilarPartition(const WorkListEntry &Entry, const SplitProposal &SP);
958
959	const SplitGraph &SG;
960	unsigned NumParts;
961	SubmitProposalFn SubmitProposal;
962
963	// A Cluster is considered large when its cost, excluding entry points,
964	// exceeds this value.
965	CostType LargeClusterThreshold = `0`;
966	unsigned NumProposalsSubmitted = `0`;
967	SmallVector<WorkListEntry> WorkList;
968	};
969
970	RecursiveSearchSplitting::RecursiveSearchSplitting(
971	const SplitGraph &SG, unsigned NumParts, SubmitProposalFn SubmitProposal)
972	: SG(SG), NumParts(NumParts), SubmitProposal (SubmitProposal) {
973	// arbitrary max value as a safeguard. Anything above 10 will already be
974	// slow, this is just a max value to prevent extreme resource exhaustion or
975	// unbounded run time.
976	if (MaxDepth > `16`)
977	report_fatal_error(reason: "[amdgpu-split-module] search depth of " +
978	Twine (MaxDepth) + " is too high!");
979	LargeClusterThreshold =
980	(LargeFnFactor != `0.0`)
981	? CostType(((SG.getModuleCost() / NumParts) * LargeFnFactor))
982	: std::numeric_limits<CostType>::max();
983	LLVM_DEBUG(dbgs() << "[recursive search] large cluster threshold set at "
984	<< LargeClusterThreshold << "\n");
985	}
986
987	void RecursiveSearchSplitting::run() {
988	{
989	SplitModuleTimer SMT("recursive_search_prepare", "preparing worklist");
990	setupWorkList();
991	}
992
993	{
994	SplitModuleTimer SMT("recursive_search_pick", "partitioning");
995	SplitProposal SP(SG, NumParts);
996	pickPartition(/BranchDepth=/Depth: `0`, /Idx=/`0`, SP);
997	}
998	}
999
1000	void RecursiveSearchSplitting::setupWorkList() {
1001	// e.g. if A and B are two worklist item, and they both call a non copyable
1002	// dependency C, this does:
1003	// A=C
1004	// B=C
1005	// => NodeEC will create a single group (A, B, C) and we create a new
1006	// WorkList entry for that group.
1007
1008	EquivalenceClasses<unsigned> NodeEC;
1009	for (const SplitGraph::Node *N : SG.nodes()) {
1010	if (!N->isGraphEntryPoint())
1011	continue;
1012
1013	NodeEC.insert(Data: N->getID());
1014	N->visitAllDependencies(Visitor: [&](const SplitGraph::Node &Dep) {
1015	if (&Dep != N && Dep.isNonCopyable())
1016	NodeEC.unionSets(V1: N->getID(), V2: Dep.getID());
1017	});
1018	}
1019
1020	for (const auto &Node : NodeEC) {
1021	if (!Node->isLeader())
1022	continue;
1023
1024	BitVector Cluster = SG.createNodesBitVector();
1025	for (unsigned M : NodeEC.members(ECV: *Node)) {
1026	const SplitGraph::Node &N = SG.getNode(ID: M);
1027	if (N.isGraphEntryPoint())
1028	N.getDependencies(BV&: Cluster);
1029	}
1030	WorkList.emplace_back(Args: std::move(Cluster));
1031	}
1032
1033	// Calculate costs and other useful information.
1034	for (WorkListEntry &Entry : WorkList) {
1035	for (unsigned NodeID : Entry.Cluster.set_bits()) {
1036	const SplitGraph::Node &N = SG.getNode(ID: NodeID);
1037	const CostType Cost = N.getIndividualCost();
1038
1039	Entry.TotalCost += Cost;
1040	if (!N.isGraphEntryPoint()) {
1041	Entry.CostExcludingGraphEntryPoints += Cost;
1042	++Entry.NumNonEntryNodes;
1043	}
1044	}
1045	}
1046
1047	stable_sort(Range&: WorkList, C: [](const WorkListEntry &A, const WorkListEntry &B) {
1048	if (A.TotalCost != B.TotalCost)
1049	return A.TotalCost > B.TotalCost;
1050
1051	if (A.CostExcludingGraphEntryPoints != B.CostExcludingGraphEntryPoints)
1052	return A.CostExcludingGraphEntryPoints > B.CostExcludingGraphEntryPoints;
1053
1054	if (A.NumNonEntryNodes != B.NumNonEntryNodes)
1055	return A.NumNonEntryNodes > B.NumNonEntryNodes;
1056
1057	return A.Cluster.count() > B.Cluster.count();
1058	});
1059
1060	LLVM_DEBUG({
1061	dbgs() << "[recursive search] worklist:\n";
1062	for (const auto &[Idx, Entry] : enumerate(WorkList)) {
1063	dbgs() << " - [" << Idx << "]: ";
1064	for (unsigned NodeID : Entry.Cluster.set_bits())
1065	dbgs() << NodeID << " ";
1066	dbgs() << "(total_cost:" << Entry.TotalCost
1067	<< ", cost_excl_entries:" << Entry.CostExcludingGraphEntryPoints
1068	<< ")\n";
1069	}
1070	});
1071	}
1072
1073	void RecursiveSearchSplitting::pickPartition(unsigned Depth, unsigned Idx,
1074	SplitProposal SP) {
1075	while (Idx < WorkList.size()) {
1076	// Step 1: Determine candidate PIDs.
1077	//
1078	const WorkListEntry &Entry = WorkList [Idx];
1079	const BitVector &Cluster = Entry.Cluster;
1080
1081	// Default option is to do load-balancing, AKA assign to least pressured
1082	// partition.
1083	const unsigned CheapestPID = SP.findCheapestPartition();
1084	assert(CheapestPID != InvalidPID);
1085
1086	// Explore assigning to the kernel that contains the most dependencies in
1087	// common.
1088	const auto [MostSimilarPID, SimilarDepsCost] =
1089	findMostSimilarPartition(Entry, SP);
1090
1091	// We can chose to explore only one path if we only have one valid path, or
1092	// if we reached maximum search depth and can no longer branch out.
1093	unsigned SinglePIDToTry = InvalidPID;
1094	if (MostSimilarPID == InvalidPID) // no similar PID found
1095	SinglePIDToTry = CheapestPID;
1096	else if (MostSimilarPID == CheapestPID) // both landed on the same PID
1097	SinglePIDToTry = CheapestPID;
1098	else if (Depth >= MaxDepth) {
1099	// We have to choose one path. Use a heuristic to guess which one will be
1100	// more appropriate.
1101	if (Entry.CostExcludingGraphEntryPoints > LargeClusterThreshold) {
1102	// Check if the amount of code in common makes it worth it.
1103	assert(SimilarDepsCost && Entry.CostExcludingGraphEntryPoints);
1104	const double Ratio = static_cast<double>(SimilarDepsCost) /
1105	Entry.CostExcludingGraphEntryPoints;
1106	assert(Ratio >= `0.0` && Ratio <= `1.0`);
1107	if (Ratio > LargeFnOverlapForMerge) {
1108	// For debug, just print "L", so we'll see "L3=P3" for instance, which
1109	// will mean we reached max depth and chose P3 based on this
1110	// heuristic.
1111	LLVM_DEBUG(dbgs() << `'L'`);
1112	SinglePIDToTry = MostSimilarPID;
1113	}
1114	} else
1115	SinglePIDToTry = CheapestPID;
1116	}
1117
1118	// Step 2: Explore candidates.
1119
1120	// When we only explore one possible path, and thus branch depth doesn't
1121	// increase, do not recurse, iterate instead.
1122	if (SinglePIDToTry != InvalidPID) {
1123	LLVM_DEBUG(dbgs() << Idx << "=P" << SinglePIDToTry << `' '`);
1124	// Only one path to explore, don't clone SP, don't increase depth.
1125	SP.add(PID: SinglePIDToTry, BV: Cluster);
1126	++Idx;
1127	continue;
1128	}
1129
1130	assert(MostSimilarPID != InvalidPID);
1131
1132	// We explore multiple paths: recurse at increased depth, then stop this
1133	// function.
1134
1135	LLVM_DEBUG(dbgs() << `'\n'`);
1136
1137	// lb = load balancing = put in cheapest partition
1138	{
1139	SplitProposal BranchSP = SP;
1140	LLVM_DEBUG(dbgs().indent(Depth)
1141	<< " [lb] " << Idx << "=P" << CheapestPID << "? ");
1142	BranchSP.add(PID: CheapestPID, BV: Cluster);
1143	pickPartition(Depth: Depth + `1`, Idx: Idx + `1`, SP: BranchSP);
1144	}
1145
1146	// ms = most similar = put in partition with the most in common
1147	{
1148	SplitProposal BranchSP = SP;
1149	LLVM_DEBUG(dbgs().indent(Depth)
1150	<< " [ms] " << Idx << "=P" << MostSimilarPID << "? ");
1151	BranchSP.add(PID: MostSimilarPID, BV: Cluster);
1152	pickPartition(Depth: Depth + `1`, Idx: Idx + `1`, SP: BranchSP);
1153	}
1154
1155	return;
1156	}
1157
1158	// Step 3: If we assigned all WorkList items, submit the proposal.
1159
1160	assert(Idx == WorkList.size());
1161	assert(NumProposalsSubmitted <= (`2u` << MaxDepth) &&
1162	"Search got out of bounds?");
1163	SP.setName("recursive_search (depth=" + std::to_string(val: Depth) + ") #" +
1164	std::to_string(val: NumProposalsSubmitted++));
1165	LLVM_DEBUG(dbgs() << `'\n'`);
1166	SubmitProposal (SP);
1167	}
1168
1169	std::pair<unsigned, CostType>
1170	RecursiveSearchSplitting::findMostSimilarPartition(const WorkListEntry &Entry,
1171	const SplitProposal &SP) {
1172	if (!Entry.NumNonEntryNodes)
1173	return {InvalidPID, `0`};
1174
1175	// We take the partition that is the most similar using Cost as a metric.
1176	// So we take the set of nodes in common, compute their aggregated cost, and
1177	// pick the partition with the highest cost in common.
1178	unsigned ChosenPID = InvalidPID;
1179	CostType ChosenCost = `0`;
1180	for (unsigned PID = `0`; PID < NumParts; ++PID) {
1181	BitVector BV = SP [PID];
1182	BV &= Entry.Cluster; // FIXME: & doesn't work between BVs?!
1183
1184	if (BV.none())
1185	continue;
1186
1187	const CostType Cost = SG.calculateCost(BV);
1188
1189	if (ChosenPID == InvalidPID \|\| ChosenCost < Cost \|\|
1190	(ChosenCost == Cost && PID > ChosenPID)) {
1191	ChosenPID = PID;
1192	ChosenCost = Cost;
1193	}
1194	}
1195
1196	return {ChosenPID, ChosenCost};
1197	}
1198
1199	//===----------------------------------------------------------------------===//
1200	// DOTGraph Printing Support
1201	//===----------------------------------------------------------------------===//
1202
1203	const SplitGraph::Node mapEdgeToDst(const* SplitGraph::Edge *E) {
1204	return E->Dst;
1205	}
1206
1207	using SplitGraphEdgeDstIterator =
1208	mapped_iterator<SplitGraph::edges_iterator, decltype(&mapEdgeToDst)>;
1209
1210	} // namespace
1211
1212	template <> struct GraphTraits<SplitGraph> {
1213	using NodeRef = const SplitGraph::Node *;
1214	using nodes_iterator = SplitGraph::nodes_iterator;
1215	using ChildIteratorType = SplitGraphEdgeDstIterator;
1216
1217	using EdgeRef = const SplitGraph::Edge *;
1218	using ChildEdgeIteratorType = SplitGraph::edges_iterator;
1219
1220	static NodeRef getEntryNode(NodeRef N) { return N; }
1221
1222	static ChildIteratorType child_begin(NodeRef Ref) {
1223	return {Ref->outgoing_edges().begin(), mapEdgeToDst};
1224	}
1225	static ChildIteratorType child_end(NodeRef Ref) {
1226	return {Ref->outgoing_edges().end(), mapEdgeToDst};
1227	}
1228
1229	static nodes_iterator nodes_begin(const SplitGraph &G) {
1230	return G.nodes().begin();
1231	}
1232	static nodes_iterator nodes_end(const SplitGraph &G) {
1233	return G.nodes().end();
1234	}
1235	};
1236
1237	template <> struct DOTGraphTraits<SplitGraph> : public DefaultDOTGraphTraits {
1238	DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits (IsSimple) {}
1239
1240	static std::string getGraphName(const SplitGraph &SG) {
1241	return SG.getModule().getName().str();
1242	}
1243
1244	std::string getNodeLabel(const SplitGraph::Node N, const* SplitGraph &SG) {
1245	return N->getName().str();
1246	}
1247
1248	static std::string getNodeDescription(const SplitGraph::Node *N,
1249	const SplitGraph &SG) {
1250	std::string Result;
1251	if (N->isEntryFunctionCC())
1252	Result += "entry-fn-cc ";
1253	if (N->isNonCopyable())
1254	Result += "non-copyable ";
1255	Result += "cost:" + std::to_string(val: N->getIndividualCost());
1256	return Result;
1257	}
1258
1259	static std::string getNodeAttributes(const SplitGraph::Node *N,
1260	const SplitGraph &SG) {
1261	return N->hasAnyIncomingEdges() ? "" : "color=\"red\"";
1262	}
1263
1264	static std::string getEdgeAttributes(const SplitGraph::Node *N,
1265	SplitGraphEdgeDstIterator EI,
1266	const SplitGraph &SG) {
1267
1268	switch ((*EI.getCurrent())->Kind) {
1269	case SplitGraph::EdgeKind::DirectCall:
1270	return "";
1271	case SplitGraph::EdgeKind::IndirectCall:
1272	return "style=\"dashed\"";
1273	}
1274	llvm_unreachable("Unknown SplitGraph::EdgeKind enum");
1275	}
1276	};
1277
1278	//===----------------------------------------------------------------------===//
1279	// Driver
1280	//===----------------------------------------------------------------------===//
1281
1282	namespace {
1283
1284	// If we didn't externalize GVs, then local GVs need to be conservatively
1285	// imported into every module (including their initializers), and then cleaned
1286	// up afterwards.
1287	static bool needsConservativeImport(const GlobalValue *GV) {
1288	if (const auto *Var = dyn_cast<GlobalVariable>(Val: GV))
1289	return Var->hasLocalLinkage();
1290	return isa<GlobalAlias>(Val: GV);
1291	}
1292
1293	/// Prints a summary of the partition \p N, represented by module \p M, to \p
1294	/// OS.
1295	static void printPartitionSummary(raw_ostream &OS, unsigned N, const Module &M,
1296	unsigned PartCost, unsigned ModuleCost) {
1297	OS << "* Partition P" << N << " *\n";
1298
1299	for (const auto &Fn : M) {
1300	if (!Fn.isDeclaration())
1301	OS << " - [function] " << Fn.getName() << "\n";
1302	}
1303
1304	for (const auto &GV : M.globals()) {
1305	if (GV.hasInitializer())
1306	OS << " - [global] " << GV.getName() << "\n";
1307	}
1308
1309	OS << "Partition contains " << formatRatioOf(Num: PartCost, Dem: ModuleCost)
1310	<< "% of the source\n";
1311	}
1312
1313	static void evaluateProposal(SplitProposal &Best, SplitProposal New) {
1314	SplitModuleTimer SMT("proposal_evaluation", "proposal ranking algorithm");
1315
1316	LLVM_DEBUG({
1317	New.verifyCompleteness();
1318	if (DebugProposalSearch)
1319	New.print(dbgs());
1320	});
1321
1322	const double CurBScore = Best.getBottleneckScore();
1323	const double CurCSScore = Best.getCodeSizeScore();
1324	const double NewBScore = New.getBottleneckScore();
1325	const double NewCSScore = New.getCodeSizeScore();
1326
1327	// TODO: Improve this
1328	// We can probably lower the precision of the comparison at first
1329	// e.g. if we have
1330	// - (Current): BScore: 0.489 CSCore 1.105
1331	// - (New): BScore: 0.475 CSCore 1.305
1332	// Currently we'd choose the new one because the bottleneck score is
1333	// lower, but the new one duplicates more code. It may be worth it to
1334	// discard the new proposal as the impact on build time is negligible.
1335
1336	// Compare them
1337	bool IsBest = false;
1338	if (NewBScore < CurBScore)
1339	IsBest = true;
1340	else if (NewBScore == CurBScore)
1341	IsBest = (NewCSScore < CurCSScore); // Use code size as tie breaker.
1342
1343	if (IsBest)
1344	Best = std::move(New);
1345
1346	LLVM_DEBUG(if (DebugProposalSearch) {
1347	if (IsBest)
1348	dbgs() << "[search] new best proposal!\n";
1349	else
1350	dbgs() << "[search] discarding - not profitable\n";
1351	});
1352	}
1353
1354	/// Trivial helper to create an identical copy of \p M.
1355	static std::unique_ptr<Module> cloneAll(const Module &M) {
1356	ValueToValueMapTy VMap;
1357	return CloneModule(M, VMap, ShouldCloneDefinition: [&](const GlobalValue GV) { return* true; });
1358	}
1359
1360	/// Writes \p SG as a DOTGraph to \ref ModuleDotCfgDir if requested.
1361	static void writeDOTGraph(const SplitGraph &SG) {
1362	if (ModuleDotCfgOutput.empty())
1363	return;
1364
1365	std::error_code EC;
1366	raw_fd_ostream OS(ModuleDotCfgOutput, EC);
1367	if (EC) {
1368	errs() << "[" DEBUG_TYPE "]: cannot open '" << ModuleDotCfgOutput
1369	<< "' - DOTGraph will not be printed\n";
1370	}
1371	WriteGraph(O&: OS, G: SG, /ShortName=/ShortNames: false,
1372	/Title=/SG.getModule().getName());
1373	}
1374
1375	static void splitAMDGPUModule(
1376	GetTTIFn GetTTI, Module &M, unsigned NumParts,
1377	function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) {
1378	CallGraph CG(M);
1379
1380	// Externalize functions whose address are taken.
1381	//
1382	// This is needed because partitioning is purely based on calls, but sometimes
1383	// a kernel/function may just look at the address of another local function
1384	// and not do anything (no calls). After partitioning, that local function may
1385	// end up in a different module (so it's just a declaration in the module
1386	// where its address is taken), which emits a "undefined hidden symbol" linker
1387	// error.
1388	//
1389	// Additionally, it guides partitioning to not duplicate this function if it's
1390	// called directly at some point.
1391	//
1392	// TODO: Could we be smarter about this ? This makes all functions whose
1393	// addresses are taken non-copyable. We should probably model this type of
1394	// constraint in the graph and use it to guide splitting, instead of
1395	// externalizing like this. Maybe non-copyable should really mean "keep one
1396	// visible copy, then internalize all other copies" for some functions?
1397	if (!NoExternalizeOnAddrTaken) {
1398	for (auto &Fn : M) {
1399	// TODO: Should aliases count? Probably not but they're so rare I'm not
1400	// sure it's worth fixing.
1401	if (Fn.hasLocalLinkage() && Fn.hasAddressTaken()) {
1402	LLVM_DEBUG(dbgs() << "[externalize] "; Fn.printAsOperand(dbgs());
1403	dbgs() << " because its address is taken\n");
1404	externalize(GV&: Fn);
1405	}
1406	}
1407	}
1408
1409	// Externalize local GVs, which avoids duplicating their initializers, which
1410	// in turns helps keep code size in check.
1411	if (!NoExternalizeGlobals) {
1412	for (auto &GV : M.globals()) {
1413	if (GV.hasLocalLinkage())
1414	LLVM_DEBUG(dbgs() << "[externalize] GV " << GV.getName() << `'\n'`);
1415	externalize(GV);
1416	}
1417	}
1418
1419	// Start by calculating the cost of every function in the module, as well as
1420	// the module's overall cost.
1421	FunctionsCostMap FnCosts;
1422	const CostType ModuleCost = calculateFunctionCosts(GetTTI, M, CostMap&: FnCosts);
1423
1424	// Build the SplitGraph, which represents the module's functions and models
1425	// their dependencies accurately.
1426	SplitGraph SG(M, FnCosts, ModuleCost);
1427	SG.buildGraph(CG);
1428
1429	if (SG.empty()) {
1430	LLVM_DEBUG(
1431	dbgs()
1432	<< "[!] no nodes in graph, input is empty - no splitting possible\n");
1433	ModuleCallback (cloneAll(M));
1434	return;
1435	}
1436
1437	LLVM_DEBUG({
1438	dbgs() << "[graph] nodes:\n";
1439	for (const SplitGraph::Node *N : SG.nodes()) {
1440	dbgs() << " - [" << N->getID() << "]: " << N->getName() << " "
1441	<< (N->isGraphEntryPoint() ? "(entry)" : "") << " "
1442	<< (N->isNonCopyable() ? "(noncopyable)" : "") << "\n";
1443	}
1444	});
1445
1446	writeDOTGraph(SG);
1447
1448	LLVM_DEBUG(dbgs() << "[search] testing splitting strategies\n");
1449
1450	std::optional<SplitProposal> Proposal;
1451	const auto EvaluateProposal = [&](SplitProposal SP) {
1452	SP.calculateScores();
1453	if (!Proposal)
1454	Proposal = std::move(SP);
1455	else
1456	evaluateProposal(Best&: *Proposal, New: std::move(SP));
1457	};
1458
1459	// TODO: It would be very easy to create new strategies by just adding a base
1460	// class to RecursiveSearchSplitting and abstracting it away.
1461	RecursiveSearchSplitting (SG, NumParts, EvaluateProposal).run();
1462	LLVM_DEBUG(if (Proposal) dbgs() << "[search done] selected proposal: "
1463	<< Proposal->getName() << "\n";);
1464
1465	if (!Proposal) {
1466	LLVM_DEBUG(dbgs() << "[!] no proposal made, no splitting possible!\n");
1467	ModuleCallback (cloneAll(M));
1468	return;
1469	}
1470
1471	LLVM_DEBUG(Proposal->print(dbgs()););
1472
1473	std::optional<raw_fd_ostream> SummariesOS;
1474	if (!PartitionSummariesOutput.empty()) {
1475	std::error_code EC;
1476	SummariesOS.emplace(args&: PartitionSummariesOutput, args&: EC);
1477	if (EC)
1478	errs() << "[" DEBUG_TYPE "]: cannot open '" << PartitionSummariesOutput
1479	<< "' - Partition summaries will not be printed\n";
1480	}
1481
1482	// One module will import all GlobalValues that are not Functions
1483	// and are not subject to conservative import.
1484	bool ImportAllGVs = true;
1485
1486	for (unsigned PID = `0`; PID < NumParts; ++PID) {
1487	SplitModuleTimer SMT2("modules_creation",
1488	"creating modules for each partition");
1489	LLVM_DEBUG(dbgs() << "[split] creating new modules\n");
1490
1491	DenseSet<const Function *> FnsInPart;
1492	for (unsigned NodeID : (*Proposal)[PID].set_bits())
1493	FnsInPart.insert(V: &SG.getNode(ID: NodeID).getFunction());
1494
1495	// Don't create empty modules.
1496	if (FnsInPart.empty()) {
1497	LLVM_DEBUG(dbgs() << "[split] P" << PID
1498	<< " is empty, not creating module\n");
1499	continue;
1500	}
1501
1502	ValueToValueMapTy VMap;
1503	CostType PartCost = `0`;
1504	std::unique_ptr<Module> MPart(
1505	CloneModule(M, VMap, ShouldCloneDefinition: [&](const GlobalValue *GV) {
1506	// Functions go in their assigned partition.
1507	if (const auto *Fn = dyn_cast<Function>(Val: GV)) {
1508	if (FnsInPart.contains(V: Fn)) {
1509	PartCost += SG.getCost(F: *Fn);
1510	return true;
1511	}
1512	return false;
1513	}
1514
1515	// Everything else goes in the first non-empty module we create.
1516	return ImportAllGVs \|\| needsConservativeImport(GV);
1517	}));
1518
1519	ImportAllGVs = false;
1520
1521	// FIXME: Aliases aren't seen often, and their handling isn't perfect so
1522	// bugs are possible.
1523
1524	// Clean-up conservatively imported GVs without any users.
1525	for (auto &GV : make_early_inc_range(Range: MPart ->global_values())) {
1526	if (needsConservativeImport(GV: &GV) && GV.use_empty())
1527	GV.eraseFromParent();
1528	}
1529
1530	if (SummariesOS)
1531	printPartitionSummary(OS&: SummariesOS, N: PID, M: MPart, PartCost, ModuleCost);
1532
1533	LLVM_DEBUG(
1534	printPartitionSummary(dbgs(), PID, *MPart, PartCost, ModuleCost));
1535
1536	ModuleCallback (std::move(MPart));
1537	}
1538	}
1539	} // namespace
1540
1541	PreservedAnalyses AMDGPUSplitModulePass::run(Module &M,
1542	ModuleAnalysisManager &MAM) {
1543	SplitModuleTimer SMT(
1544	"total", "total pass runtime (incl. potentially waiting for lockfile)");
1545
1546	FunctionAnalysisManager &FAM =
1547	MAM.getResult<FunctionAnalysisManagerModuleProxy>(IR&: M).getManager();
1548	const auto TTIGetter = [&FAM](Function &F) -> const TargetTransformInfo & {
1549	return FAM.getResult<TargetIRAnalysis>(IR&: F);
1550	};
1551
1552	bool Done = false;
1553	#ifndef NDEBUG
1554	if (UseLockFile) {
1555	SmallString<`128`> LockFilePath;
1556	sys::path::system_temp_directory(/ErasedOnReboot=/true, LockFilePath);
1557	sys::path::append(LockFilePath, "amdgpu-split-module-debug");
1558	LLVM_DEBUG(dbgs() << DEBUG_TYPE " using lockfile '" << LockFilePath
1559	<< "'\n");
1560
1561	while (true) {
1562	llvm::LockFileManager Lock(LockFilePath.str());
1563	bool Owned;
1564	if (Error Err = Lock.tryLock().moveInto(Owned)) {
1565	consumeError(std::move(Err));
1566	LLVM_DEBUG(
1567	dbgs() << "[amdgpu-split-module] unable to acquire lockfile, debug "
1568	"output may be mangled by other processes\n");
1569	} else if (!Owned) {
1570	switch (Lock.waitForUnlockFor(std::chrono::seconds(`90`))) {
1571	case WaitForUnlockResult::Success:
1572	break;
1573	case WaitForUnlockResult::OwnerDied:
1574	continue; // try again to get the lock.
1575	case WaitForUnlockResult::Timeout:
1576	LLVM_DEBUG(
1577	dbgs()
1578	<< "[amdgpu-split-module] unable to acquire lockfile, debug "
1579	"output may be mangled by other processes\n");
1580	Lock.unsafeMaybeUnlock();
1581	break; // give up
1582	}
1583	}
1584
1585	splitAMDGPUModule(TTIGetter, M, N, ModuleCallback);
1586	Done = true;
1587	break;
1588	}
1589	}
1590	#endif
1591
1592	if (!Done)
1593	splitAMDGPUModule(GetTTI: TTIGetter, M, NumParts: N, ModuleCallback);
1594
1595	// We can change linkage/visibilities in the input, consider that nothing is
1596	// preserved just to be safe. This pass runs last anyway.
1597	return PreservedAnalyses::none();
1598	}
1599	} // namespace llvm
1600

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp