AMDGPUSplitModule.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp]

1	//===- AMDGPUSplitModule.cpp ----------------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file Implements a module splitting algorithm designed to support the
10	/// FullLTO --lto-partitions option for parallel codegen. This is completely
11	/// different from the common SplitModule pass, as this system is designed with
12	/// AMDGPU in mind.
13	///
14	/// The basic idea of this module splitting implementation is the same as
15	/// SplitModule: load-balance the module's functions across a set of N
16	/// partitions to allow parallel codegen. However, it does it very
17	/// differently than the target-agnostic variant:
18	/// - The module has "split roots", which are kernels in the vast
19	// majority of cases.
20	/// - Each root has a set of dependencies, and when a root and its
21	/// dependencies is considered "big", we try to put it in a partition where
22	/// most dependencies are already imported, to avoid duplicating large
23	/// amounts of code.
24	/// - There's special care for indirect calls in order to ensure
25	/// AMDGPUResourceUsageAnalysis can work correctly.
26	///
27	/// This file also includes a more elaborate logging system to enable
28	/// users to easily generate logs that (if desired) do not include any value
29	/// names, in order to not leak information about the source file.
30	/// Such logs are very helpful to understand and fix potential issues with
31	/// module splitting.
32
33	#include "AMDGPUSplitModule.h"
34	#include "AMDGPUTargetMachine.h"
35	#include "Utils/AMDGPUBaseInfo.h"
36	#include "llvm/ADT/DenseMap.h"
37	#include "llvm/ADT/SmallVector.h"
38	#include "llvm/ADT/StringExtras.h"
39	#include "llvm/ADT/StringRef.h"
40	#include "llvm/Analysis/CallGraph.h"
41	#include "llvm/Analysis/TargetTransformInfo.h"
42	#include "llvm/IR/Function.h"
43	#include "llvm/IR/Instruction.h"
44	#include "llvm/IR/Module.h"
45	#include "llvm/IR/User.h"
46	#include "llvm/IR/Value.h"
47	#include "llvm/Support/Casting.h"
48	#include "llvm/Support/Debug.h"
49	#include "llvm/Support/FileSystem.h"
50	#include "llvm/Support/Path.h"
51	#include "llvm/Support/Process.h"
52	#include "llvm/Support/SHA256.h"
53	#include "llvm/Support/Threading.h"
54	#include "llvm/Support/raw_ostream.h"
55	#include "llvm/Transforms/Utils/Cloning.h"
56	#include <algorithm>
57	#include <cassert>
58	#include <iterator>
59	#include <memory>
60	#include <utility>
61	#include <vector>
62
63	using namespace llvm;
64
65	#define DEBUG_TYPE "amdgpu-split-module"
66
67	namespace {
68
69	static cl::opt<float> LargeFnFactor(
70	"amdgpu-module-splitting-large-function-threshold", cl::init(Val: `2.0f`),
71	cl::Hidden,
72	cl::desc (
73	"consider a function as large and needing special treatment when the "
74	"cost of importing it into a partition"
75	"exceeds the average cost of a partition by this factor; e;g. 2.0 "
76	"means if the function and its dependencies is 2 times bigger than "
77	"an average partition; 0 disables large functions handling entirely"));
78
79	static cl::opt<float> LargeFnOverlapForMerge(
80	"amdgpu-module-splitting-large-function-merge-overlap", cl::init(Val: `0.8f`),
81	cl::Hidden,
82	cl::desc (
83	"defines how much overlap between two large function's dependencies "
84	"is needed to put them in the same partition"));
85
86	static cl::opt<bool> NoExternalizeGlobals(
87	"amdgpu-module-splitting-no-externalize-globals", cl::Hidden,
88	cl::desc ("disables externalization of global variable with local linkage; "
89	"may cause globals to be duplicated which increases binary size"));
90
91	static cl::opt<std::string>
92	LogDirOpt("amdgpu-module-splitting-log-dir", cl::Hidden,
93	cl::desc ("output directory for AMDGPU module splitting logs"));
94
95	static cl::opt<bool>
96	LogPrivate("amdgpu-module-splitting-log-private", cl::Hidden,
97	cl::desc ("hash value names before printing them in the AMDGPU "
98	"module splitting logs"));
99
100	using CostType = InstructionCost::CostType;
101	using PartitionID = unsigned;
102	using GetTTIFn = function_ref<const TargetTransformInfo &(Function &)>;
103
104	static bool isEntryPoint(const Function *F) {
105	return AMDGPU::isEntryFunctionCC(CC: F->getCallingConv());
106	}
107
108	static std::string getName(const Value &V) {
109	static bool HideNames;
110
111	static llvm::once_flag HideNameInitFlag;
112	llvm::call_once(flag&: HideNameInitFlag, F: [&]() {
113	if (LogPrivate.getNumOccurrences())
114	HideNames = LogPrivate;
115	else {
116	const auto EV = sys::Process::GetEnv(name: "AMD_SPLIT_MODULE_LOG_PRIVATE");
117	HideNames = (EV.value_or(u: "0") != "0");
118	}
119	});
120
121	if (!HideNames)
122	return V.getName().str();
123	return toHex(Input: SHA256::hash(Data: arrayRefFromStringRef(Input: V.getName())),
124	/LowerCase=/true);
125	}
126
127	/// Main logging helper.
128	///
129	/// Logging can be configured by the following environment variable.
130	/// AMD_SPLIT_MODULE_LOG_DIR=<filepath>
131	/// If set, uses <filepath> as the directory to write logfiles to
132	/// each time module splitting is used.
133	/// AMD_SPLIT_MODULE_LOG_PRIVATE
134	/// If set to anything other than zero, all names are hidden.
135	///
136	/// Both environment variables have corresponding CL options which
137	/// takes priority over them.
138	///
139	/// Any output printed to the log files is also printed to dbgs() when -debug is
140	/// used and LLVM_DEBUG is defined.
141	///
142	/// This approach has a small disadvantage over LLVM_DEBUG though: logging logic
143	/// cannot be removed from the code (by building without debug). This probably
144	/// has a small performance cost because if some computation/formatting is
145	/// needed for logging purpose, it may be done everytime only to be ignored
146	/// by the logger.
147	///
148	/// As this pass only runs once and is not doing anything computationally
149	/// expensive, this is likely a reasonable trade-off.
150	///
151	/// If some computation should really be avoided when unused, users of the class
152	/// can check whether any logging will occur by using the bool operator.
153	///
154	/// \code
155	/// if (SML) {
156	/// // Executes only if logging to a file or if -debug is available and
157	/// used.
158	/// }
159	/// \endcode
160	class SplitModuleLogger {
161	public:
162	SplitModuleLogger(const Module &M) {
163	std::string LogDir = LogDirOpt;
164	if (LogDir.empty())
165	LogDir = sys::Process::GetEnv(name: "AMD_SPLIT_MODULE_LOG_DIR").value_or(u: "");
166
167	// No log dir specified means we don't need to log to a file.
168	// We may still log to dbgs(), though.
169	if (LogDir.empty())
170	return;
171
172	// If a log directory is specified, create a new file with a unique name in
173	// that directory.
174	int Fd;
175	SmallString<`0`> PathTemplate;
176	SmallString<`0`> RealPath;
177	sys::path::append(path&: PathTemplate, a: LogDir, b: "Module-%%-%%-%%-%%-%%-%%-%%.txt");
178	if (auto Err =
179	sys::fs::createUniqueFile(Model: PathTemplate.str(), ResultFD&: Fd, ResultPath&: RealPath)) {
180	report_fatal_error(reason: "Failed to create log file at '" + Twine (LogDir) +
181	"': " + Err.message(),
182	/CrashDiag=/gen_crash_diag: false);
183	}
184
185	FileOS = std::make_unique<raw_fd_ostream>(args&: Fd, /shouldClose=/args: true);
186	}
187
188	bool hasLogFile() const { return FileOS != nullptr; }
189
190	raw_ostream &logfile() {
191	assert(FileOS && "no logfile!");
192	return *FileOS;
193	}
194
195	/// \returns true if this SML will log anything either to a file or dbgs().
196	/// Can be used to avoid expensive computations that are ignored when logging
197	/// is disabled.
198	operator bool() const {
199	return hasLogFile() \|\| (DebugFlag && isCurrentDebugType(DEBUG_TYPE));
200	}
201
202	private:
203	std::unique_ptr<raw_fd_ostream> FileOS;
204	};
205
206	template <typename Ty>
207	static SplitModuleLogger &operator<<(SplitModuleLogger &SML, const Ty &Val) {
208	static_assert(
209	!std::is_same_v<Ty, Value>,
210	"do not print values to logs directly, use handleName instead!");
211	LLVM_DEBUG(dbgs() << Val);
212	if (SML.hasLogFile())
213	SML.logfile() << Val;
214	return SML;
215	}
216
217	/// Calculate the cost of each function in \p M
218	/// \param SML Log Helper
219	/// \param GetTTI Abstract getter for TargetTransformInfo.
220	/// \param M Module to analyze.
221	/// \param CostMap[out] Resulting Function -> Cost map.
222	/// \return The module's total cost.
223	static CostType
224	calculateFunctionCosts(SplitModuleLogger &SML, GetTTIFn GetTTI, Module &M,
225	DenseMap<const Function *, CostType> &CostMap) {
226	CostType ModuleCost = `0`;
227	CostType KernelCost = `0`;
228
229	for (auto &Fn : M) {
230	if (Fn.isDeclaration())
231	continue;
232
233	CostType FnCost = `0`;
234	const auto &TTI = GetTTI (Fn);
235	for (const auto &BB : Fn) {
236	for (const auto &I : BB) {
237	auto Cost =
238	TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_CodeSize);
239	assert(Cost != InstructionCost::getMax());
240	// Assume expensive if we can't tell the cost of an instruction.
241	CostType CostVal =
242	Cost.getValue().value_or(u: TargetTransformInfo::TCC_Expensive);
243	assert((FnCost + CostVal) >= FnCost && "Overflow!");
244	FnCost += CostVal;
245	}
246	}
247
248	assert(FnCost != `0`);
249
250	CostMap [&Fn] = FnCost;
251	assert((ModuleCost + FnCost) >= ModuleCost && "Overflow!");
252	ModuleCost += FnCost;
253
254	if (isEntryPoint(F: &Fn))
255	KernelCost += FnCost;
256	}
257
258	CostType FnCost = (ModuleCost - KernelCost);
259	CostType ModuleCostOr1 = ModuleCost ? ModuleCost : `1`;
260	SML << "=> Total Module Cost: " << ModuleCost << `'\n'`
261	<< " => KernelCost: " << KernelCost << " ("
262	<< format(Fmt: "%0.2f", Vals: (float(KernelCost) / ModuleCostOr1) * `100`) << "%)\n"
263	<< " => FnsCost: " << FnCost << " ("
264	<< format(Fmt: "%0.2f", Vals: (float(FnCost) / ModuleCostOr1) * `100`) << "%)\n";
265
266	return ModuleCost;
267	}
268
269	static bool canBeIndirectlyCalled(const Function &F) {
270	if (F.isDeclaration() \|\| isEntryPoint(F: &F))
271	return false;
272	return !F.hasLocalLinkage() \|\|
273	F.hasAddressTaken(/PutOffender=/nullptr,
274	/IgnoreCallbackUses=/false,
275	/IgnoreAssumeLikeCalls=/true,
276	/IgnoreLLVMUsed=/IngoreLLVMUsed: true,
277	/IgnoreARCAttachedCall=/false,
278	/IgnoreCastedDirectCall=/true);
279	}
280
281	/// When a function or any of its callees performs an indirect call, this
282	/// takes over \ref addAllDependencies and adds all potentially callable
283	/// functions to \p Fns so they can be counted as dependencies of the function.
284	///
285	/// This is needed due to how AMDGPUResourceUsageAnalysis operates: in the
286	/// presence of an indirect call, the function's resource usage is the same as
287	/// the most expensive function in the module.
288	/// \param M The module.
289	/// \param Fns[out] Resulting list of functions.
290	static void addAllIndirectCallDependencies(const Module &M,
291	DenseSet<const Function *> &Fns) {
292	for (const auto &Fn : M) {
293	if (canBeIndirectlyCalled(F: Fn))
294	Fns.insert(V: &Fn);
295	}
296	}
297
298	/// Adds the functions that \p Fn may call to \p Fns, then recurses into each
299	/// callee until all reachable functions have been gathered.
300	///
301	/// \param SML Log Helper
302	/// \param CG Call graph for \p Fn's module.
303	/// \param Fn Current function to look at.
304	/// \param Fns[out] Resulting list of functions.
305	/// \param OnlyDirect Whether to only consider direct callees.
306	/// \param HadIndirectCall[out] Set to true if an indirect call was seen at some
307	/// point, either in \p Fn or in one of the function it calls. When that
308	/// happens, we fall back to adding all callable functions inside \p Fn's module
309	/// to \p Fns.
310	static void addAllDependencies(SplitModuleLogger &SML, const CallGraph &CG,
311	const Function &Fn,
312	DenseSet<const Function > &Fns, bool* OnlyDirect,
313	bool &HadIndirectCall) {
314	assert(!Fn.isDeclaration());
315
316	const Module &M = *Fn.getParent();
317	SmallVector<const Function *> WorkList({&Fn});
318	while (!WorkList.empty()) {
319	const auto &CurFn = *WorkList.pop_back_val();
320	assert(!CurFn.isDeclaration());
321
322	// Scan for an indirect call. If such a call is found, we have to
323	// conservatively assume this can call all non-entrypoint functions in the
324	// module.
325
326	for (auto &CGEntry : *CG [&CurFn]) {
327	auto *CGNode = CGEntry.second;
328	auto *Callee = CGNode->getFunction();
329	if (!Callee) {
330	if (OnlyDirect)
331	continue;
332
333	// Functions have an edge towards CallsExternalNode if they're external
334	// declarations, or if they do an indirect call. As we only process
335	// definitions here, we know this means the function has an indirect
336	// call. We then have to conservatively assume this can call all
337	// non-entrypoint functions in the module.
338	if (CGNode != CG.getCallsExternalNode())
339	continue; // this is another function-less node we don't care about.
340
341	SML << "Indirect call detected in " << getName(V: CurFn)
342	<< " - treating all non-entrypoint functions as "
343	"potential dependencies\n";
344
345	// TODO: Print an ORE as well ?
346	addAllIndirectCallDependencies(M, Fns);
347	HadIndirectCall = true;
348	continue;
349	}
350
351	if (Callee->isDeclaration())
352	continue;
353
354	auto [It, Inserted] = Fns.insert(V: Callee);
355	if (Inserted)
356	WorkList.push_back(Elt: Callee);
357	}
358	}
359	}
360
361	/// Contains information about a function and its dependencies.
362	/// This is a splitting root. The splitting algorithm works by
363	/// assigning these to partitions.
364	struct FunctionWithDependencies {
365	FunctionWithDependencies(SplitModuleLogger &SML, CallGraph &CG,
366	const DenseMap<const Function *, CostType> &FnCosts,
367	const Function *Fn)
368	: Fn(Fn) {
369	// When Fn is not a kernel, we don't need to collect indirect callees.
370	// Resource usage analysis is only performed on kernels, and we collect
371	// indirect callees for resource usage analysis.
372	addAllDependencies(SML, CG, Fn: *Fn, Fns&: Dependencies,
373	/OnlyDirect/ !isEntryPoint(F: Fn), HadIndirectCall&: HasIndirectCall);
374	TotalCost = FnCosts.at(Val: Fn);
375	for (const auto *Dep : Dependencies) {
376	TotalCost += FnCosts.at(Val: Dep);
377
378	// We cannot duplicate functions with external linkage, or functions that
379	// may be overriden at runtime.
380	HasNonDuplicatableDependecy \|=
381	(Dep->hasExternalLinkage() \|\| !Dep->isDefinitionExact());
382	}
383	}
384
385	const Function Fn = nullptr*;
386	DenseSet<const Function *> Dependencies;
387	/// Whether \p Fn or any of its \ref Dependencies contains an indirect call.
388	bool HasIndirectCall = false;
389	/// Whether any of \p Fn's dependencies cannot be duplicated.
390	bool HasNonDuplicatableDependecy = false;
391
392	CostType TotalCost = `0`;
393
394	/// \returns true if this function and its dependencies can be considered
395	/// large according to \p Threshold.
396	bool isLarge(CostType Threshold) const {
397	return TotalCost > Threshold && !Dependencies.empty();
398	}
399	};
400
401	/// Calculates how much overlap there is between \p A and \p B.
402	/// \return A number between 0.0 and 1.0, where 1.0 means A == B and 0.0 means A
403	/// and B have no shared elements. Kernels do not count in overlap calculation.
404	static float calculateOverlap(const DenseSet<const Function *> &A,
405	const DenseSet<const Function *> &B) {
406	DenseSet<const Function *> Total;
407	for (const auto *F : A) {
408	if (!isEntryPoint(F))
409	Total.insert(V: F);
410	}
411
412	if (Total.empty())
413	return `0.0f`;
414
415	unsigned NumCommon = `0`;
416	for (const auto *F : B) {
417	if (isEntryPoint(F))
418	continue;
419
420	auto [It, Inserted] = Total.insert(V: F);
421	if (!Inserted)
422	++NumCommon;
423	}
424
425	return static_cast<float>(NumCommon) / Total.size();
426	}
427
428	/// Performs all of the partitioning work on \p M.
429	/// \param SML Log Helper
430	/// \param M Module to partition.
431	/// \param NumParts Number of partitions to create.
432	/// \param ModuleCost Total cost of all functions in \p M.
433	/// \param FnCosts Map of Function -> Cost
434	/// \param WorkList Functions and their dependencies to process in order.
435	/// \returns The created partitions (a vector of size \p NumParts )
436	static std::vector<DenseSet<const Function *>>
437	doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts,
438	CostType ModuleCost,
439	const DenseMap<const Function *, CostType> &FnCosts,
440	const SmallVector<FunctionWithDependencies> &WorkList) {
441
442	SML << "\n--Partitioning Starts--\n";
443
444	// Calculate a "large function threshold". When more than one function's total
445	// import cost exceeds this value, we will try to assign it to an existing
446	// partition to reduce the amount of duplication needed.
447	//
448	// e.g. let two functions X and Y have a import cost of ~10% of the module, we
449	// assign X to a partition as usual, but when we get to Y, we check if it's
450	// worth also putting it in Y's partition.
451	const CostType LargeFnThreshold =
452	LargeFnFactor ? CostType(((ModuleCost / NumParts) * LargeFnFactor))
453	: std::numeric_limits<CostType>::max();
454
455	std::vector<DenseSet<const Function *>> Partitions;
456	Partitions.resize(new_size: NumParts);
457
458	// Assign functions to partitions, and try to keep the partitions more or
459	// less balanced. We do that through a priority queue sorted in reverse, so we
460	// can always look at the partition with the least content.
461	//
462	// There are some cases where we will be deliberately unbalanced though.
463	// - Large functions: we try to merge with existing partitions to reduce code
464	// duplication.
465	// - Functions with indirect or external calls always go in the first
466	// partition (P0).
467	auto ComparePartitions = [](const std::pair<PartitionID, CostType> &a,
468	const std::pair<PartitionID, CostType> &b) {
469	// When two partitions have the same cost, assign to the one with the
470	// biggest ID first. This allows us to put things in P0 last, because P0 may
471	// have other stuff added later.
472	if (a.second == b.second)
473	return a.first < b.first;
474	return a.second > b.second;
475	};
476
477	// We can't use priority_queue here because we need to be able to access any
478	// element. This makes this a bit inefficient as we need to sort it again
479	// everytime we change it, but it's a very small array anyway (likely under 64
480	// partitions) so it's a cheap operation.
481	std::vector<std::pair<PartitionID, CostType>> BalancingQueue;
482	for (unsigned I = `0`; I < NumParts; ++I)
483	BalancingQueue.emplace_back(args&: I, args: `0`);
484
485	// Helper function to handle assigning a function to a partition. This takes
486	// care of updating the balancing queue.
487	const auto AssignToPartition = [&](PartitionID PID,
488	const FunctionWithDependencies &FWD) {
489	auto &FnsInPart = Partitions [PID];
490	FnsInPart.insert(V: FWD.Fn);
491	FnsInPart.insert(I: FWD.Dependencies.begin(), E: FWD.Dependencies.end());
492
493	SML << "assign " << getName(V: *FWD.Fn) << " to P" << PID << "\n -> ";
494	if (!FWD.Dependencies.empty()) {
495	SML << FWD.Dependencies.size() << " dependencies added\n";
496	};
497
498	// Update the balancing queue. we scan backwards because in the common case
499	// the partition is at the end.
500	for (auto &[QueuePID, Cost] : reverse(C&: BalancingQueue)) {
501	if (QueuePID == PID) {
502	CostType NewCost = `0`;
503	for (auto *Fn : Partitions [PID])
504	NewCost += FnCosts.at(Val: Fn);
505
506	SML << "[Updating P" << PID << " Cost]:" << Cost << " -> " << NewCost;
507	if (Cost) {
508	SML << " (" << unsigned(((float(NewCost) / Cost) - `1`) * `100`)
509	<< "% increase)";
510	}
511	SML << `'\n'`;
512
513	Cost = NewCost;
514	}
515	}
516
517	sort(C&: BalancingQueue, Comp: ComparePartitions);
518	};
519
520	for (auto &CurFn : WorkList) {
521	// When a function has indirect calls, it must stay in the first partition
522	// alongside every reachable non-entry function. This is a nightmare case
523	// for splitting as it severely limits what we can do.
524	if (CurFn.HasIndirectCall) {
525	SML << "Function with indirect call(s): " << getName(V: *CurFn.Fn)
526	<< " defaulting to P0\n";
527	AssignToPartition (`0`, CurFn);
528	continue;
529	}
530
531	// When a function has non duplicatable dependencies, we have to keep it in
532	// the first partition as well. This is a conservative approach, a
533	// finer-grained approach could keep track of which dependencies are
534	// non-duplicatable exactly and just make sure they're grouped together.
535	if (CurFn.HasNonDuplicatableDependecy) {
536	SML << "Function with externally visible dependency "
537	<< getName(V: *CurFn.Fn) << " defaulting to P0\n";
538	AssignToPartition (`0`, CurFn);
539	continue;
540	}
541
542	// Be smart with large functions to avoid duplicating their dependencies.
543	if (CurFn.isLarge(Threshold: LargeFnThreshold)) {
544	assert(LargeFnOverlapForMerge >= `0.0f` && LargeFnOverlapForMerge <= `1.0f`);
545	SML << "Large Function: " << getName(V: *CurFn.Fn)
546	<< " - looking for partition with at least "
547	<< format(Fmt: "%0.2f", Vals: LargeFnOverlapForMerge * `100`) << "% overlap\n";
548
549	bool Assigned = false;
550	for (const auto &[PID, Fns] : enumerate(First&: Partitions)) {
551	float Overlap = calculateOverlap(A: CurFn.Dependencies, B: Fns);
552	SML << " => " << format(Fmt: "%0.2f", Vals: Overlap * `100`) << "% overlap with P"
553	<< PID << `'\n'`;
554	if (Overlap > LargeFnOverlapForMerge) {
555	SML << " selecting P" << PID << `'\n'`;
556	AssignToPartition (PID, CurFn);
557	Assigned = true;
558	}
559	}
560
561	if (Assigned)
562	continue;
563	}
564
565	// Normal "load-balancing", assign to partition with least pressure.
566	auto [PID, CurCost] = BalancingQueue.back();
567	AssignToPartition (PID, CurFn);
568	}
569
570	if (SML) {
571	for (const auto &[Idx, Part] : enumerate(First&: Partitions)) {
572	CostType Cost = `0`;
573	for (auto *Fn : Part)
574	Cost += FnCosts.at(Val: Fn);
575	SML << "P" << Idx << " has a total cost of " << Cost << " ("
576	<< format(Fmt: "%0.2f", Vals: (float(Cost) / ModuleCost) * `100`)
577	<< "% of source module)\n";
578	}
579
580	SML << "--Partitioning Done--\n\n";
581	}
582
583	// Check no functions were missed.
584	#ifndef NDEBUG
585	DenseSet<const Function *> AllFunctions;
586	for (const auto &Part : Partitions)
587	AllFunctions.insert(Part.begin(), Part.end());
588
589	for (auto &Fn : M) {
590	if (!Fn.isDeclaration() && !AllFunctions.contains(&Fn)) {
591	assert(AllFunctions.contains(&Fn) && "Missed a function?!");
592	}
593	}
594	#endif
595
596	return Partitions;
597	}
598
599	static void externalize(GlobalValue &GV) {
600	if (GV.hasLocalLinkage()) {
601	GV.setLinkage(GlobalValue::ExternalLinkage);
602	GV.setVisibility(GlobalValue::HiddenVisibility);
603	}
604
605	// Unnamed entities must be named consistently between modules. setName will
606	// give a distinct name to each such entity.
607	if (!GV.hasName())
608	GV.setName("__llvmsplit_unnamed");
609	}
610
611	static bool hasDirectCaller(const Function &Fn) {
612	for (auto &U : Fn.uses()) {
613	if (auto *CB = dyn_cast<CallBase>(Val: U.getUser()); CB && CB->isCallee(U: &U))
614	return true;
615	}
616	return false;
617	}
618
619	static void splitAMDGPUModule(
620	GetTTIFn GetTTI, Module &M, unsigned N,
621	function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) {
622
623	SplitModuleLogger SML(M);
624
625	CallGraph CG(M);
626
627	// Externalize functions whose address are taken.
628	//
629	// This is needed because partitioning is purely based on calls, but sometimes
630	// a kernel/function may just look at the address of another local function
631	// and not do anything (no calls). After partitioning, that local function may
632	// end up in a different module (so it's just a declaration in the module
633	// where its address is taken), which emits a "undefined hidden symbol" linker
634	// error.
635	//
636	// Additionally, it guides partitioning to not duplicate this function if it's
637	// called directly at some point.
638	for (auto &Fn : M) {
639	if (Fn.hasAddressTaken()) {
640	if (Fn.hasLocalLinkage()) {
641	SML << "[externalize] " << Fn.getName()
642	<< " because its address is taken\n";
643	}
644	externalize(GV&: Fn);
645	}
646	}
647
648	// Externalize local GVs, which avoids duplicating their initializers, which
649	// in turns helps keep code size in check.
650	if (!NoExternalizeGlobals) {
651	for (auto &GV : M.globals()) {
652	if (GV.hasLocalLinkage())
653	SML << "[externalize] GV " << GV.getName() << `'\n'`;
654	externalize(GV);
655	}
656	}
657
658	// Start by calculating the cost of every function in the module, as well as
659	// the module's overall cost.
660	DenseMap<const Function *, CostType> FnCosts;
661	const CostType ModuleCost = calculateFunctionCosts(SML, GetTTI, M, CostMap&: FnCosts);
662
663	// First, gather ever kernel into the worklist.
664	SmallVector<FunctionWithDependencies> WorkList;
665	for (auto &Fn : M) {
666	if (isEntryPoint(F: &Fn) && !Fn.isDeclaration())
667	WorkList.emplace_back(Args&: SML, Args&: CG, Args&: FnCosts, Args: &Fn);
668	}
669
670	// Then, find missing functions that need to be considered as additional
671	// roots. These can't be called in theory, but in practice we still have to
672	// handle them to avoid linker errors.
673	{
674	DenseSet<const Function *> SeenFunctions;
675	for (const auto &FWD : WorkList) {
676	SeenFunctions.insert(V: FWD.Fn);
677	SeenFunctions.insert(I: FWD.Dependencies.begin(), E: FWD.Dependencies.end());
678	}
679
680	for (auto &Fn : M) {
681	// If this function is not part of any kernel's dependencies and isn't
682	// directly called, consider it as a root.
683	if (!Fn.isDeclaration() && !isEntryPoint(F: &Fn) &&
684	!SeenFunctions.count(V: &Fn) && !hasDirectCaller(Fn)) {
685	WorkList.emplace_back(Args&: SML, Args&: CG, Args&: FnCosts, Args: &Fn);
686	}
687	}
688	}
689
690	// Sort the worklist so the most expensive roots are seen first.
691	sort(C&: WorkList, Comp: [&](auto &A, auto &B) {
692	// Sort by total cost, and if the total cost is identical, sort
693	// alphabetically.
694	if (A.TotalCost == B.TotalCost)
695	return A.Fn->getName() < B.Fn->getName();
696	return A.TotalCost > B.TotalCost;
697	});
698
699	if (SML) {
700	SML << "Worklist\n";
701	for (const auto &FWD : WorkList) {
702	SML << "[root] " << getName(V: *FWD.Fn) << " (totalCost:" << FWD.TotalCost
703	<< " indirect:" << FWD.HasIndirectCall
704	<< " hasNonDuplicatableDep:" << FWD.HasNonDuplicatableDependecy
705	<< ")\n";
706	// Sort function names before printing to ensure determinism.
707	SmallVector<std::string> SortedDepNames;
708	SortedDepNames.reserve(N: FWD.Dependencies.size());
709	for (const auto *Dep : FWD.Dependencies)
710	SortedDepNames.push_back(Elt: getName(V: *Dep));
711	sort(C&: SortedDepNames);
712
713	for (const auto &Name : SortedDepNames)
714	SML << " [dependency] " << Name << `'\n'`;
715	}
716	}
717
718	// This performs all of the partitioning work.
719	auto Partitions = doPartitioning(SML, M, NumParts: N, ModuleCost, FnCosts, WorkList);
720	assert(Partitions.size() == N);
721
722	// If we didn't externalize GVs, then local GVs need to be conservatively
723	// imported into every module (including their initializers), and then cleaned
724	// up afterwards.
725	const auto NeedsConservativeImport = [&](const GlobalValue *GV) {
726	// We conservatively import private/internal GVs into every module and clean
727	// them up afterwards.
728	const auto *Var = dyn_cast<GlobalVariable>(Val: GV);
729	return Var && Var->hasLocalLinkage();
730	};
731
732	SML << "Creating " << N << " modules...\n";
733	unsigned TotalFnImpls = `0`;
734	for (unsigned I = `0`; I < N; ++I) {
735	const auto &FnsInPart = Partitions [I];
736
737	ValueToValueMapTy VMap;
738	std::unique_ptr<Module> MPart(
739	CloneModule(M, VMap, ShouldCloneDefinition: [&](const GlobalValue *GV) {
740	// Functions go in their assigned partition.
741	if (const auto *Fn = dyn_cast<Function>(Val: GV))
742	return FnsInPart.contains(V: Fn);
743
744	if (NeedsConservativeImport (GV))
745	return true;
746
747	// Everything else goes in the first partition.
748	return I == `0`;
749	}));
750
751	// Clean-up conservatively imported GVs without any users.
752	for (auto &GV : make_early_inc_range(Range: MPart ->globals())) {
753	if (NeedsConservativeImport (&GV) && GV.use_empty())
754	GV.eraseFromParent();
755	}
756
757	unsigned NumAllFns = `0`, NumKernels = `0`;
758	for (auto &Cur : *MPart) {
759	if (!Cur.isDeclaration()) {
760	++NumAllFns;
761	if (isEntryPoint(F: &Cur))
762	++NumKernels;
763	}
764	}
765	TotalFnImpls += NumAllFns;
766	SML << " - Module " << I << " with " << NumAllFns << " functions ("
767	<< NumKernels << " kernels)\n";
768	ModuleCallback (std::move(MPart));
769	}
770
771	SML << TotalFnImpls << " function definitions across all modules ("
772	<< format(Fmt: "%0.2f", Vals: (float(TotalFnImpls) / FnCosts.size()) * `100`)
773	<< "% of original module)\n";
774	}
775	} // namespace
776
777	PreservedAnalyses AMDGPUSplitModulePass::run(Module &M,
778	ModuleAnalysisManager &MAM) {
779	FunctionAnalysisManager &FAM =
780	MAM.getResult<FunctionAnalysisManagerModuleProxy>(IR&: M).getManager();
781	const auto TTIGetter = [&FAM](Function &F) -> const TargetTransformInfo & {
782	return FAM.getResult<TargetIRAnalysis>(IR&: F);
783	};
784	splitAMDGPUModule(GetTTI: TTIGetter, M, N, ModuleCallback);
785	// We don't change the original module.
786	return PreservedAnalyses::all();
787	}
788

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp