1 | //===- AMDGPUSplitModule.cpp ----------------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file Implements a module splitting algorithm designed to support the |
10 | /// FullLTO --lto-partitions option for parallel codegen. |
11 | /// |
12 | /// The role of this module splitting pass is the same as |
13 | /// lib/Transforms/Utils/SplitModule.cpp: load-balance the module's functions |
14 | /// across a set of N partitions to allow for parallel codegen. |
15 | /// |
16 | /// The similarities mostly end here, as this pass achieves load-balancing in a |
17 | /// more elaborate fashion which is targeted towards AMDGPU modules. It can take |
18 | /// advantage of the structure of AMDGPU modules (which are mostly |
19 | /// self-contained) to allow for more efficient splitting without affecting |
20 | /// codegen negatively, or causing innaccurate resource usage analysis. |
21 | /// |
22 | /// High-level pass overview: |
23 | /// - SplitGraph & associated classes |
24 | /// - Graph representation of the module and of the dependencies that |
25 | /// matter for splitting. |
26 | /// - RecursiveSearchSplitting |
27 | /// - Core splitting algorithm. |
28 | /// - SplitProposal |
29 | /// - Represents a suggested solution for splitting the input module. These |
30 | /// solutions can be scored to determine the best one when multiple |
31 | /// solutions are available. |
32 | /// - Driver/pass "run" function glues everything together. |
33 | |
34 | #include "AMDGPUSplitModule.h" |
35 | #include "AMDGPUTargetMachine.h" |
36 | #include "Utils/AMDGPUBaseInfo.h" |
37 | #include "llvm/ADT/DenseMap.h" |
38 | #include "llvm/ADT/EquivalenceClasses.h" |
39 | #include "llvm/ADT/GraphTraits.h" |
40 | #include "llvm/ADT/SmallVector.h" |
41 | #include "llvm/ADT/StringExtras.h" |
42 | #include "llvm/ADT/StringRef.h" |
43 | #include "llvm/Analysis/CallGraph.h" |
44 | #include "llvm/Analysis/TargetTransformInfo.h" |
45 | #include "llvm/IR/Function.h" |
46 | #include "llvm/IR/InstIterator.h" |
47 | #include "llvm/IR/Instruction.h" |
48 | #include "llvm/IR/Module.h" |
49 | #include "llvm/IR/Value.h" |
50 | #include "llvm/Support/Allocator.h" |
51 | #include "llvm/Support/Casting.h" |
52 | #include "llvm/Support/DOTGraphTraits.h" |
53 | #include "llvm/Support/Debug.h" |
54 | #include "llvm/Support/GraphWriter.h" |
55 | #include "llvm/Support/Path.h" |
56 | #include "llvm/Support/Timer.h" |
57 | #include "llvm/Support/raw_ostream.h" |
58 | #include "llvm/Transforms/Utils/Cloning.h" |
59 | #include <cassert> |
60 | #include <cmath> |
61 | #include <memory> |
62 | #include <utility> |
63 | #include <vector> |
64 | |
65 | #ifndef NDEBUG |
66 | #include "llvm/Support/LockFileManager.h" |
67 | #endif |
68 | |
69 | #define DEBUG_TYPE "amdgpu-split-module" |
70 | |
71 | namespace llvm { |
72 | namespace { |
73 | |
74 | static cl::opt<unsigned> MaxDepth( |
75 | "amdgpu-module-splitting-max-depth" , |
76 | cl::desc( |
77 | "maximum search depth. 0 forces a greedy approach. " |
78 | "warning: the algorithm is up to O(2^N), where N is the max depth." ), |
79 | cl::init(Val: 8)); |
80 | |
81 | static cl::opt<float> LargeFnFactor( |
82 | "amdgpu-module-splitting-large-threshold" , cl::init(Val: 2.0f), cl::Hidden, |
83 | cl::desc( |
84 | "when max depth is reached and we can no longer branch out, this " |
85 | "value determines if a function is worth merging into an already " |
86 | "existing partition to reduce code duplication. This is a factor " |
87 | "of the ideal partition size, e.g. 2.0 means we consider the " |
88 | "function for merging if its cost (including its callees) is 2x the " |
89 | "size of an ideal partition." )); |
90 | |
91 | static cl::opt<float> LargeFnOverlapForMerge( |
92 | "amdgpu-module-splitting-merge-threshold" , cl::init(Val: 0.7f), cl::Hidden, |
93 | cl::desc("when a function is considered for merging into a partition that " |
94 | "already contains some of its callees, do the merge if at least " |
95 | "n% of the code it can reach is already present inside the " |
96 | "partition; e.g. 0.7 means only merge >70%" )); |
97 | |
98 | static cl::opt<bool> NoExternalizeGlobals( |
99 | "amdgpu-module-splitting-no-externalize-globals" , cl::Hidden, |
100 | cl::desc("disables externalization of global variable with local linkage; " |
101 | "may cause globals to be duplicated which increases binary size" )); |
102 | |
103 | static cl::opt<bool> NoExternalizeOnAddrTaken( |
104 | "amdgpu-module-splitting-no-externalize-address-taken" , cl::Hidden, |
105 | cl::desc( |
106 | "disables externalization of functions whose addresses are taken" )); |
107 | |
108 | static cl::opt<std::string> |
109 | ModuleDotCfgOutput("amdgpu-module-splitting-print-module-dotcfg" , |
110 | cl::Hidden, |
111 | cl::desc("output file to write out the dotgraph " |
112 | "representation of the input module" )); |
113 | |
114 | static cl::opt<std::string> PartitionSummariesOutput( |
115 | "amdgpu-module-splitting-print-partition-summaries" , cl::Hidden, |
116 | cl::desc("output file to write out a summary of " |
117 | "the partitions created for each module" )); |
118 | |
119 | #ifndef NDEBUG |
120 | static cl::opt<bool> |
121 | UseLockFile("amdgpu-module-splitting-serial-execution" , cl::Hidden, |
122 | cl::desc("use a lock file so only one process in the system " |
123 | "can run this pass at once. useful to avoid mangled " |
124 | "debug output in multithreaded environments." )); |
125 | |
126 | static cl::opt<bool> |
127 | DebugProposalSearch("amdgpu-module-splitting-debug-proposal-search" , |
128 | cl::Hidden, |
129 | cl::desc("print all proposals received and whether " |
130 | "they were rejected or accepted" )); |
131 | #endif |
132 | |
133 | struct SplitModuleTimer : NamedRegionTimer { |
134 | SplitModuleTimer(StringRef Name, StringRef Desc) |
135 | : NamedRegionTimer(Name, Desc, DEBUG_TYPE, "AMDGPU Module Splitting" , |
136 | TimePassesIsEnabled) {} |
137 | }; |
138 | |
139 | //===----------------------------------------------------------------------===// |
140 | // Utils |
141 | //===----------------------------------------------------------------------===// |
142 | |
143 | using CostType = InstructionCost::CostType; |
144 | using FunctionsCostMap = DenseMap<const Function *, CostType>; |
145 | using GetTTIFn = function_ref<const TargetTransformInfo &(Function &)>; |
146 | static constexpr unsigned InvalidPID = -1; |
147 | |
148 | /// \param Num numerator |
149 | /// \param Dem denominator |
150 | /// \returns a printable object to print (Num/Dem) using "%0.2f". |
151 | static auto formatRatioOf(CostType Num, CostType Dem) { |
152 | CostType DemOr1 = Dem ? Dem : 1; |
153 | return format(Fmt: "%0.2f" , Vals: (static_cast<double>(Num) / DemOr1) * 100); |
154 | } |
155 | |
156 | /// Checks whether a given function is non-copyable. |
157 | /// |
158 | /// Non-copyable functions cannot be cloned into multiple partitions, and only |
159 | /// one copy of the function can be present across all partitions. |
160 | /// |
161 | /// Kernel functions and external functions fall into this category. If we were |
162 | /// to clone them, we would end up with multiple symbol definitions and a very |
163 | /// unhappy linker. |
164 | static bool isNonCopyable(const Function &F) { |
165 | return F.hasExternalLinkage() || !F.isDefinitionExact() || |
166 | AMDGPU::isEntryFunctionCC(CC: F.getCallingConv()); |
167 | } |
168 | |
169 | /// If \p GV has local linkage, make it external + hidden. |
170 | static void externalize(GlobalValue &GV) { |
171 | if (GV.hasLocalLinkage()) { |
172 | GV.setLinkage(GlobalValue::ExternalLinkage); |
173 | GV.setVisibility(GlobalValue::HiddenVisibility); |
174 | } |
175 | |
176 | // Unnamed entities must be named consistently between modules. setName will |
177 | // give a distinct name to each such entity. |
178 | if (!GV.hasName()) |
179 | GV.setName("__llvmsplit_unnamed" ); |
180 | } |
181 | |
182 | /// Cost analysis function. Calculates the cost of each function in \p M |
183 | /// |
184 | /// \param GetTTI Abstract getter for TargetTransformInfo. |
185 | /// \param M Module to analyze. |
186 | /// \param CostMap[out] Resulting Function -> Cost map. |
187 | /// \return The module's total cost. |
188 | static CostType calculateFunctionCosts(GetTTIFn GetTTI, Module &M, |
189 | FunctionsCostMap &CostMap) { |
190 | SplitModuleTimer SMT("calculateFunctionCosts" , "cost analysis" ); |
191 | |
192 | LLVM_DEBUG(dbgs() << "[cost analysis] calculating function costs\n" ); |
193 | CostType ModuleCost = 0; |
194 | [[maybe_unused]] CostType KernelCost = 0; |
195 | |
196 | for (auto &Fn : M) { |
197 | if (Fn.isDeclaration()) |
198 | continue; |
199 | |
200 | CostType FnCost = 0; |
201 | const auto &TTI = GetTTI(Fn); |
202 | for (const auto &BB : Fn) { |
203 | for (const auto &I : BB) { |
204 | auto Cost = |
205 | TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_CodeSize); |
206 | assert(Cost != InstructionCost::getMax()); |
207 | // Assume expensive if we can't tell the cost of an instruction. |
208 | CostType CostVal = Cost.isValid() |
209 | ? Cost.getValue() |
210 | : (CostType)TargetTransformInfo::TCC_Expensive; |
211 | assert((FnCost + CostVal) >= FnCost && "Overflow!" ); |
212 | FnCost += CostVal; |
213 | } |
214 | } |
215 | |
216 | assert(FnCost != 0); |
217 | |
218 | CostMap[&Fn] = FnCost; |
219 | assert((ModuleCost + FnCost) >= ModuleCost && "Overflow!" ); |
220 | ModuleCost += FnCost; |
221 | |
222 | if (AMDGPU::isEntryFunctionCC(CC: Fn.getCallingConv())) |
223 | KernelCost += FnCost; |
224 | } |
225 | |
226 | if (CostMap.empty()) |
227 | return 0; |
228 | |
229 | assert(ModuleCost); |
230 | LLVM_DEBUG({ |
231 | const CostType FnCost = ModuleCost - KernelCost; |
232 | dbgs() << " - total module cost is " << ModuleCost << ". kernels cost " |
233 | << "" << KernelCost << " (" |
234 | << format("%0.2f" , (float(KernelCost) / ModuleCost) * 100) |
235 | << "% of the module), functions cost " << FnCost << " (" |
236 | << format("%0.2f" , (float(FnCost) / ModuleCost) * 100) |
237 | << "% of the module)\n" ; |
238 | }); |
239 | |
240 | return ModuleCost; |
241 | } |
242 | |
243 | /// \return true if \p F can be indirectly called |
244 | static bool canBeIndirectlyCalled(const Function &F) { |
245 | if (F.isDeclaration() || AMDGPU::isEntryFunctionCC(CC: F.getCallingConv())) |
246 | return false; |
247 | return !F.hasLocalLinkage() || |
248 | F.hasAddressTaken(/*PutOffender=*/nullptr, |
249 | /*IgnoreCallbackUses=*/false, |
250 | /*IgnoreAssumeLikeCalls=*/true, |
251 | /*IgnoreLLVMUsed=*/IngoreLLVMUsed: true, |
252 | /*IgnoreARCAttachedCall=*/false, |
253 | /*IgnoreCastedDirectCall=*/true); |
254 | } |
255 | |
256 | //===----------------------------------------------------------------------===// |
257 | // Graph-based Module Representation |
258 | //===----------------------------------------------------------------------===// |
259 | |
260 | /// AMDGPUSplitModule's view of the source Module, as a graph of all components |
261 | /// that can be split into different modules. |
262 | /// |
263 | /// The most trivial instance of this graph is just the CallGraph of the module, |
264 | /// but it is not guaranteed that the graph is strictly equal to the CG. It |
265 | /// currently always is but it's designed in a way that would eventually allow |
266 | /// us to create abstract nodes, or nodes for different entities such as global |
267 | /// variables or any other meaningful constraint we must consider. |
268 | /// |
269 | /// The graph is only mutable by this class, and is generally not modified |
270 | /// after \ref SplitGraph::buildGraph runs. No consumers of the graph can |
271 | /// mutate it. |
272 | class SplitGraph { |
273 | public: |
274 | class Node; |
275 | |
276 | enum class EdgeKind : uint8_t { |
277 | /// The nodes are related through a direct call. This is a "strong" edge as |
278 | /// it means the Src will directly reference the Dst. |
279 | DirectCall, |
280 | /// The nodes are related through an indirect call. |
281 | /// This is a "weaker" edge and is only considered when traversing the graph |
282 | /// starting from a kernel. We need this edge for resource usage analysis. |
283 | /// |
284 | /// The reason why we have this edge in the first place is due to how |
285 | /// AMDGPUResourceUsageAnalysis works. In the presence of an indirect call, |
286 | /// the resource usage of the kernel containing the indirect call is the |
287 | /// max resource usage of all functions that can be indirectly called. |
288 | IndirectCall, |
289 | }; |
290 | |
291 | /// An edge between two nodes. Edges are directional, and tagged with a |
292 | /// "kind". |
293 | struct Edge { |
294 | Edge(Node *Src, Node *Dst, EdgeKind Kind) |
295 | : Src(Src), Dst(Dst), Kind(Kind) {} |
296 | |
297 | Node *Src; ///< Source |
298 | Node *Dst; ///< Destination |
299 | EdgeKind Kind; |
300 | }; |
301 | |
302 | using EdgesVec = SmallVector<const Edge *, 0>; |
303 | using edges_iterator = EdgesVec::const_iterator; |
304 | using nodes_iterator = const Node *const *; |
305 | |
306 | SplitGraph(const Module &M, const FunctionsCostMap &CostMap, |
307 | CostType ModuleCost) |
308 | : M(M), CostMap(CostMap), ModuleCost(ModuleCost) {} |
309 | |
310 | void buildGraph(CallGraph &CG); |
311 | |
312 | #ifndef NDEBUG |
313 | bool verifyGraph() const; |
314 | #endif |
315 | |
316 | bool empty() const { return Nodes.empty(); } |
317 | const iterator_range<nodes_iterator> nodes() const { |
318 | return {Nodes.begin(), Nodes.end()}; |
319 | } |
320 | const Node &getNode(unsigned ID) const { return *Nodes[ID]; } |
321 | |
322 | unsigned getNumNodes() const { return Nodes.size(); } |
323 | BitVector createNodesBitVector() const { return BitVector(Nodes.size()); } |
324 | |
325 | const Module &getModule() const { return M; } |
326 | |
327 | CostType getModuleCost() const { return ModuleCost; } |
328 | CostType getCost(const Function &F) const { return CostMap.at(Val: &F); } |
329 | |
330 | /// \returns the aggregated cost of all nodes in \p BV (bits set to 1 = node |
331 | /// IDs). |
332 | CostType calculateCost(const BitVector &BV) const; |
333 | |
334 | private: |
335 | /// Retrieves the node for \p GV in \p Cache, or creates a new node for it and |
336 | /// updates \p Cache. |
337 | Node &getNode(DenseMap<const GlobalValue *, Node *> &Cache, |
338 | const GlobalValue &GV); |
339 | |
340 | // Create a new edge between two nodes and add it to both nodes. |
341 | const Edge &createEdge(Node &Src, Node &Dst, EdgeKind EK); |
342 | |
343 | const Module &M; |
344 | const FunctionsCostMap &CostMap; |
345 | CostType ModuleCost; |
346 | |
347 | // Final list of nodes with stable ordering. |
348 | SmallVector<Node *> Nodes; |
349 | |
350 | SpecificBumpPtrAllocator<Node> NodesPool; |
351 | |
352 | // Edges are trivially destructible objects, so as a small optimization we |
353 | // use a BumpPtrAllocator which avoids destructor calls but also makes |
354 | // allocation faster. |
355 | static_assert( |
356 | std::is_trivially_destructible_v<Edge>, |
357 | "Edge must be trivially destructible to use the BumpPtrAllocator" ); |
358 | BumpPtrAllocator EdgesPool; |
359 | }; |
360 | |
361 | /// Nodes in the SplitGraph contain both incoming, and outgoing edges. |
362 | /// Incoming edges have this node as their Dst, and Outgoing ones have this node |
363 | /// as their Src. |
364 | /// |
365 | /// Edge objects are shared by both nodes in Src/Dst. They provide immediate |
366 | /// feedback on how two nodes are related, and in which direction they are |
367 | /// related, which is valuable information to make splitting decisions. |
368 | /// |
369 | /// Nodes are fundamentally abstract, and any consumers of the graph should |
370 | /// treat them as such. While a node will be a function most of the time, we |
371 | /// could also create nodes for any other reason. In the future, we could have |
372 | /// single nodes for multiple functions, or nodes for GVs, etc. |
373 | class SplitGraph::Node { |
374 | friend class SplitGraph; |
375 | |
376 | public: |
377 | Node(unsigned ID, const GlobalValue &GV, CostType IndividualCost, |
378 | bool IsNonCopyable) |
379 | : ID(ID), GV(GV), IndividualCost(IndividualCost), |
380 | IsNonCopyable(IsNonCopyable), IsEntryFnCC(false), IsGraphEntry(false) { |
381 | if (auto *Fn = dyn_cast<Function>(Val: &GV)) |
382 | IsEntryFnCC = AMDGPU::isEntryFunctionCC(CC: Fn->getCallingConv()); |
383 | } |
384 | |
385 | /// An 0-indexed ID for the node. The maximum ID (exclusive) is the number of |
386 | /// nodes in the graph. This ID can be used as an index in a BitVector. |
387 | unsigned getID() const { return ID; } |
388 | |
389 | const Function &getFunction() const { return cast<Function>(Val: GV); } |
390 | |
391 | /// \returns the cost to import this component into a given module, not |
392 | /// accounting for any dependencies that may need to be imported as well. |
393 | CostType getIndividualCost() const { return IndividualCost; } |
394 | |
395 | bool isNonCopyable() const { return IsNonCopyable; } |
396 | bool isEntryFunctionCC() const { return IsEntryFnCC; } |
397 | |
398 | /// \returns whether this is an entry point in the graph. Entry points are |
399 | /// defined as follows: if you take all entry points in the graph, and iterate |
400 | /// their dependencies, you are guaranteed to visit all nodes in the graph at |
401 | /// least once. |
402 | bool isGraphEntryPoint() const { return IsGraphEntry; } |
403 | |
404 | StringRef getName() const { return GV.getName(); } |
405 | |
406 | bool hasAnyIncomingEdges() const { return IncomingEdges.size(); } |
407 | bool hasAnyIncomingEdgesOfKind(EdgeKind EK) const { |
408 | return any_of(Range: IncomingEdges, P: [&](const auto *E) { return E->Kind == EK; }); |
409 | } |
410 | |
411 | bool hasAnyOutgoingEdges() const { return OutgoingEdges.size(); } |
412 | bool hasAnyOutgoingEdgesOfKind(EdgeKind EK) const { |
413 | return any_of(Range: OutgoingEdges, P: [&](const auto *E) { return E->Kind == EK; }); |
414 | } |
415 | |
416 | iterator_range<edges_iterator> incoming_edges() const { |
417 | return IncomingEdges; |
418 | } |
419 | |
420 | iterator_range<edges_iterator> outgoing_edges() const { |
421 | return OutgoingEdges; |
422 | } |
423 | |
424 | bool shouldFollowIndirectCalls() const { return isEntryFunctionCC(); } |
425 | |
426 | /// Visit all children of this node in a recursive fashion. Also visits Self. |
427 | /// If \ref shouldFollowIndirectCalls returns false, then this only follows |
428 | /// DirectCall edges. |
429 | /// |
430 | /// \param Visitor Visitor Function. |
431 | void visitAllDependencies(std::function<void(const Node &)> Visitor) const; |
432 | |
433 | /// Adds the depedencies of this node in \p BV by setting the bit |
434 | /// corresponding to each node. |
435 | /// |
436 | /// Implemented using \ref visitAllDependencies, hence it follows the same |
437 | /// rules regarding dependencies traversal. |
438 | /// |
439 | /// \param[out] BV The bitvector where the bits should be set. |
440 | void getDependencies(BitVector &BV) const { |
441 | visitAllDependencies(Visitor: [&](const Node &N) { BV.set(N.getID()); }); |
442 | } |
443 | |
444 | private: |
445 | void markAsGraphEntry() { IsGraphEntry = true; } |
446 | |
447 | unsigned ID; |
448 | const GlobalValue &GV; |
449 | CostType IndividualCost; |
450 | bool IsNonCopyable : 1; |
451 | bool IsEntryFnCC : 1; |
452 | bool IsGraphEntry : 1; |
453 | |
454 | // TODO: Use a single sorted vector (with all incoming/outgoing edges grouped |
455 | // together) |
456 | EdgesVec IncomingEdges; |
457 | EdgesVec OutgoingEdges; |
458 | }; |
459 | |
460 | void SplitGraph::Node::visitAllDependencies( |
461 | std::function<void(const Node &)> Visitor) const { |
462 | const bool FollowIndirect = shouldFollowIndirectCalls(); |
463 | // FIXME: If this can access SplitGraph in the future, use a BitVector |
464 | // instead. |
465 | DenseSet<const Node *> Seen; |
466 | SmallVector<const Node *, 8> WorkList({this}); |
467 | while (!WorkList.empty()) { |
468 | const Node *CurN = WorkList.pop_back_val(); |
469 | if (auto [It, Inserted] = Seen.insert(V: CurN); !Inserted) |
470 | continue; |
471 | |
472 | Visitor(*CurN); |
473 | |
474 | for (const Edge *E : CurN->outgoing_edges()) { |
475 | if (!FollowIndirect && E->Kind == EdgeKind::IndirectCall) |
476 | continue; |
477 | WorkList.push_back(Elt: E->Dst); |
478 | } |
479 | } |
480 | } |
481 | |
482 | /// Checks if \p I has MD_callees and if it does, parse it and put the function |
483 | /// in \p Callees. |
484 | /// |
485 | /// \returns true if there was metadata and it was parsed correctly. false if |
486 | /// there was no MD or if it contained unknown entries and parsing failed. |
487 | /// If this returns false, \p Callees will contain incomplete information |
488 | /// and must not be used. |
489 | static bool handleCalleesMD(const Instruction &I, |
490 | SetVector<Function *> &Callees) { |
491 | auto *MD = I.getMetadata(KindID: LLVMContext::MD_callees); |
492 | if (!MD) |
493 | return false; |
494 | |
495 | for (const auto &Op : MD->operands()) { |
496 | Function *Callee = mdconst::extract_or_null<Function>(MD: Op); |
497 | if (!Callee) |
498 | return false; |
499 | Callees.insert(X: Callee); |
500 | } |
501 | |
502 | return true; |
503 | } |
504 | |
505 | void SplitGraph::buildGraph(CallGraph &CG) { |
506 | SplitModuleTimer SMT("buildGraph" , "graph construction" ); |
507 | LLVM_DEBUG( |
508 | dbgs() |
509 | << "[build graph] constructing graph representation of the input\n" ); |
510 | |
511 | // FIXME(?): Is the callgraph really worth using if we have to iterate the |
512 | // function again whenever it fails to give us enough information? |
513 | |
514 | // We build the graph by just iterating all functions in the module and |
515 | // working on their direct callees. At the end, all nodes should be linked |
516 | // together as expected. |
517 | DenseMap<const GlobalValue *, Node *> Cache; |
518 | SmallVector<const Function *> FnsWithIndirectCalls, IndirectlyCallableFns; |
519 | for (const Function &Fn : M) { |
520 | if (Fn.isDeclaration()) |
521 | continue; |
522 | |
523 | // Look at direct callees and create the necessary edges in the graph. |
524 | SetVector<const Function *> DirectCallees; |
525 | bool CallsExternal = false; |
526 | for (auto &CGEntry : *CG[&Fn]) { |
527 | auto *CGNode = CGEntry.second; |
528 | if (auto *Callee = CGNode->getFunction()) { |
529 | if (!Callee->isDeclaration()) |
530 | DirectCallees.insert(X: Callee); |
531 | } else if (CGNode == CG.getCallsExternalNode()) |
532 | CallsExternal = true; |
533 | } |
534 | |
535 | // Keep track of this function if it contains an indirect call and/or if it |
536 | // can be indirectly called. |
537 | if (CallsExternal) { |
538 | LLVM_DEBUG(dbgs() << " [!] callgraph is incomplete for " ; |
539 | Fn.printAsOperand(dbgs()); |
540 | dbgs() << " - analyzing function\n" ); |
541 | |
542 | SetVector<Function *> KnownCallees; |
543 | bool HasUnknownIndirectCall = false; |
544 | for (const auto &Inst : instructions(F: Fn)) { |
545 | // look at all calls without a direct callee. |
546 | const auto *CB = dyn_cast<CallBase>(Val: &Inst); |
547 | if (!CB || CB->getCalledFunction()) |
548 | continue; |
549 | |
550 | // inline assembly can be ignored, unless InlineAsmIsIndirectCall is |
551 | // true. |
552 | if (CB->isInlineAsm()) { |
553 | LLVM_DEBUG(dbgs() << " found inline assembly\n" ); |
554 | continue; |
555 | } |
556 | |
557 | if (handleCalleesMD(I: Inst, Callees&: KnownCallees)) |
558 | continue; |
559 | // If we failed to parse any !callees MD, or some was missing, |
560 | // the entire KnownCallees list is now unreliable. |
561 | KnownCallees.clear(); |
562 | |
563 | // Everything else is handled conservatively. If we fall into the |
564 | // conservative case don't bother analyzing further. |
565 | HasUnknownIndirectCall = true; |
566 | break; |
567 | } |
568 | |
569 | if (HasUnknownIndirectCall) { |
570 | LLVM_DEBUG(dbgs() << " indirect call found\n" ); |
571 | FnsWithIndirectCalls.push_back(Elt: &Fn); |
572 | } else if (!KnownCallees.empty()) |
573 | DirectCallees.insert_range(R&: KnownCallees); |
574 | } |
575 | |
576 | Node &N = getNode(Cache, GV: Fn); |
577 | for (const auto *Callee : DirectCallees) |
578 | createEdge(Src&: N, Dst&: getNode(Cache, GV: *Callee), EK: EdgeKind::DirectCall); |
579 | |
580 | if (canBeIndirectlyCalled(F: Fn)) |
581 | IndirectlyCallableFns.push_back(Elt: &Fn); |
582 | } |
583 | |
584 | // Post-process functions with indirect calls. |
585 | for (const Function *Fn : FnsWithIndirectCalls) { |
586 | for (const Function *Candidate : IndirectlyCallableFns) { |
587 | Node &Src = getNode(Cache, GV: *Fn); |
588 | Node &Dst = getNode(Cache, GV: *Candidate); |
589 | createEdge(Src, Dst, EK: EdgeKind::IndirectCall); |
590 | } |
591 | } |
592 | |
593 | // Now, find all entry points. |
594 | SmallVector<Node *, 16> CandidateEntryPoints; |
595 | BitVector NodesReachableByKernels = createNodesBitVector(); |
596 | for (Node *N : Nodes) { |
597 | // Functions with an Entry CC are always graph entry points too. |
598 | if (N->isEntryFunctionCC()) { |
599 | N->markAsGraphEntry(); |
600 | N->getDependencies(BV&: NodesReachableByKernels); |
601 | } else if (!N->hasAnyIncomingEdgesOfKind(EK: EdgeKind::DirectCall)) |
602 | CandidateEntryPoints.push_back(Elt: N); |
603 | } |
604 | |
605 | for (Node *N : CandidateEntryPoints) { |
606 | // This can be another entry point if it's not reachable by a kernel |
607 | // TODO: We could sort all of the possible new entries in a stable order |
608 | // (e.g. by cost), then consume them one by one until |
609 | // NodesReachableByKernels is all 1s. It'd allow us to avoid |
610 | // considering some nodes as non-entries in some specific cases. |
611 | if (!NodesReachableByKernels.test(Idx: N->getID())) |
612 | N->markAsGraphEntry(); |
613 | } |
614 | |
615 | #ifndef NDEBUG |
616 | assert(verifyGraph()); |
617 | #endif |
618 | } |
619 | |
620 | #ifndef NDEBUG |
621 | bool SplitGraph::verifyGraph() const { |
622 | unsigned ExpectedID = 0; |
623 | // Exceptionally using a set here in case IDs are messed up. |
624 | DenseSet<const Node *> SeenNodes; |
625 | DenseSet<const Function *> SeenFunctionNodes; |
626 | for (const Node *N : Nodes) { |
627 | if (N->getID() != (ExpectedID++)) { |
628 | errs() << "Node IDs are incorrect!\n" ; |
629 | return false; |
630 | } |
631 | |
632 | if (!SeenNodes.insert(N).second) { |
633 | errs() << "Node seen more than once!\n" ; |
634 | return false; |
635 | } |
636 | |
637 | if (&getNode(N->getID()) != N) { |
638 | errs() << "getNode doesn't return the right node\n" ; |
639 | return false; |
640 | } |
641 | |
642 | for (const Edge *E : N->IncomingEdges) { |
643 | if (!E->Src || !E->Dst || (E->Dst != N) || |
644 | (find(E->Src->OutgoingEdges, E) == E->Src->OutgoingEdges.end())) { |
645 | errs() << "ill-formed incoming edges\n" ; |
646 | return false; |
647 | } |
648 | } |
649 | |
650 | for (const Edge *E : N->OutgoingEdges) { |
651 | if (!E->Src || !E->Dst || (E->Src != N) || |
652 | (find(E->Dst->IncomingEdges, E) == E->Dst->IncomingEdges.end())) { |
653 | errs() << "ill-formed outgoing edges\n" ; |
654 | return false; |
655 | } |
656 | } |
657 | |
658 | const Function &Fn = N->getFunction(); |
659 | if (AMDGPU::isEntryFunctionCC(Fn.getCallingConv())) { |
660 | if (N->hasAnyIncomingEdges()) { |
661 | errs() << "Kernels cannot have incoming edges\n" ; |
662 | return false; |
663 | } |
664 | } |
665 | |
666 | if (Fn.isDeclaration()) { |
667 | errs() << "declarations shouldn't have nodes!\n" ; |
668 | return false; |
669 | } |
670 | |
671 | auto [It, Inserted] = SeenFunctionNodes.insert(&Fn); |
672 | if (!Inserted) { |
673 | errs() << "one function has multiple nodes!\n" ; |
674 | return false; |
675 | } |
676 | } |
677 | |
678 | if (ExpectedID != Nodes.size()) { |
679 | errs() << "Node IDs out of sync!\n" ; |
680 | return false; |
681 | } |
682 | |
683 | if (createNodesBitVector().size() != getNumNodes()) { |
684 | errs() << "nodes bit vector doesn't have the right size!\n" ; |
685 | return false; |
686 | } |
687 | |
688 | // Check we respect the promise of Node::isKernel |
689 | BitVector BV = createNodesBitVector(); |
690 | for (const Node *N : nodes()) { |
691 | if (N->isGraphEntryPoint()) |
692 | N->getDependencies(BV); |
693 | } |
694 | |
695 | // Ensure each function in the module has an associated node. |
696 | for (const auto &Fn : M) { |
697 | if (!Fn.isDeclaration()) { |
698 | if (!SeenFunctionNodes.contains(&Fn)) { |
699 | errs() << "Fn has no associated node in the graph!\n" ; |
700 | return false; |
701 | } |
702 | } |
703 | } |
704 | |
705 | if (!BV.all()) { |
706 | errs() << "not all nodes are reachable through the graph's entry points!\n" ; |
707 | return false; |
708 | } |
709 | |
710 | return true; |
711 | } |
712 | #endif |
713 | |
714 | CostType SplitGraph::calculateCost(const BitVector &BV) const { |
715 | CostType Cost = 0; |
716 | for (unsigned NodeID : BV.set_bits()) |
717 | Cost += getNode(ID: NodeID).getIndividualCost(); |
718 | return Cost; |
719 | } |
720 | |
721 | SplitGraph::Node & |
722 | SplitGraph::getNode(DenseMap<const GlobalValue *, Node *> &Cache, |
723 | const GlobalValue &GV) { |
724 | auto &N = Cache[&GV]; |
725 | if (N) |
726 | return *N; |
727 | |
728 | CostType Cost = 0; |
729 | bool NonCopyable = false; |
730 | if (const Function *Fn = dyn_cast<Function>(Val: &GV)) { |
731 | NonCopyable = isNonCopyable(F: *Fn); |
732 | Cost = CostMap.at(Val: Fn); |
733 | } |
734 | N = new (NodesPool.Allocate()) Node(Nodes.size(), GV, Cost, NonCopyable); |
735 | Nodes.push_back(Elt: N); |
736 | assert(&getNode(N->getID()) == N); |
737 | return *N; |
738 | } |
739 | |
740 | const SplitGraph::Edge &SplitGraph::createEdge(Node &Src, Node &Dst, |
741 | EdgeKind EK) { |
742 | const Edge *E = new (EdgesPool.Allocate<Edge>(Num: 1)) Edge(&Src, &Dst, EK); |
743 | Src.OutgoingEdges.push_back(Elt: E); |
744 | Dst.IncomingEdges.push_back(Elt: E); |
745 | return *E; |
746 | } |
747 | |
748 | //===----------------------------------------------------------------------===// |
749 | // Split Proposals |
750 | //===----------------------------------------------------------------------===// |
751 | |
752 | /// Represents a module splitting proposal. |
753 | /// |
754 | /// Proposals are made of N BitVectors, one for each partition, where each bit |
755 | /// set indicates that the node is present and should be copied inside that |
756 | /// partition. |
757 | /// |
758 | /// Proposals have several metrics attached so they can be compared/sorted, |
759 | /// which the driver to try multiple strategies resultings in multiple proposals |
760 | /// and choose the best one out of them. |
761 | class SplitProposal { |
762 | public: |
763 | SplitProposal(const SplitGraph &SG, unsigned MaxPartitions) : SG(&SG) { |
764 | Partitions.resize(new_size: MaxPartitions, x: {0, SG.createNodesBitVector()}); |
765 | } |
766 | |
767 | void setName(StringRef NewName) { Name = NewName; } |
768 | StringRef getName() const { return Name; } |
769 | |
770 | const BitVector &operator[](unsigned PID) const { |
771 | return Partitions[PID].second; |
772 | } |
773 | |
774 | void add(unsigned PID, const BitVector &BV) { |
775 | Partitions[PID].second |= BV; |
776 | updateScore(PID); |
777 | } |
778 | |
779 | void print(raw_ostream &OS) const; |
780 | LLVM_DUMP_METHOD void dump() const { print(OS&: dbgs()); } |
781 | |
782 | // Find the cheapest partition (lowest cost). In case of ties, always returns |
783 | // the highest partition number. |
784 | unsigned findCheapestPartition() const; |
785 | |
786 | /// Calculate the CodeSize and Bottleneck scores. |
787 | void calculateScores(); |
788 | |
789 | #ifndef NDEBUG |
790 | void verifyCompleteness() const; |
791 | #endif |
792 | |
793 | /// Only available after \ref calculateScores is called. |
794 | /// |
795 | /// A positive number indicating the % of code duplication that this proposal |
796 | /// creates. e.g. 0.2 means this proposal adds roughly 20% code size by |
797 | /// duplicating some functions across partitions. |
798 | /// |
799 | /// Value is always rounded up to 3 decimal places. |
800 | /// |
801 | /// A perfect score would be 0.0, and anything approaching 1.0 is very bad. |
802 | double getCodeSizeScore() const { return CodeSizeScore; } |
803 | |
804 | /// Only available after \ref calculateScores is called. |
805 | /// |
806 | /// A number between [0, 1] which indicates how big of a bottleneck is |
807 | /// expected from the largest partition. |
808 | /// |
809 | /// A score of 1.0 means the biggest partition is as big as the source module, |
810 | /// so build time will be equal to or greater than the build time of the |
811 | /// initial input. |
812 | /// |
813 | /// Value is always rounded up to 3 decimal places. |
814 | /// |
815 | /// This is one of the metrics used to estimate this proposal's build time. |
816 | double getBottleneckScore() const { return BottleneckScore; } |
817 | |
818 | private: |
819 | void updateScore(unsigned PID) { |
820 | assert(SG); |
821 | for (auto &[PCost, Nodes] : Partitions) { |
822 | TotalCost -= PCost; |
823 | PCost = SG->calculateCost(BV: Nodes); |
824 | TotalCost += PCost; |
825 | } |
826 | } |
827 | |
828 | /// \see getCodeSizeScore |
829 | double CodeSizeScore = 0.0; |
830 | /// \see getBottleneckScore |
831 | double BottleneckScore = 0.0; |
832 | /// Aggregated cost of all partitions |
833 | CostType TotalCost = 0; |
834 | |
835 | const SplitGraph *SG = nullptr; |
836 | std::string Name; |
837 | |
838 | std::vector<std::pair<CostType, BitVector>> Partitions; |
839 | }; |
840 | |
841 | void SplitProposal::print(raw_ostream &OS) const { |
842 | assert(SG); |
843 | |
844 | OS << "[proposal] " << Name << ", total cost:" << TotalCost |
845 | << ", code size score:" << format(Fmt: "%0.3f" , Vals: CodeSizeScore) |
846 | << ", bottleneck score:" << format(Fmt: "%0.3f" , Vals: BottleneckScore) << '\n'; |
847 | for (const auto &[PID, Part] : enumerate(First: Partitions)) { |
848 | const auto &[Cost, NodeIDs] = Part; |
849 | OS << " - P" << PID << " nodes:" << NodeIDs.count() << " cost: " << Cost |
850 | << '|' << formatRatioOf(Num: Cost, Dem: SG->getModuleCost()) << "%\n" ; |
851 | } |
852 | } |
853 | |
854 | unsigned SplitProposal::findCheapestPartition() const { |
855 | assert(!Partitions.empty()); |
856 | CostType CurCost = std::numeric_limits<CostType>::max(); |
857 | unsigned CurPID = InvalidPID; |
858 | for (const auto &[Idx, Part] : enumerate(First: Partitions)) { |
859 | if (Part.first <= CurCost) { |
860 | CurPID = Idx; |
861 | CurCost = Part.first; |
862 | } |
863 | } |
864 | assert(CurPID != InvalidPID); |
865 | return CurPID; |
866 | } |
867 | |
868 | void SplitProposal::calculateScores() { |
869 | if (Partitions.empty()) |
870 | return; |
871 | |
872 | assert(SG); |
873 | CostType LargestPCost = 0; |
874 | for (auto &[PCost, Nodes] : Partitions) { |
875 | if (PCost > LargestPCost) |
876 | LargestPCost = PCost; |
877 | } |
878 | |
879 | CostType ModuleCost = SG->getModuleCost(); |
880 | CodeSizeScore = double(TotalCost) / ModuleCost; |
881 | assert(CodeSizeScore >= 0.0); |
882 | |
883 | BottleneckScore = double(LargestPCost) / ModuleCost; |
884 | |
885 | CodeSizeScore = std::ceil(x: CodeSizeScore * 100.0) / 100.0; |
886 | BottleneckScore = std::ceil(x: BottleneckScore * 100.0) / 100.0; |
887 | } |
888 | |
889 | #ifndef NDEBUG |
890 | void SplitProposal::verifyCompleteness() const { |
891 | if (Partitions.empty()) |
892 | return; |
893 | |
894 | BitVector Result = Partitions[0].second; |
895 | for (const auto &P : drop_begin(Partitions)) |
896 | Result |= P.second; |
897 | assert(Result.all() && "some nodes are missing from this proposal!" ); |
898 | } |
899 | #endif |
900 | |
901 | //===-- RecursiveSearchStrategy -------------------------------------------===// |
902 | |
903 | /// Partitioning algorithm. |
904 | /// |
905 | /// This is a recursive search algorithm that can explore multiple possiblities. |
906 | /// |
907 | /// When a cluster of nodes can go into more than one partition, and we haven't |
908 | /// reached maximum search depth, we recurse and explore both options and their |
909 | /// consequences. Both branches will yield a proposal, and the driver will grade |
910 | /// both and choose the best one. |
911 | /// |
912 | /// If max depth is reached, we will use some heuristics to make a choice. Most |
913 | /// of the time we will just use the least-pressured (cheapest) partition, but |
914 | /// if a cluster is particularly big and there is a good amount of overlap with |
915 | /// an existing partition, we will choose that partition instead. |
916 | class RecursiveSearchSplitting { |
917 | public: |
918 | using SubmitProposalFn = function_ref<void(SplitProposal)>; |
919 | |
920 | RecursiveSearchSplitting(const SplitGraph &SG, unsigned NumParts, |
921 | SubmitProposalFn SubmitProposal); |
922 | |
923 | void run(); |
924 | |
925 | private: |
926 | struct WorkListEntry { |
927 | WorkListEntry(const BitVector &BV) : Cluster(BV) {} |
928 | |
929 | unsigned NumNonEntryNodes = 0; |
930 | CostType TotalCost = 0; |
931 | CostType CostExcludingGraphEntryPoints = 0; |
932 | BitVector Cluster; |
933 | }; |
934 | |
935 | /// Collects all graph entry points's clusters and sort them so the most |
936 | /// expensive clusters are viewed first. This will merge clusters together if |
937 | /// they share a non-copyable dependency. |
938 | void setupWorkList(); |
939 | |
940 | /// Recursive function that assigns the worklist item at \p Idx into a |
941 | /// partition of \p SP. |
942 | /// |
943 | /// \p Depth is the current search depth. When this value is equal to |
944 | /// \ref MaxDepth, we can no longer recurse. |
945 | /// |
946 | /// This function only recurses if there is more than one possible assignment, |
947 | /// otherwise it is iterative to avoid creating a call stack that is as big as |
948 | /// \ref WorkList. |
949 | void pickPartition(unsigned Depth, unsigned Idx, SplitProposal SP); |
950 | |
951 | /// \return A pair: first element is the PID of the partition that has the |
952 | /// most similarities with \p Entry, or \ref InvalidPID if no partition was |
953 | /// found with at least one element in common. The second element is the |
954 | /// aggregated cost of all dependencies in common between \p Entry and that |
955 | /// partition. |
956 | std::pair<unsigned, CostType> |
957 | findMostSimilarPartition(const WorkListEntry &Entry, const SplitProposal &SP); |
958 | |
959 | const SplitGraph &SG; |
960 | unsigned NumParts; |
961 | SubmitProposalFn SubmitProposal; |
962 | |
963 | // A Cluster is considered large when its cost, excluding entry points, |
964 | // exceeds this value. |
965 | CostType LargeClusterThreshold = 0; |
966 | unsigned NumProposalsSubmitted = 0; |
967 | SmallVector<WorkListEntry> WorkList; |
968 | }; |
969 | |
970 | RecursiveSearchSplitting::RecursiveSearchSplitting( |
971 | const SplitGraph &SG, unsigned NumParts, SubmitProposalFn SubmitProposal) |
972 | : SG(SG), NumParts(NumParts), SubmitProposal(SubmitProposal) { |
973 | // arbitrary max value as a safeguard. Anything above 10 will already be |
974 | // slow, this is just a max value to prevent extreme resource exhaustion or |
975 | // unbounded run time. |
976 | if (MaxDepth > 16) |
977 | report_fatal_error(reason: "[amdgpu-split-module] search depth of " + |
978 | Twine(MaxDepth) + " is too high!" ); |
979 | LargeClusterThreshold = |
980 | (LargeFnFactor != 0.0) |
981 | ? CostType(((SG.getModuleCost() / NumParts) * LargeFnFactor)) |
982 | : std::numeric_limits<CostType>::max(); |
983 | LLVM_DEBUG(dbgs() << "[recursive search] large cluster threshold set at " |
984 | << LargeClusterThreshold << "\n" ); |
985 | } |
986 | |
987 | void RecursiveSearchSplitting::run() { |
988 | { |
989 | SplitModuleTimer SMT("recursive_search_prepare" , "preparing worklist" ); |
990 | setupWorkList(); |
991 | } |
992 | |
993 | { |
994 | SplitModuleTimer SMT("recursive_search_pick" , "partitioning" ); |
995 | SplitProposal SP(SG, NumParts); |
996 | pickPartition(/*BranchDepth=*/Depth: 0, /*Idx=*/0, SP); |
997 | } |
998 | } |
999 | |
1000 | void RecursiveSearchSplitting::setupWorkList() { |
1001 | // e.g. if A and B are two worklist item, and they both call a non copyable |
1002 | // dependency C, this does: |
1003 | // A=C |
1004 | // B=C |
1005 | // => NodeEC will create a single group (A, B, C) and we create a new |
1006 | // WorkList entry for that group. |
1007 | |
1008 | EquivalenceClasses<unsigned> NodeEC; |
1009 | for (const SplitGraph::Node *N : SG.nodes()) { |
1010 | if (!N->isGraphEntryPoint()) |
1011 | continue; |
1012 | |
1013 | NodeEC.insert(Data: N->getID()); |
1014 | N->visitAllDependencies(Visitor: [&](const SplitGraph::Node &Dep) { |
1015 | if (&Dep != N && Dep.isNonCopyable()) |
1016 | NodeEC.unionSets(V1: N->getID(), V2: Dep.getID()); |
1017 | }); |
1018 | } |
1019 | |
1020 | for (const auto &Node : NodeEC) { |
1021 | if (!Node->isLeader()) |
1022 | continue; |
1023 | |
1024 | BitVector Cluster = SG.createNodesBitVector(); |
1025 | for (unsigned M : NodeEC.members(ECV: *Node)) { |
1026 | const SplitGraph::Node &N = SG.getNode(ID: M); |
1027 | if (N.isGraphEntryPoint()) |
1028 | N.getDependencies(BV&: Cluster); |
1029 | } |
1030 | WorkList.emplace_back(Args: std::move(Cluster)); |
1031 | } |
1032 | |
1033 | // Calculate costs and other useful information. |
1034 | for (WorkListEntry &Entry : WorkList) { |
1035 | for (unsigned NodeID : Entry.Cluster.set_bits()) { |
1036 | const SplitGraph::Node &N = SG.getNode(ID: NodeID); |
1037 | const CostType Cost = N.getIndividualCost(); |
1038 | |
1039 | Entry.TotalCost += Cost; |
1040 | if (!N.isGraphEntryPoint()) { |
1041 | Entry.CostExcludingGraphEntryPoints += Cost; |
1042 | ++Entry.NumNonEntryNodes; |
1043 | } |
1044 | } |
1045 | } |
1046 | |
1047 | stable_sort(Range&: WorkList, C: [](const WorkListEntry &A, const WorkListEntry &B) { |
1048 | if (A.TotalCost != B.TotalCost) |
1049 | return A.TotalCost > B.TotalCost; |
1050 | |
1051 | if (A.CostExcludingGraphEntryPoints != B.CostExcludingGraphEntryPoints) |
1052 | return A.CostExcludingGraphEntryPoints > B.CostExcludingGraphEntryPoints; |
1053 | |
1054 | if (A.NumNonEntryNodes != B.NumNonEntryNodes) |
1055 | return A.NumNonEntryNodes > B.NumNonEntryNodes; |
1056 | |
1057 | return A.Cluster.count() > B.Cluster.count(); |
1058 | }); |
1059 | |
1060 | LLVM_DEBUG({ |
1061 | dbgs() << "[recursive search] worklist:\n" ; |
1062 | for (const auto &[Idx, Entry] : enumerate(WorkList)) { |
1063 | dbgs() << " - [" << Idx << "]: " ; |
1064 | for (unsigned NodeID : Entry.Cluster.set_bits()) |
1065 | dbgs() << NodeID << " " ; |
1066 | dbgs() << "(total_cost:" << Entry.TotalCost |
1067 | << ", cost_excl_entries:" << Entry.CostExcludingGraphEntryPoints |
1068 | << ")\n" ; |
1069 | } |
1070 | }); |
1071 | } |
1072 | |
1073 | void RecursiveSearchSplitting::pickPartition(unsigned Depth, unsigned Idx, |
1074 | SplitProposal SP) { |
1075 | while (Idx < WorkList.size()) { |
1076 | // Step 1: Determine candidate PIDs. |
1077 | // |
1078 | const WorkListEntry &Entry = WorkList[Idx]; |
1079 | const BitVector &Cluster = Entry.Cluster; |
1080 | |
1081 | // Default option is to do load-balancing, AKA assign to least pressured |
1082 | // partition. |
1083 | const unsigned CheapestPID = SP.findCheapestPartition(); |
1084 | assert(CheapestPID != InvalidPID); |
1085 | |
1086 | // Explore assigning to the kernel that contains the most dependencies in |
1087 | // common. |
1088 | const auto [MostSimilarPID, SimilarDepsCost] = |
1089 | findMostSimilarPartition(Entry, SP); |
1090 | |
1091 | // We can chose to explore only one path if we only have one valid path, or |
1092 | // if we reached maximum search depth and can no longer branch out. |
1093 | unsigned SinglePIDToTry = InvalidPID; |
1094 | if (MostSimilarPID == InvalidPID) // no similar PID found |
1095 | SinglePIDToTry = CheapestPID; |
1096 | else if (MostSimilarPID == CheapestPID) // both landed on the same PID |
1097 | SinglePIDToTry = CheapestPID; |
1098 | else if (Depth >= MaxDepth) { |
1099 | // We have to choose one path. Use a heuristic to guess which one will be |
1100 | // more appropriate. |
1101 | if (Entry.CostExcludingGraphEntryPoints > LargeClusterThreshold) { |
1102 | // Check if the amount of code in common makes it worth it. |
1103 | assert(SimilarDepsCost && Entry.CostExcludingGraphEntryPoints); |
1104 | const double Ratio = static_cast<double>(SimilarDepsCost) / |
1105 | Entry.CostExcludingGraphEntryPoints; |
1106 | assert(Ratio >= 0.0 && Ratio <= 1.0); |
1107 | if (Ratio > LargeFnOverlapForMerge) { |
1108 | // For debug, just print "L", so we'll see "L3=P3" for instance, which |
1109 | // will mean we reached max depth and chose P3 based on this |
1110 | // heuristic. |
1111 | LLVM_DEBUG(dbgs() << 'L'); |
1112 | SinglePIDToTry = MostSimilarPID; |
1113 | } |
1114 | } else |
1115 | SinglePIDToTry = CheapestPID; |
1116 | } |
1117 | |
1118 | // Step 2: Explore candidates. |
1119 | |
1120 | // When we only explore one possible path, and thus branch depth doesn't |
1121 | // increase, do not recurse, iterate instead. |
1122 | if (SinglePIDToTry != InvalidPID) { |
1123 | LLVM_DEBUG(dbgs() << Idx << "=P" << SinglePIDToTry << ' '); |
1124 | // Only one path to explore, don't clone SP, don't increase depth. |
1125 | SP.add(PID: SinglePIDToTry, BV: Cluster); |
1126 | ++Idx; |
1127 | continue; |
1128 | } |
1129 | |
1130 | assert(MostSimilarPID != InvalidPID); |
1131 | |
1132 | // We explore multiple paths: recurse at increased depth, then stop this |
1133 | // function. |
1134 | |
1135 | LLVM_DEBUG(dbgs() << '\n'); |
1136 | |
1137 | // lb = load balancing = put in cheapest partition |
1138 | { |
1139 | SplitProposal BranchSP = SP; |
1140 | LLVM_DEBUG(dbgs().indent(Depth) |
1141 | << " [lb] " << Idx << "=P" << CheapestPID << "? " ); |
1142 | BranchSP.add(PID: CheapestPID, BV: Cluster); |
1143 | pickPartition(Depth: Depth + 1, Idx: Idx + 1, SP: BranchSP); |
1144 | } |
1145 | |
1146 | // ms = most similar = put in partition with the most in common |
1147 | { |
1148 | SplitProposal BranchSP = SP; |
1149 | LLVM_DEBUG(dbgs().indent(Depth) |
1150 | << " [ms] " << Idx << "=P" << MostSimilarPID << "? " ); |
1151 | BranchSP.add(PID: MostSimilarPID, BV: Cluster); |
1152 | pickPartition(Depth: Depth + 1, Idx: Idx + 1, SP: BranchSP); |
1153 | } |
1154 | |
1155 | return; |
1156 | } |
1157 | |
1158 | // Step 3: If we assigned all WorkList items, submit the proposal. |
1159 | |
1160 | assert(Idx == WorkList.size()); |
1161 | assert(NumProposalsSubmitted <= (2u << MaxDepth) && |
1162 | "Search got out of bounds?" ); |
1163 | SP.setName("recursive_search (depth=" + std::to_string(val: Depth) + ") #" + |
1164 | std::to_string(val: NumProposalsSubmitted++)); |
1165 | LLVM_DEBUG(dbgs() << '\n'); |
1166 | SubmitProposal(SP); |
1167 | } |
1168 | |
1169 | std::pair<unsigned, CostType> |
1170 | RecursiveSearchSplitting::findMostSimilarPartition(const WorkListEntry &Entry, |
1171 | const SplitProposal &SP) { |
1172 | if (!Entry.NumNonEntryNodes) |
1173 | return {InvalidPID, 0}; |
1174 | |
1175 | // We take the partition that is the most similar using Cost as a metric. |
1176 | // So we take the set of nodes in common, compute their aggregated cost, and |
1177 | // pick the partition with the highest cost in common. |
1178 | unsigned ChosenPID = InvalidPID; |
1179 | CostType ChosenCost = 0; |
1180 | for (unsigned PID = 0; PID < NumParts; ++PID) { |
1181 | BitVector BV = SP[PID]; |
1182 | BV &= Entry.Cluster; // FIXME: & doesn't work between BVs?! |
1183 | |
1184 | if (BV.none()) |
1185 | continue; |
1186 | |
1187 | const CostType Cost = SG.calculateCost(BV); |
1188 | |
1189 | if (ChosenPID == InvalidPID || ChosenCost < Cost || |
1190 | (ChosenCost == Cost && PID > ChosenPID)) { |
1191 | ChosenPID = PID; |
1192 | ChosenCost = Cost; |
1193 | } |
1194 | } |
1195 | |
1196 | return {ChosenPID, ChosenCost}; |
1197 | } |
1198 | |
1199 | //===----------------------------------------------------------------------===// |
1200 | // DOTGraph Printing Support |
1201 | //===----------------------------------------------------------------------===// |
1202 | |
1203 | const SplitGraph::Node *mapEdgeToDst(const SplitGraph::Edge *E) { |
1204 | return E->Dst; |
1205 | } |
1206 | |
1207 | using SplitGraphEdgeDstIterator = |
1208 | mapped_iterator<SplitGraph::edges_iterator, decltype(&mapEdgeToDst)>; |
1209 | |
1210 | } // namespace |
1211 | |
1212 | template <> struct GraphTraits<SplitGraph> { |
1213 | using NodeRef = const SplitGraph::Node *; |
1214 | using nodes_iterator = SplitGraph::nodes_iterator; |
1215 | using ChildIteratorType = SplitGraphEdgeDstIterator; |
1216 | |
1217 | using EdgeRef = const SplitGraph::Edge *; |
1218 | using ChildEdgeIteratorType = SplitGraph::edges_iterator; |
1219 | |
1220 | static NodeRef getEntryNode(NodeRef N) { return N; } |
1221 | |
1222 | static ChildIteratorType child_begin(NodeRef Ref) { |
1223 | return {Ref->outgoing_edges().begin(), mapEdgeToDst}; |
1224 | } |
1225 | static ChildIteratorType child_end(NodeRef Ref) { |
1226 | return {Ref->outgoing_edges().end(), mapEdgeToDst}; |
1227 | } |
1228 | |
1229 | static nodes_iterator nodes_begin(const SplitGraph &G) { |
1230 | return G.nodes().begin(); |
1231 | } |
1232 | static nodes_iterator nodes_end(const SplitGraph &G) { |
1233 | return G.nodes().end(); |
1234 | } |
1235 | }; |
1236 | |
1237 | template <> struct DOTGraphTraits<SplitGraph> : public DefaultDOTGraphTraits { |
1238 | DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {} |
1239 | |
1240 | static std::string getGraphName(const SplitGraph &SG) { |
1241 | return SG.getModule().getName().str(); |
1242 | } |
1243 | |
1244 | std::string getNodeLabel(const SplitGraph::Node *N, const SplitGraph &SG) { |
1245 | return N->getName().str(); |
1246 | } |
1247 | |
1248 | static std::string getNodeDescription(const SplitGraph::Node *N, |
1249 | const SplitGraph &SG) { |
1250 | std::string Result; |
1251 | if (N->isEntryFunctionCC()) |
1252 | Result += "entry-fn-cc " ; |
1253 | if (N->isNonCopyable()) |
1254 | Result += "non-copyable " ; |
1255 | Result += "cost:" + std::to_string(val: N->getIndividualCost()); |
1256 | return Result; |
1257 | } |
1258 | |
1259 | static std::string getNodeAttributes(const SplitGraph::Node *N, |
1260 | const SplitGraph &SG) { |
1261 | return N->hasAnyIncomingEdges() ? "" : "color=\"red\"" ; |
1262 | } |
1263 | |
1264 | static std::string getEdgeAttributes(const SplitGraph::Node *N, |
1265 | SplitGraphEdgeDstIterator EI, |
1266 | const SplitGraph &SG) { |
1267 | |
1268 | switch ((*EI.getCurrent())->Kind) { |
1269 | case SplitGraph::EdgeKind::DirectCall: |
1270 | return "" ; |
1271 | case SplitGraph::EdgeKind::IndirectCall: |
1272 | return "style=\"dashed\"" ; |
1273 | } |
1274 | llvm_unreachable("Unknown SplitGraph::EdgeKind enum" ); |
1275 | } |
1276 | }; |
1277 | |
1278 | //===----------------------------------------------------------------------===// |
1279 | // Driver |
1280 | //===----------------------------------------------------------------------===// |
1281 | |
1282 | namespace { |
1283 | |
1284 | // If we didn't externalize GVs, then local GVs need to be conservatively |
1285 | // imported into every module (including their initializers), and then cleaned |
1286 | // up afterwards. |
1287 | static bool needsConservativeImport(const GlobalValue *GV) { |
1288 | if (const auto *Var = dyn_cast<GlobalVariable>(Val: GV)) |
1289 | return Var->hasLocalLinkage(); |
1290 | return isa<GlobalAlias>(Val: GV); |
1291 | } |
1292 | |
1293 | /// Prints a summary of the partition \p N, represented by module \p M, to \p |
1294 | /// OS. |
1295 | static void printPartitionSummary(raw_ostream &OS, unsigned N, const Module &M, |
1296 | unsigned PartCost, unsigned ModuleCost) { |
1297 | OS << "*** Partition P" << N << " ***\n" ; |
1298 | |
1299 | for (const auto &Fn : M) { |
1300 | if (!Fn.isDeclaration()) |
1301 | OS << " - [function] " << Fn.getName() << "\n" ; |
1302 | } |
1303 | |
1304 | for (const auto &GV : M.globals()) { |
1305 | if (GV.hasInitializer()) |
1306 | OS << " - [global] " << GV.getName() << "\n" ; |
1307 | } |
1308 | |
1309 | OS << "Partition contains " << formatRatioOf(Num: PartCost, Dem: ModuleCost) |
1310 | << "% of the source\n" ; |
1311 | } |
1312 | |
1313 | static void evaluateProposal(SplitProposal &Best, SplitProposal New) { |
1314 | SplitModuleTimer SMT("proposal_evaluation" , "proposal ranking algorithm" ); |
1315 | |
1316 | LLVM_DEBUG({ |
1317 | New.verifyCompleteness(); |
1318 | if (DebugProposalSearch) |
1319 | New.print(dbgs()); |
1320 | }); |
1321 | |
1322 | const double CurBScore = Best.getBottleneckScore(); |
1323 | const double CurCSScore = Best.getCodeSizeScore(); |
1324 | const double NewBScore = New.getBottleneckScore(); |
1325 | const double NewCSScore = New.getCodeSizeScore(); |
1326 | |
1327 | // TODO: Improve this |
1328 | // We can probably lower the precision of the comparison at first |
1329 | // e.g. if we have |
1330 | // - (Current): BScore: 0.489 CSCore 1.105 |
1331 | // - (New): BScore: 0.475 CSCore 1.305 |
1332 | // Currently we'd choose the new one because the bottleneck score is |
1333 | // lower, but the new one duplicates more code. It may be worth it to |
1334 | // discard the new proposal as the impact on build time is negligible. |
1335 | |
1336 | // Compare them |
1337 | bool IsBest = false; |
1338 | if (NewBScore < CurBScore) |
1339 | IsBest = true; |
1340 | else if (NewBScore == CurBScore) |
1341 | IsBest = (NewCSScore < CurCSScore); // Use code size as tie breaker. |
1342 | |
1343 | if (IsBest) |
1344 | Best = std::move(New); |
1345 | |
1346 | LLVM_DEBUG(if (DebugProposalSearch) { |
1347 | if (IsBest) |
1348 | dbgs() << "[search] new best proposal!\n" ; |
1349 | else |
1350 | dbgs() << "[search] discarding - not profitable\n" ; |
1351 | }); |
1352 | } |
1353 | |
1354 | /// Trivial helper to create an identical copy of \p M. |
1355 | static std::unique_ptr<Module> cloneAll(const Module &M) { |
1356 | ValueToValueMapTy VMap; |
1357 | return CloneModule(M, VMap, ShouldCloneDefinition: [&](const GlobalValue *GV) { return true; }); |
1358 | } |
1359 | |
1360 | /// Writes \p SG as a DOTGraph to \ref ModuleDotCfgDir if requested. |
1361 | static void writeDOTGraph(const SplitGraph &SG) { |
1362 | if (ModuleDotCfgOutput.empty()) |
1363 | return; |
1364 | |
1365 | std::error_code EC; |
1366 | raw_fd_ostream OS(ModuleDotCfgOutput, EC); |
1367 | if (EC) { |
1368 | errs() << "[" DEBUG_TYPE "]: cannot open '" << ModuleDotCfgOutput |
1369 | << "' - DOTGraph will not be printed\n" ; |
1370 | } |
1371 | WriteGraph(O&: OS, G: SG, /*ShortName=*/ShortNames: false, |
1372 | /*Title=*/SG.getModule().getName()); |
1373 | } |
1374 | |
1375 | static void splitAMDGPUModule( |
1376 | GetTTIFn GetTTI, Module &M, unsigned NumParts, |
1377 | function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) { |
1378 | CallGraph CG(M); |
1379 | |
1380 | // Externalize functions whose address are taken. |
1381 | // |
1382 | // This is needed because partitioning is purely based on calls, but sometimes |
1383 | // a kernel/function may just look at the address of another local function |
1384 | // and not do anything (no calls). After partitioning, that local function may |
1385 | // end up in a different module (so it's just a declaration in the module |
1386 | // where its address is taken), which emits a "undefined hidden symbol" linker |
1387 | // error. |
1388 | // |
1389 | // Additionally, it guides partitioning to not duplicate this function if it's |
1390 | // called directly at some point. |
1391 | // |
1392 | // TODO: Could we be smarter about this ? This makes all functions whose |
1393 | // addresses are taken non-copyable. We should probably model this type of |
1394 | // constraint in the graph and use it to guide splitting, instead of |
1395 | // externalizing like this. Maybe non-copyable should really mean "keep one |
1396 | // visible copy, then internalize all other copies" for some functions? |
1397 | if (!NoExternalizeOnAddrTaken) { |
1398 | for (auto &Fn : M) { |
1399 | // TODO: Should aliases count? Probably not but they're so rare I'm not |
1400 | // sure it's worth fixing. |
1401 | if (Fn.hasLocalLinkage() && Fn.hasAddressTaken()) { |
1402 | LLVM_DEBUG(dbgs() << "[externalize] " ; Fn.printAsOperand(dbgs()); |
1403 | dbgs() << " because its address is taken\n" ); |
1404 | externalize(GV&: Fn); |
1405 | } |
1406 | } |
1407 | } |
1408 | |
1409 | // Externalize local GVs, which avoids duplicating their initializers, which |
1410 | // in turns helps keep code size in check. |
1411 | if (!NoExternalizeGlobals) { |
1412 | for (auto &GV : M.globals()) { |
1413 | if (GV.hasLocalLinkage()) |
1414 | LLVM_DEBUG(dbgs() << "[externalize] GV " << GV.getName() << '\n'); |
1415 | externalize(GV); |
1416 | } |
1417 | } |
1418 | |
1419 | // Start by calculating the cost of every function in the module, as well as |
1420 | // the module's overall cost. |
1421 | FunctionsCostMap FnCosts; |
1422 | const CostType ModuleCost = calculateFunctionCosts(GetTTI, M, CostMap&: FnCosts); |
1423 | |
1424 | // Build the SplitGraph, which represents the module's functions and models |
1425 | // their dependencies accurately. |
1426 | SplitGraph SG(M, FnCosts, ModuleCost); |
1427 | SG.buildGraph(CG); |
1428 | |
1429 | if (SG.empty()) { |
1430 | LLVM_DEBUG( |
1431 | dbgs() |
1432 | << "[!] no nodes in graph, input is empty - no splitting possible\n" ); |
1433 | ModuleCallback(cloneAll(M)); |
1434 | return; |
1435 | } |
1436 | |
1437 | LLVM_DEBUG({ |
1438 | dbgs() << "[graph] nodes:\n" ; |
1439 | for (const SplitGraph::Node *N : SG.nodes()) { |
1440 | dbgs() << " - [" << N->getID() << "]: " << N->getName() << " " |
1441 | << (N->isGraphEntryPoint() ? "(entry)" : "" ) << " " |
1442 | << (N->isNonCopyable() ? "(noncopyable)" : "" ) << "\n" ; |
1443 | } |
1444 | }); |
1445 | |
1446 | writeDOTGraph(SG); |
1447 | |
1448 | LLVM_DEBUG(dbgs() << "[search] testing splitting strategies\n" ); |
1449 | |
1450 | std::optional<SplitProposal> Proposal; |
1451 | const auto EvaluateProposal = [&](SplitProposal SP) { |
1452 | SP.calculateScores(); |
1453 | if (!Proposal) |
1454 | Proposal = std::move(SP); |
1455 | else |
1456 | evaluateProposal(Best&: *Proposal, New: std::move(SP)); |
1457 | }; |
1458 | |
1459 | // TODO: It would be very easy to create new strategies by just adding a base |
1460 | // class to RecursiveSearchSplitting and abstracting it away. |
1461 | RecursiveSearchSplitting(SG, NumParts, EvaluateProposal).run(); |
1462 | LLVM_DEBUG(if (Proposal) dbgs() << "[search done] selected proposal: " |
1463 | << Proposal->getName() << "\n" ;); |
1464 | |
1465 | if (!Proposal) { |
1466 | LLVM_DEBUG(dbgs() << "[!] no proposal made, no splitting possible!\n" ); |
1467 | ModuleCallback(cloneAll(M)); |
1468 | return; |
1469 | } |
1470 | |
1471 | LLVM_DEBUG(Proposal->print(dbgs());); |
1472 | |
1473 | std::optional<raw_fd_ostream> SummariesOS; |
1474 | if (!PartitionSummariesOutput.empty()) { |
1475 | std::error_code EC; |
1476 | SummariesOS.emplace(args&: PartitionSummariesOutput, args&: EC); |
1477 | if (EC) |
1478 | errs() << "[" DEBUG_TYPE "]: cannot open '" << PartitionSummariesOutput |
1479 | << "' - Partition summaries will not be printed\n" ; |
1480 | } |
1481 | |
1482 | // One module will import all GlobalValues that are not Functions |
1483 | // and are not subject to conservative import. |
1484 | bool ImportAllGVs = true; |
1485 | |
1486 | for (unsigned PID = 0; PID < NumParts; ++PID) { |
1487 | SplitModuleTimer SMT2("modules_creation" , |
1488 | "creating modules for each partition" ); |
1489 | LLVM_DEBUG(dbgs() << "[split] creating new modules\n" ); |
1490 | |
1491 | DenseSet<const Function *> FnsInPart; |
1492 | for (unsigned NodeID : (*Proposal)[PID].set_bits()) |
1493 | FnsInPart.insert(V: &SG.getNode(ID: NodeID).getFunction()); |
1494 | |
1495 | // Don't create empty modules. |
1496 | if (FnsInPart.empty()) { |
1497 | LLVM_DEBUG(dbgs() << "[split] P" << PID |
1498 | << " is empty, not creating module\n" ); |
1499 | continue; |
1500 | } |
1501 | |
1502 | ValueToValueMapTy VMap; |
1503 | CostType PartCost = 0; |
1504 | std::unique_ptr<Module> MPart( |
1505 | CloneModule(M, VMap, ShouldCloneDefinition: [&](const GlobalValue *GV) { |
1506 | // Functions go in their assigned partition. |
1507 | if (const auto *Fn = dyn_cast<Function>(Val: GV)) { |
1508 | if (FnsInPart.contains(V: Fn)) { |
1509 | PartCost += SG.getCost(F: *Fn); |
1510 | return true; |
1511 | } |
1512 | return false; |
1513 | } |
1514 | |
1515 | // Everything else goes in the first non-empty module we create. |
1516 | return ImportAllGVs || needsConservativeImport(GV); |
1517 | })); |
1518 | |
1519 | ImportAllGVs = false; |
1520 | |
1521 | // FIXME: Aliases aren't seen often, and their handling isn't perfect so |
1522 | // bugs are possible. |
1523 | |
1524 | // Clean-up conservatively imported GVs without any users. |
1525 | for (auto &GV : make_early_inc_range(Range: MPart->global_values())) { |
1526 | if (needsConservativeImport(GV: &GV) && GV.use_empty()) |
1527 | GV.eraseFromParent(); |
1528 | } |
1529 | |
1530 | if (SummariesOS) |
1531 | printPartitionSummary(OS&: *SummariesOS, N: PID, M: *MPart, PartCost, ModuleCost); |
1532 | |
1533 | LLVM_DEBUG( |
1534 | printPartitionSummary(dbgs(), PID, *MPart, PartCost, ModuleCost)); |
1535 | |
1536 | ModuleCallback(std::move(MPart)); |
1537 | } |
1538 | } |
1539 | } // namespace |
1540 | |
1541 | PreservedAnalyses AMDGPUSplitModulePass::run(Module &M, |
1542 | ModuleAnalysisManager &MAM) { |
1543 | SplitModuleTimer SMT( |
1544 | "total" , "total pass runtime (incl. potentially waiting for lockfile)" ); |
1545 | |
1546 | FunctionAnalysisManager &FAM = |
1547 | MAM.getResult<FunctionAnalysisManagerModuleProxy>(IR&: M).getManager(); |
1548 | const auto TTIGetter = [&FAM](Function &F) -> const TargetTransformInfo & { |
1549 | return FAM.getResult<TargetIRAnalysis>(IR&: F); |
1550 | }; |
1551 | |
1552 | bool Done = false; |
1553 | #ifndef NDEBUG |
1554 | if (UseLockFile) { |
1555 | SmallString<128> LockFilePath; |
1556 | sys::path::system_temp_directory(/*ErasedOnReboot=*/true, LockFilePath); |
1557 | sys::path::append(LockFilePath, "amdgpu-split-module-debug" ); |
1558 | LLVM_DEBUG(dbgs() << DEBUG_TYPE " using lockfile '" << LockFilePath |
1559 | << "'\n" ); |
1560 | |
1561 | while (true) { |
1562 | llvm::LockFileManager Lock(LockFilePath.str()); |
1563 | bool Owned; |
1564 | if (Error Err = Lock.tryLock().moveInto(Owned)) { |
1565 | consumeError(std::move(Err)); |
1566 | LLVM_DEBUG( |
1567 | dbgs() << "[amdgpu-split-module] unable to acquire lockfile, debug " |
1568 | "output may be mangled by other processes\n" ); |
1569 | } else if (!Owned) { |
1570 | switch (Lock.waitForUnlockFor(std::chrono::seconds(90))) { |
1571 | case WaitForUnlockResult::Success: |
1572 | break; |
1573 | case WaitForUnlockResult::OwnerDied: |
1574 | continue; // try again to get the lock. |
1575 | case WaitForUnlockResult::Timeout: |
1576 | LLVM_DEBUG( |
1577 | dbgs() |
1578 | << "[amdgpu-split-module] unable to acquire lockfile, debug " |
1579 | "output may be mangled by other processes\n" ); |
1580 | Lock.unsafeMaybeUnlock(); |
1581 | break; // give up |
1582 | } |
1583 | } |
1584 | |
1585 | splitAMDGPUModule(TTIGetter, M, N, ModuleCallback); |
1586 | Done = true; |
1587 | break; |
1588 | } |
1589 | } |
1590 | #endif |
1591 | |
1592 | if (!Done) |
1593 | splitAMDGPUModule(GetTTI: TTIGetter, M, NumParts: N, ModuleCallback); |
1594 | |
1595 | // We can change linkage/visibilities in the input, consider that nothing is |
1596 | // preserved just to be safe. This pass runs last anyway. |
1597 | return PreservedAnalyses::none(); |
1598 | } |
1599 | } // namespace llvm |
1600 | |