| 1 | //===- AMDGPUSplitModule.cpp ----------------------------------------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | /// \file Implements a module splitting algorithm designed to support the |
| 10 | /// FullLTO --lto-partitions option for parallel codegen. |
| 11 | /// |
| 12 | /// The role of this module splitting pass is the same as |
| 13 | /// lib/Transforms/Utils/SplitModule.cpp: load-balance the module's functions |
| 14 | /// across a set of N partitions to allow for parallel codegen. |
| 15 | /// |
| 16 | /// The similarities mostly end here, as this pass achieves load-balancing in a |
| 17 | /// more elaborate fashion which is targeted towards AMDGPU modules. It can take |
| 18 | /// advantage of the structure of AMDGPU modules (which are mostly |
| 19 | /// self-contained) to allow for more efficient splitting without affecting |
| 20 | /// codegen negatively, or causing innaccurate resource usage analysis. |
| 21 | /// |
| 22 | /// High-level pass overview: |
| 23 | /// - SplitGraph & associated classes |
| 24 | /// - Graph representation of the module and of the dependencies that |
| 25 | /// matter for splitting. |
| 26 | /// - RecursiveSearchSplitting |
| 27 | /// - Core splitting algorithm. |
| 28 | /// - SplitProposal |
| 29 | /// - Represents a suggested solution for splitting the input module. These |
| 30 | /// solutions can be scored to determine the best one when multiple |
| 31 | /// solutions are available. |
| 32 | /// - Driver/pass "run" function glues everything together. |
| 33 | |
| 34 | #include "AMDGPUSplitModule.h" |
| 35 | #include "AMDGPUTargetMachine.h" |
| 36 | #include "Utils/AMDGPUBaseInfo.h" |
| 37 | #include "llvm/ADT/DenseMap.h" |
| 38 | #include "llvm/ADT/EquivalenceClasses.h" |
| 39 | #include "llvm/ADT/GraphTraits.h" |
| 40 | #include "llvm/ADT/SmallVector.h" |
| 41 | #include "llvm/ADT/StringExtras.h" |
| 42 | #include "llvm/ADT/StringRef.h" |
| 43 | #include "llvm/Analysis/CallGraph.h" |
| 44 | #include "llvm/Analysis/TargetTransformInfo.h" |
| 45 | #include "llvm/IR/Function.h" |
| 46 | #include "llvm/IR/GlobalAlias.h" |
| 47 | #include "llvm/IR/InstIterator.h" |
| 48 | #include "llvm/IR/Instruction.h" |
| 49 | #include "llvm/IR/Module.h" |
| 50 | #include "llvm/IR/Value.h" |
| 51 | #include "llvm/Support/Allocator.h" |
| 52 | #include "llvm/Support/Casting.h" |
| 53 | #include "llvm/Support/DOTGraphTraits.h" |
| 54 | #include "llvm/Support/Debug.h" |
| 55 | #include "llvm/Support/GraphWriter.h" |
| 56 | #include "llvm/Support/Path.h" |
| 57 | #include "llvm/Support/Timer.h" |
| 58 | #include "llvm/Support/raw_ostream.h" |
| 59 | #include "llvm/Transforms/Utils/Cloning.h" |
| 60 | #include <cassert> |
| 61 | #include <cmath> |
| 62 | #include <memory> |
| 63 | #include <utility> |
| 64 | #include <vector> |
| 65 | |
| 66 | #ifndef NDEBUG |
| 67 | #include "llvm/Support/LockFileManager.h" |
| 68 | #endif |
| 69 | |
| 70 | #define DEBUG_TYPE "amdgpu-split-module" |
| 71 | |
| 72 | namespace llvm { |
| 73 | namespace { |
| 74 | |
| 75 | static cl::opt<unsigned> MaxDepth( |
| 76 | "amdgpu-module-splitting-max-depth" , |
| 77 | cl::desc( |
| 78 | "maximum search depth. 0 forces a greedy approach. " |
| 79 | "warning: the algorithm is up to O(2^N), where N is the max depth." ), |
| 80 | cl::init(Val: 8)); |
| 81 | |
| 82 | static cl::opt<float> LargeFnFactor( |
| 83 | "amdgpu-module-splitting-large-threshold" , cl::init(Val: 2.0f), cl::Hidden, |
| 84 | cl::desc( |
| 85 | "when max depth is reached and we can no longer branch out, this " |
| 86 | "value determines if a function is worth merging into an already " |
| 87 | "existing partition to reduce code duplication. This is a factor " |
| 88 | "of the ideal partition size, e.g. 2.0 means we consider the " |
| 89 | "function for merging if its cost (including its callees) is 2x the " |
| 90 | "size of an ideal partition." )); |
| 91 | |
| 92 | static cl::opt<float> LargeFnOverlapForMerge( |
| 93 | "amdgpu-module-splitting-merge-threshold" , cl::init(Val: 0.7f), cl::Hidden, |
| 94 | cl::desc("when a function is considered for merging into a partition that " |
| 95 | "already contains some of its callees, do the merge if at least " |
| 96 | "n% of the code it can reach is already present inside the " |
| 97 | "partition; e.g. 0.7 means only merge >70%" )); |
| 98 | |
| 99 | static cl::opt<bool> NoExternalizeGlobals( |
| 100 | "amdgpu-module-splitting-no-externalize-globals" , cl::Hidden, |
| 101 | cl::desc("disables externalization of global variable with local linkage; " |
| 102 | "may cause globals to be duplicated which increases binary size" )); |
| 103 | |
| 104 | static cl::opt<bool> NoExternalizeOnAddrTaken( |
| 105 | "amdgpu-module-splitting-no-externalize-address-taken" , cl::Hidden, |
| 106 | cl::desc( |
| 107 | "disables externalization of functions whose addresses are taken" )); |
| 108 | |
| 109 | static cl::opt<std::string> |
| 110 | ModuleDotCfgOutput("amdgpu-module-splitting-print-module-dotcfg" , |
| 111 | cl::Hidden, |
| 112 | cl::desc("output file to write out the dotgraph " |
| 113 | "representation of the input module" )); |
| 114 | |
| 115 | static cl::opt<std::string> PartitionSummariesOutput( |
| 116 | "amdgpu-module-splitting-print-partition-summaries" , cl::Hidden, |
| 117 | cl::desc("output file to write out a summary of " |
| 118 | "the partitions created for each module" )); |
| 119 | |
| 120 | #ifndef NDEBUG |
| 121 | static cl::opt<bool> |
| 122 | UseLockFile("amdgpu-module-splitting-serial-execution" , cl::Hidden, |
| 123 | cl::desc("use a lock file so only one process in the system " |
| 124 | "can run this pass at once. useful to avoid mangled " |
| 125 | "debug output in multithreaded environments." )); |
| 126 | |
| 127 | static cl::opt<bool> |
| 128 | DebugProposalSearch("amdgpu-module-splitting-debug-proposal-search" , |
| 129 | cl::Hidden, |
| 130 | cl::desc("print all proposals received and whether " |
| 131 | "they were rejected or accepted" )); |
| 132 | #endif |
| 133 | |
| 134 | struct SplitModuleTimer : NamedRegionTimer { |
| 135 | SplitModuleTimer(StringRef Name, StringRef Desc) |
| 136 | : NamedRegionTimer(Name, Desc, DEBUG_TYPE, "AMDGPU Module Splitting" , |
| 137 | TimePassesIsEnabled) {} |
| 138 | }; |
| 139 | |
| 140 | //===----------------------------------------------------------------------===// |
| 141 | // Utils |
| 142 | //===----------------------------------------------------------------------===// |
| 143 | |
| 144 | using CostType = InstructionCost::CostType; |
| 145 | using FunctionsCostMap = DenseMap<const Function *, CostType>; |
| 146 | using GetTTIFn = function_ref<const TargetTransformInfo &(Function &)>; |
| 147 | static constexpr unsigned InvalidPID = -1; |
| 148 | |
| 149 | /// \param Num numerator |
| 150 | /// \param Dem denominator |
| 151 | /// \returns a printable object to print (Num/Dem) using "%0.2f". |
| 152 | static auto formatRatioOf(CostType Num, CostType Dem) { |
| 153 | CostType DemOr1 = Dem ? Dem : 1; |
| 154 | return format(Fmt: "%0.2f" , Vals: (static_cast<double>(Num) / DemOr1) * 100); |
| 155 | } |
| 156 | |
| 157 | /// Checks whether a given function is non-copyable. |
| 158 | /// |
| 159 | /// Non-copyable functions cannot be cloned into multiple partitions, and only |
| 160 | /// one copy of the function can be present across all partitions. |
| 161 | /// |
| 162 | /// Kernel functions and external functions fall into this category. If we were |
| 163 | /// to clone them, we would end up with multiple symbol definitions and a very |
| 164 | /// unhappy linker. |
| 165 | static bool isNonCopyable(const Function &F) { |
| 166 | return F.hasExternalLinkage() || !F.isDefinitionExact() || |
| 167 | AMDGPU::isEntryFunctionCC(CC: F.getCallingConv()); |
| 168 | } |
| 169 | |
| 170 | /// If \p GV has local linkage, make it external + hidden. |
| 171 | static void externalize(GlobalValue &GV) { |
| 172 | if (GV.hasLocalLinkage()) { |
| 173 | GV.setLinkage(GlobalValue::ExternalLinkage); |
| 174 | GV.setVisibility(GlobalValue::HiddenVisibility); |
| 175 | } |
| 176 | |
| 177 | // Unnamed entities must be named consistently between modules. setName will |
| 178 | // give a distinct name to each such entity. |
| 179 | if (!GV.hasName()) |
| 180 | GV.setName("__llvmsplit_unnamed" ); |
| 181 | } |
| 182 | |
| 183 | /// Cost analysis function. Calculates the cost of each function in \p M |
| 184 | /// |
| 185 | /// \param GetTTI Abstract getter for TargetTransformInfo. |
| 186 | /// \param M Module to analyze. |
| 187 | /// \param CostMap[out] Resulting Function -> Cost map. |
| 188 | /// \return The module's total cost. |
| 189 | static CostType calculateFunctionCosts(GetTTIFn GetTTI, Module &M, |
| 190 | FunctionsCostMap &CostMap) { |
| 191 | SplitModuleTimer SMT("calculateFunctionCosts" , "cost analysis" ); |
| 192 | |
| 193 | LLVM_DEBUG(dbgs() << "[cost analysis] calculating function costs\n" ); |
| 194 | CostType ModuleCost = 0; |
| 195 | [[maybe_unused]] CostType KernelCost = 0; |
| 196 | |
| 197 | for (auto &Fn : M) { |
| 198 | if (Fn.isDeclaration()) |
| 199 | continue; |
| 200 | |
| 201 | CostType FnCost = 0; |
| 202 | const auto &TTI = GetTTI(Fn); |
| 203 | for (const auto &BB : Fn) { |
| 204 | for (const auto &I : BB) { |
| 205 | auto Cost = |
| 206 | TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_CodeSize); |
| 207 | assert(Cost != InstructionCost::getMax()); |
| 208 | // Assume expensive if we can't tell the cost of an instruction. |
| 209 | CostType CostVal = Cost.isValid() |
| 210 | ? Cost.getValue() |
| 211 | : (CostType)TargetTransformInfo::TCC_Expensive; |
| 212 | assert((FnCost + CostVal) >= FnCost && "Overflow!" ); |
| 213 | FnCost += CostVal; |
| 214 | } |
| 215 | } |
| 216 | |
| 217 | assert(FnCost != 0); |
| 218 | |
| 219 | CostMap[&Fn] = FnCost; |
| 220 | assert((ModuleCost + FnCost) >= ModuleCost && "Overflow!" ); |
| 221 | ModuleCost += FnCost; |
| 222 | |
| 223 | if (AMDGPU::isEntryFunctionCC(CC: Fn.getCallingConv())) |
| 224 | KernelCost += FnCost; |
| 225 | } |
| 226 | |
| 227 | if (CostMap.empty()) |
| 228 | return 0; |
| 229 | |
| 230 | assert(ModuleCost); |
| 231 | LLVM_DEBUG({ |
| 232 | const CostType FnCost = ModuleCost - KernelCost; |
| 233 | dbgs() << " - total module cost is " << ModuleCost << ". kernels cost " |
| 234 | << "" << KernelCost << " (" |
| 235 | << format("%0.2f" , (float(KernelCost) / ModuleCost) * 100) |
| 236 | << "% of the module), functions cost " << FnCost << " (" |
| 237 | << format("%0.2f" , (float(FnCost) / ModuleCost) * 100) |
| 238 | << "% of the module)\n" ; |
| 239 | }); |
| 240 | |
| 241 | return ModuleCost; |
| 242 | } |
| 243 | |
| 244 | /// \return true if \p F can be indirectly called |
| 245 | static bool canBeIndirectlyCalled(const Function &F) { |
| 246 | if (F.isDeclaration() || AMDGPU::isEntryFunctionCC(CC: F.getCallingConv())) |
| 247 | return false; |
| 248 | return !F.hasLocalLinkage() || |
| 249 | F.hasAddressTaken(/*PutOffender=*/nullptr, |
| 250 | /*IgnoreCallbackUses=*/false, |
| 251 | /*IgnoreAssumeLikeCalls=*/true, |
| 252 | /*IgnoreLLVMUsed=*/IngoreLLVMUsed: true, |
| 253 | /*IgnoreARCAttachedCall=*/false, |
| 254 | /*IgnoreCastedDirectCall=*/true); |
| 255 | } |
| 256 | |
| 257 | //===----------------------------------------------------------------------===// |
| 258 | // Graph-based Module Representation |
| 259 | //===----------------------------------------------------------------------===// |
| 260 | |
| 261 | /// AMDGPUSplitModule's view of the source Module, as a graph of all components |
| 262 | /// that can be split into different modules. |
| 263 | /// |
| 264 | /// The most trivial instance of this graph is just the CallGraph of the module, |
| 265 | /// but it is not guaranteed that the graph is strictly equal to the CG. It |
| 266 | /// currently always is but it's designed in a way that would eventually allow |
| 267 | /// us to create abstract nodes, or nodes for different entities such as global |
| 268 | /// variables or any other meaningful constraint we must consider. |
| 269 | /// |
| 270 | /// The graph is only mutable by this class, and is generally not modified |
| 271 | /// after \ref SplitGraph::buildGraph runs. No consumers of the graph can |
| 272 | /// mutate it. |
| 273 | class SplitGraph { |
| 274 | public: |
| 275 | class Node; |
| 276 | |
| 277 | enum class EdgeKind : uint8_t { |
| 278 | /// The nodes are related through a direct call. This is a "strong" edge as |
| 279 | /// it means the Src will directly reference the Dst. |
| 280 | DirectCall, |
| 281 | /// The nodes are related through an indirect call. |
| 282 | /// This is a "weaker" edge and is only considered when traversing the graph |
| 283 | /// starting from a kernel. We need this edge for resource usage analysis. |
| 284 | /// |
| 285 | /// The reason why we have this edge in the first place is due to how |
| 286 | /// AMDGPUResourceUsageAnalysis works. In the presence of an indirect call, |
| 287 | /// the resource usage of the kernel containing the indirect call is the |
| 288 | /// max resource usage of all functions that can be indirectly called. |
| 289 | IndirectCall, |
| 290 | }; |
| 291 | |
| 292 | /// An edge between two nodes. Edges are directional, and tagged with a |
| 293 | /// "kind". |
| 294 | struct Edge { |
| 295 | Edge(Node *Src, Node *Dst, EdgeKind Kind) |
| 296 | : Src(Src), Dst(Dst), Kind(Kind) {} |
| 297 | |
| 298 | Node *Src; ///< Source |
| 299 | Node *Dst; ///< Destination |
| 300 | EdgeKind Kind; |
| 301 | }; |
| 302 | |
| 303 | using EdgesVec = SmallVector<const Edge *, 0>; |
| 304 | using edges_iterator = EdgesVec::const_iterator; |
| 305 | using nodes_iterator = const Node *const *; |
| 306 | |
| 307 | SplitGraph(const Module &M, const FunctionsCostMap &CostMap, |
| 308 | CostType ModuleCost) |
| 309 | : M(M), CostMap(CostMap), ModuleCost(ModuleCost) {} |
| 310 | |
| 311 | void buildGraph(CallGraph &CG); |
| 312 | |
| 313 | #ifndef NDEBUG |
| 314 | bool verifyGraph() const; |
| 315 | #endif |
| 316 | |
| 317 | bool empty() const { return Nodes.empty(); } |
| 318 | iterator_range<nodes_iterator> nodes() const { return Nodes; } |
| 319 | const Node &getNode(unsigned ID) const { return *Nodes[ID]; } |
| 320 | |
| 321 | unsigned getNumNodes() const { return Nodes.size(); } |
| 322 | BitVector createNodesBitVector() const { return BitVector(Nodes.size()); } |
| 323 | |
| 324 | const Module &getModule() const { return M; } |
| 325 | |
| 326 | CostType getModuleCost() const { return ModuleCost; } |
| 327 | CostType getCost(const Function &F) const { return CostMap.at(Val: &F); } |
| 328 | |
| 329 | /// \returns the aggregated cost of all nodes in \p BV (bits set to 1 = node |
| 330 | /// IDs). |
| 331 | CostType calculateCost(const BitVector &BV) const; |
| 332 | |
| 333 | private: |
| 334 | /// Retrieves the node for \p GV in \p Cache, or creates a new node for it and |
| 335 | /// updates \p Cache. |
| 336 | Node &getNode(DenseMap<const GlobalValue *, Node *> &Cache, |
| 337 | const GlobalValue &GV); |
| 338 | |
| 339 | // Create a new edge between two nodes and add it to both nodes. |
| 340 | const Edge &createEdge(Node &Src, Node &Dst, EdgeKind EK); |
| 341 | |
| 342 | const Module &M; |
| 343 | const FunctionsCostMap &CostMap; |
| 344 | CostType ModuleCost; |
| 345 | |
| 346 | // Final list of nodes with stable ordering. |
| 347 | SmallVector<Node *> Nodes; |
| 348 | |
| 349 | SpecificBumpPtrAllocator<Node> NodesPool; |
| 350 | |
| 351 | // Edges are trivially destructible objects, so as a small optimization we |
| 352 | // use a BumpPtrAllocator which avoids destructor calls but also makes |
| 353 | // allocation faster. |
| 354 | static_assert( |
| 355 | std::is_trivially_destructible_v<Edge>, |
| 356 | "Edge must be trivially destructible to use the BumpPtrAllocator" ); |
| 357 | BumpPtrAllocator EdgesPool; |
| 358 | }; |
| 359 | |
| 360 | /// Nodes in the SplitGraph contain both incoming, and outgoing edges. |
| 361 | /// Incoming edges have this node as their Dst, and Outgoing ones have this node |
| 362 | /// as their Src. |
| 363 | /// |
| 364 | /// Edge objects are shared by both nodes in Src/Dst. They provide immediate |
| 365 | /// feedback on how two nodes are related, and in which direction they are |
| 366 | /// related, which is valuable information to make splitting decisions. |
| 367 | /// |
| 368 | /// Nodes are fundamentally abstract, and any consumers of the graph should |
| 369 | /// treat them as such. While a node will be a function most of the time, we |
| 370 | /// could also create nodes for any other reason. In the future, we could have |
| 371 | /// single nodes for multiple functions, or nodes for GVs, etc. |
| 372 | class SplitGraph::Node { |
| 373 | friend class SplitGraph; |
| 374 | |
| 375 | public: |
| 376 | Node(unsigned ID, const GlobalValue &GV, CostType IndividualCost, |
| 377 | bool IsNonCopyable) |
| 378 | : ID(ID), GV(GV), IndividualCost(IndividualCost), |
| 379 | IsNonCopyable(IsNonCopyable), IsEntryFnCC(false), IsGraphEntry(false) { |
| 380 | if (auto *Fn = dyn_cast<Function>(Val: &GV)) |
| 381 | IsEntryFnCC = AMDGPU::isEntryFunctionCC(CC: Fn->getCallingConv()); |
| 382 | } |
| 383 | |
| 384 | /// An 0-indexed ID for the node. The maximum ID (exclusive) is the number of |
| 385 | /// nodes in the graph. This ID can be used as an index in a BitVector. |
| 386 | unsigned getID() const { return ID; } |
| 387 | |
| 388 | const Function &getFunction() const { return cast<Function>(Val: GV); } |
| 389 | |
| 390 | /// \returns the cost to import this component into a given module, not |
| 391 | /// accounting for any dependencies that may need to be imported as well. |
| 392 | CostType getIndividualCost() const { return IndividualCost; } |
| 393 | |
| 394 | bool isNonCopyable() const { return IsNonCopyable; } |
| 395 | bool isEntryFunctionCC() const { return IsEntryFnCC; } |
| 396 | |
| 397 | /// \returns whether this is an entry point in the graph. Entry points are |
| 398 | /// defined as follows: if you take all entry points in the graph, and iterate |
| 399 | /// their dependencies, you are guaranteed to visit all nodes in the graph at |
| 400 | /// least once. |
| 401 | bool isGraphEntryPoint() const { return IsGraphEntry; } |
| 402 | |
| 403 | StringRef getName() const { return GV.getName(); } |
| 404 | |
| 405 | bool hasAnyIncomingEdges() const { return IncomingEdges.size(); } |
| 406 | bool hasAnyIncomingEdgesOfKind(EdgeKind EK) const { |
| 407 | return any_of(Range: IncomingEdges, P: [&](const auto *E) { return E->Kind == EK; }); |
| 408 | } |
| 409 | |
| 410 | bool hasAnyOutgoingEdges() const { return OutgoingEdges.size(); } |
| 411 | bool hasAnyOutgoingEdgesOfKind(EdgeKind EK) const { |
| 412 | return any_of(Range: OutgoingEdges, P: [&](const auto *E) { return E->Kind == EK; }); |
| 413 | } |
| 414 | |
| 415 | iterator_range<edges_iterator> incoming_edges() const { |
| 416 | return IncomingEdges; |
| 417 | } |
| 418 | |
| 419 | iterator_range<edges_iterator> outgoing_edges() const { |
| 420 | return OutgoingEdges; |
| 421 | } |
| 422 | |
| 423 | bool shouldFollowIndirectCalls() const { return isEntryFunctionCC(); } |
| 424 | |
| 425 | /// Visit all children of this node in a recursive fashion. Also visits Self. |
| 426 | /// If \ref shouldFollowIndirectCalls returns false, then this only follows |
| 427 | /// DirectCall edges. |
| 428 | /// |
| 429 | /// \param Visitor Visitor Function. |
| 430 | void visitAllDependencies(std::function<void(const Node &)> Visitor) const; |
| 431 | |
| 432 | /// Adds the depedencies of this node in \p BV by setting the bit |
| 433 | /// corresponding to each node. |
| 434 | /// |
| 435 | /// Implemented using \ref visitAllDependencies, hence it follows the same |
| 436 | /// rules regarding dependencies traversal. |
| 437 | /// |
| 438 | /// \param[out] BV The bitvector where the bits should be set. |
| 439 | void getDependencies(BitVector &BV) const { |
| 440 | visitAllDependencies(Visitor: [&](const Node &N) { BV.set(N.getID()); }); |
| 441 | } |
| 442 | |
| 443 | private: |
| 444 | void markAsGraphEntry() { IsGraphEntry = true; } |
| 445 | |
| 446 | unsigned ID; |
| 447 | const GlobalValue &GV; |
| 448 | CostType IndividualCost; |
| 449 | bool IsNonCopyable : 1; |
| 450 | bool IsEntryFnCC : 1; |
| 451 | bool IsGraphEntry : 1; |
| 452 | |
| 453 | // TODO: Use a single sorted vector (with all incoming/outgoing edges grouped |
| 454 | // together) |
| 455 | EdgesVec IncomingEdges; |
| 456 | EdgesVec OutgoingEdges; |
| 457 | }; |
| 458 | |
| 459 | void SplitGraph::Node::visitAllDependencies( |
| 460 | std::function<void(const Node &)> Visitor) const { |
| 461 | const bool FollowIndirect = shouldFollowIndirectCalls(); |
| 462 | // FIXME: If this can access SplitGraph in the future, use a BitVector |
| 463 | // instead. |
| 464 | DenseSet<const Node *> Seen; |
| 465 | SmallVector<const Node *, 8> WorkList({this}); |
| 466 | while (!WorkList.empty()) { |
| 467 | const Node *CurN = WorkList.pop_back_val(); |
| 468 | if (auto [It, Inserted] = Seen.insert(V: CurN); !Inserted) |
| 469 | continue; |
| 470 | |
| 471 | Visitor(*CurN); |
| 472 | |
| 473 | for (const Edge *E : CurN->outgoing_edges()) { |
| 474 | if (!FollowIndirect && E->Kind == EdgeKind::IndirectCall) |
| 475 | continue; |
| 476 | WorkList.push_back(Elt: E->Dst); |
| 477 | } |
| 478 | } |
| 479 | } |
| 480 | |
| 481 | /// Checks if \p I has MD_callees and if it does, parse it and put the function |
| 482 | /// in \p Callees. |
| 483 | /// |
| 484 | /// \returns true if there was metadata and it was parsed correctly. false if |
| 485 | /// there was no MD or if it contained unknown entries and parsing failed. |
| 486 | /// If this returns false, \p Callees will contain incomplete information |
| 487 | /// and must not be used. |
| 488 | static bool handleCalleesMD(const Instruction &I, |
| 489 | SetVector<Function *> &Callees) { |
| 490 | auto *MD = I.getMetadata(KindID: LLVMContext::MD_callees); |
| 491 | if (!MD) |
| 492 | return false; |
| 493 | |
| 494 | for (const auto &Op : MD->operands()) { |
| 495 | Function *Callee = mdconst::extract_or_null<Function>(MD: Op); |
| 496 | if (!Callee) |
| 497 | return false; |
| 498 | Callees.insert(X: Callee); |
| 499 | } |
| 500 | |
| 501 | return true; |
| 502 | } |
| 503 | |
| 504 | void SplitGraph::buildGraph(CallGraph &CG) { |
| 505 | SplitModuleTimer SMT("buildGraph" , "graph construction" ); |
| 506 | LLVM_DEBUG( |
| 507 | dbgs() |
| 508 | << "[build graph] constructing graph representation of the input\n" ); |
| 509 | |
| 510 | // FIXME(?): Is the callgraph really worth using if we have to iterate the |
| 511 | // function again whenever it fails to give us enough information? |
| 512 | |
| 513 | // We build the graph by just iterating all functions in the module and |
| 514 | // working on their direct callees. At the end, all nodes should be linked |
| 515 | // together as expected. |
| 516 | DenseMap<const GlobalValue *, Node *> Cache; |
| 517 | SmallVector<const Function *> FnsWithIndirectCalls, IndirectlyCallableFns; |
| 518 | for (const Function &Fn : M) { |
| 519 | if (Fn.isDeclaration()) |
| 520 | continue; |
| 521 | |
| 522 | // Look at direct callees and create the necessary edges in the graph. |
| 523 | SetVector<const Function *> DirectCallees; |
| 524 | bool CallsExternal = false; |
| 525 | for (auto &CGEntry : *CG[&Fn]) { |
| 526 | auto *CGNode = CGEntry.second; |
| 527 | if (auto *Callee = CGNode->getFunction()) { |
| 528 | if (!Callee->isDeclaration()) |
| 529 | DirectCallees.insert(X: Callee); |
| 530 | } else if (CGNode == CG.getCallsExternalNode()) |
| 531 | CallsExternal = true; |
| 532 | } |
| 533 | |
| 534 | // Keep track of this function if it contains an indirect call and/or if it |
| 535 | // can be indirectly called. |
| 536 | if (CallsExternal) { |
| 537 | LLVM_DEBUG(dbgs() << " [!] callgraph is incomplete for " ; |
| 538 | Fn.printAsOperand(dbgs()); |
| 539 | dbgs() << " - analyzing function\n" ); |
| 540 | |
| 541 | SetVector<Function *> KnownCallees; |
| 542 | bool HasUnknownIndirectCall = false; |
| 543 | for (const auto &Inst : instructions(F: Fn)) { |
| 544 | // look at all calls without a direct callee. |
| 545 | const auto *CB = dyn_cast<CallBase>(Val: &Inst); |
| 546 | if (!CB || CB->getCalledFunction()) |
| 547 | continue; |
| 548 | |
| 549 | // inline assembly can be ignored, unless InlineAsmIsIndirectCall is |
| 550 | // true. |
| 551 | if (CB->isInlineAsm()) { |
| 552 | LLVM_DEBUG(dbgs() << " found inline assembly\n" ); |
| 553 | continue; |
| 554 | } |
| 555 | |
| 556 | if (handleCalleesMD(I: Inst, Callees&: KnownCallees)) |
| 557 | continue; |
| 558 | // If we failed to parse any !callees MD, or some was missing, |
| 559 | // the entire KnownCallees list is now unreliable. |
| 560 | KnownCallees.clear(); |
| 561 | |
| 562 | // Everything else is handled conservatively. If we fall into the |
| 563 | // conservative case don't bother analyzing further. |
| 564 | HasUnknownIndirectCall = true; |
| 565 | break; |
| 566 | } |
| 567 | |
| 568 | if (HasUnknownIndirectCall) { |
| 569 | LLVM_DEBUG(dbgs() << " indirect call found\n" ); |
| 570 | FnsWithIndirectCalls.push_back(Elt: &Fn); |
| 571 | } else if (!KnownCallees.empty()) |
| 572 | DirectCallees.insert_range(R&: KnownCallees); |
| 573 | } |
| 574 | |
| 575 | Node &N = getNode(Cache, GV: Fn); |
| 576 | for (const auto *Callee : DirectCallees) |
| 577 | createEdge(Src&: N, Dst&: getNode(Cache, GV: *Callee), EK: EdgeKind::DirectCall); |
| 578 | |
| 579 | if (canBeIndirectlyCalled(F: Fn)) |
| 580 | IndirectlyCallableFns.push_back(Elt: &Fn); |
| 581 | } |
| 582 | |
| 583 | // Post-process functions with indirect calls. |
| 584 | for (const Function *Fn : FnsWithIndirectCalls) { |
| 585 | for (const Function *Candidate : IndirectlyCallableFns) { |
| 586 | Node &Src = getNode(Cache, GV: *Fn); |
| 587 | Node &Dst = getNode(Cache, GV: *Candidate); |
| 588 | createEdge(Src, Dst, EK: EdgeKind::IndirectCall); |
| 589 | } |
| 590 | } |
| 591 | |
| 592 | // Now, find all entry points. |
| 593 | SmallVector<Node *, 16> CandidateEntryPoints; |
| 594 | BitVector NodesReachableByKernels = createNodesBitVector(); |
| 595 | for (Node *N : Nodes) { |
| 596 | // Functions with an Entry CC are always graph entry points too. |
| 597 | if (N->isEntryFunctionCC()) { |
| 598 | N->markAsGraphEntry(); |
| 599 | N->getDependencies(BV&: NodesReachableByKernels); |
| 600 | } else if (!N->hasAnyIncomingEdgesOfKind(EK: EdgeKind::DirectCall)) |
| 601 | CandidateEntryPoints.push_back(Elt: N); |
| 602 | } |
| 603 | |
| 604 | for (Node *N : CandidateEntryPoints) { |
| 605 | // This can be another entry point if it's not reachable by a kernel |
| 606 | // TODO: We could sort all of the possible new entries in a stable order |
| 607 | // (e.g. by cost), then consume them one by one until |
| 608 | // NodesReachableByKernels is all 1s. It'd allow us to avoid |
| 609 | // considering some nodes as non-entries in some specific cases. |
| 610 | if (!NodesReachableByKernels.test(Idx: N->getID())) |
| 611 | N->markAsGraphEntry(); |
| 612 | } |
| 613 | |
| 614 | #ifndef NDEBUG |
| 615 | assert(verifyGraph()); |
| 616 | #endif |
| 617 | } |
| 618 | |
| 619 | #ifndef NDEBUG |
| 620 | bool SplitGraph::verifyGraph() const { |
| 621 | unsigned ExpectedID = 0; |
| 622 | // Exceptionally using a set here in case IDs are messed up. |
| 623 | DenseSet<const Node *> SeenNodes; |
| 624 | DenseSet<const Function *> SeenFunctionNodes; |
| 625 | for (const Node *N : Nodes) { |
| 626 | if (N->getID() != (ExpectedID++)) { |
| 627 | errs() << "Node IDs are incorrect!\n" ; |
| 628 | return false; |
| 629 | } |
| 630 | |
| 631 | if (!SeenNodes.insert(N).second) { |
| 632 | errs() << "Node seen more than once!\n" ; |
| 633 | return false; |
| 634 | } |
| 635 | |
| 636 | if (&getNode(N->getID()) != N) { |
| 637 | errs() << "getNode doesn't return the right node\n" ; |
| 638 | return false; |
| 639 | } |
| 640 | |
| 641 | for (const Edge *E : N->IncomingEdges) { |
| 642 | if (!E->Src || !E->Dst || (E->Dst != N) || |
| 643 | (find(E->Src->OutgoingEdges, E) == E->Src->OutgoingEdges.end())) { |
| 644 | errs() << "ill-formed incoming edges\n" ; |
| 645 | return false; |
| 646 | } |
| 647 | } |
| 648 | |
| 649 | for (const Edge *E : N->OutgoingEdges) { |
| 650 | if (!E->Src || !E->Dst || (E->Src != N) || |
| 651 | (find(E->Dst->IncomingEdges, E) == E->Dst->IncomingEdges.end())) { |
| 652 | errs() << "ill-formed outgoing edges\n" ; |
| 653 | return false; |
| 654 | } |
| 655 | } |
| 656 | |
| 657 | const Function &Fn = N->getFunction(); |
| 658 | if (AMDGPU::isEntryFunctionCC(Fn.getCallingConv())) { |
| 659 | if (N->hasAnyIncomingEdges()) { |
| 660 | errs() << "Kernels cannot have incoming edges\n" ; |
| 661 | return false; |
| 662 | } |
| 663 | } |
| 664 | |
| 665 | if (Fn.isDeclaration()) { |
| 666 | errs() << "declarations shouldn't have nodes!\n" ; |
| 667 | return false; |
| 668 | } |
| 669 | |
| 670 | auto [It, Inserted] = SeenFunctionNodes.insert(&Fn); |
| 671 | if (!Inserted) { |
| 672 | errs() << "one function has multiple nodes!\n" ; |
| 673 | return false; |
| 674 | } |
| 675 | } |
| 676 | |
| 677 | if (ExpectedID != Nodes.size()) { |
| 678 | errs() << "Node IDs out of sync!\n" ; |
| 679 | return false; |
| 680 | } |
| 681 | |
| 682 | if (createNodesBitVector().size() != getNumNodes()) { |
| 683 | errs() << "nodes bit vector doesn't have the right size!\n" ; |
| 684 | return false; |
| 685 | } |
| 686 | |
| 687 | // Check we respect the promise of Node::isKernel |
| 688 | BitVector BV = createNodesBitVector(); |
| 689 | for (const Node *N : nodes()) { |
| 690 | if (N->isGraphEntryPoint()) |
| 691 | N->getDependencies(BV); |
| 692 | } |
| 693 | |
| 694 | // Ensure each function in the module has an associated node. |
| 695 | for (const auto &Fn : M) { |
| 696 | if (!Fn.isDeclaration()) { |
| 697 | if (!SeenFunctionNodes.contains(&Fn)) { |
| 698 | errs() << "Fn has no associated node in the graph!\n" ; |
| 699 | return false; |
| 700 | } |
| 701 | } |
| 702 | } |
| 703 | |
| 704 | if (!BV.all()) { |
| 705 | errs() << "not all nodes are reachable through the graph's entry points!\n" ; |
| 706 | return false; |
| 707 | } |
| 708 | |
| 709 | return true; |
| 710 | } |
| 711 | #endif |
| 712 | |
| 713 | CostType SplitGraph::calculateCost(const BitVector &BV) const { |
| 714 | CostType Cost = 0; |
| 715 | for (unsigned NodeID : BV.set_bits()) |
| 716 | Cost += getNode(ID: NodeID).getIndividualCost(); |
| 717 | return Cost; |
| 718 | } |
| 719 | |
| 720 | SplitGraph::Node & |
| 721 | SplitGraph::getNode(DenseMap<const GlobalValue *, Node *> &Cache, |
| 722 | const GlobalValue &GV) { |
| 723 | auto &N = Cache[&GV]; |
| 724 | if (N) |
| 725 | return *N; |
| 726 | |
| 727 | CostType Cost = 0; |
| 728 | bool NonCopyable = false; |
| 729 | if (const Function *Fn = dyn_cast<Function>(Val: &GV)) { |
| 730 | NonCopyable = isNonCopyable(F: *Fn); |
| 731 | Cost = CostMap.at(Val: Fn); |
| 732 | } |
| 733 | N = new (NodesPool.Allocate()) Node(Nodes.size(), GV, Cost, NonCopyable); |
| 734 | Nodes.push_back(Elt: N); |
| 735 | assert(&getNode(N->getID()) == N); |
| 736 | return *N; |
| 737 | } |
| 738 | |
| 739 | const SplitGraph::Edge &SplitGraph::createEdge(Node &Src, Node &Dst, |
| 740 | EdgeKind EK) { |
| 741 | const Edge *E = new (EdgesPool.Allocate<Edge>(Num: 1)) Edge(&Src, &Dst, EK); |
| 742 | Src.OutgoingEdges.push_back(Elt: E); |
| 743 | Dst.IncomingEdges.push_back(Elt: E); |
| 744 | return *E; |
| 745 | } |
| 746 | |
| 747 | //===----------------------------------------------------------------------===// |
| 748 | // Split Proposals |
| 749 | //===----------------------------------------------------------------------===// |
| 750 | |
| 751 | /// Represents a module splitting proposal. |
| 752 | /// |
| 753 | /// Proposals are made of N BitVectors, one for each partition, where each bit |
| 754 | /// set indicates that the node is present and should be copied inside that |
| 755 | /// partition. |
| 756 | /// |
| 757 | /// Proposals have several metrics attached so they can be compared/sorted, |
| 758 | /// which the driver to try multiple strategies resultings in multiple proposals |
| 759 | /// and choose the best one out of them. |
| 760 | class SplitProposal { |
| 761 | public: |
| 762 | SplitProposal(const SplitGraph &SG, unsigned MaxPartitions) : SG(&SG) { |
| 763 | Partitions.resize(new_size: MaxPartitions, x: {0, SG.createNodesBitVector()}); |
| 764 | } |
| 765 | |
| 766 | void setName(StringRef NewName) { Name = NewName; } |
| 767 | StringRef getName() const { return Name; } |
| 768 | |
| 769 | const BitVector &operator[](unsigned PID) const { |
| 770 | return Partitions[PID].second; |
| 771 | } |
| 772 | |
| 773 | void add(unsigned PID, const BitVector &BV) { |
| 774 | Partitions[PID].second |= BV; |
| 775 | updateScore(PID); |
| 776 | } |
| 777 | |
| 778 | void print(raw_ostream &OS) const; |
| 779 | LLVM_DUMP_METHOD void dump() const { print(OS&: dbgs()); } |
| 780 | |
| 781 | // Find the cheapest partition (lowest cost). In case of ties, always returns |
| 782 | // the highest partition number. |
| 783 | unsigned findCheapestPartition() const; |
| 784 | |
| 785 | /// Calculate the CodeSize and Bottleneck scores. |
| 786 | void calculateScores(); |
| 787 | |
| 788 | #ifndef NDEBUG |
| 789 | void verifyCompleteness() const; |
| 790 | #endif |
| 791 | |
| 792 | /// Only available after \ref calculateScores is called. |
| 793 | /// |
| 794 | /// A positive number indicating the % of code duplication that this proposal |
| 795 | /// creates. e.g. 0.2 means this proposal adds roughly 20% code size by |
| 796 | /// duplicating some functions across partitions. |
| 797 | /// |
| 798 | /// Value is always rounded up to 3 decimal places. |
| 799 | /// |
| 800 | /// A perfect score would be 0.0, and anything approaching 1.0 is very bad. |
| 801 | double getCodeSizeScore() const { return CodeSizeScore; } |
| 802 | |
| 803 | /// Only available after \ref calculateScores is called. |
| 804 | /// |
| 805 | /// A number between [0, 1] which indicates how big of a bottleneck is |
| 806 | /// expected from the largest partition. |
| 807 | /// |
| 808 | /// A score of 1.0 means the biggest partition is as big as the source module, |
| 809 | /// so build time will be equal to or greater than the build time of the |
| 810 | /// initial input. |
| 811 | /// |
| 812 | /// Value is always rounded up to 3 decimal places. |
| 813 | /// |
| 814 | /// This is one of the metrics used to estimate this proposal's build time. |
| 815 | double getBottleneckScore() const { return BottleneckScore; } |
| 816 | |
| 817 | private: |
| 818 | void updateScore(unsigned PID) { |
| 819 | assert(SG); |
| 820 | for (auto &[PCost, Nodes] : Partitions) { |
| 821 | TotalCost -= PCost; |
| 822 | PCost = SG->calculateCost(BV: Nodes); |
| 823 | TotalCost += PCost; |
| 824 | } |
| 825 | } |
| 826 | |
| 827 | /// \see getCodeSizeScore |
| 828 | double CodeSizeScore = 0.0; |
| 829 | /// \see getBottleneckScore |
| 830 | double BottleneckScore = 0.0; |
| 831 | /// Aggregated cost of all partitions |
| 832 | CostType TotalCost = 0; |
| 833 | |
| 834 | const SplitGraph *SG = nullptr; |
| 835 | std::string Name; |
| 836 | |
| 837 | std::vector<std::pair<CostType, BitVector>> Partitions; |
| 838 | }; |
| 839 | |
| 840 | void SplitProposal::print(raw_ostream &OS) const { |
| 841 | assert(SG); |
| 842 | |
| 843 | OS << "[proposal] " << Name << ", total cost:" << TotalCost |
| 844 | << ", code size score:" << format(Fmt: "%0.3f" , Vals: CodeSizeScore) |
| 845 | << ", bottleneck score:" << format(Fmt: "%0.3f" , Vals: BottleneckScore) << '\n'; |
| 846 | for (const auto &[PID, Part] : enumerate(First: Partitions)) { |
| 847 | const auto &[Cost, NodeIDs] = Part; |
| 848 | OS << " - P" << PID << " nodes:" << NodeIDs.count() << " cost: " << Cost |
| 849 | << '|' << formatRatioOf(Num: Cost, Dem: SG->getModuleCost()) << "%\n" ; |
| 850 | } |
| 851 | } |
| 852 | |
| 853 | unsigned SplitProposal::findCheapestPartition() const { |
| 854 | assert(!Partitions.empty()); |
| 855 | CostType CurCost = std::numeric_limits<CostType>::max(); |
| 856 | unsigned CurPID = InvalidPID; |
| 857 | for (const auto &[Idx, Part] : enumerate(First: Partitions)) { |
| 858 | if (Part.first <= CurCost) { |
| 859 | CurPID = Idx; |
| 860 | CurCost = Part.first; |
| 861 | } |
| 862 | } |
| 863 | assert(CurPID != InvalidPID); |
| 864 | return CurPID; |
| 865 | } |
| 866 | |
| 867 | void SplitProposal::calculateScores() { |
| 868 | if (Partitions.empty()) |
| 869 | return; |
| 870 | |
| 871 | assert(SG); |
| 872 | CostType LargestPCost = 0; |
| 873 | for (auto &[PCost, Nodes] : Partitions) { |
| 874 | if (PCost > LargestPCost) |
| 875 | LargestPCost = PCost; |
| 876 | } |
| 877 | |
| 878 | CostType ModuleCost = SG->getModuleCost(); |
| 879 | CodeSizeScore = double(TotalCost) / ModuleCost; |
| 880 | assert(CodeSizeScore >= 0.0); |
| 881 | |
| 882 | BottleneckScore = double(LargestPCost) / ModuleCost; |
| 883 | |
| 884 | CodeSizeScore = std::ceil(x: CodeSizeScore * 100.0) / 100.0; |
| 885 | BottleneckScore = std::ceil(x: BottleneckScore * 100.0) / 100.0; |
| 886 | } |
| 887 | |
| 888 | #ifndef NDEBUG |
| 889 | void SplitProposal::verifyCompleteness() const { |
| 890 | if (Partitions.empty()) |
| 891 | return; |
| 892 | |
| 893 | BitVector Result = Partitions[0].second; |
| 894 | for (const auto &P : drop_begin(Partitions)) |
| 895 | Result |= P.second; |
| 896 | assert(Result.all() && "some nodes are missing from this proposal!" ); |
| 897 | } |
| 898 | #endif |
| 899 | |
| 900 | //===-- RecursiveSearchStrategy -------------------------------------------===// |
| 901 | |
| 902 | /// Partitioning algorithm. |
| 903 | /// |
| 904 | /// This is a recursive search algorithm that can explore multiple possiblities. |
| 905 | /// |
| 906 | /// When a cluster of nodes can go into more than one partition, and we haven't |
| 907 | /// reached maximum search depth, we recurse and explore both options and their |
| 908 | /// consequences. Both branches will yield a proposal, and the driver will grade |
| 909 | /// both and choose the best one. |
| 910 | /// |
| 911 | /// If max depth is reached, we will use some heuristics to make a choice. Most |
| 912 | /// of the time we will just use the least-pressured (cheapest) partition, but |
| 913 | /// if a cluster is particularly big and there is a good amount of overlap with |
| 914 | /// an existing partition, we will choose that partition instead. |
| 915 | class RecursiveSearchSplitting { |
| 916 | public: |
| 917 | using SubmitProposalFn = function_ref<void(SplitProposal)>; |
| 918 | |
| 919 | RecursiveSearchSplitting(const SplitGraph &SG, unsigned NumParts, |
| 920 | SubmitProposalFn SubmitProposal); |
| 921 | |
| 922 | void run(); |
| 923 | |
| 924 | private: |
| 925 | struct WorkListEntry { |
| 926 | WorkListEntry(const BitVector &BV) : Cluster(BV) {} |
| 927 | |
| 928 | unsigned NumNonEntryNodes = 0; |
| 929 | CostType TotalCost = 0; |
| 930 | CostType CostExcludingGraphEntryPoints = 0; |
| 931 | BitVector Cluster; |
| 932 | }; |
| 933 | |
| 934 | /// Collects all graph entry points's clusters and sort them so the most |
| 935 | /// expensive clusters are viewed first. This will merge clusters together if |
| 936 | /// they share a non-copyable dependency. |
| 937 | void setupWorkList(); |
| 938 | |
| 939 | /// Recursive function that assigns the worklist item at \p Idx into a |
| 940 | /// partition of \p SP. |
| 941 | /// |
| 942 | /// \p Depth is the current search depth. When this value is equal to |
| 943 | /// \ref MaxDepth, we can no longer recurse. |
| 944 | /// |
| 945 | /// This function only recurses if there is more than one possible assignment, |
| 946 | /// otherwise it is iterative to avoid creating a call stack that is as big as |
| 947 | /// \ref WorkList. |
| 948 | void pickPartition(unsigned Depth, unsigned Idx, SplitProposal SP); |
| 949 | |
| 950 | /// \return A pair: first element is the PID of the partition that has the |
| 951 | /// most similarities with \p Entry, or \ref InvalidPID if no partition was |
| 952 | /// found with at least one element in common. The second element is the |
| 953 | /// aggregated cost of all dependencies in common between \p Entry and that |
| 954 | /// partition. |
| 955 | std::pair<unsigned, CostType> |
| 956 | findMostSimilarPartition(const WorkListEntry &Entry, const SplitProposal &SP); |
| 957 | |
| 958 | const SplitGraph &SG; |
| 959 | unsigned NumParts; |
| 960 | SubmitProposalFn SubmitProposal; |
| 961 | |
| 962 | // A Cluster is considered large when its cost, excluding entry points, |
| 963 | // exceeds this value. |
| 964 | CostType LargeClusterThreshold = 0; |
| 965 | unsigned NumProposalsSubmitted = 0; |
| 966 | SmallVector<WorkListEntry> WorkList; |
| 967 | }; |
| 968 | |
| 969 | RecursiveSearchSplitting::RecursiveSearchSplitting( |
| 970 | const SplitGraph &SG, unsigned NumParts, SubmitProposalFn SubmitProposal) |
| 971 | : SG(SG), NumParts(NumParts), SubmitProposal(SubmitProposal) { |
| 972 | // arbitrary max value as a safeguard. Anything above 10 will already be |
| 973 | // slow, this is just a max value to prevent extreme resource exhaustion or |
| 974 | // unbounded run time. |
| 975 | if (MaxDepth > 16) |
| 976 | report_fatal_error(reason: "[amdgpu-split-module] search depth of " + |
| 977 | Twine(MaxDepth) + " is too high!" ); |
| 978 | LargeClusterThreshold = |
| 979 | (LargeFnFactor != 0.0) |
| 980 | ? CostType(((SG.getModuleCost() / NumParts) * LargeFnFactor)) |
| 981 | : std::numeric_limits<CostType>::max(); |
| 982 | LLVM_DEBUG(dbgs() << "[recursive search] large cluster threshold set at " |
| 983 | << LargeClusterThreshold << "\n" ); |
| 984 | } |
| 985 | |
| 986 | void RecursiveSearchSplitting::run() { |
| 987 | { |
| 988 | SplitModuleTimer SMT("recursive_search_prepare" , "preparing worklist" ); |
| 989 | setupWorkList(); |
| 990 | } |
| 991 | |
| 992 | { |
| 993 | SplitModuleTimer SMT("recursive_search_pick" , "partitioning" ); |
| 994 | SplitProposal SP(SG, NumParts); |
| 995 | pickPartition(/*BranchDepth=*/Depth: 0, /*Idx=*/0, SP: std::move(SP)); |
| 996 | } |
| 997 | } |
| 998 | |
| 999 | void RecursiveSearchSplitting::setupWorkList() { |
| 1000 | // e.g. if A and B are two worklist item, and they both call a non copyable |
| 1001 | // dependency C, this does: |
| 1002 | // A=C |
| 1003 | // B=C |
| 1004 | // => NodeEC will create a single group (A, B, C) and we create a new |
| 1005 | // WorkList entry for that group. |
| 1006 | |
| 1007 | EquivalenceClasses<unsigned> NodeEC; |
| 1008 | for (const SplitGraph::Node *N : SG.nodes()) { |
| 1009 | if (!N->isGraphEntryPoint()) |
| 1010 | continue; |
| 1011 | |
| 1012 | NodeEC.insert(Data: N->getID()); |
| 1013 | N->visitAllDependencies(Visitor: [&](const SplitGraph::Node &Dep) { |
| 1014 | if (&Dep != N && Dep.isNonCopyable()) |
| 1015 | NodeEC.unionSets(V1: N->getID(), V2: Dep.getID()); |
| 1016 | }); |
| 1017 | } |
| 1018 | |
| 1019 | for (const auto &Node : NodeEC) { |
| 1020 | if (!Node->isLeader()) |
| 1021 | continue; |
| 1022 | |
| 1023 | BitVector Cluster = SG.createNodesBitVector(); |
| 1024 | for (unsigned M : NodeEC.members(ECV: *Node)) { |
| 1025 | const SplitGraph::Node &N = SG.getNode(ID: M); |
| 1026 | if (N.isGraphEntryPoint()) |
| 1027 | N.getDependencies(BV&: Cluster); |
| 1028 | } |
| 1029 | WorkList.emplace_back(Args: std::move(Cluster)); |
| 1030 | } |
| 1031 | |
| 1032 | // Calculate costs and other useful information. |
| 1033 | for (WorkListEntry &Entry : WorkList) { |
| 1034 | for (unsigned NodeID : Entry.Cluster.set_bits()) { |
| 1035 | const SplitGraph::Node &N = SG.getNode(ID: NodeID); |
| 1036 | const CostType Cost = N.getIndividualCost(); |
| 1037 | |
| 1038 | Entry.TotalCost += Cost; |
| 1039 | if (!N.isGraphEntryPoint()) { |
| 1040 | Entry.CostExcludingGraphEntryPoints += Cost; |
| 1041 | ++Entry.NumNonEntryNodes; |
| 1042 | } |
| 1043 | } |
| 1044 | } |
| 1045 | |
| 1046 | stable_sort(Range&: WorkList, C: [](const WorkListEntry &A, const WorkListEntry &B) { |
| 1047 | if (A.TotalCost != B.TotalCost) |
| 1048 | return A.TotalCost > B.TotalCost; |
| 1049 | |
| 1050 | if (A.CostExcludingGraphEntryPoints != B.CostExcludingGraphEntryPoints) |
| 1051 | return A.CostExcludingGraphEntryPoints > B.CostExcludingGraphEntryPoints; |
| 1052 | |
| 1053 | if (A.NumNonEntryNodes != B.NumNonEntryNodes) |
| 1054 | return A.NumNonEntryNodes > B.NumNonEntryNodes; |
| 1055 | |
| 1056 | return A.Cluster.count() > B.Cluster.count(); |
| 1057 | }); |
| 1058 | |
| 1059 | LLVM_DEBUG({ |
| 1060 | dbgs() << "[recursive search] worklist:\n" ; |
| 1061 | for (const auto &[Idx, Entry] : enumerate(WorkList)) { |
| 1062 | dbgs() << " - [" << Idx << "]: " ; |
| 1063 | for (unsigned NodeID : Entry.Cluster.set_bits()) |
| 1064 | dbgs() << NodeID << " " ; |
| 1065 | dbgs() << "(total_cost:" << Entry.TotalCost |
| 1066 | << ", cost_excl_entries:" << Entry.CostExcludingGraphEntryPoints |
| 1067 | << ")\n" ; |
| 1068 | } |
| 1069 | }); |
| 1070 | } |
| 1071 | |
| 1072 | void RecursiveSearchSplitting::pickPartition(unsigned Depth, unsigned Idx, |
| 1073 | SplitProposal SP) { |
| 1074 | while (Idx < WorkList.size()) { |
| 1075 | // Step 1: Determine candidate PIDs. |
| 1076 | // |
| 1077 | const WorkListEntry &Entry = WorkList[Idx]; |
| 1078 | const BitVector &Cluster = Entry.Cluster; |
| 1079 | |
| 1080 | // Default option is to do load-balancing, AKA assign to least pressured |
| 1081 | // partition. |
| 1082 | const unsigned CheapestPID = SP.findCheapestPartition(); |
| 1083 | assert(CheapestPID != InvalidPID); |
| 1084 | |
| 1085 | // Explore assigning to the kernel that contains the most dependencies in |
| 1086 | // common. |
| 1087 | const auto [MostSimilarPID, SimilarDepsCost] = |
| 1088 | findMostSimilarPartition(Entry, SP); |
| 1089 | |
| 1090 | // We can chose to explore only one path if we only have one valid path, or |
| 1091 | // if we reached maximum search depth and can no longer branch out. |
| 1092 | unsigned SinglePIDToTry = InvalidPID; |
| 1093 | if (MostSimilarPID == InvalidPID) // no similar PID found |
| 1094 | SinglePIDToTry = CheapestPID; |
| 1095 | else if (MostSimilarPID == CheapestPID) // both landed on the same PID |
| 1096 | SinglePIDToTry = CheapestPID; |
| 1097 | else if (Depth >= MaxDepth) { |
| 1098 | // We have to choose one path. Use a heuristic to guess which one will be |
| 1099 | // more appropriate. |
| 1100 | if (Entry.CostExcludingGraphEntryPoints > LargeClusterThreshold) { |
| 1101 | // Check if the amount of code in common makes it worth it. |
| 1102 | assert(SimilarDepsCost && Entry.CostExcludingGraphEntryPoints); |
| 1103 | const double Ratio = static_cast<double>(SimilarDepsCost) / |
| 1104 | Entry.CostExcludingGraphEntryPoints; |
| 1105 | assert(Ratio >= 0.0 && Ratio <= 1.0); |
| 1106 | if (Ratio > LargeFnOverlapForMerge) { |
| 1107 | // For debug, just print "L", so we'll see "L3=P3" for instance, which |
| 1108 | // will mean we reached max depth and chose P3 based on this |
| 1109 | // heuristic. |
| 1110 | LLVM_DEBUG(dbgs() << 'L'); |
| 1111 | SinglePIDToTry = MostSimilarPID; |
| 1112 | } |
| 1113 | } else |
| 1114 | SinglePIDToTry = CheapestPID; |
| 1115 | } |
| 1116 | |
| 1117 | // Step 2: Explore candidates. |
| 1118 | |
| 1119 | // When we only explore one possible path, and thus branch depth doesn't |
| 1120 | // increase, do not recurse, iterate instead. |
| 1121 | if (SinglePIDToTry != InvalidPID) { |
| 1122 | LLVM_DEBUG(dbgs() << Idx << "=P" << SinglePIDToTry << ' '); |
| 1123 | // Only one path to explore, don't clone SP, don't increase depth. |
| 1124 | SP.add(PID: SinglePIDToTry, BV: Cluster); |
| 1125 | ++Idx; |
| 1126 | continue; |
| 1127 | } |
| 1128 | |
| 1129 | assert(MostSimilarPID != InvalidPID); |
| 1130 | |
| 1131 | // We explore multiple paths: recurse at increased depth, then stop this |
| 1132 | // function. |
| 1133 | |
| 1134 | LLVM_DEBUG(dbgs() << '\n'); |
| 1135 | |
| 1136 | // lb = load balancing = put in cheapest partition |
| 1137 | { |
| 1138 | SplitProposal BranchSP = SP; |
| 1139 | LLVM_DEBUG(dbgs().indent(Depth) |
| 1140 | << " [lb] " << Idx << "=P" << CheapestPID << "? " ); |
| 1141 | BranchSP.add(PID: CheapestPID, BV: Cluster); |
| 1142 | pickPartition(Depth: Depth + 1, Idx: Idx + 1, SP: std::move(BranchSP)); |
| 1143 | } |
| 1144 | |
| 1145 | // ms = most similar = put in partition with the most in common |
| 1146 | { |
| 1147 | SplitProposal BranchSP = SP; |
| 1148 | LLVM_DEBUG(dbgs().indent(Depth) |
| 1149 | << " [ms] " << Idx << "=P" << MostSimilarPID << "? " ); |
| 1150 | BranchSP.add(PID: MostSimilarPID, BV: Cluster); |
| 1151 | pickPartition(Depth: Depth + 1, Idx: Idx + 1, SP: std::move(BranchSP)); |
| 1152 | } |
| 1153 | |
| 1154 | return; |
| 1155 | } |
| 1156 | |
| 1157 | // Step 3: If we assigned all WorkList items, submit the proposal. |
| 1158 | |
| 1159 | assert(Idx == WorkList.size()); |
| 1160 | assert(NumProposalsSubmitted <= (2u << MaxDepth) && |
| 1161 | "Search got out of bounds?" ); |
| 1162 | SP.setName("recursive_search (depth=" + std::to_string(val: Depth) + ") #" + |
| 1163 | std::to_string(val: NumProposalsSubmitted++)); |
| 1164 | LLVM_DEBUG(dbgs() << '\n'); |
| 1165 | SubmitProposal(std::move(SP)); |
| 1166 | } |
| 1167 | |
| 1168 | std::pair<unsigned, CostType> |
| 1169 | RecursiveSearchSplitting::findMostSimilarPartition(const WorkListEntry &Entry, |
| 1170 | const SplitProposal &SP) { |
| 1171 | if (!Entry.NumNonEntryNodes) |
| 1172 | return {InvalidPID, 0}; |
| 1173 | |
| 1174 | // We take the partition that is the most similar using Cost as a metric. |
| 1175 | // So we take the set of nodes in common, compute their aggregated cost, and |
| 1176 | // pick the partition with the highest cost in common. |
| 1177 | unsigned ChosenPID = InvalidPID; |
| 1178 | CostType ChosenCost = 0; |
| 1179 | for (unsigned PID = 0; PID < NumParts; ++PID) { |
| 1180 | BitVector BV = SP[PID]; |
| 1181 | BV &= Entry.Cluster; // FIXME: & doesn't work between BVs?! |
| 1182 | |
| 1183 | if (BV.none()) |
| 1184 | continue; |
| 1185 | |
| 1186 | const CostType Cost = SG.calculateCost(BV); |
| 1187 | |
| 1188 | if (ChosenPID == InvalidPID || ChosenCost < Cost || |
| 1189 | (ChosenCost == Cost && PID > ChosenPID)) { |
| 1190 | ChosenPID = PID; |
| 1191 | ChosenCost = Cost; |
| 1192 | } |
| 1193 | } |
| 1194 | |
| 1195 | return {ChosenPID, ChosenCost}; |
| 1196 | } |
| 1197 | |
| 1198 | //===----------------------------------------------------------------------===// |
| 1199 | // DOTGraph Printing Support |
| 1200 | //===----------------------------------------------------------------------===// |
| 1201 | |
| 1202 | const SplitGraph::Node *mapEdgeToDst(const SplitGraph::Edge *E) { |
| 1203 | return E->Dst; |
| 1204 | } |
| 1205 | |
| 1206 | using SplitGraphEdgeDstIterator = |
| 1207 | mapped_iterator<SplitGraph::edges_iterator, decltype(&mapEdgeToDst)>; |
| 1208 | |
| 1209 | } // namespace |
| 1210 | |
| 1211 | template <> struct GraphTraits<SplitGraph> { |
| 1212 | using NodeRef = const SplitGraph::Node *; |
| 1213 | using nodes_iterator = SplitGraph::nodes_iterator; |
| 1214 | using ChildIteratorType = SplitGraphEdgeDstIterator; |
| 1215 | |
| 1216 | using EdgeRef = const SplitGraph::Edge *; |
| 1217 | using ChildEdgeIteratorType = SplitGraph::edges_iterator; |
| 1218 | |
| 1219 | static NodeRef getEntryNode(NodeRef N) { return N; } |
| 1220 | |
| 1221 | static ChildIteratorType child_begin(NodeRef Ref) { |
| 1222 | return {Ref->outgoing_edges().begin(), mapEdgeToDst}; |
| 1223 | } |
| 1224 | static ChildIteratorType child_end(NodeRef Ref) { |
| 1225 | return {Ref->outgoing_edges().end(), mapEdgeToDst}; |
| 1226 | } |
| 1227 | |
| 1228 | static nodes_iterator nodes_begin(const SplitGraph &G) { |
| 1229 | return G.nodes().begin(); |
| 1230 | } |
| 1231 | static nodes_iterator nodes_end(const SplitGraph &G) { |
| 1232 | return G.nodes().end(); |
| 1233 | } |
| 1234 | }; |
| 1235 | |
| 1236 | template <> struct DOTGraphTraits<SplitGraph> : public DefaultDOTGraphTraits { |
| 1237 | DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {} |
| 1238 | |
| 1239 | static std::string getGraphName(const SplitGraph &SG) { |
| 1240 | return SG.getModule().getName().str(); |
| 1241 | } |
| 1242 | |
| 1243 | std::string getNodeLabel(const SplitGraph::Node *N, const SplitGraph &SG) { |
| 1244 | return N->getName().str(); |
| 1245 | } |
| 1246 | |
| 1247 | static std::string getNodeDescription(const SplitGraph::Node *N, |
| 1248 | const SplitGraph &SG) { |
| 1249 | std::string Result; |
| 1250 | if (N->isEntryFunctionCC()) |
| 1251 | Result += "entry-fn-cc " ; |
| 1252 | if (N->isNonCopyable()) |
| 1253 | Result += "non-copyable " ; |
| 1254 | Result += "cost:" + std::to_string(val: N->getIndividualCost()); |
| 1255 | return Result; |
| 1256 | } |
| 1257 | |
| 1258 | static std::string getNodeAttributes(const SplitGraph::Node *N, |
| 1259 | const SplitGraph &SG) { |
| 1260 | return N->hasAnyIncomingEdges() ? "" : "color=\"red\"" ; |
| 1261 | } |
| 1262 | |
| 1263 | static std::string getEdgeAttributes(const SplitGraph::Node *N, |
| 1264 | SplitGraphEdgeDstIterator EI, |
| 1265 | const SplitGraph &SG) { |
| 1266 | |
| 1267 | switch ((*EI.getCurrent())->Kind) { |
| 1268 | case SplitGraph::EdgeKind::DirectCall: |
| 1269 | return "" ; |
| 1270 | case SplitGraph::EdgeKind::IndirectCall: |
| 1271 | return "style=\"dashed\"" ; |
| 1272 | } |
| 1273 | llvm_unreachable("Unknown SplitGraph::EdgeKind enum" ); |
| 1274 | } |
| 1275 | }; |
| 1276 | |
| 1277 | //===----------------------------------------------------------------------===// |
| 1278 | // Driver |
| 1279 | //===----------------------------------------------------------------------===// |
| 1280 | |
| 1281 | namespace { |
| 1282 | |
| 1283 | // If we didn't externalize GVs, then local GVs need to be conservatively |
| 1284 | // imported into every module (including their initializers), and then cleaned |
| 1285 | // up afterwards. |
| 1286 | static bool needsConservativeImport(const GlobalValue *GV) { |
| 1287 | if (const auto *Var = dyn_cast<GlobalVariable>(Val: GV)) |
| 1288 | return Var->hasLocalLinkage(); |
| 1289 | if (const auto *GA = dyn_cast<GlobalAlias>(Val: GV)) |
| 1290 | return GA->hasLocalLinkage(); |
| 1291 | return false; |
| 1292 | } |
| 1293 | |
| 1294 | /// Prints a summary of the partition \p N, represented by module \p M, to \p |
| 1295 | /// OS. |
| 1296 | static void printPartitionSummary(raw_ostream &OS, unsigned N, const Module &M, |
| 1297 | unsigned PartCost, unsigned ModuleCost) { |
| 1298 | OS << "*** Partition P" << N << " ***\n" ; |
| 1299 | |
| 1300 | for (const auto &Fn : M) { |
| 1301 | if (!Fn.isDeclaration()) |
| 1302 | OS << " - [function] " << Fn.getName() << "\n" ; |
| 1303 | } |
| 1304 | |
| 1305 | for (const auto &GV : M.globals()) { |
| 1306 | if (GV.hasInitializer()) |
| 1307 | OS << " - [global] " << GV.getName() << "\n" ; |
| 1308 | } |
| 1309 | |
| 1310 | OS << "Partition contains " << formatRatioOf(Num: PartCost, Dem: ModuleCost) |
| 1311 | << "% of the source\n" ; |
| 1312 | } |
| 1313 | |
| 1314 | static void evaluateProposal(SplitProposal &Best, SplitProposal New) { |
| 1315 | SplitModuleTimer SMT("proposal_evaluation" , "proposal ranking algorithm" ); |
| 1316 | |
| 1317 | LLVM_DEBUG({ |
| 1318 | New.verifyCompleteness(); |
| 1319 | if (DebugProposalSearch) |
| 1320 | New.print(dbgs()); |
| 1321 | }); |
| 1322 | |
| 1323 | const double CurBScore = Best.getBottleneckScore(); |
| 1324 | const double CurCSScore = Best.getCodeSizeScore(); |
| 1325 | const double NewBScore = New.getBottleneckScore(); |
| 1326 | const double NewCSScore = New.getCodeSizeScore(); |
| 1327 | |
| 1328 | // TODO: Improve this |
| 1329 | // We can probably lower the precision of the comparison at first |
| 1330 | // e.g. if we have |
| 1331 | // - (Current): BScore: 0.489 CSCore 1.105 |
| 1332 | // - (New): BScore: 0.475 CSCore 1.305 |
| 1333 | // Currently we'd choose the new one because the bottleneck score is |
| 1334 | // lower, but the new one duplicates more code. It may be worth it to |
| 1335 | // discard the new proposal as the impact on build time is negligible. |
| 1336 | |
| 1337 | // Compare them |
| 1338 | bool IsBest = false; |
| 1339 | if (NewBScore < CurBScore) |
| 1340 | IsBest = true; |
| 1341 | else if (NewBScore == CurBScore) |
| 1342 | IsBest = (NewCSScore < CurCSScore); // Use code size as tie breaker. |
| 1343 | |
| 1344 | if (IsBest) |
| 1345 | Best = std::move(New); |
| 1346 | |
| 1347 | LLVM_DEBUG(if (DebugProposalSearch) { |
| 1348 | if (IsBest) |
| 1349 | dbgs() << "[search] new best proposal!\n" ; |
| 1350 | else |
| 1351 | dbgs() << "[search] discarding - not profitable\n" ; |
| 1352 | }); |
| 1353 | } |
| 1354 | |
| 1355 | /// Trivial helper to create an identical copy of \p M. |
| 1356 | static std::unique_ptr<Module> cloneAll(const Module &M) { |
| 1357 | ValueToValueMapTy VMap; |
| 1358 | return CloneModule(M, VMap, ShouldCloneDefinition: [&](const GlobalValue *GV) { return true; }); |
| 1359 | } |
| 1360 | |
| 1361 | /// Writes \p SG as a DOTGraph to \ref ModuleDotCfgDir if requested. |
| 1362 | static void writeDOTGraph(const SplitGraph &SG) { |
| 1363 | if (ModuleDotCfgOutput.empty()) |
| 1364 | return; |
| 1365 | |
| 1366 | std::error_code EC; |
| 1367 | raw_fd_ostream OS(ModuleDotCfgOutput, EC); |
| 1368 | if (EC) { |
| 1369 | errs() << "[" DEBUG_TYPE "]: cannot open '" << ModuleDotCfgOutput |
| 1370 | << "' - DOTGraph will not be printed\n" ; |
| 1371 | } |
| 1372 | WriteGraph(O&: OS, G: SG, /*ShortName=*/ShortNames: false, |
| 1373 | /*Title=*/SG.getModule().getName()); |
| 1374 | } |
| 1375 | |
| 1376 | static void splitAMDGPUModule( |
| 1377 | GetTTIFn GetTTI, Module &M, unsigned NumParts, |
| 1378 | function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) { |
| 1379 | CallGraph CG(M); |
| 1380 | |
| 1381 | // Externalize functions whose address are taken. |
| 1382 | // |
| 1383 | // This is needed because partitioning is purely based on calls, but sometimes |
| 1384 | // a kernel/function may just look at the address of another local function |
| 1385 | // and not do anything (no calls). After partitioning, that local function may |
| 1386 | // end up in a different module (so it's just a declaration in the module |
| 1387 | // where its address is taken), which emits a "undefined hidden symbol" linker |
| 1388 | // error. |
| 1389 | // |
| 1390 | // Additionally, it guides partitioning to not duplicate this function if it's |
| 1391 | // called directly at some point. |
| 1392 | // |
| 1393 | // TODO: Could we be smarter about this ? This makes all functions whose |
| 1394 | // addresses are taken non-copyable. We should probably model this type of |
| 1395 | // constraint in the graph and use it to guide splitting, instead of |
| 1396 | // externalizing like this. Maybe non-copyable should really mean "keep one |
| 1397 | // visible copy, then internalize all other copies" for some functions? |
| 1398 | if (!NoExternalizeOnAddrTaken) { |
| 1399 | for (auto &Fn : M) { |
| 1400 | if (Fn.hasLocalLinkage() && Fn.hasAddressTaken()) { |
| 1401 | LLVM_DEBUG(dbgs() << "[externalize] " ; Fn.printAsOperand(dbgs()); |
| 1402 | dbgs() << " because its address is taken\n" ); |
| 1403 | externalize(GV&: Fn); |
| 1404 | } |
| 1405 | } |
| 1406 | } |
| 1407 | |
| 1408 | // Externalize local GVs, which avoids duplicating their initializers, which |
| 1409 | // in turns helps keep code size in check. |
| 1410 | if (!NoExternalizeGlobals) { |
| 1411 | for (auto &GV : M.globals()) { |
| 1412 | if (GV.hasLocalLinkage()) |
| 1413 | LLVM_DEBUG(dbgs() << "[externalize] GV " << GV.getName() << '\n'); |
| 1414 | externalize(GV); |
| 1415 | } |
| 1416 | } |
| 1417 | |
| 1418 | for (auto &GA : M.aliases()) { |
| 1419 | if (GA.hasLocalLinkage()) { |
| 1420 | LLVM_DEBUG(dbgs() << "[externalize] alias " << GA.getName() << '\n'); |
| 1421 | externalize(GV&: GA); |
| 1422 | } |
| 1423 | } |
| 1424 | |
| 1425 | // Start by calculating the cost of every function in the module, as well as |
| 1426 | // the module's overall cost. |
| 1427 | FunctionsCostMap FnCosts; |
| 1428 | const CostType ModuleCost = calculateFunctionCosts(GetTTI, M, CostMap&: FnCosts); |
| 1429 | |
| 1430 | // Build the SplitGraph, which represents the module's functions and models |
| 1431 | // their dependencies accurately. |
| 1432 | SplitGraph SG(M, FnCosts, ModuleCost); |
| 1433 | SG.buildGraph(CG); |
| 1434 | |
| 1435 | if (SG.empty()) { |
| 1436 | LLVM_DEBUG( |
| 1437 | dbgs() |
| 1438 | << "[!] no nodes in graph, input is empty - no splitting possible\n" ); |
| 1439 | ModuleCallback(cloneAll(M)); |
| 1440 | return; |
| 1441 | } |
| 1442 | |
| 1443 | LLVM_DEBUG({ |
| 1444 | dbgs() << "[graph] nodes:\n" ; |
| 1445 | for (const SplitGraph::Node *N : SG.nodes()) { |
| 1446 | dbgs() << " - [" << N->getID() << "]: " << N->getName() << " " |
| 1447 | << (N->isGraphEntryPoint() ? "(entry)" : "" ) << " " |
| 1448 | << (N->isNonCopyable() ? "(noncopyable)" : "" ) << "\n" ; |
| 1449 | } |
| 1450 | }); |
| 1451 | |
| 1452 | writeDOTGraph(SG); |
| 1453 | |
| 1454 | LLVM_DEBUG(dbgs() << "[search] testing splitting strategies\n" ); |
| 1455 | |
| 1456 | std::optional<SplitProposal> Proposal; |
| 1457 | const auto EvaluateProposal = [&](SplitProposal SP) { |
| 1458 | SP.calculateScores(); |
| 1459 | if (!Proposal) |
| 1460 | Proposal = std::move(SP); |
| 1461 | else |
| 1462 | evaluateProposal(Best&: *Proposal, New: std::move(SP)); |
| 1463 | }; |
| 1464 | |
| 1465 | // TODO: It would be very easy to create new strategies by just adding a base |
| 1466 | // class to RecursiveSearchSplitting and abstracting it away. |
| 1467 | RecursiveSearchSplitting(SG, NumParts, EvaluateProposal).run(); |
| 1468 | LLVM_DEBUG(if (Proposal) dbgs() << "[search done] selected proposal: " |
| 1469 | << Proposal->getName() << "\n" ;); |
| 1470 | |
| 1471 | if (!Proposal) { |
| 1472 | LLVM_DEBUG(dbgs() << "[!] no proposal made, no splitting possible!\n" ); |
| 1473 | ModuleCallback(cloneAll(M)); |
| 1474 | return; |
| 1475 | } |
| 1476 | |
| 1477 | LLVM_DEBUG(Proposal->print(dbgs());); |
| 1478 | |
| 1479 | std::optional<raw_fd_ostream> SummariesOS; |
| 1480 | if (!PartitionSummariesOutput.empty()) { |
| 1481 | std::error_code EC; |
| 1482 | SummariesOS.emplace(args&: PartitionSummariesOutput, args&: EC); |
| 1483 | if (EC) |
| 1484 | errs() << "[" DEBUG_TYPE "]: cannot open '" << PartitionSummariesOutput |
| 1485 | << "' - Partition summaries will not be printed\n" ; |
| 1486 | } |
| 1487 | |
| 1488 | // One module will import all GlobalValues that are not Functions |
| 1489 | // and are not subject to conservative import. |
| 1490 | bool ImportAllGVs = true; |
| 1491 | |
| 1492 | for (unsigned PID = 0; PID < NumParts; ++PID) { |
| 1493 | SplitModuleTimer SMT2("modules_creation" , |
| 1494 | "creating modules for each partition" ); |
| 1495 | LLVM_DEBUG(dbgs() << "[split] creating new modules\n" ); |
| 1496 | |
| 1497 | DenseSet<const Function *> FnsInPart; |
| 1498 | for (unsigned NodeID : (*Proposal)[PID].set_bits()) |
| 1499 | FnsInPart.insert(V: &SG.getNode(ID: NodeID).getFunction()); |
| 1500 | |
| 1501 | // Don't create empty modules. |
| 1502 | if (FnsInPart.empty()) { |
| 1503 | LLVM_DEBUG(dbgs() << "[split] P" << PID |
| 1504 | << " is empty, not creating module\n" ); |
| 1505 | continue; |
| 1506 | } |
| 1507 | |
| 1508 | ValueToValueMapTy VMap; |
| 1509 | CostType PartCost = 0; |
| 1510 | std::unique_ptr<Module> MPart( |
| 1511 | CloneModule(M, VMap, ShouldCloneDefinition: [&](const GlobalValue *GV) { |
| 1512 | // Functions go in their assigned partition. |
| 1513 | if (const auto *Fn = dyn_cast<Function>(Val: GV)) { |
| 1514 | if (FnsInPart.contains(V: Fn)) { |
| 1515 | PartCost += SG.getCost(F: *Fn); |
| 1516 | return true; |
| 1517 | } |
| 1518 | return false; |
| 1519 | } |
| 1520 | |
| 1521 | // Aliases should not be separated from their underlying object. |
| 1522 | if (const auto *GA = dyn_cast<GlobalAlias>(Val: GV)) { |
| 1523 | if (const auto *Fn = dyn_cast<Function>(Val: GA->getAliaseeObject())) |
| 1524 | return FnsInPart.contains(V: Fn); |
| 1525 | } |
| 1526 | |
| 1527 | // Everything else goes in the first non-empty module we create. |
| 1528 | return ImportAllGVs || needsConservativeImport(GV); |
| 1529 | })); |
| 1530 | |
| 1531 | ImportAllGVs = false; |
| 1532 | |
| 1533 | // Clean-up conservatively imported GVs without any users. |
| 1534 | for (auto &GV : make_early_inc_range(Range: MPart->global_values())) { |
| 1535 | if (needsConservativeImport(GV: &GV) && GV.use_empty()) |
| 1536 | GV.eraseFromParent(); |
| 1537 | } |
| 1538 | |
| 1539 | if (SummariesOS) |
| 1540 | printPartitionSummary(OS&: *SummariesOS, N: PID, M: *MPart, PartCost, ModuleCost); |
| 1541 | |
| 1542 | LLVM_DEBUG( |
| 1543 | printPartitionSummary(dbgs(), PID, *MPart, PartCost, ModuleCost)); |
| 1544 | |
| 1545 | ModuleCallback(std::move(MPart)); |
| 1546 | } |
| 1547 | } |
| 1548 | } // namespace |
| 1549 | |
| 1550 | PreservedAnalyses AMDGPUSplitModulePass::run(Module &M, |
| 1551 | ModuleAnalysisManager &MAM) { |
| 1552 | SplitModuleTimer SMT( |
| 1553 | "total" , "total pass runtime (incl. potentially waiting for lockfile)" ); |
| 1554 | |
| 1555 | FunctionAnalysisManager &FAM = |
| 1556 | MAM.getResult<FunctionAnalysisManagerModuleProxy>(IR&: M).getManager(); |
| 1557 | const auto TTIGetter = [&FAM](Function &F) -> const TargetTransformInfo & { |
| 1558 | return FAM.getResult<TargetIRAnalysis>(IR&: F); |
| 1559 | }; |
| 1560 | |
| 1561 | bool Done = false; |
| 1562 | #ifndef NDEBUG |
| 1563 | if (UseLockFile) { |
| 1564 | SmallString<128> LockFilePath; |
| 1565 | sys::path::system_temp_directory(/*ErasedOnReboot=*/true, LockFilePath); |
| 1566 | sys::path::append(LockFilePath, "amdgpu-split-module-debug" ); |
| 1567 | LLVM_DEBUG(dbgs() << DEBUG_TYPE " using lockfile '" << LockFilePath |
| 1568 | << "'\n" ); |
| 1569 | |
| 1570 | while (true) { |
| 1571 | llvm::LockFileManager Lock(LockFilePath.str()); |
| 1572 | bool Owned; |
| 1573 | if (Error Err = Lock.tryLock().moveInto(Owned)) { |
| 1574 | consumeError(std::move(Err)); |
| 1575 | LLVM_DEBUG( |
| 1576 | dbgs() << "[amdgpu-split-module] unable to acquire lockfile, debug " |
| 1577 | "output may be mangled by other processes\n" ); |
| 1578 | } else if (!Owned) { |
| 1579 | switch (Lock.waitForUnlockFor(std::chrono::seconds(90))) { |
| 1580 | case WaitForUnlockResult::Success: |
| 1581 | break; |
| 1582 | case WaitForUnlockResult::OwnerDied: |
| 1583 | continue; // try again to get the lock. |
| 1584 | case WaitForUnlockResult::Timeout: |
| 1585 | LLVM_DEBUG( |
| 1586 | dbgs() |
| 1587 | << "[amdgpu-split-module] unable to acquire lockfile, debug " |
| 1588 | "output may be mangled by other processes\n" ); |
| 1589 | Lock.unsafeUnlock(); |
| 1590 | break; // give up |
| 1591 | } |
| 1592 | } |
| 1593 | |
| 1594 | splitAMDGPUModule(TTIGetter, M, N, ModuleCallback); |
| 1595 | Done = true; |
| 1596 | break; |
| 1597 | } |
| 1598 | } |
| 1599 | #endif |
| 1600 | |
| 1601 | if (!Done) |
| 1602 | splitAMDGPUModule(GetTTI: TTIGetter, M, NumParts: N, ModuleCallback); |
| 1603 | |
| 1604 | // We can change linkage/visibilities in the input, consider that nothing is |
| 1605 | // preserved just to be safe. This pass runs last anyway. |
| 1606 | return PreservedAnalyses::none(); |
| 1607 | } |
| 1608 | } // namespace llvm |
| 1609 | |