| 1 | //===- AMDGPUSplitModule.cpp ----------------------------------------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | /// \file Implements a module splitting algorithm designed to support the |
| 10 | /// FullLTO --lto-partitions option for parallel codegen. |
| 11 | /// |
| 12 | /// The role of this module splitting pass is the same as |
| 13 | /// lib/Transforms/Utils/SplitModule.cpp: load-balance the module's functions |
| 14 | /// across a set of N partitions to allow for parallel codegen. |
| 15 | /// |
| 16 | /// The similarities mostly end here, as this pass achieves load-balancing in a |
| 17 | /// more elaborate fashion which is targeted towards AMDGPU modules. It can take |
| 18 | /// advantage of the structure of AMDGPU modules (which are mostly |
| 19 | /// self-contained) to allow for more efficient splitting without affecting |
| 20 | /// codegen negatively, or causing innaccurate resource usage analysis. |
| 21 | /// |
| 22 | /// High-level pass overview: |
| 23 | /// - SplitGraph & associated classes |
| 24 | /// - Graph representation of the module and of the dependencies that |
| 25 | /// matter for splitting. |
| 26 | /// - RecursiveSearchSplitting |
| 27 | /// - Core splitting algorithm. |
| 28 | /// - SplitProposal |
| 29 | /// - Represents a suggested solution for splitting the input module. These |
| 30 | /// solutions can be scored to determine the best one when multiple |
| 31 | /// solutions are available. |
| 32 | /// - Driver/pass "run" function glues everything together. |
| 33 | |
| 34 | #include "AMDGPUSplitModule.h" |
| 35 | #include "AMDGPUTargetMachine.h" |
| 36 | #include "Utils/AMDGPUBaseInfo.h" |
| 37 | #include "llvm/ADT/DenseMap.h" |
| 38 | #include "llvm/ADT/EquivalenceClasses.h" |
| 39 | #include "llvm/ADT/GraphTraits.h" |
| 40 | #include "llvm/ADT/SmallVector.h" |
| 41 | #include "llvm/ADT/StringExtras.h" |
| 42 | #include "llvm/ADT/StringRef.h" |
| 43 | #include "llvm/Analysis/CallGraph.h" |
| 44 | #include "llvm/Analysis/TargetTransformInfo.h" |
| 45 | #include "llvm/IR/Function.h" |
| 46 | #include "llvm/IR/InstIterator.h" |
| 47 | #include "llvm/IR/Instruction.h" |
| 48 | #include "llvm/IR/Module.h" |
| 49 | #include "llvm/IR/Value.h" |
| 50 | #include "llvm/Support/Allocator.h" |
| 51 | #include "llvm/Support/Casting.h" |
| 52 | #include "llvm/Support/DOTGraphTraits.h" |
| 53 | #include "llvm/Support/Debug.h" |
| 54 | #include "llvm/Support/GraphWriter.h" |
| 55 | #include "llvm/Support/Path.h" |
| 56 | #include "llvm/Support/Timer.h" |
| 57 | #include "llvm/Support/raw_ostream.h" |
| 58 | #include "llvm/Transforms/Utils/Cloning.h" |
| 59 | #include <cassert> |
| 60 | #include <cmath> |
| 61 | #include <memory> |
| 62 | #include <utility> |
| 63 | #include <vector> |
| 64 | |
| 65 | #ifndef NDEBUG |
| 66 | #include "llvm/Support/LockFileManager.h" |
| 67 | #endif |
| 68 | |
| 69 | #define DEBUG_TYPE "amdgpu-split-module" |
| 70 | |
| 71 | namespace llvm { |
| 72 | namespace { |
| 73 | |
| 74 | static cl::opt<unsigned> MaxDepth( |
| 75 | "amdgpu-module-splitting-max-depth" , |
| 76 | cl::desc( |
| 77 | "maximum search depth. 0 forces a greedy approach. " |
| 78 | "warning: the algorithm is up to O(2^N), where N is the max depth." ), |
| 79 | cl::init(Val: 8)); |
| 80 | |
| 81 | static cl::opt<float> LargeFnFactor( |
| 82 | "amdgpu-module-splitting-large-threshold" , cl::init(Val: 2.0f), cl::Hidden, |
| 83 | cl::desc( |
| 84 | "when max depth is reached and we can no longer branch out, this " |
| 85 | "value determines if a function is worth merging into an already " |
| 86 | "existing partition to reduce code duplication. This is a factor " |
| 87 | "of the ideal partition size, e.g. 2.0 means we consider the " |
| 88 | "function for merging if its cost (including its callees) is 2x the " |
| 89 | "size of an ideal partition." )); |
| 90 | |
| 91 | static cl::opt<float> LargeFnOverlapForMerge( |
| 92 | "amdgpu-module-splitting-merge-threshold" , cl::init(Val: 0.7f), cl::Hidden, |
| 93 | cl::desc("when a function is considered for merging into a partition that " |
| 94 | "already contains some of its callees, do the merge if at least " |
| 95 | "n% of the code it can reach is already present inside the " |
| 96 | "partition; e.g. 0.7 means only merge >70%" )); |
| 97 | |
| 98 | static cl::opt<bool> NoExternalizeGlobals( |
| 99 | "amdgpu-module-splitting-no-externalize-globals" , cl::Hidden, |
| 100 | cl::desc("disables externalization of global variable with local linkage; " |
| 101 | "may cause globals to be duplicated which increases binary size" )); |
| 102 | |
| 103 | static cl::opt<bool> NoExternalizeOnAddrTaken( |
| 104 | "amdgpu-module-splitting-no-externalize-address-taken" , cl::Hidden, |
| 105 | cl::desc( |
| 106 | "disables externalization of functions whose addresses are taken" )); |
| 107 | |
| 108 | static cl::opt<std::string> |
| 109 | ModuleDotCfgOutput("amdgpu-module-splitting-print-module-dotcfg" , |
| 110 | cl::Hidden, |
| 111 | cl::desc("output file to write out the dotgraph " |
| 112 | "representation of the input module" )); |
| 113 | |
| 114 | static cl::opt<std::string> PartitionSummariesOutput( |
| 115 | "amdgpu-module-splitting-print-partition-summaries" , cl::Hidden, |
| 116 | cl::desc("output file to write out a summary of " |
| 117 | "the partitions created for each module" )); |
| 118 | |
| 119 | #ifndef NDEBUG |
| 120 | static cl::opt<bool> |
| 121 | UseLockFile("amdgpu-module-splitting-serial-execution" , cl::Hidden, |
| 122 | cl::desc("use a lock file so only one process in the system " |
| 123 | "can run this pass at once. useful to avoid mangled " |
| 124 | "debug output in multithreaded environments." )); |
| 125 | |
| 126 | static cl::opt<bool> |
| 127 | DebugProposalSearch("amdgpu-module-splitting-debug-proposal-search" , |
| 128 | cl::Hidden, |
| 129 | cl::desc("print all proposals received and whether " |
| 130 | "they were rejected or accepted" )); |
| 131 | #endif |
| 132 | |
| 133 | struct SplitModuleTimer : NamedRegionTimer { |
| 134 | SplitModuleTimer(StringRef Name, StringRef Desc) |
| 135 | : NamedRegionTimer(Name, Desc, DEBUG_TYPE, "AMDGPU Module Splitting" , |
| 136 | TimePassesIsEnabled) {} |
| 137 | }; |
| 138 | |
| 139 | //===----------------------------------------------------------------------===// |
| 140 | // Utils |
| 141 | //===----------------------------------------------------------------------===// |
| 142 | |
| 143 | using CostType = InstructionCost::CostType; |
| 144 | using FunctionsCostMap = DenseMap<const Function *, CostType>; |
| 145 | using GetTTIFn = function_ref<const TargetTransformInfo &(Function &)>; |
| 146 | static constexpr unsigned InvalidPID = -1; |
| 147 | |
| 148 | /// \param Num numerator |
| 149 | /// \param Dem denominator |
| 150 | /// \returns a printable object to print (Num/Dem) using "%0.2f". |
| 151 | static auto formatRatioOf(CostType Num, CostType Dem) { |
| 152 | CostType DemOr1 = Dem ? Dem : 1; |
| 153 | return format(Fmt: "%0.2f" , Vals: (static_cast<double>(Num) / DemOr1) * 100); |
| 154 | } |
| 155 | |
| 156 | /// Checks whether a given function is non-copyable. |
| 157 | /// |
| 158 | /// Non-copyable functions cannot be cloned into multiple partitions, and only |
| 159 | /// one copy of the function can be present across all partitions. |
| 160 | /// |
| 161 | /// Kernel functions and external functions fall into this category. If we were |
| 162 | /// to clone them, we would end up with multiple symbol definitions and a very |
| 163 | /// unhappy linker. |
| 164 | static bool isNonCopyable(const Function &F) { |
| 165 | return F.hasExternalLinkage() || !F.isDefinitionExact() || |
| 166 | AMDGPU::isEntryFunctionCC(CC: F.getCallingConv()); |
| 167 | } |
| 168 | |
| 169 | /// If \p GV has local linkage, make it external + hidden. |
| 170 | static void externalize(GlobalValue &GV) { |
| 171 | if (GV.hasLocalLinkage()) { |
| 172 | GV.setLinkage(GlobalValue::ExternalLinkage); |
| 173 | GV.setVisibility(GlobalValue::HiddenVisibility); |
| 174 | } |
| 175 | |
| 176 | // Unnamed entities must be named consistently between modules. setName will |
| 177 | // give a distinct name to each such entity. |
| 178 | if (!GV.hasName()) |
| 179 | GV.setName("__llvmsplit_unnamed" ); |
| 180 | } |
| 181 | |
| 182 | /// Cost analysis function. Calculates the cost of each function in \p M |
| 183 | /// |
| 184 | /// \param GetTTI Abstract getter for TargetTransformInfo. |
| 185 | /// \param M Module to analyze. |
| 186 | /// \param CostMap[out] Resulting Function -> Cost map. |
| 187 | /// \return The module's total cost. |
| 188 | static CostType calculateFunctionCosts(GetTTIFn GetTTI, Module &M, |
| 189 | FunctionsCostMap &CostMap) { |
| 190 | SplitModuleTimer SMT("calculateFunctionCosts" , "cost analysis" ); |
| 191 | |
| 192 | LLVM_DEBUG(dbgs() << "[cost analysis] calculating function costs\n" ); |
| 193 | CostType ModuleCost = 0; |
| 194 | [[maybe_unused]] CostType KernelCost = 0; |
| 195 | |
| 196 | for (auto &Fn : M) { |
| 197 | if (Fn.isDeclaration()) |
| 198 | continue; |
| 199 | |
| 200 | CostType FnCost = 0; |
| 201 | const auto &TTI = GetTTI(Fn); |
| 202 | for (const auto &BB : Fn) { |
| 203 | for (const auto &I : BB) { |
| 204 | auto Cost = |
| 205 | TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_CodeSize); |
| 206 | assert(Cost != InstructionCost::getMax()); |
| 207 | // Assume expensive if we can't tell the cost of an instruction. |
| 208 | CostType CostVal = Cost.isValid() |
| 209 | ? Cost.getValue() |
| 210 | : (CostType)TargetTransformInfo::TCC_Expensive; |
| 211 | assert((FnCost + CostVal) >= FnCost && "Overflow!" ); |
| 212 | FnCost += CostVal; |
| 213 | } |
| 214 | } |
| 215 | |
| 216 | assert(FnCost != 0); |
| 217 | |
| 218 | CostMap[&Fn] = FnCost; |
| 219 | assert((ModuleCost + FnCost) >= ModuleCost && "Overflow!" ); |
| 220 | ModuleCost += FnCost; |
| 221 | |
| 222 | if (AMDGPU::isEntryFunctionCC(CC: Fn.getCallingConv())) |
| 223 | KernelCost += FnCost; |
| 224 | } |
| 225 | |
| 226 | if (CostMap.empty()) |
| 227 | return 0; |
| 228 | |
| 229 | assert(ModuleCost); |
| 230 | LLVM_DEBUG({ |
| 231 | const CostType FnCost = ModuleCost - KernelCost; |
| 232 | dbgs() << " - total module cost is " << ModuleCost << ". kernels cost " |
| 233 | << "" << KernelCost << " (" |
| 234 | << format("%0.2f" , (float(KernelCost) / ModuleCost) * 100) |
| 235 | << "% of the module), functions cost " << FnCost << " (" |
| 236 | << format("%0.2f" , (float(FnCost) / ModuleCost) * 100) |
| 237 | << "% of the module)\n" ; |
| 238 | }); |
| 239 | |
| 240 | return ModuleCost; |
| 241 | } |
| 242 | |
| 243 | /// \return true if \p F can be indirectly called |
| 244 | static bool canBeIndirectlyCalled(const Function &F) { |
| 245 | if (F.isDeclaration() || AMDGPU::isEntryFunctionCC(CC: F.getCallingConv())) |
| 246 | return false; |
| 247 | return !F.hasLocalLinkage() || |
| 248 | F.hasAddressTaken(/*PutOffender=*/nullptr, |
| 249 | /*IgnoreCallbackUses=*/false, |
| 250 | /*IgnoreAssumeLikeCalls=*/true, |
| 251 | /*IgnoreLLVMUsed=*/IngoreLLVMUsed: true, |
| 252 | /*IgnoreARCAttachedCall=*/false, |
| 253 | /*IgnoreCastedDirectCall=*/true); |
| 254 | } |
| 255 | |
| 256 | //===----------------------------------------------------------------------===// |
| 257 | // Graph-based Module Representation |
| 258 | //===----------------------------------------------------------------------===// |
| 259 | |
| 260 | /// AMDGPUSplitModule's view of the source Module, as a graph of all components |
| 261 | /// that can be split into different modules. |
| 262 | /// |
| 263 | /// The most trivial instance of this graph is just the CallGraph of the module, |
| 264 | /// but it is not guaranteed that the graph is strictly equal to the CG. It |
| 265 | /// currently always is but it's designed in a way that would eventually allow |
| 266 | /// us to create abstract nodes, or nodes for different entities such as global |
| 267 | /// variables or any other meaningful constraint we must consider. |
| 268 | /// |
| 269 | /// The graph is only mutable by this class, and is generally not modified |
| 270 | /// after \ref SplitGraph::buildGraph runs. No consumers of the graph can |
| 271 | /// mutate it. |
| 272 | class SplitGraph { |
| 273 | public: |
| 274 | class Node; |
| 275 | |
| 276 | enum class EdgeKind : uint8_t { |
| 277 | /// The nodes are related through a direct call. This is a "strong" edge as |
| 278 | /// it means the Src will directly reference the Dst. |
| 279 | DirectCall, |
| 280 | /// The nodes are related through an indirect call. |
| 281 | /// This is a "weaker" edge and is only considered when traversing the graph |
| 282 | /// starting from a kernel. We need this edge for resource usage analysis. |
| 283 | /// |
| 284 | /// The reason why we have this edge in the first place is due to how |
| 285 | /// AMDGPUResourceUsageAnalysis works. In the presence of an indirect call, |
| 286 | /// the resource usage of the kernel containing the indirect call is the |
| 287 | /// max resource usage of all functions that can be indirectly called. |
| 288 | IndirectCall, |
| 289 | }; |
| 290 | |
| 291 | /// An edge between two nodes. Edges are directional, and tagged with a |
| 292 | /// "kind". |
| 293 | struct Edge { |
| 294 | Edge(Node *Src, Node *Dst, EdgeKind Kind) |
| 295 | : Src(Src), Dst(Dst), Kind(Kind) {} |
| 296 | |
| 297 | Node *Src; ///< Source |
| 298 | Node *Dst; ///< Destination |
| 299 | EdgeKind Kind; |
| 300 | }; |
| 301 | |
| 302 | using EdgesVec = SmallVector<const Edge *, 0>; |
| 303 | using edges_iterator = EdgesVec::const_iterator; |
| 304 | using nodes_iterator = const Node *const *; |
| 305 | |
| 306 | SplitGraph(const Module &M, const FunctionsCostMap &CostMap, |
| 307 | CostType ModuleCost) |
| 308 | : M(M), CostMap(CostMap), ModuleCost(ModuleCost) {} |
| 309 | |
| 310 | void buildGraph(CallGraph &CG); |
| 311 | |
| 312 | #ifndef NDEBUG |
| 313 | bool verifyGraph() const; |
| 314 | #endif |
| 315 | |
| 316 | bool empty() const { return Nodes.empty(); } |
| 317 | const iterator_range<nodes_iterator> nodes() const { |
| 318 | return {Nodes.begin(), Nodes.end()}; |
| 319 | } |
| 320 | const Node &getNode(unsigned ID) const { return *Nodes[ID]; } |
| 321 | |
| 322 | unsigned getNumNodes() const { return Nodes.size(); } |
| 323 | BitVector createNodesBitVector() const { return BitVector(Nodes.size()); } |
| 324 | |
| 325 | const Module &getModule() const { return M; } |
| 326 | |
| 327 | CostType getModuleCost() const { return ModuleCost; } |
| 328 | CostType getCost(const Function &F) const { return CostMap.at(Val: &F); } |
| 329 | |
| 330 | /// \returns the aggregated cost of all nodes in \p BV (bits set to 1 = node |
| 331 | /// IDs). |
| 332 | CostType calculateCost(const BitVector &BV) const; |
| 333 | |
| 334 | private: |
| 335 | /// Retrieves the node for \p GV in \p Cache, or creates a new node for it and |
| 336 | /// updates \p Cache. |
| 337 | Node &getNode(DenseMap<const GlobalValue *, Node *> &Cache, |
| 338 | const GlobalValue &GV); |
| 339 | |
| 340 | // Create a new edge between two nodes and add it to both nodes. |
| 341 | const Edge &createEdge(Node &Src, Node &Dst, EdgeKind EK); |
| 342 | |
| 343 | const Module &M; |
| 344 | const FunctionsCostMap &CostMap; |
| 345 | CostType ModuleCost; |
| 346 | |
| 347 | // Final list of nodes with stable ordering. |
| 348 | SmallVector<Node *> Nodes; |
| 349 | |
| 350 | SpecificBumpPtrAllocator<Node> NodesPool; |
| 351 | |
| 352 | // Edges are trivially destructible objects, so as a small optimization we |
| 353 | // use a BumpPtrAllocator which avoids destructor calls but also makes |
| 354 | // allocation faster. |
| 355 | static_assert( |
| 356 | std::is_trivially_destructible_v<Edge>, |
| 357 | "Edge must be trivially destructible to use the BumpPtrAllocator" ); |
| 358 | BumpPtrAllocator EdgesPool; |
| 359 | }; |
| 360 | |
| 361 | /// Nodes in the SplitGraph contain both incoming, and outgoing edges. |
| 362 | /// Incoming edges have this node as their Dst, and Outgoing ones have this node |
| 363 | /// as their Src. |
| 364 | /// |
| 365 | /// Edge objects are shared by both nodes in Src/Dst. They provide immediate |
| 366 | /// feedback on how two nodes are related, and in which direction they are |
| 367 | /// related, which is valuable information to make splitting decisions. |
| 368 | /// |
| 369 | /// Nodes are fundamentally abstract, and any consumers of the graph should |
| 370 | /// treat them as such. While a node will be a function most of the time, we |
| 371 | /// could also create nodes for any other reason. In the future, we could have |
| 372 | /// single nodes for multiple functions, or nodes for GVs, etc. |
| 373 | class SplitGraph::Node { |
| 374 | friend class SplitGraph; |
| 375 | |
| 376 | public: |
| 377 | Node(unsigned ID, const GlobalValue &GV, CostType IndividualCost, |
| 378 | bool IsNonCopyable) |
| 379 | : ID(ID), GV(GV), IndividualCost(IndividualCost), |
| 380 | IsNonCopyable(IsNonCopyable), IsEntryFnCC(false), IsGraphEntry(false) { |
| 381 | if (auto *Fn = dyn_cast<Function>(Val: &GV)) |
| 382 | IsEntryFnCC = AMDGPU::isEntryFunctionCC(CC: Fn->getCallingConv()); |
| 383 | } |
| 384 | |
| 385 | /// An 0-indexed ID for the node. The maximum ID (exclusive) is the number of |
| 386 | /// nodes in the graph. This ID can be used as an index in a BitVector. |
| 387 | unsigned getID() const { return ID; } |
| 388 | |
| 389 | const Function &getFunction() const { return cast<Function>(Val: GV); } |
| 390 | |
| 391 | /// \returns the cost to import this component into a given module, not |
| 392 | /// accounting for any dependencies that may need to be imported as well. |
| 393 | CostType getIndividualCost() const { return IndividualCost; } |
| 394 | |
| 395 | bool isNonCopyable() const { return IsNonCopyable; } |
| 396 | bool isEntryFunctionCC() const { return IsEntryFnCC; } |
| 397 | |
| 398 | /// \returns whether this is an entry point in the graph. Entry points are |
| 399 | /// defined as follows: if you take all entry points in the graph, and iterate |
| 400 | /// their dependencies, you are guaranteed to visit all nodes in the graph at |
| 401 | /// least once. |
| 402 | bool isGraphEntryPoint() const { return IsGraphEntry; } |
| 403 | |
| 404 | StringRef getName() const { return GV.getName(); } |
| 405 | |
| 406 | bool hasAnyIncomingEdges() const { return IncomingEdges.size(); } |
| 407 | bool hasAnyIncomingEdgesOfKind(EdgeKind EK) const { |
| 408 | return any_of(Range: IncomingEdges, P: [&](const auto *E) { return E->Kind == EK; }); |
| 409 | } |
| 410 | |
| 411 | bool hasAnyOutgoingEdges() const { return OutgoingEdges.size(); } |
| 412 | bool hasAnyOutgoingEdgesOfKind(EdgeKind EK) const { |
| 413 | return any_of(Range: OutgoingEdges, P: [&](const auto *E) { return E->Kind == EK; }); |
| 414 | } |
| 415 | |
| 416 | iterator_range<edges_iterator> incoming_edges() const { |
| 417 | return IncomingEdges; |
| 418 | } |
| 419 | |
| 420 | iterator_range<edges_iterator> outgoing_edges() const { |
| 421 | return OutgoingEdges; |
| 422 | } |
| 423 | |
| 424 | bool shouldFollowIndirectCalls() const { return isEntryFunctionCC(); } |
| 425 | |
| 426 | /// Visit all children of this node in a recursive fashion. Also visits Self. |
| 427 | /// If \ref shouldFollowIndirectCalls returns false, then this only follows |
| 428 | /// DirectCall edges. |
| 429 | /// |
| 430 | /// \param Visitor Visitor Function. |
| 431 | void visitAllDependencies(std::function<void(const Node &)> Visitor) const; |
| 432 | |
| 433 | /// Adds the depedencies of this node in \p BV by setting the bit |
| 434 | /// corresponding to each node. |
| 435 | /// |
| 436 | /// Implemented using \ref visitAllDependencies, hence it follows the same |
| 437 | /// rules regarding dependencies traversal. |
| 438 | /// |
| 439 | /// \param[out] BV The bitvector where the bits should be set. |
| 440 | void getDependencies(BitVector &BV) const { |
| 441 | visitAllDependencies(Visitor: [&](const Node &N) { BV.set(N.getID()); }); |
| 442 | } |
| 443 | |
| 444 | private: |
| 445 | void markAsGraphEntry() { IsGraphEntry = true; } |
| 446 | |
| 447 | unsigned ID; |
| 448 | const GlobalValue &GV; |
| 449 | CostType IndividualCost; |
| 450 | bool IsNonCopyable : 1; |
| 451 | bool IsEntryFnCC : 1; |
| 452 | bool IsGraphEntry : 1; |
| 453 | |
| 454 | // TODO: Use a single sorted vector (with all incoming/outgoing edges grouped |
| 455 | // together) |
| 456 | EdgesVec IncomingEdges; |
| 457 | EdgesVec OutgoingEdges; |
| 458 | }; |
| 459 | |
| 460 | void SplitGraph::Node::visitAllDependencies( |
| 461 | std::function<void(const Node &)> Visitor) const { |
| 462 | const bool FollowIndirect = shouldFollowIndirectCalls(); |
| 463 | // FIXME: If this can access SplitGraph in the future, use a BitVector |
| 464 | // instead. |
| 465 | DenseSet<const Node *> Seen; |
| 466 | SmallVector<const Node *, 8> WorkList({this}); |
| 467 | while (!WorkList.empty()) { |
| 468 | const Node *CurN = WorkList.pop_back_val(); |
| 469 | if (auto [It, Inserted] = Seen.insert(V: CurN); !Inserted) |
| 470 | continue; |
| 471 | |
| 472 | Visitor(*CurN); |
| 473 | |
| 474 | for (const Edge *E : CurN->outgoing_edges()) { |
| 475 | if (!FollowIndirect && E->Kind == EdgeKind::IndirectCall) |
| 476 | continue; |
| 477 | WorkList.push_back(Elt: E->Dst); |
| 478 | } |
| 479 | } |
| 480 | } |
| 481 | |
| 482 | /// Checks if \p I has MD_callees and if it does, parse it and put the function |
| 483 | /// in \p Callees. |
| 484 | /// |
| 485 | /// \returns true if there was metadata and it was parsed correctly. false if |
| 486 | /// there was no MD or if it contained unknown entries and parsing failed. |
| 487 | /// If this returns false, \p Callees will contain incomplete information |
| 488 | /// and must not be used. |
| 489 | static bool handleCalleesMD(const Instruction &I, |
| 490 | SetVector<Function *> &Callees) { |
| 491 | auto *MD = I.getMetadata(KindID: LLVMContext::MD_callees); |
| 492 | if (!MD) |
| 493 | return false; |
| 494 | |
| 495 | for (const auto &Op : MD->operands()) { |
| 496 | Function *Callee = mdconst::extract_or_null<Function>(MD: Op); |
| 497 | if (!Callee) |
| 498 | return false; |
| 499 | Callees.insert(X: Callee); |
| 500 | } |
| 501 | |
| 502 | return true; |
| 503 | } |
| 504 | |
| 505 | void SplitGraph::buildGraph(CallGraph &CG) { |
| 506 | SplitModuleTimer SMT("buildGraph" , "graph construction" ); |
| 507 | LLVM_DEBUG( |
| 508 | dbgs() |
| 509 | << "[build graph] constructing graph representation of the input\n" ); |
| 510 | |
| 511 | // FIXME(?): Is the callgraph really worth using if we have to iterate the |
| 512 | // function again whenever it fails to give us enough information? |
| 513 | |
| 514 | // We build the graph by just iterating all functions in the module and |
| 515 | // working on their direct callees. At the end, all nodes should be linked |
| 516 | // together as expected. |
| 517 | DenseMap<const GlobalValue *, Node *> Cache; |
| 518 | SmallVector<const Function *> FnsWithIndirectCalls, IndirectlyCallableFns; |
| 519 | for (const Function &Fn : M) { |
| 520 | if (Fn.isDeclaration()) |
| 521 | continue; |
| 522 | |
| 523 | // Look at direct callees and create the necessary edges in the graph. |
| 524 | SetVector<const Function *> DirectCallees; |
| 525 | bool CallsExternal = false; |
| 526 | for (auto &CGEntry : *CG[&Fn]) { |
| 527 | auto *CGNode = CGEntry.second; |
| 528 | if (auto *Callee = CGNode->getFunction()) { |
| 529 | if (!Callee->isDeclaration()) |
| 530 | DirectCallees.insert(X: Callee); |
| 531 | } else if (CGNode == CG.getCallsExternalNode()) |
| 532 | CallsExternal = true; |
| 533 | } |
| 534 | |
| 535 | // Keep track of this function if it contains an indirect call and/or if it |
| 536 | // can be indirectly called. |
| 537 | if (CallsExternal) { |
| 538 | LLVM_DEBUG(dbgs() << " [!] callgraph is incomplete for " ; |
| 539 | Fn.printAsOperand(dbgs()); |
| 540 | dbgs() << " - analyzing function\n" ); |
| 541 | |
| 542 | SetVector<Function *> KnownCallees; |
| 543 | bool HasUnknownIndirectCall = false; |
| 544 | for (const auto &Inst : instructions(F: Fn)) { |
| 545 | // look at all calls without a direct callee. |
| 546 | const auto *CB = dyn_cast<CallBase>(Val: &Inst); |
| 547 | if (!CB || CB->getCalledFunction()) |
| 548 | continue; |
| 549 | |
| 550 | // inline assembly can be ignored, unless InlineAsmIsIndirectCall is |
| 551 | // true. |
| 552 | if (CB->isInlineAsm()) { |
| 553 | LLVM_DEBUG(dbgs() << " found inline assembly\n" ); |
| 554 | continue; |
| 555 | } |
| 556 | |
| 557 | if (handleCalleesMD(I: Inst, Callees&: KnownCallees)) |
| 558 | continue; |
| 559 | // If we failed to parse any !callees MD, or some was missing, |
| 560 | // the entire KnownCallees list is now unreliable. |
| 561 | KnownCallees.clear(); |
| 562 | |
| 563 | // Everything else is handled conservatively. If we fall into the |
| 564 | // conservative case don't bother analyzing further. |
| 565 | HasUnknownIndirectCall = true; |
| 566 | break; |
| 567 | } |
| 568 | |
| 569 | if (HasUnknownIndirectCall) { |
| 570 | LLVM_DEBUG(dbgs() << " indirect call found\n" ); |
| 571 | FnsWithIndirectCalls.push_back(Elt: &Fn); |
| 572 | } else if (!KnownCallees.empty()) |
| 573 | DirectCallees.insert_range(R&: KnownCallees); |
| 574 | } |
| 575 | |
| 576 | Node &N = getNode(Cache, GV: Fn); |
| 577 | for (const auto *Callee : DirectCallees) |
| 578 | createEdge(Src&: N, Dst&: getNode(Cache, GV: *Callee), EK: EdgeKind::DirectCall); |
| 579 | |
| 580 | if (canBeIndirectlyCalled(F: Fn)) |
| 581 | IndirectlyCallableFns.push_back(Elt: &Fn); |
| 582 | } |
| 583 | |
| 584 | // Post-process functions with indirect calls. |
| 585 | for (const Function *Fn : FnsWithIndirectCalls) { |
| 586 | for (const Function *Candidate : IndirectlyCallableFns) { |
| 587 | Node &Src = getNode(Cache, GV: *Fn); |
| 588 | Node &Dst = getNode(Cache, GV: *Candidate); |
| 589 | createEdge(Src, Dst, EK: EdgeKind::IndirectCall); |
| 590 | } |
| 591 | } |
| 592 | |
| 593 | // Now, find all entry points. |
| 594 | SmallVector<Node *, 16> CandidateEntryPoints; |
| 595 | BitVector NodesReachableByKernels = createNodesBitVector(); |
| 596 | for (Node *N : Nodes) { |
| 597 | // Functions with an Entry CC are always graph entry points too. |
| 598 | if (N->isEntryFunctionCC()) { |
| 599 | N->markAsGraphEntry(); |
| 600 | N->getDependencies(BV&: NodesReachableByKernels); |
| 601 | } else if (!N->hasAnyIncomingEdgesOfKind(EK: EdgeKind::DirectCall)) |
| 602 | CandidateEntryPoints.push_back(Elt: N); |
| 603 | } |
| 604 | |
| 605 | for (Node *N : CandidateEntryPoints) { |
| 606 | // This can be another entry point if it's not reachable by a kernel |
| 607 | // TODO: We could sort all of the possible new entries in a stable order |
| 608 | // (e.g. by cost), then consume them one by one until |
| 609 | // NodesReachableByKernels is all 1s. It'd allow us to avoid |
| 610 | // considering some nodes as non-entries in some specific cases. |
| 611 | if (!NodesReachableByKernels.test(Idx: N->getID())) |
| 612 | N->markAsGraphEntry(); |
| 613 | } |
| 614 | |
| 615 | #ifndef NDEBUG |
| 616 | assert(verifyGraph()); |
| 617 | #endif |
| 618 | } |
| 619 | |
| 620 | #ifndef NDEBUG |
| 621 | bool SplitGraph::verifyGraph() const { |
| 622 | unsigned ExpectedID = 0; |
| 623 | // Exceptionally using a set here in case IDs are messed up. |
| 624 | DenseSet<const Node *> SeenNodes; |
| 625 | DenseSet<const Function *> SeenFunctionNodes; |
| 626 | for (const Node *N : Nodes) { |
| 627 | if (N->getID() != (ExpectedID++)) { |
| 628 | errs() << "Node IDs are incorrect!\n" ; |
| 629 | return false; |
| 630 | } |
| 631 | |
| 632 | if (!SeenNodes.insert(N).second) { |
| 633 | errs() << "Node seen more than once!\n" ; |
| 634 | return false; |
| 635 | } |
| 636 | |
| 637 | if (&getNode(N->getID()) != N) { |
| 638 | errs() << "getNode doesn't return the right node\n" ; |
| 639 | return false; |
| 640 | } |
| 641 | |
| 642 | for (const Edge *E : N->IncomingEdges) { |
| 643 | if (!E->Src || !E->Dst || (E->Dst != N) || |
| 644 | (find(E->Src->OutgoingEdges, E) == E->Src->OutgoingEdges.end())) { |
| 645 | errs() << "ill-formed incoming edges\n" ; |
| 646 | return false; |
| 647 | } |
| 648 | } |
| 649 | |
| 650 | for (const Edge *E : N->OutgoingEdges) { |
| 651 | if (!E->Src || !E->Dst || (E->Src != N) || |
| 652 | (find(E->Dst->IncomingEdges, E) == E->Dst->IncomingEdges.end())) { |
| 653 | errs() << "ill-formed outgoing edges\n" ; |
| 654 | return false; |
| 655 | } |
| 656 | } |
| 657 | |
| 658 | const Function &Fn = N->getFunction(); |
| 659 | if (AMDGPU::isEntryFunctionCC(Fn.getCallingConv())) { |
| 660 | if (N->hasAnyIncomingEdges()) { |
| 661 | errs() << "Kernels cannot have incoming edges\n" ; |
| 662 | return false; |
| 663 | } |
| 664 | } |
| 665 | |
| 666 | if (Fn.isDeclaration()) { |
| 667 | errs() << "declarations shouldn't have nodes!\n" ; |
| 668 | return false; |
| 669 | } |
| 670 | |
| 671 | auto [It, Inserted] = SeenFunctionNodes.insert(&Fn); |
| 672 | if (!Inserted) { |
| 673 | errs() << "one function has multiple nodes!\n" ; |
| 674 | return false; |
| 675 | } |
| 676 | } |
| 677 | |
| 678 | if (ExpectedID != Nodes.size()) { |
| 679 | errs() << "Node IDs out of sync!\n" ; |
| 680 | return false; |
| 681 | } |
| 682 | |
| 683 | if (createNodesBitVector().size() != getNumNodes()) { |
| 684 | errs() << "nodes bit vector doesn't have the right size!\n" ; |
| 685 | return false; |
| 686 | } |
| 687 | |
| 688 | // Check we respect the promise of Node::isKernel |
| 689 | BitVector BV = createNodesBitVector(); |
| 690 | for (const Node *N : nodes()) { |
| 691 | if (N->isGraphEntryPoint()) |
| 692 | N->getDependencies(BV); |
| 693 | } |
| 694 | |
| 695 | // Ensure each function in the module has an associated node. |
| 696 | for (const auto &Fn : M) { |
| 697 | if (!Fn.isDeclaration()) { |
| 698 | if (!SeenFunctionNodes.contains(&Fn)) { |
| 699 | errs() << "Fn has no associated node in the graph!\n" ; |
| 700 | return false; |
| 701 | } |
| 702 | } |
| 703 | } |
| 704 | |
| 705 | if (!BV.all()) { |
| 706 | errs() << "not all nodes are reachable through the graph's entry points!\n" ; |
| 707 | return false; |
| 708 | } |
| 709 | |
| 710 | return true; |
| 711 | } |
| 712 | #endif |
| 713 | |
| 714 | CostType SplitGraph::calculateCost(const BitVector &BV) const { |
| 715 | CostType Cost = 0; |
| 716 | for (unsigned NodeID : BV.set_bits()) |
| 717 | Cost += getNode(ID: NodeID).getIndividualCost(); |
| 718 | return Cost; |
| 719 | } |
| 720 | |
| 721 | SplitGraph::Node & |
| 722 | SplitGraph::getNode(DenseMap<const GlobalValue *, Node *> &Cache, |
| 723 | const GlobalValue &GV) { |
| 724 | auto &N = Cache[&GV]; |
| 725 | if (N) |
| 726 | return *N; |
| 727 | |
| 728 | CostType Cost = 0; |
| 729 | bool NonCopyable = false; |
| 730 | if (const Function *Fn = dyn_cast<Function>(Val: &GV)) { |
| 731 | NonCopyable = isNonCopyable(F: *Fn); |
| 732 | Cost = CostMap.at(Val: Fn); |
| 733 | } |
| 734 | N = new (NodesPool.Allocate()) Node(Nodes.size(), GV, Cost, NonCopyable); |
| 735 | Nodes.push_back(Elt: N); |
| 736 | assert(&getNode(N->getID()) == N); |
| 737 | return *N; |
| 738 | } |
| 739 | |
| 740 | const SplitGraph::Edge &SplitGraph::createEdge(Node &Src, Node &Dst, |
| 741 | EdgeKind EK) { |
| 742 | const Edge *E = new (EdgesPool.Allocate<Edge>(Num: 1)) Edge(&Src, &Dst, EK); |
| 743 | Src.OutgoingEdges.push_back(Elt: E); |
| 744 | Dst.IncomingEdges.push_back(Elt: E); |
| 745 | return *E; |
| 746 | } |
| 747 | |
| 748 | //===----------------------------------------------------------------------===// |
| 749 | // Split Proposals |
| 750 | //===----------------------------------------------------------------------===// |
| 751 | |
| 752 | /// Represents a module splitting proposal. |
| 753 | /// |
| 754 | /// Proposals are made of N BitVectors, one for each partition, where each bit |
| 755 | /// set indicates that the node is present and should be copied inside that |
| 756 | /// partition. |
| 757 | /// |
| 758 | /// Proposals have several metrics attached so they can be compared/sorted, |
| 759 | /// which the driver to try multiple strategies resultings in multiple proposals |
| 760 | /// and choose the best one out of them. |
| 761 | class SplitProposal { |
| 762 | public: |
| 763 | SplitProposal(const SplitGraph &SG, unsigned MaxPartitions) : SG(&SG) { |
| 764 | Partitions.resize(new_size: MaxPartitions, x: {0, SG.createNodesBitVector()}); |
| 765 | } |
| 766 | |
| 767 | void setName(StringRef NewName) { Name = NewName; } |
| 768 | StringRef getName() const { return Name; } |
| 769 | |
| 770 | const BitVector &operator[](unsigned PID) const { |
| 771 | return Partitions[PID].second; |
| 772 | } |
| 773 | |
| 774 | void add(unsigned PID, const BitVector &BV) { |
| 775 | Partitions[PID].second |= BV; |
| 776 | updateScore(PID); |
| 777 | } |
| 778 | |
| 779 | void print(raw_ostream &OS) const; |
| 780 | LLVM_DUMP_METHOD void dump() const { print(OS&: dbgs()); } |
| 781 | |
| 782 | // Find the cheapest partition (lowest cost). In case of ties, always returns |
| 783 | // the highest partition number. |
| 784 | unsigned findCheapestPartition() const; |
| 785 | |
| 786 | /// Calculate the CodeSize and Bottleneck scores. |
| 787 | void calculateScores(); |
| 788 | |
| 789 | #ifndef NDEBUG |
| 790 | void verifyCompleteness() const; |
| 791 | #endif |
| 792 | |
| 793 | /// Only available after \ref calculateScores is called. |
| 794 | /// |
| 795 | /// A positive number indicating the % of code duplication that this proposal |
| 796 | /// creates. e.g. 0.2 means this proposal adds roughly 20% code size by |
| 797 | /// duplicating some functions across partitions. |
| 798 | /// |
| 799 | /// Value is always rounded up to 3 decimal places. |
| 800 | /// |
| 801 | /// A perfect score would be 0.0, and anything approaching 1.0 is very bad. |
| 802 | double getCodeSizeScore() const { return CodeSizeScore; } |
| 803 | |
| 804 | /// Only available after \ref calculateScores is called. |
| 805 | /// |
| 806 | /// A number between [0, 1] which indicates how big of a bottleneck is |
| 807 | /// expected from the largest partition. |
| 808 | /// |
| 809 | /// A score of 1.0 means the biggest partition is as big as the source module, |
| 810 | /// so build time will be equal to or greater than the build time of the |
| 811 | /// initial input. |
| 812 | /// |
| 813 | /// Value is always rounded up to 3 decimal places. |
| 814 | /// |
| 815 | /// This is one of the metrics used to estimate this proposal's build time. |
| 816 | double getBottleneckScore() const { return BottleneckScore; } |
| 817 | |
| 818 | private: |
| 819 | void updateScore(unsigned PID) { |
| 820 | assert(SG); |
| 821 | for (auto &[PCost, Nodes] : Partitions) { |
| 822 | TotalCost -= PCost; |
| 823 | PCost = SG->calculateCost(BV: Nodes); |
| 824 | TotalCost += PCost; |
| 825 | } |
| 826 | } |
| 827 | |
| 828 | /// \see getCodeSizeScore |
| 829 | double CodeSizeScore = 0.0; |
| 830 | /// \see getBottleneckScore |
| 831 | double BottleneckScore = 0.0; |
| 832 | /// Aggregated cost of all partitions |
| 833 | CostType TotalCost = 0; |
| 834 | |
| 835 | const SplitGraph *SG = nullptr; |
| 836 | std::string Name; |
| 837 | |
| 838 | std::vector<std::pair<CostType, BitVector>> Partitions; |
| 839 | }; |
| 840 | |
| 841 | void SplitProposal::print(raw_ostream &OS) const { |
| 842 | assert(SG); |
| 843 | |
| 844 | OS << "[proposal] " << Name << ", total cost:" << TotalCost |
| 845 | << ", code size score:" << format(Fmt: "%0.3f" , Vals: CodeSizeScore) |
| 846 | << ", bottleneck score:" << format(Fmt: "%0.3f" , Vals: BottleneckScore) << '\n'; |
| 847 | for (const auto &[PID, Part] : enumerate(First: Partitions)) { |
| 848 | const auto &[Cost, NodeIDs] = Part; |
| 849 | OS << " - P" << PID << " nodes:" << NodeIDs.count() << " cost: " << Cost |
| 850 | << '|' << formatRatioOf(Num: Cost, Dem: SG->getModuleCost()) << "%\n" ; |
| 851 | } |
| 852 | } |
| 853 | |
| 854 | unsigned SplitProposal::findCheapestPartition() const { |
| 855 | assert(!Partitions.empty()); |
| 856 | CostType CurCost = std::numeric_limits<CostType>::max(); |
| 857 | unsigned CurPID = InvalidPID; |
| 858 | for (const auto &[Idx, Part] : enumerate(First: Partitions)) { |
| 859 | if (Part.first <= CurCost) { |
| 860 | CurPID = Idx; |
| 861 | CurCost = Part.first; |
| 862 | } |
| 863 | } |
| 864 | assert(CurPID != InvalidPID); |
| 865 | return CurPID; |
| 866 | } |
| 867 | |
| 868 | void SplitProposal::calculateScores() { |
| 869 | if (Partitions.empty()) |
| 870 | return; |
| 871 | |
| 872 | assert(SG); |
| 873 | CostType LargestPCost = 0; |
| 874 | for (auto &[PCost, Nodes] : Partitions) { |
| 875 | if (PCost > LargestPCost) |
| 876 | LargestPCost = PCost; |
| 877 | } |
| 878 | |
| 879 | CostType ModuleCost = SG->getModuleCost(); |
| 880 | CodeSizeScore = double(TotalCost) / ModuleCost; |
| 881 | assert(CodeSizeScore >= 0.0); |
| 882 | |
| 883 | BottleneckScore = double(LargestPCost) / ModuleCost; |
| 884 | |
| 885 | CodeSizeScore = std::ceil(x: CodeSizeScore * 100.0) / 100.0; |
| 886 | BottleneckScore = std::ceil(x: BottleneckScore * 100.0) / 100.0; |
| 887 | } |
| 888 | |
| 889 | #ifndef NDEBUG |
| 890 | void SplitProposal::verifyCompleteness() const { |
| 891 | if (Partitions.empty()) |
| 892 | return; |
| 893 | |
| 894 | BitVector Result = Partitions[0].second; |
| 895 | for (const auto &P : drop_begin(Partitions)) |
| 896 | Result |= P.second; |
| 897 | assert(Result.all() && "some nodes are missing from this proposal!" ); |
| 898 | } |
| 899 | #endif |
| 900 | |
| 901 | //===-- RecursiveSearchStrategy -------------------------------------------===// |
| 902 | |
| 903 | /// Partitioning algorithm. |
| 904 | /// |
| 905 | /// This is a recursive search algorithm that can explore multiple possiblities. |
| 906 | /// |
| 907 | /// When a cluster of nodes can go into more than one partition, and we haven't |
| 908 | /// reached maximum search depth, we recurse and explore both options and their |
| 909 | /// consequences. Both branches will yield a proposal, and the driver will grade |
| 910 | /// both and choose the best one. |
| 911 | /// |
| 912 | /// If max depth is reached, we will use some heuristics to make a choice. Most |
| 913 | /// of the time we will just use the least-pressured (cheapest) partition, but |
| 914 | /// if a cluster is particularly big and there is a good amount of overlap with |
| 915 | /// an existing partition, we will choose that partition instead. |
| 916 | class RecursiveSearchSplitting { |
| 917 | public: |
| 918 | using SubmitProposalFn = function_ref<void(SplitProposal)>; |
| 919 | |
| 920 | RecursiveSearchSplitting(const SplitGraph &SG, unsigned NumParts, |
| 921 | SubmitProposalFn SubmitProposal); |
| 922 | |
| 923 | void run(); |
| 924 | |
| 925 | private: |
| 926 | struct WorkListEntry { |
| 927 | WorkListEntry(const BitVector &BV) : Cluster(BV) {} |
| 928 | |
| 929 | unsigned NumNonEntryNodes = 0; |
| 930 | CostType TotalCost = 0; |
| 931 | CostType CostExcludingGraphEntryPoints = 0; |
| 932 | BitVector Cluster; |
| 933 | }; |
| 934 | |
| 935 | /// Collects all graph entry points's clusters and sort them so the most |
| 936 | /// expensive clusters are viewed first. This will merge clusters together if |
| 937 | /// they share a non-copyable dependency. |
| 938 | void setupWorkList(); |
| 939 | |
| 940 | /// Recursive function that assigns the worklist item at \p Idx into a |
| 941 | /// partition of \p SP. |
| 942 | /// |
| 943 | /// \p Depth is the current search depth. When this value is equal to |
| 944 | /// \ref MaxDepth, we can no longer recurse. |
| 945 | /// |
| 946 | /// This function only recurses if there is more than one possible assignment, |
| 947 | /// otherwise it is iterative to avoid creating a call stack that is as big as |
| 948 | /// \ref WorkList. |
| 949 | void pickPartition(unsigned Depth, unsigned Idx, SplitProposal SP); |
| 950 | |
| 951 | /// \return A pair: first element is the PID of the partition that has the |
| 952 | /// most similarities with \p Entry, or \ref InvalidPID if no partition was |
| 953 | /// found with at least one element in common. The second element is the |
| 954 | /// aggregated cost of all dependencies in common between \p Entry and that |
| 955 | /// partition. |
| 956 | std::pair<unsigned, CostType> |
| 957 | findMostSimilarPartition(const WorkListEntry &Entry, const SplitProposal &SP); |
| 958 | |
| 959 | const SplitGraph &SG; |
| 960 | unsigned NumParts; |
| 961 | SubmitProposalFn SubmitProposal; |
| 962 | |
| 963 | // A Cluster is considered large when its cost, excluding entry points, |
| 964 | // exceeds this value. |
| 965 | CostType LargeClusterThreshold = 0; |
| 966 | unsigned NumProposalsSubmitted = 0; |
| 967 | SmallVector<WorkListEntry> WorkList; |
| 968 | }; |
| 969 | |
| 970 | RecursiveSearchSplitting::RecursiveSearchSplitting( |
| 971 | const SplitGraph &SG, unsigned NumParts, SubmitProposalFn SubmitProposal) |
| 972 | : SG(SG), NumParts(NumParts), SubmitProposal(SubmitProposal) { |
| 973 | // arbitrary max value as a safeguard. Anything above 10 will already be |
| 974 | // slow, this is just a max value to prevent extreme resource exhaustion or |
| 975 | // unbounded run time. |
| 976 | if (MaxDepth > 16) |
| 977 | report_fatal_error(reason: "[amdgpu-split-module] search depth of " + |
| 978 | Twine(MaxDepth) + " is too high!" ); |
| 979 | LargeClusterThreshold = |
| 980 | (LargeFnFactor != 0.0) |
| 981 | ? CostType(((SG.getModuleCost() / NumParts) * LargeFnFactor)) |
| 982 | : std::numeric_limits<CostType>::max(); |
| 983 | LLVM_DEBUG(dbgs() << "[recursive search] large cluster threshold set at " |
| 984 | << LargeClusterThreshold << "\n" ); |
| 985 | } |
| 986 | |
| 987 | void RecursiveSearchSplitting::run() { |
| 988 | { |
| 989 | SplitModuleTimer SMT("recursive_search_prepare" , "preparing worklist" ); |
| 990 | setupWorkList(); |
| 991 | } |
| 992 | |
| 993 | { |
| 994 | SplitModuleTimer SMT("recursive_search_pick" , "partitioning" ); |
| 995 | SplitProposal SP(SG, NumParts); |
| 996 | pickPartition(/*BranchDepth=*/Depth: 0, /*Idx=*/0, SP); |
| 997 | } |
| 998 | } |
| 999 | |
| 1000 | void RecursiveSearchSplitting::setupWorkList() { |
| 1001 | // e.g. if A and B are two worklist item, and they both call a non copyable |
| 1002 | // dependency C, this does: |
| 1003 | // A=C |
| 1004 | // B=C |
| 1005 | // => NodeEC will create a single group (A, B, C) and we create a new |
| 1006 | // WorkList entry for that group. |
| 1007 | |
| 1008 | EquivalenceClasses<unsigned> NodeEC; |
| 1009 | for (const SplitGraph::Node *N : SG.nodes()) { |
| 1010 | if (!N->isGraphEntryPoint()) |
| 1011 | continue; |
| 1012 | |
| 1013 | NodeEC.insert(Data: N->getID()); |
| 1014 | N->visitAllDependencies(Visitor: [&](const SplitGraph::Node &Dep) { |
| 1015 | if (&Dep != N && Dep.isNonCopyable()) |
| 1016 | NodeEC.unionSets(V1: N->getID(), V2: Dep.getID()); |
| 1017 | }); |
| 1018 | } |
| 1019 | |
| 1020 | for (const auto &Node : NodeEC) { |
| 1021 | if (!Node->isLeader()) |
| 1022 | continue; |
| 1023 | |
| 1024 | BitVector Cluster = SG.createNodesBitVector(); |
| 1025 | for (unsigned M : NodeEC.members(ECV: *Node)) { |
| 1026 | const SplitGraph::Node &N = SG.getNode(ID: M); |
| 1027 | if (N.isGraphEntryPoint()) |
| 1028 | N.getDependencies(BV&: Cluster); |
| 1029 | } |
| 1030 | WorkList.emplace_back(Args: std::move(Cluster)); |
| 1031 | } |
| 1032 | |
| 1033 | // Calculate costs and other useful information. |
| 1034 | for (WorkListEntry &Entry : WorkList) { |
| 1035 | for (unsigned NodeID : Entry.Cluster.set_bits()) { |
| 1036 | const SplitGraph::Node &N = SG.getNode(ID: NodeID); |
| 1037 | const CostType Cost = N.getIndividualCost(); |
| 1038 | |
| 1039 | Entry.TotalCost += Cost; |
| 1040 | if (!N.isGraphEntryPoint()) { |
| 1041 | Entry.CostExcludingGraphEntryPoints += Cost; |
| 1042 | ++Entry.NumNonEntryNodes; |
| 1043 | } |
| 1044 | } |
| 1045 | } |
| 1046 | |
| 1047 | stable_sort(Range&: WorkList, C: [](const WorkListEntry &A, const WorkListEntry &B) { |
| 1048 | if (A.TotalCost != B.TotalCost) |
| 1049 | return A.TotalCost > B.TotalCost; |
| 1050 | |
| 1051 | if (A.CostExcludingGraphEntryPoints != B.CostExcludingGraphEntryPoints) |
| 1052 | return A.CostExcludingGraphEntryPoints > B.CostExcludingGraphEntryPoints; |
| 1053 | |
| 1054 | if (A.NumNonEntryNodes != B.NumNonEntryNodes) |
| 1055 | return A.NumNonEntryNodes > B.NumNonEntryNodes; |
| 1056 | |
| 1057 | return A.Cluster.count() > B.Cluster.count(); |
| 1058 | }); |
| 1059 | |
| 1060 | LLVM_DEBUG({ |
| 1061 | dbgs() << "[recursive search] worklist:\n" ; |
| 1062 | for (const auto &[Idx, Entry] : enumerate(WorkList)) { |
| 1063 | dbgs() << " - [" << Idx << "]: " ; |
| 1064 | for (unsigned NodeID : Entry.Cluster.set_bits()) |
| 1065 | dbgs() << NodeID << " " ; |
| 1066 | dbgs() << "(total_cost:" << Entry.TotalCost |
| 1067 | << ", cost_excl_entries:" << Entry.CostExcludingGraphEntryPoints |
| 1068 | << ")\n" ; |
| 1069 | } |
| 1070 | }); |
| 1071 | } |
| 1072 | |
| 1073 | void RecursiveSearchSplitting::pickPartition(unsigned Depth, unsigned Idx, |
| 1074 | SplitProposal SP) { |
| 1075 | while (Idx < WorkList.size()) { |
| 1076 | // Step 1: Determine candidate PIDs. |
| 1077 | // |
| 1078 | const WorkListEntry &Entry = WorkList[Idx]; |
| 1079 | const BitVector &Cluster = Entry.Cluster; |
| 1080 | |
| 1081 | // Default option is to do load-balancing, AKA assign to least pressured |
| 1082 | // partition. |
| 1083 | const unsigned CheapestPID = SP.findCheapestPartition(); |
| 1084 | assert(CheapestPID != InvalidPID); |
| 1085 | |
| 1086 | // Explore assigning to the kernel that contains the most dependencies in |
| 1087 | // common. |
| 1088 | const auto [MostSimilarPID, SimilarDepsCost] = |
| 1089 | findMostSimilarPartition(Entry, SP); |
| 1090 | |
| 1091 | // We can chose to explore only one path if we only have one valid path, or |
| 1092 | // if we reached maximum search depth and can no longer branch out. |
| 1093 | unsigned SinglePIDToTry = InvalidPID; |
| 1094 | if (MostSimilarPID == InvalidPID) // no similar PID found |
| 1095 | SinglePIDToTry = CheapestPID; |
| 1096 | else if (MostSimilarPID == CheapestPID) // both landed on the same PID |
| 1097 | SinglePIDToTry = CheapestPID; |
| 1098 | else if (Depth >= MaxDepth) { |
| 1099 | // We have to choose one path. Use a heuristic to guess which one will be |
| 1100 | // more appropriate. |
| 1101 | if (Entry.CostExcludingGraphEntryPoints > LargeClusterThreshold) { |
| 1102 | // Check if the amount of code in common makes it worth it. |
| 1103 | assert(SimilarDepsCost && Entry.CostExcludingGraphEntryPoints); |
| 1104 | const double Ratio = static_cast<double>(SimilarDepsCost) / |
| 1105 | Entry.CostExcludingGraphEntryPoints; |
| 1106 | assert(Ratio >= 0.0 && Ratio <= 1.0); |
| 1107 | if (Ratio > LargeFnOverlapForMerge) { |
| 1108 | // For debug, just print "L", so we'll see "L3=P3" for instance, which |
| 1109 | // will mean we reached max depth and chose P3 based on this |
| 1110 | // heuristic. |
| 1111 | LLVM_DEBUG(dbgs() << 'L'); |
| 1112 | SinglePIDToTry = MostSimilarPID; |
| 1113 | } |
| 1114 | } else |
| 1115 | SinglePIDToTry = CheapestPID; |
| 1116 | } |
| 1117 | |
| 1118 | // Step 2: Explore candidates. |
| 1119 | |
| 1120 | // When we only explore one possible path, and thus branch depth doesn't |
| 1121 | // increase, do not recurse, iterate instead. |
| 1122 | if (SinglePIDToTry != InvalidPID) { |
| 1123 | LLVM_DEBUG(dbgs() << Idx << "=P" << SinglePIDToTry << ' '); |
| 1124 | // Only one path to explore, don't clone SP, don't increase depth. |
| 1125 | SP.add(PID: SinglePIDToTry, BV: Cluster); |
| 1126 | ++Idx; |
| 1127 | continue; |
| 1128 | } |
| 1129 | |
| 1130 | assert(MostSimilarPID != InvalidPID); |
| 1131 | |
| 1132 | // We explore multiple paths: recurse at increased depth, then stop this |
| 1133 | // function. |
| 1134 | |
| 1135 | LLVM_DEBUG(dbgs() << '\n'); |
| 1136 | |
| 1137 | // lb = load balancing = put in cheapest partition |
| 1138 | { |
| 1139 | SplitProposal BranchSP = SP; |
| 1140 | LLVM_DEBUG(dbgs().indent(Depth) |
| 1141 | << " [lb] " << Idx << "=P" << CheapestPID << "? " ); |
| 1142 | BranchSP.add(PID: CheapestPID, BV: Cluster); |
| 1143 | pickPartition(Depth: Depth + 1, Idx: Idx + 1, SP: BranchSP); |
| 1144 | } |
| 1145 | |
| 1146 | // ms = most similar = put in partition with the most in common |
| 1147 | { |
| 1148 | SplitProposal BranchSP = SP; |
| 1149 | LLVM_DEBUG(dbgs().indent(Depth) |
| 1150 | << " [ms] " << Idx << "=P" << MostSimilarPID << "? " ); |
| 1151 | BranchSP.add(PID: MostSimilarPID, BV: Cluster); |
| 1152 | pickPartition(Depth: Depth + 1, Idx: Idx + 1, SP: BranchSP); |
| 1153 | } |
| 1154 | |
| 1155 | return; |
| 1156 | } |
| 1157 | |
| 1158 | // Step 3: If we assigned all WorkList items, submit the proposal. |
| 1159 | |
| 1160 | assert(Idx == WorkList.size()); |
| 1161 | assert(NumProposalsSubmitted <= (2u << MaxDepth) && |
| 1162 | "Search got out of bounds?" ); |
| 1163 | SP.setName("recursive_search (depth=" + std::to_string(val: Depth) + ") #" + |
| 1164 | std::to_string(val: NumProposalsSubmitted++)); |
| 1165 | LLVM_DEBUG(dbgs() << '\n'); |
| 1166 | SubmitProposal(SP); |
| 1167 | } |
| 1168 | |
| 1169 | std::pair<unsigned, CostType> |
| 1170 | RecursiveSearchSplitting::findMostSimilarPartition(const WorkListEntry &Entry, |
| 1171 | const SplitProposal &SP) { |
| 1172 | if (!Entry.NumNonEntryNodes) |
| 1173 | return {InvalidPID, 0}; |
| 1174 | |
| 1175 | // We take the partition that is the most similar using Cost as a metric. |
| 1176 | // So we take the set of nodes in common, compute their aggregated cost, and |
| 1177 | // pick the partition with the highest cost in common. |
| 1178 | unsigned ChosenPID = InvalidPID; |
| 1179 | CostType ChosenCost = 0; |
| 1180 | for (unsigned PID = 0; PID < NumParts; ++PID) { |
| 1181 | BitVector BV = SP[PID]; |
| 1182 | BV &= Entry.Cluster; // FIXME: & doesn't work between BVs?! |
| 1183 | |
| 1184 | if (BV.none()) |
| 1185 | continue; |
| 1186 | |
| 1187 | const CostType Cost = SG.calculateCost(BV); |
| 1188 | |
| 1189 | if (ChosenPID == InvalidPID || ChosenCost < Cost || |
| 1190 | (ChosenCost == Cost && PID > ChosenPID)) { |
| 1191 | ChosenPID = PID; |
| 1192 | ChosenCost = Cost; |
| 1193 | } |
| 1194 | } |
| 1195 | |
| 1196 | return {ChosenPID, ChosenCost}; |
| 1197 | } |
| 1198 | |
| 1199 | //===----------------------------------------------------------------------===// |
| 1200 | // DOTGraph Printing Support |
| 1201 | //===----------------------------------------------------------------------===// |
| 1202 | |
| 1203 | const SplitGraph::Node *mapEdgeToDst(const SplitGraph::Edge *E) { |
| 1204 | return E->Dst; |
| 1205 | } |
| 1206 | |
| 1207 | using SplitGraphEdgeDstIterator = |
| 1208 | mapped_iterator<SplitGraph::edges_iterator, decltype(&mapEdgeToDst)>; |
| 1209 | |
| 1210 | } // namespace |
| 1211 | |
| 1212 | template <> struct GraphTraits<SplitGraph> { |
| 1213 | using NodeRef = const SplitGraph::Node *; |
| 1214 | using nodes_iterator = SplitGraph::nodes_iterator; |
| 1215 | using ChildIteratorType = SplitGraphEdgeDstIterator; |
| 1216 | |
| 1217 | using EdgeRef = const SplitGraph::Edge *; |
| 1218 | using ChildEdgeIteratorType = SplitGraph::edges_iterator; |
| 1219 | |
| 1220 | static NodeRef getEntryNode(NodeRef N) { return N; } |
| 1221 | |
| 1222 | static ChildIteratorType child_begin(NodeRef Ref) { |
| 1223 | return {Ref->outgoing_edges().begin(), mapEdgeToDst}; |
| 1224 | } |
| 1225 | static ChildIteratorType child_end(NodeRef Ref) { |
| 1226 | return {Ref->outgoing_edges().end(), mapEdgeToDst}; |
| 1227 | } |
| 1228 | |
| 1229 | static nodes_iterator nodes_begin(const SplitGraph &G) { |
| 1230 | return G.nodes().begin(); |
| 1231 | } |
| 1232 | static nodes_iterator nodes_end(const SplitGraph &G) { |
| 1233 | return G.nodes().end(); |
| 1234 | } |
| 1235 | }; |
| 1236 | |
| 1237 | template <> struct DOTGraphTraits<SplitGraph> : public DefaultDOTGraphTraits { |
| 1238 | DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {} |
| 1239 | |
| 1240 | static std::string getGraphName(const SplitGraph &SG) { |
| 1241 | return SG.getModule().getName().str(); |
| 1242 | } |
| 1243 | |
| 1244 | std::string getNodeLabel(const SplitGraph::Node *N, const SplitGraph &SG) { |
| 1245 | return N->getName().str(); |
| 1246 | } |
| 1247 | |
| 1248 | static std::string getNodeDescription(const SplitGraph::Node *N, |
| 1249 | const SplitGraph &SG) { |
| 1250 | std::string Result; |
| 1251 | if (N->isEntryFunctionCC()) |
| 1252 | Result += "entry-fn-cc " ; |
| 1253 | if (N->isNonCopyable()) |
| 1254 | Result += "non-copyable " ; |
| 1255 | Result += "cost:" + std::to_string(val: N->getIndividualCost()); |
| 1256 | return Result; |
| 1257 | } |
| 1258 | |
| 1259 | static std::string getNodeAttributes(const SplitGraph::Node *N, |
| 1260 | const SplitGraph &SG) { |
| 1261 | return N->hasAnyIncomingEdges() ? "" : "color=\"red\"" ; |
| 1262 | } |
| 1263 | |
| 1264 | static std::string getEdgeAttributes(const SplitGraph::Node *N, |
| 1265 | SplitGraphEdgeDstIterator EI, |
| 1266 | const SplitGraph &SG) { |
| 1267 | |
| 1268 | switch ((*EI.getCurrent())->Kind) { |
| 1269 | case SplitGraph::EdgeKind::DirectCall: |
| 1270 | return "" ; |
| 1271 | case SplitGraph::EdgeKind::IndirectCall: |
| 1272 | return "style=\"dashed\"" ; |
| 1273 | } |
| 1274 | llvm_unreachable("Unknown SplitGraph::EdgeKind enum" ); |
| 1275 | } |
| 1276 | }; |
| 1277 | |
| 1278 | //===----------------------------------------------------------------------===// |
| 1279 | // Driver |
| 1280 | //===----------------------------------------------------------------------===// |
| 1281 | |
| 1282 | namespace { |
| 1283 | |
| 1284 | // If we didn't externalize GVs, then local GVs need to be conservatively |
| 1285 | // imported into every module (including their initializers), and then cleaned |
| 1286 | // up afterwards. |
| 1287 | static bool needsConservativeImport(const GlobalValue *GV) { |
| 1288 | if (const auto *Var = dyn_cast<GlobalVariable>(Val: GV)) |
| 1289 | return Var->hasLocalLinkage(); |
| 1290 | return isa<GlobalAlias>(Val: GV); |
| 1291 | } |
| 1292 | |
| 1293 | /// Prints a summary of the partition \p N, represented by module \p M, to \p |
| 1294 | /// OS. |
| 1295 | static void printPartitionSummary(raw_ostream &OS, unsigned N, const Module &M, |
| 1296 | unsigned PartCost, unsigned ModuleCost) { |
| 1297 | OS << "*** Partition P" << N << " ***\n" ; |
| 1298 | |
| 1299 | for (const auto &Fn : M) { |
| 1300 | if (!Fn.isDeclaration()) |
| 1301 | OS << " - [function] " << Fn.getName() << "\n" ; |
| 1302 | } |
| 1303 | |
| 1304 | for (const auto &GV : M.globals()) { |
| 1305 | if (GV.hasInitializer()) |
| 1306 | OS << " - [global] " << GV.getName() << "\n" ; |
| 1307 | } |
| 1308 | |
| 1309 | OS << "Partition contains " << formatRatioOf(Num: PartCost, Dem: ModuleCost) |
| 1310 | << "% of the source\n" ; |
| 1311 | } |
| 1312 | |
| 1313 | static void evaluateProposal(SplitProposal &Best, SplitProposal New) { |
| 1314 | SplitModuleTimer SMT("proposal_evaluation" , "proposal ranking algorithm" ); |
| 1315 | |
| 1316 | LLVM_DEBUG({ |
| 1317 | New.verifyCompleteness(); |
| 1318 | if (DebugProposalSearch) |
| 1319 | New.print(dbgs()); |
| 1320 | }); |
| 1321 | |
| 1322 | const double CurBScore = Best.getBottleneckScore(); |
| 1323 | const double CurCSScore = Best.getCodeSizeScore(); |
| 1324 | const double NewBScore = New.getBottleneckScore(); |
| 1325 | const double NewCSScore = New.getCodeSizeScore(); |
| 1326 | |
| 1327 | // TODO: Improve this |
| 1328 | // We can probably lower the precision of the comparison at first |
| 1329 | // e.g. if we have |
| 1330 | // - (Current): BScore: 0.489 CSCore 1.105 |
| 1331 | // - (New): BScore: 0.475 CSCore 1.305 |
| 1332 | // Currently we'd choose the new one because the bottleneck score is |
| 1333 | // lower, but the new one duplicates more code. It may be worth it to |
| 1334 | // discard the new proposal as the impact on build time is negligible. |
| 1335 | |
| 1336 | // Compare them |
| 1337 | bool IsBest = false; |
| 1338 | if (NewBScore < CurBScore) |
| 1339 | IsBest = true; |
| 1340 | else if (NewBScore == CurBScore) |
| 1341 | IsBest = (NewCSScore < CurCSScore); // Use code size as tie breaker. |
| 1342 | |
| 1343 | if (IsBest) |
| 1344 | Best = std::move(New); |
| 1345 | |
| 1346 | LLVM_DEBUG(if (DebugProposalSearch) { |
| 1347 | if (IsBest) |
| 1348 | dbgs() << "[search] new best proposal!\n" ; |
| 1349 | else |
| 1350 | dbgs() << "[search] discarding - not profitable\n" ; |
| 1351 | }); |
| 1352 | } |
| 1353 | |
| 1354 | /// Trivial helper to create an identical copy of \p M. |
| 1355 | static std::unique_ptr<Module> cloneAll(const Module &M) { |
| 1356 | ValueToValueMapTy VMap; |
| 1357 | return CloneModule(M, VMap, ShouldCloneDefinition: [&](const GlobalValue *GV) { return true; }); |
| 1358 | } |
| 1359 | |
| 1360 | /// Writes \p SG as a DOTGraph to \ref ModuleDotCfgDir if requested. |
| 1361 | static void writeDOTGraph(const SplitGraph &SG) { |
| 1362 | if (ModuleDotCfgOutput.empty()) |
| 1363 | return; |
| 1364 | |
| 1365 | std::error_code EC; |
| 1366 | raw_fd_ostream OS(ModuleDotCfgOutput, EC); |
| 1367 | if (EC) { |
| 1368 | errs() << "[" DEBUG_TYPE "]: cannot open '" << ModuleDotCfgOutput |
| 1369 | << "' - DOTGraph will not be printed\n" ; |
| 1370 | } |
| 1371 | WriteGraph(O&: OS, G: SG, /*ShortName=*/ShortNames: false, |
| 1372 | /*Title=*/SG.getModule().getName()); |
| 1373 | } |
| 1374 | |
| 1375 | static void splitAMDGPUModule( |
| 1376 | GetTTIFn GetTTI, Module &M, unsigned NumParts, |
| 1377 | function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) { |
| 1378 | CallGraph CG(M); |
| 1379 | |
| 1380 | // Externalize functions whose address are taken. |
| 1381 | // |
| 1382 | // This is needed because partitioning is purely based on calls, but sometimes |
| 1383 | // a kernel/function may just look at the address of another local function |
| 1384 | // and not do anything (no calls). After partitioning, that local function may |
| 1385 | // end up in a different module (so it's just a declaration in the module |
| 1386 | // where its address is taken), which emits a "undefined hidden symbol" linker |
| 1387 | // error. |
| 1388 | // |
| 1389 | // Additionally, it guides partitioning to not duplicate this function if it's |
| 1390 | // called directly at some point. |
| 1391 | // |
| 1392 | // TODO: Could we be smarter about this ? This makes all functions whose |
| 1393 | // addresses are taken non-copyable. We should probably model this type of |
| 1394 | // constraint in the graph and use it to guide splitting, instead of |
| 1395 | // externalizing like this. Maybe non-copyable should really mean "keep one |
| 1396 | // visible copy, then internalize all other copies" for some functions? |
| 1397 | if (!NoExternalizeOnAddrTaken) { |
| 1398 | for (auto &Fn : M) { |
| 1399 | // TODO: Should aliases count? Probably not but they're so rare I'm not |
| 1400 | // sure it's worth fixing. |
| 1401 | if (Fn.hasLocalLinkage() && Fn.hasAddressTaken()) { |
| 1402 | LLVM_DEBUG(dbgs() << "[externalize] " ; Fn.printAsOperand(dbgs()); |
| 1403 | dbgs() << " because its address is taken\n" ); |
| 1404 | externalize(GV&: Fn); |
| 1405 | } |
| 1406 | } |
| 1407 | } |
| 1408 | |
| 1409 | // Externalize local GVs, which avoids duplicating their initializers, which |
| 1410 | // in turns helps keep code size in check. |
| 1411 | if (!NoExternalizeGlobals) { |
| 1412 | for (auto &GV : M.globals()) { |
| 1413 | if (GV.hasLocalLinkage()) |
| 1414 | LLVM_DEBUG(dbgs() << "[externalize] GV " << GV.getName() << '\n'); |
| 1415 | externalize(GV); |
| 1416 | } |
| 1417 | } |
| 1418 | |
| 1419 | // Start by calculating the cost of every function in the module, as well as |
| 1420 | // the module's overall cost. |
| 1421 | FunctionsCostMap FnCosts; |
| 1422 | const CostType ModuleCost = calculateFunctionCosts(GetTTI, M, CostMap&: FnCosts); |
| 1423 | |
| 1424 | // Build the SplitGraph, which represents the module's functions and models |
| 1425 | // their dependencies accurately. |
| 1426 | SplitGraph SG(M, FnCosts, ModuleCost); |
| 1427 | SG.buildGraph(CG); |
| 1428 | |
| 1429 | if (SG.empty()) { |
| 1430 | LLVM_DEBUG( |
| 1431 | dbgs() |
| 1432 | << "[!] no nodes in graph, input is empty - no splitting possible\n" ); |
| 1433 | ModuleCallback(cloneAll(M)); |
| 1434 | return; |
| 1435 | } |
| 1436 | |
| 1437 | LLVM_DEBUG({ |
| 1438 | dbgs() << "[graph] nodes:\n" ; |
| 1439 | for (const SplitGraph::Node *N : SG.nodes()) { |
| 1440 | dbgs() << " - [" << N->getID() << "]: " << N->getName() << " " |
| 1441 | << (N->isGraphEntryPoint() ? "(entry)" : "" ) << " " |
| 1442 | << (N->isNonCopyable() ? "(noncopyable)" : "" ) << "\n" ; |
| 1443 | } |
| 1444 | }); |
| 1445 | |
| 1446 | writeDOTGraph(SG); |
| 1447 | |
| 1448 | LLVM_DEBUG(dbgs() << "[search] testing splitting strategies\n" ); |
| 1449 | |
| 1450 | std::optional<SplitProposal> Proposal; |
| 1451 | const auto EvaluateProposal = [&](SplitProposal SP) { |
| 1452 | SP.calculateScores(); |
| 1453 | if (!Proposal) |
| 1454 | Proposal = std::move(SP); |
| 1455 | else |
| 1456 | evaluateProposal(Best&: *Proposal, New: std::move(SP)); |
| 1457 | }; |
| 1458 | |
| 1459 | // TODO: It would be very easy to create new strategies by just adding a base |
| 1460 | // class to RecursiveSearchSplitting and abstracting it away. |
| 1461 | RecursiveSearchSplitting(SG, NumParts, EvaluateProposal).run(); |
| 1462 | LLVM_DEBUG(if (Proposal) dbgs() << "[search done] selected proposal: " |
| 1463 | << Proposal->getName() << "\n" ;); |
| 1464 | |
| 1465 | if (!Proposal) { |
| 1466 | LLVM_DEBUG(dbgs() << "[!] no proposal made, no splitting possible!\n" ); |
| 1467 | ModuleCallback(cloneAll(M)); |
| 1468 | return; |
| 1469 | } |
| 1470 | |
| 1471 | LLVM_DEBUG(Proposal->print(dbgs());); |
| 1472 | |
| 1473 | std::optional<raw_fd_ostream> SummariesOS; |
| 1474 | if (!PartitionSummariesOutput.empty()) { |
| 1475 | std::error_code EC; |
| 1476 | SummariesOS.emplace(args&: PartitionSummariesOutput, args&: EC); |
| 1477 | if (EC) |
| 1478 | errs() << "[" DEBUG_TYPE "]: cannot open '" << PartitionSummariesOutput |
| 1479 | << "' - Partition summaries will not be printed\n" ; |
| 1480 | } |
| 1481 | |
| 1482 | // One module will import all GlobalValues that are not Functions |
| 1483 | // and are not subject to conservative import. |
| 1484 | bool ImportAllGVs = true; |
| 1485 | |
| 1486 | for (unsigned PID = 0; PID < NumParts; ++PID) { |
| 1487 | SplitModuleTimer SMT2("modules_creation" , |
| 1488 | "creating modules for each partition" ); |
| 1489 | LLVM_DEBUG(dbgs() << "[split] creating new modules\n" ); |
| 1490 | |
| 1491 | DenseSet<const Function *> FnsInPart; |
| 1492 | for (unsigned NodeID : (*Proposal)[PID].set_bits()) |
| 1493 | FnsInPart.insert(V: &SG.getNode(ID: NodeID).getFunction()); |
| 1494 | |
| 1495 | // Don't create empty modules. |
| 1496 | if (FnsInPart.empty()) { |
| 1497 | LLVM_DEBUG(dbgs() << "[split] P" << PID |
| 1498 | << " is empty, not creating module\n" ); |
| 1499 | continue; |
| 1500 | } |
| 1501 | |
| 1502 | ValueToValueMapTy VMap; |
| 1503 | CostType PartCost = 0; |
| 1504 | std::unique_ptr<Module> MPart( |
| 1505 | CloneModule(M, VMap, ShouldCloneDefinition: [&](const GlobalValue *GV) { |
| 1506 | // Functions go in their assigned partition. |
| 1507 | if (const auto *Fn = dyn_cast<Function>(Val: GV)) { |
| 1508 | if (FnsInPart.contains(V: Fn)) { |
| 1509 | PartCost += SG.getCost(F: *Fn); |
| 1510 | return true; |
| 1511 | } |
| 1512 | return false; |
| 1513 | } |
| 1514 | |
| 1515 | // Everything else goes in the first non-empty module we create. |
| 1516 | return ImportAllGVs || needsConservativeImport(GV); |
| 1517 | })); |
| 1518 | |
| 1519 | ImportAllGVs = false; |
| 1520 | |
| 1521 | // FIXME: Aliases aren't seen often, and their handling isn't perfect so |
| 1522 | // bugs are possible. |
| 1523 | |
| 1524 | // Clean-up conservatively imported GVs without any users. |
| 1525 | for (auto &GV : make_early_inc_range(Range: MPart->global_values())) { |
| 1526 | if (needsConservativeImport(GV: &GV) && GV.use_empty()) |
| 1527 | GV.eraseFromParent(); |
| 1528 | } |
| 1529 | |
| 1530 | if (SummariesOS) |
| 1531 | printPartitionSummary(OS&: *SummariesOS, N: PID, M: *MPart, PartCost, ModuleCost); |
| 1532 | |
| 1533 | LLVM_DEBUG( |
| 1534 | printPartitionSummary(dbgs(), PID, *MPart, PartCost, ModuleCost)); |
| 1535 | |
| 1536 | ModuleCallback(std::move(MPart)); |
| 1537 | } |
| 1538 | } |
| 1539 | } // namespace |
| 1540 | |
| 1541 | PreservedAnalyses AMDGPUSplitModulePass::run(Module &M, |
| 1542 | ModuleAnalysisManager &MAM) { |
| 1543 | SplitModuleTimer SMT( |
| 1544 | "total" , "total pass runtime (incl. potentially waiting for lockfile)" ); |
| 1545 | |
| 1546 | FunctionAnalysisManager &FAM = |
| 1547 | MAM.getResult<FunctionAnalysisManagerModuleProxy>(IR&: M).getManager(); |
| 1548 | const auto TTIGetter = [&FAM](Function &F) -> const TargetTransformInfo & { |
| 1549 | return FAM.getResult<TargetIRAnalysis>(IR&: F); |
| 1550 | }; |
| 1551 | |
| 1552 | bool Done = false; |
| 1553 | #ifndef NDEBUG |
| 1554 | if (UseLockFile) { |
| 1555 | SmallString<128> LockFilePath; |
| 1556 | sys::path::system_temp_directory(/*ErasedOnReboot=*/true, LockFilePath); |
| 1557 | sys::path::append(LockFilePath, "amdgpu-split-module-debug" ); |
| 1558 | LLVM_DEBUG(dbgs() << DEBUG_TYPE " using lockfile '" << LockFilePath |
| 1559 | << "'\n" ); |
| 1560 | |
| 1561 | while (true) { |
| 1562 | llvm::LockFileManager Lock(LockFilePath.str()); |
| 1563 | bool Owned; |
| 1564 | if (Error Err = Lock.tryLock().moveInto(Owned)) { |
| 1565 | consumeError(std::move(Err)); |
| 1566 | LLVM_DEBUG( |
| 1567 | dbgs() << "[amdgpu-split-module] unable to acquire lockfile, debug " |
| 1568 | "output may be mangled by other processes\n" ); |
| 1569 | } else if (!Owned) { |
| 1570 | switch (Lock.waitForUnlockFor(std::chrono::seconds(90))) { |
| 1571 | case WaitForUnlockResult::Success: |
| 1572 | break; |
| 1573 | case WaitForUnlockResult::OwnerDied: |
| 1574 | continue; // try again to get the lock. |
| 1575 | case WaitForUnlockResult::Timeout: |
| 1576 | LLVM_DEBUG( |
| 1577 | dbgs() |
| 1578 | << "[amdgpu-split-module] unable to acquire lockfile, debug " |
| 1579 | "output may be mangled by other processes\n" ); |
| 1580 | Lock.unsafeMaybeUnlock(); |
| 1581 | break; // give up |
| 1582 | } |
| 1583 | } |
| 1584 | |
| 1585 | splitAMDGPUModule(TTIGetter, M, N, ModuleCallback); |
| 1586 | Done = true; |
| 1587 | break; |
| 1588 | } |
| 1589 | } |
| 1590 | #endif |
| 1591 | |
| 1592 | if (!Done) |
| 1593 | splitAMDGPUModule(GetTTI: TTIGetter, M, NumParts: N, ModuleCallback); |
| 1594 | |
| 1595 | // We can change linkage/visibilities in the input, consider that nothing is |
| 1596 | // preserved just to be safe. This pass runs last anyway. |
| 1597 | return PreservedAnalyses::none(); |
| 1598 | } |
| 1599 | } // namespace llvm |
| 1600 | |