| 1 | //===- CallGraphSort.cpp --------------------------------------------------===// | 
|---|
| 2 | // | 
|---|
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | 
|---|
| 4 | // See https://llvm.org/LICENSE.txt for license information. | 
|---|
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | 
|---|
| 6 | // | 
|---|
| 7 | //===----------------------------------------------------------------------===// | 
|---|
| 8 | /// | 
|---|
| 9 | /// The file is responsible for sorting sections using LLVM call graph profile | 
|---|
| 10 | /// data by placing frequently executed code sections together. The goal of the | 
|---|
| 11 | /// placement is to improve the runtime performance of the final executable by | 
|---|
| 12 | /// arranging code sections so that i-TLB misses and i-cache misses are reduced. | 
|---|
| 13 | /// | 
|---|
| 14 | /// The algorithm first builds a call graph based on the profile data and then | 
|---|
| 15 | /// iteratively merges "chains" (ordered lists) of input sections which will be | 
|---|
| 16 | /// laid out as a unit. There are two implementations for deciding how to | 
|---|
| 17 | /// merge a pair of chains: | 
|---|
| 18 | ///  - a simpler one, referred to as Call-Chain Clustering (C^3), that follows | 
|---|
| 19 | ///    "Optimizing Function Placement for Large-Scale Data-Center Applications" | 
|---|
| 20 | /// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf | 
|---|
| 21 | /// - a more advanced one, referred to as Cache-Directed-Sort (CDSort), which | 
|---|
| 22 | ///   typically produces layouts with higher locality, and hence, yields fewer | 
|---|
| 23 | ///   instruction cache misses on large binaries. | 
|---|
| 24 | //===----------------------------------------------------------------------===// | 
|---|
| 25 |  | 
|---|
| 26 | #include "CallGraphSort.h" | 
|---|
| 27 | #include "InputFiles.h" | 
|---|
| 28 | #include "InputSection.h" | 
|---|
| 29 | #include "Symbols.h" | 
|---|
| 30 | #include "llvm/Support/FileSystem.h" | 
|---|
| 31 | #include "llvm/Transforms/Utils/CodeLayout.h" | 
|---|
| 32 |  | 
|---|
| 33 | #include <numeric> | 
|---|
| 34 |  | 
|---|
| 35 | using namespace llvm; | 
|---|
| 36 | using namespace lld; | 
|---|
| 37 | using namespace lld::elf; | 
|---|
| 38 |  | 
|---|
| 39 | namespace { | 
|---|
| 40 | struct Edge { | 
|---|
| 41 | int from; | 
|---|
| 42 | uint64_t weight; | 
|---|
| 43 | }; | 
|---|
| 44 |  | 
|---|
| 45 | struct Cluster { | 
|---|
| 46 | Cluster(int sec, size_t s) : next(sec), prev(sec), size(s) {} | 
|---|
| 47 |  | 
|---|
| 48 | double getDensity() const { | 
|---|
| 49 | if (size == 0) | 
|---|
| 50 | return 0; | 
|---|
| 51 | return double(weight) / double(size); | 
|---|
| 52 | } | 
|---|
| 53 |  | 
|---|
| 54 | int next; | 
|---|
| 55 | int prev; | 
|---|
| 56 | uint64_t size; | 
|---|
| 57 | uint64_t weight = 0; | 
|---|
| 58 | uint64_t initialWeight = 0; | 
|---|
| 59 | Edge bestPred = {.from: -1, .weight: 0}; | 
|---|
| 60 | }; | 
|---|
| 61 |  | 
|---|
| 62 | /// Implementation of the Call-Chain Clustering (C^3). The goal of this | 
|---|
| 63 | /// algorithm is to improve runtime performance of the executable by arranging | 
|---|
| 64 | /// code sections such that page table and i-cache misses are minimized. | 
|---|
| 65 | /// | 
|---|
| 66 | /// Definitions: | 
|---|
| 67 | /// * Cluster | 
|---|
| 68 | ///   * An ordered list of input sections which are laid out as a unit. At the | 
|---|
| 69 | ///     beginning of the algorithm each input section has its own cluster and | 
|---|
| 70 | ///     the weight of the cluster is the sum of the weight of all incoming | 
|---|
| 71 | ///     edges. | 
|---|
| 72 | /// * Call-Chain Clustering (C³) Heuristic | 
|---|
| 73 | ///   * Defines when and how clusters are combined. Pick the highest weighted | 
|---|
| 74 | ///     input section then add it to its most likely predecessor if it wouldn't | 
|---|
| 75 | ///     penalize it too much. | 
|---|
| 76 | /// * Density | 
|---|
| 77 | ///   * The weight of the cluster divided by the size of the cluster. This is a | 
|---|
| 78 | ///     proxy for the amount of execution time spent per byte of the cluster. | 
|---|
| 79 | /// | 
|---|
| 80 | /// It does so given a call graph profile by the following: | 
|---|
| 81 | /// * Build a weighted call graph from the call graph profile | 
|---|
| 82 | /// * Sort input sections by weight | 
|---|
| 83 | /// * For each input section starting with the highest weight | 
|---|
| 84 | ///   * Find its most likely predecessor cluster | 
|---|
| 85 | ///   * Check if the combined cluster would be too large, or would have too low | 
|---|
| 86 | ///     a density. | 
|---|
| 87 | ///   * If not, then combine the clusters. | 
|---|
| 88 | /// * Sort non-empty clusters by density | 
|---|
| 89 | class CallGraphSort { | 
|---|
| 90 | public: | 
|---|
| 91 | CallGraphSort(Ctx &); | 
|---|
| 92 |  | 
|---|
| 93 | DenseMap<const InputSectionBase *, int> run(); | 
|---|
| 94 |  | 
|---|
| 95 | private: | 
|---|
| 96 | Ctx &ctx; | 
|---|
| 97 | std::vector<Cluster> clusters; | 
|---|
| 98 | std::vector<const InputSectionBase *> sections; | 
|---|
| 99 | }; | 
|---|
| 100 |  | 
|---|
| 101 | // Maximum amount the combined cluster density can be worse than the original | 
|---|
| 102 | // cluster to consider merging. | 
|---|
| 103 | constexpr int MAX_DENSITY_DEGRADATION = 8; | 
|---|
| 104 |  | 
|---|
| 105 | // Maximum cluster size in bytes. | 
|---|
| 106 | constexpr uint64_t MAX_CLUSTER_SIZE = 1024 * 1024; | 
|---|
| 107 | } // end anonymous namespace | 
|---|
| 108 |  | 
|---|
| 109 | using SectionPair = | 
|---|
| 110 | std::pair<const InputSectionBase *, const InputSectionBase *>; | 
|---|
| 111 |  | 
|---|
| 112 | // Take the edge list in ctx.arg.callGraphProfile, resolve symbol names to | 
|---|
| 113 | // Symbols, and generate a graph between InputSections with the provided | 
|---|
| 114 | // weights. | 
|---|
| 115 | CallGraphSort::CallGraphSort(Ctx &ctx) : ctx(ctx) { | 
|---|
| 116 | MapVector<SectionPair, uint64_t> &profile = ctx.arg.callGraphProfile; | 
|---|
| 117 | DenseMap<const InputSectionBase *, int> secToCluster; | 
|---|
| 118 |  | 
|---|
| 119 | auto getOrCreateNode = [&](const InputSectionBase *isec) -> int { | 
|---|
| 120 | auto res = secToCluster.try_emplace(Key: isec, Args: clusters.size()); | 
|---|
| 121 | if (res.second) { | 
|---|
| 122 | sections.push_back(x: isec); | 
|---|
| 123 | clusters.emplace_back(args: clusters.size(), args: isec->getSize()); | 
|---|
| 124 | } | 
|---|
| 125 | return res.first->second; | 
|---|
| 126 | }; | 
|---|
| 127 |  | 
|---|
| 128 | // Create the graph. | 
|---|
| 129 | for (std::pair<SectionPair, uint64_t> &c : profile) { | 
|---|
| 130 | const auto *fromSB = cast<InputSectionBase>(Val: c.first.first); | 
|---|
| 131 | const auto *toSB = cast<InputSectionBase>(Val: c.first.second); | 
|---|
| 132 | uint64_t weight = c.second; | 
|---|
| 133 |  | 
|---|
| 134 | // Ignore edges between input sections belonging to different output | 
|---|
| 135 | // sections.  This is done because otherwise we would end up with clusters | 
|---|
| 136 | // containing input sections that can't actually be placed adjacently in the | 
|---|
| 137 | // output.  This messes with the cluster size and density calculations.  We | 
|---|
| 138 | // would also end up moving input sections in other output sections without | 
|---|
| 139 | // moving them closer to what calls them. | 
|---|
| 140 | if (fromSB->getOutputSection() != toSB->getOutputSection()) | 
|---|
| 141 | continue; | 
|---|
| 142 |  | 
|---|
| 143 | int from = getOrCreateNode(fromSB); | 
|---|
| 144 | int to = getOrCreateNode(toSB); | 
|---|
| 145 |  | 
|---|
| 146 | clusters[to].weight += weight; | 
|---|
| 147 |  | 
|---|
| 148 | if (from == to) | 
|---|
| 149 | continue; | 
|---|
| 150 |  | 
|---|
| 151 | // Remember the best edge. | 
|---|
| 152 | Cluster &toC = clusters[to]; | 
|---|
| 153 | if (toC.bestPred.from == -1 || toC.bestPred.weight < weight) { | 
|---|
| 154 | toC.bestPred.from = from; | 
|---|
| 155 | toC.bestPred.weight = weight; | 
|---|
| 156 | } | 
|---|
| 157 | } | 
|---|
| 158 | for (Cluster &c : clusters) | 
|---|
| 159 | c.initialWeight = c.weight; | 
|---|
| 160 | } | 
|---|
| 161 |  | 
|---|
| 162 | // It's bad to merge clusters which would degrade the density too much. | 
|---|
| 163 | static bool isNewDensityBad(Cluster &a, Cluster &b) { | 
|---|
| 164 | double newDensity = double(a.weight + b.weight) / double(a.size + b.size); | 
|---|
| 165 | return newDensity < a.getDensity() / MAX_DENSITY_DEGRADATION; | 
|---|
| 166 | } | 
|---|
| 167 |  | 
|---|
| 168 | // Find the leader of V's belonged cluster (represented as an equivalence | 
|---|
| 169 | // class). We apply union-find path-halving technique (simple to implement) in | 
|---|
| 170 | // the meantime as it decreases depths and the time complexity. | 
|---|
| 171 | static int getLeader(int *leaders, int v) { | 
|---|
| 172 | while (leaders[v] != v) { | 
|---|
| 173 | leaders[v] = leaders[leaders[v]]; | 
|---|
| 174 | v = leaders[v]; | 
|---|
| 175 | } | 
|---|
| 176 | return v; | 
|---|
| 177 | } | 
|---|
| 178 |  | 
|---|
| 179 | static void mergeClusters(std::vector<Cluster> &cs, Cluster &into, int intoIdx, | 
|---|
| 180 | Cluster &from, int fromIdx) { | 
|---|
| 181 | int tail1 = into.prev, tail2 = from.prev; | 
|---|
| 182 | into.prev = tail2; | 
|---|
| 183 | cs[tail2].next = intoIdx; | 
|---|
| 184 | from.prev = tail1; | 
|---|
| 185 | cs[tail1].next = fromIdx; | 
|---|
| 186 | into.size += from.size; | 
|---|
| 187 | into.weight += from.weight; | 
|---|
| 188 | from.size = 0; | 
|---|
| 189 | from.weight = 0; | 
|---|
| 190 | } | 
|---|
| 191 |  | 
|---|
| 192 | // Group InputSections into clusters using the Call-Chain Clustering heuristic | 
|---|
| 193 | // then sort the clusters by density. | 
|---|
| 194 | DenseMap<const InputSectionBase *, int> CallGraphSort::run() { | 
|---|
| 195 | std::vector<int> sorted(clusters.size()); | 
|---|
| 196 | std::unique_ptr<int[]> leaders(new int[clusters.size()]); | 
|---|
| 197 |  | 
|---|
| 198 | std::iota(first: leaders.get(), last: leaders.get() + clusters.size(), value: 0); | 
|---|
| 199 | std::iota(first: sorted.begin(), last: sorted.end(), value: 0); | 
|---|
| 200 | llvm::stable_sort(Range&: sorted, C: [&](int a, int b) { | 
|---|
| 201 | return clusters[a].getDensity() > clusters[b].getDensity(); | 
|---|
| 202 | }); | 
|---|
| 203 |  | 
|---|
| 204 | for (int l : sorted) { | 
|---|
| 205 | // The cluster index is the same as the index of its leader here because | 
|---|
| 206 | // clusters[L] has not been merged into another cluster yet. | 
|---|
| 207 | Cluster &c = clusters[l]; | 
|---|
| 208 |  | 
|---|
| 209 | // Don't consider merging if the edge is unlikely. | 
|---|
| 210 | if (c.bestPred.from == -1 || c.bestPred.weight * 10 <= c.initialWeight) | 
|---|
| 211 | continue; | 
|---|
| 212 |  | 
|---|
| 213 | int predL = getLeader(leaders: leaders.get(), v: c.bestPred.from); | 
|---|
| 214 | if (l == predL) | 
|---|
| 215 | continue; | 
|---|
| 216 |  | 
|---|
| 217 | Cluster *predC = &clusters[predL]; | 
|---|
| 218 | if (c.size + predC->size > MAX_CLUSTER_SIZE) | 
|---|
| 219 | continue; | 
|---|
| 220 |  | 
|---|
| 221 | if (isNewDensityBad(a&: *predC, b&: c)) | 
|---|
| 222 | continue; | 
|---|
| 223 |  | 
|---|
| 224 | leaders[l] = predL; | 
|---|
| 225 | mergeClusters(cs&: clusters, into&: *predC, intoIdx: predL, from&: c, fromIdx: l); | 
|---|
| 226 | } | 
|---|
| 227 |  | 
|---|
| 228 | // Sort remaining non-empty clusters by density. | 
|---|
| 229 | sorted.clear(); | 
|---|
| 230 | for (int i = 0, e = (int)clusters.size(); i != e; ++i) | 
|---|
| 231 | if (clusters[i].size > 0) | 
|---|
| 232 | sorted.push_back(x: i); | 
|---|
| 233 | llvm::stable_sort(Range&: sorted, C: [&](int a, int b) { | 
|---|
| 234 | return clusters[a].getDensity() > clusters[b].getDensity(); | 
|---|
| 235 | }); | 
|---|
| 236 |  | 
|---|
| 237 | DenseMap<const InputSectionBase *, int> orderMap; | 
|---|
| 238 | int curOrder = -clusters.size(); | 
|---|
| 239 | for (int leader : sorted) { | 
|---|
| 240 | for (int i = leader;;) { | 
|---|
| 241 | orderMap[sections[i]] = curOrder++; | 
|---|
| 242 | i = clusters[i].next; | 
|---|
| 243 | if (i == leader) | 
|---|
| 244 | break; | 
|---|
| 245 | } | 
|---|
| 246 | } | 
|---|
| 247 | if (!ctx.arg.printSymbolOrder.empty()) { | 
|---|
| 248 | std::error_code ec; | 
|---|
| 249 | raw_fd_ostream os(ctx.arg.printSymbolOrder, ec, sys::fs::OF_None); | 
|---|
| 250 | if (ec) { | 
|---|
| 251 | ErrAlways(ctx) << "cannot open "<< ctx.arg.printSymbolOrder << ": " | 
|---|
| 252 | << ec.message(); | 
|---|
| 253 | return orderMap; | 
|---|
| 254 | } | 
|---|
| 255 |  | 
|---|
| 256 | // Print the symbols ordered by C3, in the order of increasing curOrder | 
|---|
| 257 | // Instead of sorting all the orderMap, just repeat the loops above. | 
|---|
| 258 | for (int leader : sorted) | 
|---|
| 259 | for (int i = leader;;) { | 
|---|
| 260 | // Search all the symbols in the file of the section | 
|---|
| 261 | // and find out a Defined symbol with name that is within the section. | 
|---|
| 262 | for (Symbol *sym : sections[i]->file->getSymbols()) | 
|---|
| 263 | if (!sym->isSection()) // Filter out section-type symbols here. | 
|---|
| 264 | if (auto *d = dyn_cast<Defined>(Val: sym)) | 
|---|
| 265 | if (sections[i] == d->section) | 
|---|
| 266 | os << sym->getName() << "\n"; | 
|---|
| 267 | i = clusters[i].next; | 
|---|
| 268 | if (i == leader) | 
|---|
| 269 | break; | 
|---|
| 270 | } | 
|---|
| 271 | } | 
|---|
| 272 |  | 
|---|
| 273 | return orderMap; | 
|---|
| 274 | } | 
|---|
| 275 |  | 
|---|
| 276 | // Sort sections by the profile data using the Cache-Directed Sort algorithm. | 
|---|
| 277 | // The placement is done by optimizing the locality by co-locating frequently | 
|---|
| 278 | // executed code sections together. | 
|---|
| 279 | static DenseMap<const InputSectionBase *, int> | 
|---|
| 280 | computeCacheDirectedSortOrder(Ctx &ctx) { | 
|---|
| 281 | SmallVector<uint64_t, 0> funcSizes; | 
|---|
| 282 | SmallVector<uint64_t, 0> funcCounts; | 
|---|
| 283 | SmallVector<codelayout::EdgeCount, 0> callCounts; | 
|---|
| 284 | SmallVector<uint64_t, 0> callOffsets; | 
|---|
| 285 | SmallVector<const InputSectionBase *, 0> sections; | 
|---|
| 286 | DenseMap<const InputSectionBase *, size_t> secToTargetId; | 
|---|
| 287 |  | 
|---|
| 288 | auto getOrCreateNode = [&](const InputSectionBase *inSec) -> size_t { | 
|---|
| 289 | auto res = secToTargetId.try_emplace(Key: inSec, Args: sections.size()); | 
|---|
| 290 | if (res.second) { | 
|---|
| 291 | // inSec does not appear before in the graph. | 
|---|
| 292 | sections.push_back(Elt: inSec); | 
|---|
| 293 | funcSizes.push_back(Elt: inSec->getSize()); | 
|---|
| 294 | funcCounts.push_back(Elt: 0); | 
|---|
| 295 | } | 
|---|
| 296 | return res.first->second; | 
|---|
| 297 | }; | 
|---|
| 298 |  | 
|---|
| 299 | // Create the graph. | 
|---|
| 300 | for (std::pair<SectionPair, uint64_t> &c : ctx.arg.callGraphProfile) { | 
|---|
| 301 | const InputSectionBase *fromSB = cast<InputSectionBase>(Val: c.first.first); | 
|---|
| 302 | const InputSectionBase *toSB = cast<InputSectionBase>(Val: c.first.second); | 
|---|
| 303 | // Ignore edges between input sections belonging to different sections. | 
|---|
| 304 | if (fromSB->getOutputSection() != toSB->getOutputSection()) | 
|---|
| 305 | continue; | 
|---|
| 306 |  | 
|---|
| 307 | uint64_t weight = c.second; | 
|---|
| 308 | // Ignore edges with zero weight. | 
|---|
| 309 | if (weight == 0) | 
|---|
| 310 | continue; | 
|---|
| 311 |  | 
|---|
| 312 | size_t from = getOrCreateNode(fromSB); | 
|---|
| 313 | size_t to = getOrCreateNode(toSB); | 
|---|
| 314 | // Ignore self-edges (recursive calls). | 
|---|
| 315 | if (from == to) | 
|---|
| 316 | continue; | 
|---|
| 317 |  | 
|---|
| 318 | callCounts.push_back(Elt: {.src: from, .dst: to, .count: weight}); | 
|---|
| 319 | // Assume that the jump is at the middle of the input section. The profile | 
|---|
| 320 | // data does not contain jump offsets. | 
|---|
| 321 | callOffsets.push_back(Elt: (funcSizes[from] + 1) / 2); | 
|---|
| 322 | funcCounts[to] += weight; | 
|---|
| 323 | } | 
|---|
| 324 |  | 
|---|
| 325 | // Run the layout algorithm. | 
|---|
| 326 | std::vector<uint64_t> sortedSections = codelayout::computeCacheDirectedLayout( | 
|---|
| 327 | FuncSizes: funcSizes, FuncCounts: funcCounts, CallCounts: callCounts, CallOffsets: callOffsets); | 
|---|
| 328 |  | 
|---|
| 329 | // Create the final order. | 
|---|
| 330 | DenseMap<const InputSectionBase *, int> orderMap; | 
|---|
| 331 | int curOrder = -sortedSections.size(); | 
|---|
| 332 | for (uint64_t secIdx : sortedSections) | 
|---|
| 333 | orderMap[sections[secIdx]] = curOrder++; | 
|---|
| 334 |  | 
|---|
| 335 | return orderMap; | 
|---|
| 336 | } | 
|---|
| 337 |  | 
|---|
| 338 | // Sort sections by the profile data provided by --callgraph-profile-file. | 
|---|
| 339 | // | 
|---|
| 340 | // This first builds a call graph based on the profile data then merges sections | 
|---|
| 341 | // according either to the C³ or Cache-Directed-Sort ordering algorithm. | 
|---|
| 342 | DenseMap<const InputSectionBase *, int> | 
|---|
| 343 | elf::computeCallGraphProfileOrder(Ctx &ctx) { | 
|---|
| 344 | if (ctx.arg.callGraphProfileSort == CGProfileSortKind::Cdsort) | 
|---|
| 345 | return computeCacheDirectedSortOrder(ctx); | 
|---|
| 346 | return CallGraphSort(ctx).run(); | 
|---|
| 347 | } | 
|---|
| 348 |  | 
|---|