1 | //===- BlockFrequencyImplInfo.cpp - Block Frequency Info Implementation ---===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // Loops should be simplified before this analysis. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #include "llvm/Analysis/BlockFrequencyInfoImpl.h" |
14 | #include "llvm/ADT/APInt.h" |
15 | #include "llvm/ADT/DenseMap.h" |
16 | #include "llvm/ADT/SCCIterator.h" |
17 | #include "llvm/ADT/SmallString.h" |
18 | #include "llvm/Config/llvm-config.h" |
19 | #include "llvm/IR/Function.h" |
20 | #include "llvm/Support/BlockFrequency.h" |
21 | #include "llvm/Support/BranchProbability.h" |
22 | #include "llvm/Support/Compiler.h" |
23 | #include "llvm/Support/Debug.h" |
24 | #include "llvm/Support/MathExtras.h" |
25 | #include "llvm/Support/ScaledNumber.h" |
26 | #include "llvm/Support/raw_ostream.h" |
27 | #include <algorithm> |
28 | #include <cassert> |
29 | #include <cstddef> |
30 | #include <cstdint> |
31 | #include <iterator> |
32 | #include <list> |
33 | #include <numeric> |
34 | #include <optional> |
35 | #include <utility> |
36 | #include <vector> |
37 | |
38 | using namespace llvm; |
39 | using namespace llvm::bfi_detail; |
40 | |
41 | #define DEBUG_TYPE "block-freq" |
42 | |
43 | namespace llvm { |
44 | cl::opt<bool> CheckBFIUnknownBlockQueries( |
45 | "check-bfi-unknown-block-queries" , |
46 | cl::init(Val: false), cl::Hidden, |
47 | cl::desc("Check if block frequency is queried for an unknown block " |
48 | "for debugging missed BFI updates" )); |
49 | |
50 | cl::opt<bool> UseIterativeBFIInference( |
51 | "use-iterative-bfi-inference" , cl::Hidden, |
52 | cl::desc("Apply an iterative post-processing to infer correct BFI counts" )); |
53 | |
54 | cl::opt<unsigned> IterativeBFIMaxIterationsPerBlock( |
55 | "iterative-bfi-max-iterations-per-block" , cl::init(Val: 1000), cl::Hidden, |
56 | cl::desc("Iterative inference: maximum number of update iterations " |
57 | "per block" )); |
58 | |
59 | cl::opt<double> IterativeBFIPrecision( |
60 | "iterative-bfi-precision" , cl::init(Val: 1e-12), cl::Hidden, |
61 | cl::desc("Iterative inference: delta convergence precision; smaller values " |
62 | "typically lead to better results at the cost of worsen runtime" )); |
63 | } // namespace llvm |
64 | |
65 | ScaledNumber<uint64_t> BlockMass::toScaled() const { |
66 | if (isFull()) |
67 | return ScaledNumber<uint64_t>(1, 0); |
68 | return ScaledNumber<uint64_t>(getMass() + 1, -64); |
69 | } |
70 | |
71 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
72 | LLVM_DUMP_METHOD void BlockMass::dump() const { print(dbgs()); } |
73 | #endif |
74 | |
75 | static char getHexDigit(int N) { |
76 | assert(N < 16); |
77 | if (N < 10) |
78 | return '0' + N; |
79 | return 'a' + N - 10; |
80 | } |
81 | |
82 | raw_ostream &BlockMass::print(raw_ostream &OS) const { |
83 | for (int Digits = 0; Digits < 16; ++Digits) |
84 | OS << getHexDigit(N: Mass >> (60 - Digits * 4) & 0xf); |
85 | return OS; |
86 | } |
87 | |
88 | namespace { |
89 | |
90 | using BlockNode = BlockFrequencyInfoImplBase::BlockNode; |
91 | using Distribution = BlockFrequencyInfoImplBase::Distribution; |
92 | using WeightList = BlockFrequencyInfoImplBase::Distribution::WeightList; |
93 | using Scaled64 = BlockFrequencyInfoImplBase::Scaled64; |
94 | using LoopData = BlockFrequencyInfoImplBase::LoopData; |
95 | using Weight = BlockFrequencyInfoImplBase::Weight; |
96 | using FrequencyData = BlockFrequencyInfoImplBase::FrequencyData; |
97 | |
98 | /// Dithering mass distributer. |
99 | /// |
100 | /// This class splits up a single mass into portions by weight, dithering to |
101 | /// spread out error. No mass is lost. The dithering precision depends on the |
102 | /// precision of the product of \a BlockMass and \a BranchProbability. |
103 | /// |
104 | /// The distribution algorithm follows. |
105 | /// |
106 | /// 1. Initialize by saving the sum of the weights in \a RemWeight and the |
107 | /// mass to distribute in \a RemMass. |
108 | /// |
109 | /// 2. For each portion: |
110 | /// |
111 | /// 1. Construct a branch probability, P, as the portion's weight divided |
112 | /// by the current value of \a RemWeight. |
113 | /// 2. Calculate the portion's mass as \a RemMass times P. |
114 | /// 3. Update \a RemWeight and \a RemMass at each portion by subtracting |
115 | /// the current portion's weight and mass. |
116 | struct DitheringDistributer { |
117 | uint32_t RemWeight; |
118 | BlockMass RemMass; |
119 | |
120 | DitheringDistributer(Distribution &Dist, const BlockMass &Mass); |
121 | |
122 | BlockMass takeMass(uint32_t Weight); |
123 | }; |
124 | |
125 | } // end anonymous namespace |
126 | |
127 | DitheringDistributer::DitheringDistributer(Distribution &Dist, |
128 | const BlockMass &Mass) { |
129 | Dist.normalize(); |
130 | RemWeight = Dist.Total; |
131 | RemMass = Mass; |
132 | } |
133 | |
134 | BlockMass DitheringDistributer::takeMass(uint32_t Weight) { |
135 | assert(Weight && "invalid weight" ); |
136 | assert(Weight <= RemWeight); |
137 | BlockMass Mass = RemMass * BranchProbability(Weight, RemWeight); |
138 | |
139 | // Decrement totals (dither). |
140 | RemWeight -= Weight; |
141 | RemMass -= Mass; |
142 | return Mass; |
143 | } |
144 | |
145 | void Distribution::add(const BlockNode &Node, uint64_t Amount, |
146 | Weight::DistType Type) { |
147 | assert(Amount && "invalid weight of 0" ); |
148 | uint64_t NewTotal = Total + Amount; |
149 | |
150 | // Check for overflow. It should be impossible to overflow twice. |
151 | bool IsOverflow = NewTotal < Total; |
152 | assert(!(DidOverflow && IsOverflow) && "unexpected repeated overflow" ); |
153 | DidOverflow |= IsOverflow; |
154 | |
155 | // Update the total. |
156 | Total = NewTotal; |
157 | |
158 | // Save the weight. |
159 | Weights.push_back(Elt: Weight(Type, Node, Amount)); |
160 | } |
161 | |
162 | static void combineWeight(Weight &W, const Weight &OtherW) { |
163 | assert(OtherW.TargetNode.isValid()); |
164 | if (!W.Amount) { |
165 | W = OtherW; |
166 | return; |
167 | } |
168 | assert(W.Type == OtherW.Type); |
169 | assert(W.TargetNode == OtherW.TargetNode); |
170 | assert(OtherW.Amount && "Expected non-zero weight" ); |
171 | if (W.Amount > W.Amount + OtherW.Amount) |
172 | // Saturate on overflow. |
173 | W.Amount = UINT64_MAX; |
174 | else |
175 | W.Amount += OtherW.Amount; |
176 | } |
177 | |
178 | static void combineWeightsBySorting(WeightList &Weights) { |
179 | // Sort so edges to the same node are adjacent. |
180 | llvm::sort(C&: Weights, Comp: [](const Weight &L, const Weight &R) { |
181 | return L.TargetNode < R.TargetNode; |
182 | }); |
183 | |
184 | // Combine adjacent edges. |
185 | WeightList::iterator O = Weights.begin(); |
186 | for (WeightList::const_iterator I = O, L = O, E = Weights.end(); I != E; |
187 | ++O, (I = L)) { |
188 | *O = *I; |
189 | |
190 | // Find the adjacent weights to the same node. |
191 | for (++L; L != E && I->TargetNode == L->TargetNode; ++L) |
192 | combineWeight(W&: *O, OtherW: *L); |
193 | } |
194 | |
195 | // Erase extra entries. |
196 | Weights.erase(CS: O, CE: Weights.end()); |
197 | } |
198 | |
199 | static void combineWeightsByHashing(WeightList &Weights) { |
200 | // Collect weights into a DenseMap. |
201 | using HashTable = DenseMap<BlockNode::IndexType, Weight>; |
202 | |
203 | HashTable Combined(NextPowerOf2(A: 2 * Weights.size())); |
204 | for (const Weight &W : Weights) |
205 | combineWeight(W&: Combined[W.TargetNode.Index], OtherW: W); |
206 | |
207 | // Check whether anything changed. |
208 | if (Weights.size() == Combined.size()) |
209 | return; |
210 | |
211 | // Fill in the new weights. |
212 | Weights.clear(); |
213 | Weights.reserve(N: Combined.size()); |
214 | for (const auto &I : Combined) |
215 | Weights.push_back(Elt: I.second); |
216 | } |
217 | |
218 | static void combineWeights(WeightList &Weights) { |
219 | // Use a hash table for many successors to keep this linear. |
220 | if (Weights.size() > 128) { |
221 | combineWeightsByHashing(Weights); |
222 | return; |
223 | } |
224 | |
225 | combineWeightsBySorting(Weights); |
226 | } |
227 | |
228 | static uint64_t shiftRightAndRound(uint64_t N, int Shift) { |
229 | assert(Shift >= 0); |
230 | assert(Shift < 64); |
231 | if (!Shift) |
232 | return N; |
233 | return (N >> Shift) + (UINT64_C(1) & N >> (Shift - 1)); |
234 | } |
235 | |
236 | void Distribution::normalize() { |
237 | // Early exit for termination nodes. |
238 | if (Weights.empty()) |
239 | return; |
240 | |
241 | // Only bother if there are multiple successors. |
242 | if (Weights.size() > 1) |
243 | combineWeights(Weights); |
244 | |
245 | // Early exit when combined into a single successor. |
246 | if (Weights.size() == 1) { |
247 | Total = 1; |
248 | Weights.front().Amount = 1; |
249 | return; |
250 | } |
251 | |
252 | // Determine how much to shift right so that the total fits into 32-bits. |
253 | // |
254 | // If we shift at all, shift by 1 extra. Otherwise, the lower limit of 1 |
255 | // for each weight can cause a 32-bit overflow. |
256 | int Shift = 0; |
257 | if (DidOverflow) |
258 | Shift = 33; |
259 | else if (Total > UINT32_MAX) |
260 | Shift = 33 - llvm::countl_zero(Val: Total); |
261 | |
262 | // Early exit if nothing needs to be scaled. |
263 | if (!Shift) { |
264 | // If we didn't overflow then combineWeights() shouldn't have changed the |
265 | // sum of the weights, but let's double-check. |
266 | assert(Total == std::accumulate(Weights.begin(), Weights.end(), UINT64_C(0), |
267 | [](uint64_t Sum, const Weight &W) { |
268 | return Sum + W.Amount; |
269 | }) && |
270 | "Expected total to be correct" ); |
271 | return; |
272 | } |
273 | |
274 | // Recompute the total through accumulation (rather than shifting it) so that |
275 | // it's accurate after shifting and any changes combineWeights() made above. |
276 | Total = 0; |
277 | |
278 | // Sum the weights to each node and shift right if necessary. |
279 | for (Weight &W : Weights) { |
280 | // Scale down below UINT32_MAX. Since Shift is larger than necessary, we |
281 | // can round here without concern about overflow. |
282 | assert(W.TargetNode.isValid()); |
283 | W.Amount = std::max(UINT64_C(1), b: shiftRightAndRound(N: W.Amount, Shift)); |
284 | assert(W.Amount <= UINT32_MAX); |
285 | |
286 | // Update the total. |
287 | Total += W.Amount; |
288 | } |
289 | assert(Total <= UINT32_MAX); |
290 | } |
291 | |
292 | void BlockFrequencyInfoImplBase::clear() { |
293 | // Swap with a default-constructed std::vector, since std::vector<>::clear() |
294 | // does not actually clear heap storage. |
295 | std::vector<FrequencyData>().swap(x&: Freqs); |
296 | IsIrrLoopHeader.clear(); |
297 | std::vector<WorkingData>().swap(x&: Working); |
298 | Loops.clear(); |
299 | } |
300 | |
301 | /// Clear all memory not needed downstream. |
302 | /// |
303 | /// Releases all memory not used downstream. In particular, saves Freqs. |
304 | static void cleanup(BlockFrequencyInfoImplBase &BFI) { |
305 | std::vector<FrequencyData> SavedFreqs(std::move(BFI.Freqs)); |
306 | SparseBitVector<> (std::move(BFI.IsIrrLoopHeader)); |
307 | BFI.clear(); |
308 | BFI.Freqs = std::move(SavedFreqs); |
309 | BFI.IsIrrLoopHeader = std::move(SavedIsIrrLoopHeader); |
310 | } |
311 | |
312 | bool BlockFrequencyInfoImplBase::addToDist(Distribution &Dist, |
313 | const LoopData *OuterLoop, |
314 | const BlockNode &Pred, |
315 | const BlockNode &Succ, |
316 | uint64_t Weight) { |
317 | if (!Weight) |
318 | Weight = 1; |
319 | |
320 | auto = [&OuterLoop](const BlockNode &Node) { |
321 | return OuterLoop && OuterLoop->isHeader(Node); |
322 | }; |
323 | |
324 | BlockNode Resolved = Working[Succ.Index].getResolvedNode(); |
325 | |
326 | #ifndef NDEBUG |
327 | auto debugSuccessor = [&](const char *Type) { |
328 | dbgs() << " =>" |
329 | << " [" << Type << "] weight = " << Weight; |
330 | if (!isLoopHeader(Resolved)) |
331 | dbgs() << ", succ = " << getBlockName(Succ); |
332 | if (Resolved != Succ) |
333 | dbgs() << ", resolved = " << getBlockName(Resolved); |
334 | dbgs() << "\n" ; |
335 | }; |
336 | (void)debugSuccessor; |
337 | #endif |
338 | |
339 | if (isLoopHeader(Resolved)) { |
340 | LLVM_DEBUG(debugSuccessor("backedge" )); |
341 | Dist.addBackedge(Node: Resolved, Amount: Weight); |
342 | return true; |
343 | } |
344 | |
345 | if (Working[Resolved.Index].getContainingLoop() != OuterLoop) { |
346 | LLVM_DEBUG(debugSuccessor(" exit " )); |
347 | Dist.addExit(Node: Resolved, Amount: Weight); |
348 | return true; |
349 | } |
350 | |
351 | if (Resolved < Pred) { |
352 | if (!isLoopHeader(Pred)) { |
353 | // If OuterLoop is an irreducible loop, we can't actually handle this. |
354 | assert((!OuterLoop || !OuterLoop->isIrreducible()) && |
355 | "unhandled irreducible control flow" ); |
356 | |
357 | // Irreducible backedge. Abort. |
358 | LLVM_DEBUG(debugSuccessor("abort!!!" )); |
359 | return false; |
360 | } |
361 | |
362 | // If "Pred" is a loop header, then this isn't really a backedge; rather, |
363 | // OuterLoop must be irreducible. These false backedges can come only from |
364 | // secondary loop headers. |
365 | assert(OuterLoop && OuterLoop->isIrreducible() && !isLoopHeader(Resolved) && |
366 | "unhandled irreducible control flow" ); |
367 | } |
368 | |
369 | LLVM_DEBUG(debugSuccessor(" local " )); |
370 | Dist.addLocal(Node: Resolved, Amount: Weight); |
371 | return true; |
372 | } |
373 | |
374 | bool BlockFrequencyInfoImplBase::addLoopSuccessorsToDist( |
375 | const LoopData *OuterLoop, LoopData &Loop, Distribution &Dist) { |
376 | // Copy the exit map into Dist. |
377 | for (const auto &I : Loop.Exits) |
378 | if (!addToDist(Dist, OuterLoop, Pred: Loop.getHeader(), Succ: I.first, |
379 | Weight: I.second.getMass())) |
380 | // Irreducible backedge. |
381 | return false; |
382 | |
383 | return true; |
384 | } |
385 | |
386 | /// Compute the loop scale for a loop. |
387 | void BlockFrequencyInfoImplBase::computeLoopScale(LoopData &Loop) { |
388 | // Compute loop scale. |
389 | LLVM_DEBUG(dbgs() << "compute-loop-scale: " << getLoopName(Loop) << "\n" ); |
390 | |
391 | // Infinite loops need special handling. If we give the back edge an infinite |
392 | // mass, they may saturate all the other scales in the function down to 1, |
393 | // making all the other region temperatures look exactly the same. Choose an |
394 | // arbitrary scale to avoid these issues. |
395 | // |
396 | // FIXME: An alternate way would be to select a symbolic scale which is later |
397 | // replaced to be the maximum of all computed scales plus 1. This would |
398 | // appropriately describe the loop as having a large scale, without skewing |
399 | // the final frequency computation. |
400 | const Scaled64 InfiniteLoopScale(1, 12); |
401 | |
402 | // LoopScale == 1 / ExitMass |
403 | // ExitMass == HeadMass - BackedgeMass |
404 | BlockMass TotalBackedgeMass; |
405 | for (auto &Mass : Loop.BackedgeMass) |
406 | TotalBackedgeMass += Mass; |
407 | BlockMass ExitMass = BlockMass::getFull() - TotalBackedgeMass; |
408 | |
409 | // Block scale stores the inverse of the scale. If this is an infinite loop, |
410 | // its exit mass will be zero. In this case, use an arbitrary scale for the |
411 | // loop scale. |
412 | Loop.Scale = |
413 | ExitMass.isEmpty() ? InfiniteLoopScale : ExitMass.toScaled().inverse(); |
414 | |
415 | LLVM_DEBUG(dbgs() << " - exit-mass = " << ExitMass << " (" |
416 | << BlockMass::getFull() << " - " << TotalBackedgeMass |
417 | << ")\n" |
418 | << " - scale = " << Loop.Scale << "\n" ); |
419 | } |
420 | |
421 | /// Package up a loop. |
422 | void BlockFrequencyInfoImplBase::packageLoop(LoopData &Loop) { |
423 | LLVM_DEBUG(dbgs() << "packaging-loop: " << getLoopName(Loop) << "\n" ); |
424 | |
425 | // Clear the subloop exits to prevent quadratic memory usage. |
426 | for (const BlockNode &M : Loop.Nodes) { |
427 | if (auto *Loop = Working[M.Index].getPackagedLoop()) |
428 | Loop->Exits.clear(); |
429 | LLVM_DEBUG(dbgs() << " - node: " << getBlockName(M.Index) << "\n" ); |
430 | } |
431 | Loop.IsPackaged = true; |
432 | } |
433 | |
434 | #ifndef NDEBUG |
435 | static void debugAssign(const BlockFrequencyInfoImplBase &BFI, |
436 | const DitheringDistributer &D, const BlockNode &T, |
437 | const BlockMass &M, const char *Desc) { |
438 | dbgs() << " => assign " << M << " (" << D.RemMass << ")" ; |
439 | if (Desc) |
440 | dbgs() << " [" << Desc << "]" ; |
441 | if (T.isValid()) |
442 | dbgs() << " to " << BFI.getBlockName(T); |
443 | dbgs() << "\n" ; |
444 | } |
445 | #endif |
446 | |
447 | void BlockFrequencyInfoImplBase::distributeMass(const BlockNode &Source, |
448 | LoopData *OuterLoop, |
449 | Distribution &Dist) { |
450 | BlockMass Mass = Working[Source.Index].getMass(); |
451 | LLVM_DEBUG(dbgs() << " => mass: " << Mass << "\n" ); |
452 | |
453 | // Distribute mass to successors as laid out in Dist. |
454 | DitheringDistributer D(Dist, Mass); |
455 | |
456 | for (const Weight &W : Dist.Weights) { |
457 | // Check for a local edge (non-backedge and non-exit). |
458 | BlockMass Taken = D.takeMass(Weight: W.Amount); |
459 | if (W.Type == Weight::Local) { |
460 | Working[W.TargetNode.Index].getMass() += Taken; |
461 | LLVM_DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr)); |
462 | continue; |
463 | } |
464 | |
465 | // Backedges and exits only make sense if we're processing a loop. |
466 | assert(OuterLoop && "backedge or exit outside of loop" ); |
467 | |
468 | // Check for a backedge. |
469 | if (W.Type == Weight::Backedge) { |
470 | OuterLoop->BackedgeMass[OuterLoop->getHeaderIndex(B: W.TargetNode)] += Taken; |
471 | LLVM_DEBUG(debugAssign(*this, D, W.TargetNode, Taken, "back" )); |
472 | continue; |
473 | } |
474 | |
475 | // This must be an exit. |
476 | assert(W.Type == Weight::Exit); |
477 | OuterLoop->Exits.push_back(Elt: std::make_pair(x: W.TargetNode, y&: Taken)); |
478 | LLVM_DEBUG(debugAssign(*this, D, W.TargetNode, Taken, "exit" )); |
479 | } |
480 | } |
481 | |
482 | static void convertFloatingToInteger(BlockFrequencyInfoImplBase &BFI, |
483 | const Scaled64 &Min, const Scaled64 &Max) { |
484 | // Scale the Factor to a size that creates integers. If possible scale |
485 | // integers so that Max == UINT64_MAX so that they can be best differentiated. |
486 | // Is is possible that the range between min and max cannot be accurately |
487 | // represented in a 64bit integer without either loosing precision for small |
488 | // values (so small unequal numbers all map to 1) or saturaturing big numbers |
489 | // loosing precision for big numbers (so unequal big numbers may map to |
490 | // UINT64_MAX). We choose to loose precision for small numbers. |
491 | const unsigned MaxBits = sizeof(Scaled64::DigitsType) * CHAR_BIT; |
492 | // Users often add up multiple BlockFrequency values or multiply them with |
493 | // things like instruction costs. Leave some room to avoid saturating |
494 | // operations reaching UIN64_MAX too early. |
495 | const unsigned Slack = 10; |
496 | Scaled64 ScalingFactor = Scaled64(1, MaxBits - Slack) / Max; |
497 | |
498 | // Translate the floats to integers. |
499 | LLVM_DEBUG(dbgs() << "float-to-int: min = " << Min << ", max = " << Max |
500 | << ", factor = " << ScalingFactor << "\n" ); |
501 | (void)Min; |
502 | for (size_t Index = 0; Index < BFI.Freqs.size(); ++Index) { |
503 | Scaled64 Scaled = BFI.Freqs[Index].Scaled * ScalingFactor; |
504 | BFI.Freqs[Index].Integer = std::max(UINT64_C(1), b: Scaled.toInt<uint64_t>()); |
505 | LLVM_DEBUG(dbgs() << " - " << BFI.getBlockName(Index) << ": float = " |
506 | << BFI.Freqs[Index].Scaled << ", scaled = " << Scaled |
507 | << ", int = " << BFI.Freqs[Index].Integer << "\n" ); |
508 | } |
509 | } |
510 | |
511 | /// Unwrap a loop package. |
512 | /// |
513 | /// Visits all the members of a loop, adjusting their BlockData according to |
514 | /// the loop's pseudo-node. |
515 | static void unwrapLoop(BlockFrequencyInfoImplBase &BFI, LoopData &Loop) { |
516 | LLVM_DEBUG(dbgs() << "unwrap-loop-package: " << BFI.getLoopName(Loop) |
517 | << ": mass = " << Loop.Mass << ", scale = " << Loop.Scale |
518 | << "\n" ); |
519 | Loop.Scale *= Loop.Mass.toScaled(); |
520 | Loop.IsPackaged = false; |
521 | LLVM_DEBUG(dbgs() << " => combined-scale = " << Loop.Scale << "\n" ); |
522 | |
523 | // Propagate the head scale through the loop. Since members are visited in |
524 | // RPO, the head scale will be updated by the loop scale first, and then the |
525 | // final head scale will be used for updated the rest of the members. |
526 | for (const BlockNode &N : Loop.Nodes) { |
527 | const auto &Working = BFI.Working[N.Index]; |
528 | Scaled64 &F = Working.isAPackage() ? Working.getPackagedLoop()->Scale |
529 | : BFI.Freqs[N.Index].Scaled; |
530 | Scaled64 New = Loop.Scale * F; |
531 | LLVM_DEBUG(dbgs() << " - " << BFI.getBlockName(N) << ": " << F << " => " |
532 | << New << "\n" ); |
533 | F = New; |
534 | } |
535 | } |
536 | |
537 | void BlockFrequencyInfoImplBase::unwrapLoops() { |
538 | // Set initial frequencies from loop-local masses. |
539 | for (size_t Index = 0; Index < Working.size(); ++Index) |
540 | Freqs[Index].Scaled = Working[Index].Mass.toScaled(); |
541 | |
542 | for (LoopData &Loop : Loops) |
543 | unwrapLoop(BFI&: *this, Loop); |
544 | } |
545 | |
546 | void BlockFrequencyInfoImplBase::finalizeMetrics() { |
547 | // Unwrap loop packages in reverse post-order, tracking min and max |
548 | // frequencies. |
549 | auto Min = Scaled64::getLargest(); |
550 | auto Max = Scaled64::getZero(); |
551 | for (size_t Index = 0; Index < Working.size(); ++Index) { |
552 | // Update min/max scale. |
553 | Min = std::min(a: Min, b: Freqs[Index].Scaled); |
554 | Max = std::max(a: Max, b: Freqs[Index].Scaled); |
555 | } |
556 | |
557 | // Convert to integers. |
558 | convertFloatingToInteger(BFI&: *this, Min, Max); |
559 | |
560 | // Clean up data structures. |
561 | cleanup(BFI&: *this); |
562 | |
563 | // Print out the final stats. |
564 | LLVM_DEBUG(dump()); |
565 | } |
566 | |
567 | BlockFrequency |
568 | BlockFrequencyInfoImplBase::getBlockFreq(const BlockNode &Node) const { |
569 | if (!Node.isValid()) { |
570 | #ifndef NDEBUG |
571 | if (CheckBFIUnknownBlockQueries) { |
572 | SmallString<256> Msg; |
573 | raw_svector_ostream OS(Msg); |
574 | OS << "*** Detected BFI query for unknown block " << getBlockName(Node); |
575 | report_fatal_error(OS.str()); |
576 | } |
577 | #endif |
578 | return BlockFrequency(0); |
579 | } |
580 | return BlockFrequency(Freqs[Node.Index].Integer); |
581 | } |
582 | |
583 | std::optional<uint64_t> |
584 | BlockFrequencyInfoImplBase::getBlockProfileCount(const Function &F, |
585 | const BlockNode &Node, |
586 | bool AllowSynthetic) const { |
587 | return getProfileCountFromFreq(F, Freq: getBlockFreq(Node), AllowSynthetic); |
588 | } |
589 | |
590 | std::optional<uint64_t> BlockFrequencyInfoImplBase::getProfileCountFromFreq( |
591 | const Function &F, BlockFrequency Freq, bool AllowSynthetic) const { |
592 | auto EntryCount = F.getEntryCount(AllowSynthetic); |
593 | if (!EntryCount) |
594 | return std::nullopt; |
595 | // Use 128 bit APInt to do the arithmetic to avoid overflow. |
596 | APInt BlockCount(128, EntryCount->getCount()); |
597 | APInt BlockFreq(128, Freq.getFrequency()); |
598 | APInt EntryFreq(128, getEntryFreq().getFrequency()); |
599 | BlockCount *= BlockFreq; |
600 | // Rounded division of BlockCount by EntryFreq. Since EntryFreq is unsigned |
601 | // lshr by 1 gives EntryFreq/2. |
602 | BlockCount = (BlockCount + EntryFreq.lshr(shiftAmt: 1)).udiv(RHS: EntryFreq); |
603 | return BlockCount.getLimitedValue(); |
604 | } |
605 | |
606 | bool |
607 | BlockFrequencyInfoImplBase::(const BlockNode &Node) { |
608 | if (!Node.isValid()) |
609 | return false; |
610 | return IsIrrLoopHeader.test(Idx: Node.Index); |
611 | } |
612 | |
613 | Scaled64 |
614 | BlockFrequencyInfoImplBase::getFloatingBlockFreq(const BlockNode &Node) const { |
615 | if (!Node.isValid()) |
616 | return Scaled64::getZero(); |
617 | return Freqs[Node.Index].Scaled; |
618 | } |
619 | |
620 | void BlockFrequencyInfoImplBase::setBlockFreq(const BlockNode &Node, |
621 | BlockFrequency Freq) { |
622 | assert(Node.isValid() && "Expected valid node" ); |
623 | assert(Node.Index < Freqs.size() && "Expected legal index" ); |
624 | Freqs[Node.Index].Integer = Freq.getFrequency(); |
625 | } |
626 | |
627 | std::string |
628 | BlockFrequencyInfoImplBase::getBlockName(const BlockNode &Node) const { |
629 | return {}; |
630 | } |
631 | |
632 | std::string |
633 | BlockFrequencyInfoImplBase::getLoopName(const LoopData &Loop) const { |
634 | return getBlockName(Node: Loop.getHeader()) + (Loop.isIrreducible() ? "**" : "*" ); |
635 | } |
636 | |
637 | void IrreducibleGraph::addNodesInLoop(const BFIBase::LoopData &OuterLoop) { |
638 | Start = OuterLoop.getHeader(); |
639 | Nodes.reserve(n: OuterLoop.Nodes.size()); |
640 | for (auto N : OuterLoop.Nodes) |
641 | addNode(Node: N); |
642 | indexNodes(); |
643 | } |
644 | |
645 | void IrreducibleGraph::addNodesInFunction() { |
646 | Start = 0; |
647 | for (uint32_t Index = 0; Index < BFI.Working.size(); ++Index) |
648 | if (!BFI.Working[Index].isPackaged()) |
649 | addNode(Node: Index); |
650 | indexNodes(); |
651 | } |
652 | |
653 | void IrreducibleGraph::indexNodes() { |
654 | for (auto &I : Nodes) |
655 | Lookup[I.Node.Index] = &I; |
656 | } |
657 | |
658 | void IrreducibleGraph::addEdge(IrrNode &Irr, const BlockNode &Succ, |
659 | const BFIBase::LoopData *OuterLoop) { |
660 | if (OuterLoop && OuterLoop->isHeader(Node: Succ)) |
661 | return; |
662 | auto L = Lookup.find(Val: Succ.Index); |
663 | if (L == Lookup.end()) |
664 | return; |
665 | IrrNode &SuccIrr = *L->second; |
666 | Irr.Edges.push_back(x: &SuccIrr); |
667 | SuccIrr.Edges.push_front(x: &Irr); |
668 | ++SuccIrr.NumIn; |
669 | } |
670 | |
671 | namespace llvm { |
672 | |
673 | template <> struct GraphTraits<IrreducibleGraph> { |
674 | using GraphT = bfi_detail::IrreducibleGraph; |
675 | using NodeRef = const GraphT::IrrNode *; |
676 | using ChildIteratorType = GraphT::IrrNode::iterator; |
677 | |
678 | static NodeRef getEntryNode(const GraphT &G) { return G.StartIrr; } |
679 | static ChildIteratorType child_begin(NodeRef N) { return N->succ_begin(); } |
680 | static ChildIteratorType child_end(NodeRef N) { return N->succ_end(); } |
681 | }; |
682 | |
683 | } // end namespace llvm |
684 | |
685 | /// Find extra irreducible headers. |
686 | /// |
687 | /// Find entry blocks and other blocks with backedges, which exist when \c G |
688 | /// contains irreducible sub-SCCs. |
689 | static void ( |
690 | const BlockFrequencyInfoImplBase &BFI, |
691 | const IrreducibleGraph &G, |
692 | const std::vector<const IrreducibleGraph::IrrNode *> &SCC, |
693 | LoopData::NodeList &, LoopData::NodeList &Others) { |
694 | // Map from nodes in the SCC to whether it's an entry block. |
695 | SmallDenseMap<const IrreducibleGraph::IrrNode *, bool, 8> InSCC; |
696 | |
697 | // InSCC also acts the set of nodes in the graph. Seed it. |
698 | for (const auto *I : SCC) |
699 | InSCC[I] = false; |
700 | |
701 | for (auto I = InSCC.begin(), E = InSCC.end(); I != E; ++I) { |
702 | auto &Irr = *I->first; |
703 | for (const auto *P : make_range(x: Irr.pred_begin(), y: Irr.pred_end())) { |
704 | if (InSCC.count(Val: P)) |
705 | continue; |
706 | |
707 | // This is an entry block. |
708 | I->second = true; |
709 | Headers.push_back(Elt: Irr.Node); |
710 | LLVM_DEBUG(dbgs() << " => entry = " << BFI.getBlockName(Irr.Node) |
711 | << "\n" ); |
712 | break; |
713 | } |
714 | } |
715 | assert(Headers.size() >= 2 && |
716 | "Expected irreducible CFG; -loop-info is likely invalid" ); |
717 | if (Headers.size() == InSCC.size()) { |
718 | // Every block is a header. |
719 | llvm::sort(C&: Headers); |
720 | return; |
721 | } |
722 | |
723 | // Look for extra headers from irreducible sub-SCCs. |
724 | for (const auto &I : InSCC) { |
725 | // Entry blocks are already headers. |
726 | if (I.second) |
727 | continue; |
728 | |
729 | auto &Irr = *I.first; |
730 | for (const auto *P : make_range(x: Irr.pred_begin(), y: Irr.pred_end())) { |
731 | // Skip forward edges. |
732 | if (P->Node < Irr.Node) |
733 | continue; |
734 | |
735 | // Skip predecessors from entry blocks. These can have inverted |
736 | // ordering. |
737 | if (InSCC.lookup(Val: P)) |
738 | continue; |
739 | |
740 | // Store the extra header. |
741 | Headers.push_back(Elt: Irr.Node); |
742 | LLVM_DEBUG(dbgs() << " => extra = " << BFI.getBlockName(Irr.Node) |
743 | << "\n" ); |
744 | break; |
745 | } |
746 | if (Headers.back() == Irr.Node) |
747 | // Added this as a header. |
748 | continue; |
749 | |
750 | // This is not a header. |
751 | Others.push_back(Elt: Irr.Node); |
752 | LLVM_DEBUG(dbgs() << " => other = " << BFI.getBlockName(Irr.Node) << "\n" ); |
753 | } |
754 | llvm::sort(C&: Headers); |
755 | llvm::sort(C&: Others); |
756 | } |
757 | |
758 | static void createIrreducibleLoop( |
759 | BlockFrequencyInfoImplBase &BFI, const IrreducibleGraph &G, |
760 | LoopData *OuterLoop, std::list<LoopData>::iterator Insert, |
761 | const std::vector<const IrreducibleGraph::IrrNode *> &SCC) { |
762 | // Translate the SCC into RPO. |
763 | LLVM_DEBUG(dbgs() << " - found-scc\n" ); |
764 | |
765 | LoopData::NodeList ; |
766 | LoopData::NodeList Others; |
767 | findIrreducibleHeaders(BFI, G, SCC, Headers, Others); |
768 | |
769 | auto Loop = BFI.Loops.emplace(position: Insert, args&: OuterLoop, args: Headers.begin(), |
770 | args: Headers.end(), args: Others.begin(), args: Others.end()); |
771 | |
772 | // Update loop hierarchy. |
773 | for (const auto &N : Loop->Nodes) |
774 | if (BFI.Working[N.Index].isLoopHeader()) |
775 | BFI.Working[N.Index].Loop->Parent = &*Loop; |
776 | else |
777 | BFI.Working[N.Index].Loop = &*Loop; |
778 | } |
779 | |
780 | iterator_range<std::list<LoopData>::iterator> |
781 | BlockFrequencyInfoImplBase::analyzeIrreducible( |
782 | const IrreducibleGraph &G, LoopData *OuterLoop, |
783 | std::list<LoopData>::iterator Insert) { |
784 | assert((OuterLoop == nullptr) == (Insert == Loops.begin())); |
785 | auto Prev = OuterLoop ? std::prev(x: Insert) : Loops.end(); |
786 | |
787 | for (auto I = scc_begin(G); !I.isAtEnd(); ++I) { |
788 | if (I->size() < 2) |
789 | continue; |
790 | |
791 | // Translate the SCC into RPO. |
792 | createIrreducibleLoop(BFI&: *this, G, OuterLoop, Insert, SCC: *I); |
793 | } |
794 | |
795 | if (OuterLoop) |
796 | return make_range(x: std::next(x: Prev), y: Insert); |
797 | return make_range(x: Loops.begin(), y: Insert); |
798 | } |
799 | |
800 | void |
801 | BlockFrequencyInfoImplBase::updateLoopWithIrreducible(LoopData &OuterLoop) { |
802 | OuterLoop.Exits.clear(); |
803 | for (auto &Mass : OuterLoop.BackedgeMass) |
804 | Mass = BlockMass::getEmpty(); |
805 | auto O = OuterLoop.Nodes.begin() + 1; |
806 | for (auto I = O, E = OuterLoop.Nodes.end(); I != E; ++I) |
807 | if (!Working[I->Index].isPackaged()) |
808 | *O++ = *I; |
809 | OuterLoop.Nodes.erase(CS: O, CE: OuterLoop.Nodes.end()); |
810 | } |
811 | |
812 | void BlockFrequencyInfoImplBase::(LoopData &Loop) { |
813 | assert(Loop.isIrreducible() && "this only makes sense on irreducible loops" ); |
814 | |
815 | // Since the loop has more than one header block, the mass flowing back into |
816 | // each header will be different. Adjust the mass in each header loop to |
817 | // reflect the masses flowing through back edges. |
818 | // |
819 | // To do this, we distribute the initial mass using the backedge masses |
820 | // as weights for the distribution. |
821 | BlockMass LoopMass = BlockMass::getFull(); |
822 | Distribution Dist; |
823 | |
824 | LLVM_DEBUG(dbgs() << "adjust-loop-header-mass:\n" ); |
825 | for (uint32_t H = 0; H < Loop.NumHeaders; ++H) { |
826 | auto & = Loop.Nodes[H]; |
827 | auto &BackedgeMass = Loop.BackedgeMass[Loop.getHeaderIndex(B: HeaderNode)]; |
828 | LLVM_DEBUG(dbgs() << " - Add back edge mass for node " |
829 | << getBlockName(HeaderNode) << ": " << BackedgeMass |
830 | << "\n" ); |
831 | if (BackedgeMass.getMass() > 0) |
832 | Dist.addLocal(Node: HeaderNode, Amount: BackedgeMass.getMass()); |
833 | else |
834 | LLVM_DEBUG(dbgs() << " Nothing added. Back edge mass is zero\n" ); |
835 | } |
836 | |
837 | DitheringDistributer D(Dist, LoopMass); |
838 | |
839 | LLVM_DEBUG(dbgs() << " Distribute loop mass " << LoopMass |
840 | << " to headers using above weights\n" ); |
841 | for (const Weight &W : Dist.Weights) { |
842 | BlockMass Taken = D.takeMass(Weight: W.Amount); |
843 | assert(W.Type == Weight::Local && "all weights should be local" ); |
844 | Working[W.TargetNode.Index].getMass() = Taken; |
845 | LLVM_DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr)); |
846 | } |
847 | } |
848 | |
849 | void BlockFrequencyInfoImplBase::(Distribution &Dist) { |
850 | BlockMass LoopMass = BlockMass::getFull(); |
851 | DitheringDistributer D(Dist, LoopMass); |
852 | for (const Weight &W : Dist.Weights) { |
853 | BlockMass Taken = D.takeMass(Weight: W.Amount); |
854 | assert(W.Type == Weight::Local && "all weights should be local" ); |
855 | Working[W.TargetNode.Index].getMass() = Taken; |
856 | LLVM_DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr)); |
857 | } |
858 | } |
859 | |