1//===-- ProfileGenerator.cpp - Profile Generator ---------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8#include "ProfileGenerator.h"
9#include "ErrorHandling.h"
10#include "MissingFrameInferrer.h"
11#include "PerfReader.h"
12#include "ProfiledBinary.h"
13#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
14#include "llvm/ProfileData/ProfileCommon.h"
15#include <algorithm>
16#include <float.h>
17#include <unordered_set>
18#include <utility>
19
20cl::opt<std::string> OutputFilename("output", cl::value_desc("output"),
21 cl::Required,
22 cl::desc("Output profile file"));
23static cl::alias OutputA("o", cl::desc("Alias for --output"),
24 cl::aliasopt(OutputFilename));
25
26static cl::opt<SampleProfileFormat> OutputFormat(
27 "format", cl::desc("Format of output profile"), cl::init(Val: SPF_Ext_Binary),
28 cl::values(
29 clEnumValN(SPF_Binary, "binary", "Binary encoding (default)"),
30 clEnumValN(SPF_Ext_Binary, "extbinary", "Extensible binary encoding"),
31 clEnumValN(SPF_Text, "text", "Text encoding"),
32 clEnumValN(SPF_GCC, "gcc",
33 "GCC encoding (only meaningful for -sample)")));
34
35static cl::opt<bool> UseMD5(
36 "use-md5", cl::Hidden,
37 cl::desc("Use md5 to represent function names in the output profile (only "
38 "meaningful for -extbinary)"));
39
40static cl::opt<bool> PopulateProfileSymbolList(
41 "populate-profile-symbol-list", cl::init(Val: false), cl::Hidden,
42 cl::desc("Populate profile symbol list (only meaningful for -extbinary)"));
43
44static cl::opt<bool> FillZeroForAllFuncs(
45 "fill-zero-for-all-funcs", cl::init(Val: false), cl::Hidden,
46 cl::desc("Attribute all functions' range with zero count "
47 "even it's not hit by any samples."));
48
49static cl::opt<int32_t, true> RecursionCompression(
50 "compress-recursion",
51 cl::desc("Compressing recursion by deduplicating adjacent frame "
52 "sequences up to the specified size. -1 means no size limit."),
53 cl::Hidden,
54 cl::location(L&: llvm::sampleprof::CSProfileGenerator::MaxCompressionSize));
55
56static cl::opt<bool>
57 TrimColdProfile("trim-cold-profile",
58 cl::desc("If the total count of the profile is smaller "
59 "than threshold, it will be trimmed."));
60
61static cl::opt<bool> CSProfMergeColdContext(
62 "csprof-merge-cold-context", cl::init(Val: true),
63 cl::desc("If the total count of context profile is smaller than "
64 "the threshold, it will be merged into context-less base "
65 "profile."));
66
67static cl::opt<uint32_t> CSProfMaxColdContextDepth(
68 "csprof-max-cold-context-depth", cl::init(Val: 1),
69 cl::desc("Keep the last K contexts while merging cold profile. 1 means the "
70 "context-less base profile"));
71
72static cl::opt<int, true> CSProfMaxContextDepth(
73 "csprof-max-context-depth",
74 cl::desc("Keep the last K contexts while merging profile. -1 means no "
75 "depth limit."),
76 cl::location(L&: llvm::sampleprof::CSProfileGenerator::MaxContextDepth));
77
78static cl::opt<double> ProfileDensityThreshold(
79 "profile-density-threshold", llvm::cl::init(Val: 50),
80 llvm::cl::desc("If the profile density is below the given threshold, it "
81 "will be suggested to increase the sampling rate."),
82 llvm::cl::Optional);
83static cl::opt<bool> ShowDensity("show-density", llvm::cl::init(Val: false),
84 llvm::cl::desc("show profile density details"),
85 llvm::cl::Optional);
86static cl::opt<int> ProfileDensityCutOffHot(
87 "profile-density-cutoff-hot", llvm::cl::init(Val: 990000),
88 llvm::cl::desc("Total samples cutoff for functions used to calculate "
89 "profile density."));
90
91static cl::opt<bool> UpdateTotalSamples(
92 "update-total-samples", llvm::cl::init(Val: false),
93 llvm::cl::desc(
94 "Update total samples by accumulating all its body samples."),
95 llvm::cl::Optional);
96
97static cl::opt<bool> GenCSNestedProfile(
98 "gen-cs-nested-profile", cl::Hidden, cl::init(Val: true),
99 cl::desc("Generate nested function profiles for CSSPGO"));
100
101cl::opt<bool> InferMissingFrames(
102 "infer-missing-frames", llvm::cl::init(Val: true),
103 llvm::cl::desc(
104 "Infer missing call frames due to compiler tail call elimination."),
105 llvm::cl::Optional);
106
107extern cl::opt<bool> LeadingIPOnly;
108
109using namespace llvm;
110using namespace sampleprof;
111
112namespace llvm {
113extern cl::opt<int> ProfileSummaryCutoffHot;
114extern cl::opt<bool> UseContextLessSummary;
115
116namespace sampleprof {
117
118// Initialize the MaxCompressionSize to -1 which means no size limit
119int32_t CSProfileGenerator::MaxCompressionSize = -1;
120
121int CSProfileGenerator::MaxContextDepth = -1;
122
123bool ProfileGeneratorBase::UseFSDiscriminator = false;
124
125std::unique_ptr<ProfileGeneratorBase>
126ProfileGeneratorBase::create(ProfiledBinary *Binary,
127 const ContextSampleCounterMap *SampleCounters,
128 bool ProfileIsCS) {
129 std::unique_ptr<ProfileGeneratorBase> Generator;
130 if (ProfileIsCS) {
131 Generator.reset(p: new CSProfileGenerator(Binary, SampleCounters));
132 } else {
133 Generator.reset(p: new ProfileGenerator(Binary, SampleCounters));
134 }
135 ProfileGeneratorBase::UseFSDiscriminator = Binary->useFSDiscriminator();
136 FunctionSamples::ProfileIsFS = Binary->useFSDiscriminator();
137
138 return Generator;
139}
140
141std::unique_ptr<ProfileGeneratorBase>
142ProfileGeneratorBase::create(ProfiledBinary *Binary, SampleProfileMap &Profiles,
143 bool ProfileIsCS) {
144 std::unique_ptr<ProfileGeneratorBase> Generator;
145 if (ProfileIsCS) {
146 Generator.reset(p: new CSProfileGenerator(Binary, Profiles));
147 } else {
148 Generator.reset(p: new ProfileGenerator(Binary, std::move(Profiles)));
149 }
150 ProfileGeneratorBase::UseFSDiscriminator = Binary->useFSDiscriminator();
151 FunctionSamples::ProfileIsFS = Binary->useFSDiscriminator();
152
153 return Generator;
154}
155
156void ProfileGeneratorBase::write(std::unique_ptr<SampleProfileWriter> Writer,
157 SampleProfileMap &ProfileMap) {
158 // Populate profile symbol list if extended binary format is used.
159 ProfileSymbolList SymbolList;
160
161 if (PopulateProfileSymbolList && OutputFormat == SPF_Ext_Binary) {
162 Binary->populateSymbolListFromDWARF(SymbolList);
163 Writer->setProfileSymbolList(&SymbolList);
164 }
165
166 if (std::error_code EC = Writer->write(ProfileMap))
167 exitWithError(EC: std::move(EC));
168}
169
170void ProfileGeneratorBase::write() {
171 auto WriterOrErr = SampleProfileWriter::create(Filename: OutputFilename, Format: OutputFormat);
172 if (std::error_code EC = WriterOrErr.getError())
173 exitWithError(EC, Whence: OutputFilename);
174
175 if (UseMD5) {
176 if (OutputFormat != SPF_Ext_Binary)
177 WithColor::warning() << "-use-md5 is ignored. Specify "
178 "--format=extbinary to enable it\n";
179 else
180 WriterOrErr.get()->setUseMD5();
181 }
182
183 write(Writer: std::move(WriterOrErr.get()), ProfileMap);
184}
185
186void ProfileGeneratorBase::showDensitySuggestion(double Density) {
187 if (Density == 0.0)
188 WithColor::warning() << "The output profile is empty or the "
189 "--profile-density-cutoff-hot option is "
190 "set too low. Please check your command.\n";
191 else if (Density < ProfileDensityThreshold)
192 WithColor::warning()
193 << "Sample PGO is estimated to optimize better with "
194 << format(Fmt: "%.1f", Vals: ProfileDensityThreshold / Density)
195 << "x more samples. Please consider increasing sampling rate or "
196 "profiling for longer duration to get more samples.\n";
197
198 if (ShowDensity)
199 outs() << "Functions with density >= " << format(Fmt: "%.1f", Vals: Density)
200 << " account for "
201 << format(Fmt: "%.2f",
202 Vals: static_cast<double>(ProfileDensityCutOffHot) / 10000)
203 << "% total sample counts.\n";
204}
205
206bool ProfileGeneratorBase::filterAmbiguousProfile(FunctionSamples &FS) {
207 for (const auto &Prefix : FuncPrefixsToFilter) {
208 if (FS.getFuncName().starts_with(Prefix))
209 return true;
210 }
211
212 // Filter the function profiles for the inlinees. It's useful for fuzzy
213 // profile matching which flattens the profile and inlinees' samples are
214 // merged into top-level function.
215 for (auto &Callees :
216 const_cast<CallsiteSampleMap &>(FS.getCallsiteSamples())) {
217 auto &CalleesMap = Callees.second;
218 for (auto I = CalleesMap.begin(); I != CalleesMap.end();) {
219 auto FS = I++;
220 if (filterAmbiguousProfile(FS&: FS->second))
221 CalleesMap.erase(position: FS);
222 }
223 }
224 return false;
225}
226
227// For built-in local initialization function such as __cxx_global_var_init,
228// __tls_init prefix function, there could be multiple versions of the functions
229// in the final binary. However, in the profile generation, we call
230// getCanonicalFnName to canonicalize the names which strips the suffixes.
231// Therefore, samples from different functions queries the same profile and the
232// samples are merged. As the functions are essentially different, entries of
233// the merged profile are ambiguous. In sample loader, the IR from one version
234// would be attributed towards a merged entries, which is inaccurate. Especially
235// for fuzzy profile matching, it gets multiple callsites(from different
236// function) but used to match one callsite, which misleads the matching and
237// causes a lot of false positives report. Hence, we want to filter them out
238// from the profile map during the profile generation time. The profiles are all
239// cold functions, it won't have perf impact.
240void ProfileGeneratorBase::filterAmbiguousProfile(SampleProfileMap &Profiles) {
241 for (auto I = ProfileMap.begin(); I != ProfileMap.end();) {
242 auto FS = I++;
243 if (filterAmbiguousProfile(FS&: FS->second))
244 ProfileMap.erase(It: FS);
245 }
246}
247
248void ProfileGeneratorBase::findDisjointRanges(RangeSample &DisjointRanges,
249 const RangeSample &Ranges) {
250
251 /*
252 Regions may overlap with each other. Using the boundary info, find all
253 disjoint ranges and their sample count. BoundaryPoint contains the count
254 multiple samples begin/end at this points.
255
256 |<--100-->| Sample1
257 |<------200------>| Sample2
258 A B C
259
260 In the example above,
261 Sample1 begins at A, ends at B, its value is 100.
262 Sample2 beings at A, ends at C, its value is 200.
263 For A, BeginCount is the sum of sample begins at A, which is 300 and no
264 samples ends at A, so EndCount is 0.
265 Then boundary points A, B, and C with begin/end counts are:
266 A: (300, 0)
267 B: (0, 100)
268 C: (0, 200)
269 */
270 struct BoundaryPoint {
271 // Sum of sample counts beginning at this point
272 uint64_t BeginCount = UINT64_MAX;
273 // Sum of sample counts ending at this point
274 uint64_t EndCount = UINT64_MAX;
275 // Is the begin point of a zero range.
276 bool IsZeroRangeBegin = false;
277 // Is the end point of a zero range.
278 bool IsZeroRangeEnd = false;
279
280 void addBeginCount(uint64_t Count) {
281 if (BeginCount == UINT64_MAX)
282 BeginCount = 0;
283 BeginCount += Count;
284 }
285
286 void addEndCount(uint64_t Count) {
287 if (EndCount == UINT64_MAX)
288 EndCount = 0;
289 EndCount += Count;
290 }
291 };
292
293 /*
294 For the above example. With boundary points, follwing logic finds two
295 disjoint region of
296
297 [A,B]: 300
298 [B+1,C]: 200
299
300 If there is a boundary point that both begin and end, the point itself
301 becomes a separate disjoint region. For example, if we have original
302 ranges of
303
304 |<--- 100 --->|
305 |<--- 200 --->|
306 A B C
307
308 there are three boundary points with their begin/end counts of
309
310 A: (100, 0)
311 B: (200, 100)
312 C: (0, 200)
313
314 the disjoint ranges would be
315
316 [A, B-1]: 100
317 [B, B]: 300
318 [B+1, C]: 200.
319
320 Example for zero value range:
321
322 |<--- 100 --->|
323 |<--- 200 --->|
324 |<--------------- 0 ----------------->|
325 A B C D E F
326
327 [A, B-1] : 0
328 [B, C] : 100
329 [C+1, D-1]: 0
330 [D, E] : 200
331 [E+1, F] : 0
332 */
333 std::map<uint64_t, BoundaryPoint> Boundaries;
334
335 for (const auto &Item : Ranges) {
336 assert(Item.first.first <= Item.first.second &&
337 "Invalid instruction range");
338 auto &BeginPoint = Boundaries[Item.first.first];
339 auto &EndPoint = Boundaries[Item.first.second];
340 uint64_t Count = Item.second;
341
342 BeginPoint.addBeginCount(Count);
343 EndPoint.addEndCount(Count);
344 if (Count == 0) {
345 BeginPoint.IsZeroRangeBegin = true;
346 EndPoint.IsZeroRangeEnd = true;
347 }
348 }
349
350 // Use UINT64_MAX to indicate there is no existing range between BeginAddress
351 // and the next valid address
352 uint64_t BeginAddress = UINT64_MAX;
353 int ZeroRangeDepth = 0;
354 uint64_t Count = 0;
355 for (const auto &Item : Boundaries) {
356 uint64_t Address = Item.first;
357 const BoundaryPoint &Point = Item.second;
358 if (Point.BeginCount != UINT64_MAX) {
359 if (BeginAddress != UINT64_MAX)
360 DisjointRanges[{BeginAddress, Address - 1}] = Count;
361 Count += Point.BeginCount;
362 BeginAddress = Address;
363 ZeroRangeDepth += Point.IsZeroRangeBegin;
364 }
365 if (Point.EndCount != UINT64_MAX) {
366 assert((BeginAddress != UINT64_MAX) &&
367 "First boundary point cannot be 'end' point");
368 DisjointRanges[{BeginAddress, Address}] = Count;
369 assert(Count >= Point.EndCount && "Mismatched live ranges");
370 Count -= Point.EndCount;
371 BeginAddress = Address + 1;
372 ZeroRangeDepth -= Point.IsZeroRangeEnd;
373 // If the remaining count is zero and it's no longer in a zero range, this
374 // means we consume all the ranges before, thus mark BeginAddress as
375 // UINT64_MAX. e.g. supposing we have two non-overlapping ranges:
376 // [<---- 10 ---->]
377 // [<---- 20 ---->]
378 // A B C D
379 // The BeginAddress(B+1) will reset to invalid(UINT64_MAX), so we won't
380 // have the [B+1, C-1] zero range.
381 if (Count == 0 && ZeroRangeDepth == 0)
382 BeginAddress = UINT64_MAX;
383 }
384 }
385}
386
387void ProfileGeneratorBase::updateBodySamplesforFunctionProfile(
388 FunctionSamples &FunctionProfile, const SampleContextFrame &LeafLoc,
389 uint64_t Count) {
390 // Use the maximum count of samples with same line location
391 uint32_t Discriminator = getBaseDiscriminator(Discriminator: LeafLoc.Location.Discriminator);
392
393 if (LeadingIPOnly) {
394 // When computing an IP-based profile we take the SUM of counts at the
395 // location instead of applying duplication factors and taking the MAX.
396 FunctionProfile.addBodySamples(LineOffset: LeafLoc.Location.LineOffset, Discriminator,
397 Num: Count);
398 } else {
399 // Otherwise, use duplication factor to compensate for loop
400 // unroll/vectorization. Note that this is only needed when we're taking
401 // MAX of the counts at the location instead of SUM.
402 Count *= getDuplicationFactor(Discriminator: LeafLoc.Location.Discriminator);
403
404 ErrorOr<uint64_t> R = FunctionProfile.findSamplesAt(
405 LineOffset: LeafLoc.Location.LineOffset, Discriminator);
406
407 uint64_t PreviousCount = R ? R.get() : 0;
408 if (PreviousCount <= Count) {
409 FunctionProfile.addBodySamples(LineOffset: LeafLoc.Location.LineOffset, Discriminator,
410 Num: Count - PreviousCount);
411 }
412 }
413}
414
415void ProfileGeneratorBase::updateTotalSamples() {
416 for (auto &Item : ProfileMap) {
417 FunctionSamples &FunctionProfile = Item.second;
418 FunctionProfile.updateTotalSamples();
419 }
420}
421
422void ProfileGeneratorBase::updateCallsiteSamples() {
423 for (auto &Item : ProfileMap) {
424 FunctionSamples &FunctionProfile = Item.second;
425 FunctionProfile.updateCallsiteSamples();
426 }
427}
428
429void ProfileGeneratorBase::updateFunctionSamples() {
430 updateCallsiteSamples();
431
432 if (UpdateTotalSamples)
433 updateTotalSamples();
434}
435
436void ProfileGeneratorBase::collectProfiledFunctions() {
437 std::unordered_set<const BinaryFunction *> ProfiledFunctions;
438 if (collectFunctionsFromRawProfile(ProfiledFunctions))
439 Binary->setProfiledFunctions(ProfiledFunctions);
440 else if (collectFunctionsFromLLVMProfile(ProfiledFunctions))
441 Binary->setProfiledFunctions(ProfiledFunctions);
442 else
443 llvm_unreachable("Unsupported input profile");
444}
445
446bool ProfileGeneratorBase::collectFunctionsFromRawProfile(
447 std::unordered_set<const BinaryFunction *> &ProfiledFunctions) {
448 if (!SampleCounters)
449 return false;
450 // Go through all the stacks, ranges and branches in sample counters, use
451 // the start of the range to look up the function it belongs and record the
452 // function.
453 for (const auto &CI : *SampleCounters) {
454 if (const auto *CtxKey = dyn_cast<AddrBasedCtxKey>(Val: CI.first.getPtr())) {
455 for (auto StackAddr : CtxKey->Context) {
456 if (FuncRange *FRange = Binary->findFuncRange(Address: StackAddr))
457 ProfiledFunctions.insert(x: FRange->Func);
458 }
459 }
460
461 for (auto Item : CI.second.RangeCounter) {
462 uint64_t StartAddress = Item.first.first;
463 if (FuncRange *FRange = Binary->findFuncRange(Address: StartAddress))
464 ProfiledFunctions.insert(x: FRange->Func);
465 }
466
467 for (auto Item : CI.second.BranchCounter) {
468 uint64_t SourceAddress = Item.first.first;
469 uint64_t TargetAddress = Item.first.second;
470 if (FuncRange *FRange = Binary->findFuncRange(Address: SourceAddress))
471 ProfiledFunctions.insert(x: FRange->Func);
472 if (FuncRange *FRange = Binary->findFuncRange(Address: TargetAddress))
473 ProfiledFunctions.insert(x: FRange->Func);
474 }
475 }
476 return true;
477}
478
479bool ProfileGenerator::collectFunctionsFromLLVMProfile(
480 std::unordered_set<const BinaryFunction *> &ProfiledFunctions) {
481 for (const auto &FS : ProfileMap) {
482 if (auto *Func = Binary->getBinaryFunction(FName: FS.second.getFunction()))
483 ProfiledFunctions.insert(x: Func);
484 }
485 return true;
486}
487
488bool CSProfileGenerator::collectFunctionsFromLLVMProfile(
489 std::unordered_set<const BinaryFunction *> &ProfiledFunctions) {
490 for (auto *Node : ContextTracker) {
491 if (!Node->getFuncName().empty())
492 if (auto *Func = Binary->getBinaryFunction(FName: Node->getFuncName()))
493 ProfiledFunctions.insert(x: Func);
494 }
495 return true;
496}
497
498FunctionSamples &
499ProfileGenerator::getTopLevelFunctionProfile(FunctionId FuncName) {
500 SampleContext Context(FuncName);
501 return ProfileMap.create(Ctx: Context);
502}
503
504void ProfileGenerator::generateProfile() {
505 collectProfiledFunctions();
506
507 if (Binary->usePseudoProbes())
508 Binary->decodePseudoProbe();
509
510 if (SampleCounters) {
511 if (Binary->usePseudoProbes()) {
512 generateProbeBasedProfile();
513 } else {
514 generateLineNumBasedProfile();
515 }
516 }
517
518 postProcessProfiles();
519}
520
521void ProfileGenerator::postProcessProfiles() {
522 computeSummaryAndThreshold(ProfileMap);
523 trimColdProfiles(Profiles: ProfileMap, ColdCntThreshold: ColdCountThreshold);
524 filterAmbiguousProfile(Profiles&: ProfileMap);
525 calculateAndShowDensity(Profiles: ProfileMap);
526}
527
528void ProfileGenerator::trimColdProfiles(const SampleProfileMap &Profiles,
529 uint64_t ColdCntThreshold) {
530 if (!TrimColdProfile)
531 return;
532
533 // Move cold profiles into a tmp container.
534 std::vector<hash_code> ColdProfileHashes;
535 for (const auto &I : ProfileMap) {
536 if (I.second.getTotalSamples() < ColdCntThreshold)
537 ColdProfileHashes.emplace_back(args: I.first);
538 }
539
540 // Remove the cold profile from ProfileMap.
541 for (const auto &I : ColdProfileHashes)
542 ProfileMap.erase(Key: I);
543}
544
545void ProfileGenerator::generateLineNumBasedProfile() {
546 assert(SampleCounters->size() == 1 &&
547 "Must have one entry for profile generation.");
548 const SampleCounter &SC = SampleCounters->begin()->second;
549 // Fill in function body samples
550 populateBodySamplesForAllFunctions(RangeCounter: SC.RangeCounter);
551 // Fill in boundary sample counts as well as call site samples for calls
552 populateBoundarySamplesForAllFunctions(BranchCounters: SC.BranchCounter);
553
554 updateFunctionSamples();
555}
556
557void ProfileGenerator::generateProbeBasedProfile() {
558 assert(SampleCounters->size() == 1 &&
559 "Must have one entry for profile generation.");
560 // Enable pseudo probe functionalities in SampleProf
561 FunctionSamples::ProfileIsProbeBased = true;
562 const SampleCounter &SC = SampleCounters->begin()->second;
563 // Fill in function body samples
564 populateBodySamplesWithProbesForAllFunctions(RangeCounter: SC.RangeCounter);
565 // Fill in boundary sample counts as well as call site samples for calls
566 populateBoundarySamplesWithProbesForAllFunctions(BranchCounters: SC.BranchCounter);
567
568 updateFunctionSamples();
569}
570
571void ProfileGenerator::populateBodySamplesWithProbesForAllFunctions(
572 const RangeSample &RangeCounter) {
573 ProbeCounterMap ProbeCounter;
574 // preprocessRangeCounter returns disjoint ranges, so no longer to redo it
575 // inside extractProbesFromRange.
576 extractProbesFromRange(RangeCounter: preprocessRangeCounter(RangeCounter), ProbeCounter,
577 FindDisjointRanges: false);
578
579 for (const auto &PI : ProbeCounter) {
580 const MCDecodedPseudoProbe *Probe = PI.first;
581 uint64_t Count = PI.second;
582 SampleContextFrameVector FrameVec;
583 Binary->getInlineContextForProbe(Probe, InlineContextStack&: FrameVec, IncludeLeaf: true);
584 FunctionSamples &FunctionProfile =
585 getLeafProfileAndAddTotalSamples(FrameVec, Count);
586 FunctionProfile.addBodySamples(LineOffset: Probe->getIndex(), Discriminator: Probe->getDiscriminator(),
587 Num: Count);
588 if (Probe->isEntry())
589 FunctionProfile.addHeadSamples(Num: Count);
590 }
591}
592
593void ProfileGenerator::populateBoundarySamplesWithProbesForAllFunctions(
594 const BranchSample &BranchCounters) {
595 for (const auto &Entry : BranchCounters) {
596 uint64_t SourceAddress = Entry.first.first;
597 uint64_t TargetAddress = Entry.first.second;
598 uint64_t Count = Entry.second;
599 assert(Count != 0 && "Unexpected zero weight branch");
600
601 StringRef CalleeName = getCalleeNameForAddress(TargetAddress);
602 if (CalleeName.size() == 0)
603 continue;
604
605 const MCDecodedPseudoProbe *CallProbe =
606 Binary->getCallProbeForAddr(Address: SourceAddress);
607 if (CallProbe == nullptr)
608 continue;
609
610 // Record called target sample and its count.
611 SampleContextFrameVector FrameVec;
612 Binary->getInlineContextForProbe(Probe: CallProbe, InlineContextStack&: FrameVec, IncludeLeaf: true);
613
614 if (!FrameVec.empty()) {
615 FunctionSamples &FunctionProfile =
616 getLeafProfileAndAddTotalSamples(FrameVec, Count: 0);
617 FunctionProfile.addCalledTargetSamples(
618 LineOffset: FrameVec.back().Location.LineOffset,
619 Discriminator: FrameVec.back().Location.Discriminator,
620 Func: FunctionId(CalleeName), Num: Count);
621 }
622 }
623}
624
625FunctionSamples &ProfileGenerator::getLeafProfileAndAddTotalSamples(
626 const SampleContextFrameVector &FrameVec, uint64_t Count) {
627 // Get top level profile
628 FunctionSamples *FunctionProfile =
629 &getTopLevelFunctionProfile(FuncName: FrameVec[0].Func);
630 FunctionProfile->addTotalSamples(Num: Count);
631 if (Binary->usePseudoProbes()) {
632 const auto *FuncDesc = Binary->getFuncDescForGUID(
633 GUID: FunctionProfile->getFunction().getHashCode());
634 FunctionProfile->setFunctionHash(FuncDesc->FuncHash);
635 }
636
637 for (size_t I = 1; I < FrameVec.size(); I++) {
638 LineLocation Callsite(
639 FrameVec[I - 1].Location.LineOffset,
640 getBaseDiscriminator(Discriminator: FrameVec[I - 1].Location.Discriminator));
641 FunctionSamplesMap &SamplesMap =
642 FunctionProfile->functionSamplesAt(Loc: Callsite);
643 auto Ret =
644 SamplesMap.emplace(args: FrameVec[I].Func, args: FunctionSamples());
645 if (Ret.second) {
646 SampleContext Context(FrameVec[I].Func);
647 Ret.first->second.setContext(Context);
648 }
649 FunctionProfile = &Ret.first->second;
650 FunctionProfile->addTotalSamples(Num: Count);
651 if (Binary->usePseudoProbes()) {
652 const auto *FuncDesc = Binary->getFuncDescForGUID(
653 GUID: FunctionProfile->getFunction().getHashCode());
654 FunctionProfile->setFunctionHash(FuncDesc->FuncHash);
655 }
656 }
657
658 return *FunctionProfile;
659}
660
661RangeSample
662ProfileGenerator::preprocessRangeCounter(const RangeSample &RangeCounter) {
663 RangeSample Ranges(RangeCounter.begin(), RangeCounter.end());
664 if (FillZeroForAllFuncs) {
665 for (auto &FuncI : Binary->getAllBinaryFunctions()) {
666 for (auto &R : FuncI.second.Ranges) {
667 Ranges[{R.first, R.second - 1}] += 0;
668 }
669 }
670 } else {
671 // For each range, we search for all ranges of the function it belongs to
672 // and initialize it with zero count, so it remains zero if doesn't hit any
673 // samples. This is to be consistent with compiler that interpret zero count
674 // as unexecuted(cold).
675 for (const auto &I : RangeCounter) {
676 uint64_t StartAddress = I.first.first;
677 for (const auto &Range : Binary->getRanges(Address: StartAddress))
678 Ranges[{Range.first, Range.second - 1}] += 0;
679 }
680 }
681 RangeSample DisjointRanges;
682 findDisjointRanges(DisjointRanges, Ranges);
683 return DisjointRanges;
684}
685
686void ProfileGenerator::populateBodySamplesForAllFunctions(
687 const RangeSample &RangeCounter) {
688 for (const auto &Range : preprocessRangeCounter(RangeCounter)) {
689 uint64_t RangeBegin = Range.first.first;
690 uint64_t RangeEnd = Range.first.second;
691 uint64_t Count = Range.second;
692
693 InstructionPointer IP(Binary, RangeBegin, true);
694 // Disjoint ranges may have range in the middle of two instr,
695 // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range
696 // can be Addr1+1 to Addr2-1. We should ignore such range.
697 if (IP.Address > RangeEnd)
698 continue;
699
700 do {
701 const SampleContextFrameVector FrameVec =
702 Binary->getFrameLocationStack(Address: IP.Address);
703 if (!FrameVec.empty()) {
704 // FIXME: As accumulating total count per instruction caused some
705 // regression, we changed to accumulate total count per byte as a
706 // workaround. Tuning hotness threshold on the compiler side might be
707 // necessary in the future.
708 FunctionSamples &FunctionProfile = getLeafProfileAndAddTotalSamples(
709 FrameVec, Count: Count * Binary->getInstSize(Address: IP.Address));
710 updateBodySamplesforFunctionProfile(FunctionProfile, LeafLoc: FrameVec.back(),
711 Count);
712 }
713 } while (IP.advance() && IP.Address <= RangeEnd);
714 }
715}
716
717StringRef
718ProfileGeneratorBase::getCalleeNameForAddress(uint64_t TargetAddress) {
719 // Get the function range by branch target if it's a call branch.
720 auto *FRange = Binary->findFuncRangeForStartAddr(Address: TargetAddress);
721
722 // We won't accumulate sample count for a range whose start is not the real
723 // function entry such as outlined function or inner labels.
724 if (!FRange || !FRange->IsFuncEntry)
725 return StringRef();
726
727 return FunctionSamples::getCanonicalFnName(FnName: FRange->getFuncName());
728}
729
730void ProfileGenerator::populateBoundarySamplesForAllFunctions(
731 const BranchSample &BranchCounters) {
732 for (const auto &Entry : BranchCounters) {
733 uint64_t SourceAddress = Entry.first.first;
734 uint64_t TargetAddress = Entry.first.second;
735 uint64_t Count = Entry.second;
736 assert(Count != 0 && "Unexpected zero weight branch");
737
738 StringRef CalleeName = getCalleeNameForAddress(TargetAddress);
739 if (CalleeName.size() == 0)
740 continue;
741 // Record called target sample and its count.
742 const SampleContextFrameVector &FrameVec =
743 Binary->getCachedFrameLocationStack(Address: SourceAddress);
744 if (!FrameVec.empty()) {
745 FunctionSamples &FunctionProfile =
746 getLeafProfileAndAddTotalSamples(FrameVec, Count: 0);
747 FunctionProfile.addCalledTargetSamples(
748 LineOffset: FrameVec.back().Location.LineOffset,
749 Discriminator: getBaseDiscriminator(Discriminator: FrameVec.back().Location.Discriminator),
750 Func: FunctionId(CalleeName), Num: Count);
751 }
752 // Add head samples for callee.
753 FunctionSamples &CalleeProfile =
754 getTopLevelFunctionProfile(FuncName: FunctionId(CalleeName));
755 CalleeProfile.addHeadSamples(Num: Count);
756 }
757}
758
759void ProfileGeneratorBase::calculateBodySamplesAndSize(
760 const FunctionSamples &FSamples, uint64_t &TotalBodySamples,
761 uint64_t &FuncBodySize) {
762 // Note that ideally the size should be the number of function instruction.
763 // However, for probe-based profile, we don't have the accurate instruction
764 // count for each probe, instead, the probe sample is the samples count for
765 // the block, which is equivelant to
766 // total_instruction_samples/num_of_instruction in one block. Hence, we use
767 // the number of probe as a proxy for the function's size.
768 FuncBodySize += FSamples.getBodySamples().size();
769
770 // The accumulated body samples re-calculated here could be different from the
771 // TotalSamples(getTotalSamples) field of FunctionSamples for line-number
772 // based profile. The reason is that TotalSamples is the sum of all the
773 // samples of the machine instruction in one source-code line, however, the
774 // entry of Bodysamples is the only max number of them, so the TotalSamples is
775 // usually much bigger than the accumulated body samples as one souce-code
776 // line can emit many machine instructions. We observed a regression when we
777 // switched to use the accumulated body samples(by using
778 // -update-total-samples). Hence, it's safer to re-calculate here to avoid
779 // such discrepancy. There is no problem for probe-based profile, as the
780 // TotalSamples is exactly the same as the accumulated body samples.
781 for (const auto &I : FSamples.getBodySamples())
782 TotalBodySamples += I.second.getSamples();
783
784 for (const auto &CallsiteSamples : FSamples.getCallsiteSamples())
785 for (const auto &Callee : CallsiteSamples.second) {
786 // For binary-level density, the inlinees' samples and size should be
787 // included in the calculation.
788 calculateBodySamplesAndSize(FSamples: Callee.second, TotalBodySamples,
789 FuncBodySize);
790 }
791}
792
793// Calculate Profile-density:
794// Calculate the density for each function and sort them in descending order,
795// keep accumulating their total samples unitl it exceeds the
796// percentage_threshold(cut-off) of total profile samples, the profile-density
797// is the last(minimum) function-density of the processed functions, which means
798// all the functions hot to perf are on good density if the profile-density is
799// good. The percentage_threshold(--profile-density-cutoff-hot) is configurable
800// depending on how much regression the system want to tolerate.
801double
802ProfileGeneratorBase::calculateDensity(const SampleProfileMap &Profiles) {
803 double ProfileDensity = 0.0;
804
805 uint64_t TotalProfileSamples = 0;
806 // A list of the function profile density and its total samples.
807 std::vector<std::pair<double, uint64_t>> FuncDensityList;
808 for (const auto &I : Profiles) {
809 uint64_t TotalBodySamples = 0;
810 uint64_t FuncBodySize = 0;
811 calculateBodySamplesAndSize(FSamples: I.second, TotalBodySamples, FuncBodySize);
812
813 if (FuncBodySize == 0)
814 continue;
815
816 double FuncDensity = static_cast<double>(TotalBodySamples) / FuncBodySize;
817 TotalProfileSamples += TotalBodySamples;
818 FuncDensityList.emplace_back(args&: FuncDensity, args&: TotalBodySamples);
819 }
820
821 // Sorted by the density in descending order.
822 llvm::stable_sort(Range&: FuncDensityList, C: [&](const std::pair<double, uint64_t> &A,
823 const std::pair<double, uint64_t> &B) {
824 if (A.first != B.first)
825 return A.first > B.first;
826 return A.second < B.second;
827 });
828
829 uint64_t AccumulatedSamples = 0;
830 uint32_t I = 0;
831 assert(ProfileDensityCutOffHot <= 1000000 &&
832 "The cutoff value is greater than 1000000(100%)");
833 while (AccumulatedSamples < TotalProfileSamples *
834 static_cast<float>(ProfileDensityCutOffHot) /
835 1000000 &&
836 I < FuncDensityList.size()) {
837 AccumulatedSamples += FuncDensityList[I].second;
838 ProfileDensity = FuncDensityList[I].first;
839 I++;
840 }
841
842 return ProfileDensity;
843}
844
845void ProfileGeneratorBase::calculateAndShowDensity(
846 const SampleProfileMap &Profiles) {
847 double Density = calculateDensity(Profiles);
848 showDensitySuggestion(Density);
849}
850
851FunctionSamples *
852CSProfileGenerator::getOrCreateFunctionSamples(ContextTrieNode *ContextNode,
853 bool WasLeafInlined) {
854 FunctionSamples *FProfile = ContextNode->getFunctionSamples();
855 if (!FProfile) {
856 FSamplesList.emplace_back();
857 FProfile = &FSamplesList.back();
858 FProfile->setFunction(ContextNode->getFuncName());
859 ContextNode->setFunctionSamples(FProfile);
860 }
861 // Update ContextWasInlined attribute for existing contexts.
862 // The current function can be called in two ways:
863 // - when processing a probe of the current frame
864 // - when processing the entry probe of an inlinee's frame, which
865 // is then used to update the callsite count of the current frame.
866 // The two can happen in any order, hence here we are making sure
867 // `ContextWasInlined` is always set as expected.
868 // TODO: Note that the former does not always happen if no probes of the
869 // current frame has samples, and if the latter happens, we could lose the
870 // attribute. This should be fixed.
871 if (WasLeafInlined)
872 FProfile->getContext().setAttribute(ContextWasInlined);
873 return FProfile;
874}
875
876ContextTrieNode *
877CSProfileGenerator::getOrCreateContextNode(const SampleContextFrames Context,
878 bool WasLeafInlined) {
879 ContextTrieNode *ContextNode =
880 ContextTracker.getOrCreateContextPath(Context, AllowCreate: true);
881 getOrCreateFunctionSamples(ContextNode, WasLeafInlined);
882 return ContextNode;
883}
884
885void CSProfileGenerator::generateProfile() {
886 FunctionSamples::ProfileIsCS = true;
887
888 collectProfiledFunctions();
889
890 if (Binary->usePseudoProbes()) {
891 Binary->decodePseudoProbe();
892 if (InferMissingFrames)
893 initializeMissingFrameInferrer();
894 }
895
896 if (SampleCounters) {
897 if (Binary->usePseudoProbes()) {
898 generateProbeBasedProfile();
899 } else {
900 generateLineNumBasedProfile();
901 }
902 }
903
904 if (Binary->getTrackFuncContextSize())
905 computeSizeForProfiledFunctions();
906
907 postProcessProfiles();
908}
909
910void CSProfileGenerator::initializeMissingFrameInferrer() {
911 Binary->getMissingContextInferrer()->initialize(SampleCounters);
912}
913
914void CSProfileGenerator::inferMissingFrames(
915 const SmallVectorImpl<uint64_t> &Context,
916 SmallVectorImpl<uint64_t> &NewContext) {
917 Binary->inferMissingFrames(Context, NewContext);
918}
919
920void CSProfileGenerator::computeSizeForProfiledFunctions() {
921 for (auto *Func : Binary->getProfiledFunctions())
922 Binary->computeInlinedContextSizeForFunc(Func);
923
924 // Flush the symbolizer to save memory.
925 Binary->flushSymbolizer();
926}
927
928void CSProfileGenerator::updateFunctionSamples() {
929 for (auto *Node : ContextTracker) {
930 FunctionSamples *FSamples = Node->getFunctionSamples();
931 if (FSamples) {
932 if (UpdateTotalSamples)
933 FSamples->updateTotalSamples();
934 FSamples->updateCallsiteSamples();
935 }
936 }
937}
938
939void CSProfileGenerator::generateLineNumBasedProfile() {
940 for (const auto &CI : *SampleCounters) {
941 const auto *CtxKey = cast<StringBasedCtxKey>(Val: CI.first.getPtr());
942
943 ContextTrieNode *ContextNode = &getRootContext();
944 // Sample context will be empty if the jump is an external-to-internal call
945 // pattern, the head samples should be added for the internal function.
946 if (!CtxKey->Context.empty()) {
947 // Get or create function profile for the range
948 ContextNode =
949 getOrCreateContextNode(Context: CtxKey->Context, WasLeafInlined: CtxKey->WasLeafInlined);
950 // Fill in function body samples
951 populateBodySamplesForFunction(FunctionProfile&: *ContextNode->getFunctionSamples(),
952 RangeCounters: CI.second.RangeCounter);
953 }
954 // Fill in boundary sample counts as well as call site samples for calls
955 populateBoundarySamplesForFunction(CallerNode: ContextNode, BranchCounters: CI.second.BranchCounter);
956 }
957 // Fill in call site value sample for inlined calls and also use context to
958 // infer missing samples. Since we don't have call count for inlined
959 // functions, we estimate it from inlinee's profile using the entry of the
960 // body sample.
961 populateInferredFunctionSamples(Node&: getRootContext());
962
963 updateFunctionSamples();
964}
965
966void CSProfileGenerator::populateBodySamplesForFunction(
967 FunctionSamples &FunctionProfile, const RangeSample &RangeCounter) {
968 // Compute disjoint ranges first, so we can use MAX
969 // for calculating count for each location.
970 RangeSample Ranges;
971 findDisjointRanges(DisjointRanges&: Ranges, Ranges: RangeCounter);
972 for (const auto &Range : Ranges) {
973 uint64_t RangeBegin = Range.first.first;
974 uint64_t RangeEnd = Range.first.second;
975 uint64_t Count = Range.second;
976 // Disjoint ranges have introduce zero-filled gap that
977 // doesn't belong to current context, filter them out.
978 if (Count == 0)
979 continue;
980
981 InstructionPointer IP(Binary, RangeBegin, true);
982 // Disjoint ranges may have range in the middle of two instr,
983 // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range
984 // can be Addr1+1 to Addr2-1. We should ignore such range.
985 if (IP.Address > RangeEnd)
986 continue;
987
988 do {
989 auto LeafLoc = Binary->getInlineLeafFrameLoc(Address: IP.Address);
990 if (LeafLoc) {
991 // Recording body sample for this specific context
992 updateBodySamplesforFunctionProfile(FunctionProfile, LeafLoc: *LeafLoc, Count);
993 FunctionProfile.addTotalSamples(Num: Count);
994 }
995 } while (IP.advance() && IP.Address <= RangeEnd);
996 }
997}
998
999void CSProfileGenerator::populateBoundarySamplesForFunction(
1000 ContextTrieNode *Node, const BranchSample &BranchCounters) {
1001
1002 for (const auto &Entry : BranchCounters) {
1003 uint64_t SourceAddress = Entry.first.first;
1004 uint64_t TargetAddress = Entry.first.second;
1005 uint64_t Count = Entry.second;
1006 assert(Count != 0 && "Unexpected zero weight branch");
1007
1008 StringRef CalleeName = getCalleeNameForAddress(TargetAddress);
1009 if (CalleeName.size() == 0)
1010 continue;
1011
1012 ContextTrieNode *CallerNode = Node;
1013 LineLocation CalleeCallSite(0, 0);
1014 if (CallerNode != &getRootContext()) {
1015 // Record called target sample and its count
1016 auto LeafLoc = Binary->getInlineLeafFrameLoc(Address: SourceAddress);
1017 if (LeafLoc) {
1018 CallerNode->getFunctionSamples()->addCalledTargetSamples(
1019 LineOffset: LeafLoc->Location.LineOffset,
1020 Discriminator: getBaseDiscriminator(Discriminator: LeafLoc->Location.Discriminator),
1021 Func: FunctionId(CalleeName),
1022 Num: Count);
1023 // Record head sample for called target(callee)
1024 CalleeCallSite = LeafLoc->Location;
1025 }
1026 }
1027
1028 ContextTrieNode *CalleeNode =
1029 CallerNode->getOrCreateChildContext(CallSite: CalleeCallSite,
1030 ChildName: FunctionId(CalleeName));
1031 FunctionSamples *CalleeProfile = getOrCreateFunctionSamples(ContextNode: CalleeNode);
1032 CalleeProfile->addHeadSamples(Num: Count);
1033 }
1034}
1035
1036void CSProfileGenerator::populateInferredFunctionSamples(
1037 ContextTrieNode &Node) {
1038 // There is no call jmp sample between the inliner and inlinee, we need to use
1039 // the inlinee's context to infer inliner's context, i.e. parent(inliner)'s
1040 // sample depends on child(inlinee)'s sample, so traverse the tree in
1041 // post-order.
1042 for (auto &It : Node.getAllChildContext())
1043 populateInferredFunctionSamples(Node&: It.second);
1044
1045 FunctionSamples *CalleeProfile = Node.getFunctionSamples();
1046 if (!CalleeProfile)
1047 return;
1048 // If we already have head sample counts, we must have value profile
1049 // for call sites added already. Skip to avoid double counting.
1050 if (CalleeProfile->getHeadSamples())
1051 return;
1052 ContextTrieNode *CallerNode = Node.getParentContext();
1053 // If we don't have context, nothing to do for caller's call site.
1054 // This could happen for entry point function.
1055 if (CallerNode == &getRootContext())
1056 return;
1057
1058 LineLocation CallerLeafFrameLoc = Node.getCallSiteLoc();
1059 FunctionSamples &CallerProfile = *getOrCreateFunctionSamples(ContextNode: CallerNode);
1060 // Since we don't have call count for inlined functions, we
1061 // estimate it from inlinee's profile using entry body sample.
1062 uint64_t EstimatedCallCount = CalleeProfile->getHeadSamplesEstimate();
1063 // If we don't have samples with location, use 1 to indicate live.
1064 if (!EstimatedCallCount && !CalleeProfile->getBodySamples().size())
1065 EstimatedCallCount = 1;
1066 CallerProfile.addCalledTargetSamples(LineOffset: CallerLeafFrameLoc.LineOffset,
1067 Discriminator: CallerLeafFrameLoc.Discriminator,
1068 Func: Node.getFuncName(), Num: EstimatedCallCount);
1069 CallerProfile.addBodySamples(LineOffset: CallerLeafFrameLoc.LineOffset,
1070 Discriminator: CallerLeafFrameLoc.Discriminator,
1071 Num: EstimatedCallCount);
1072 CallerProfile.addTotalSamples(Num: EstimatedCallCount);
1073}
1074
1075void CSProfileGenerator::convertToProfileMap(
1076 ContextTrieNode &Node, SampleContextFrameVector &Context) {
1077 FunctionSamples *FProfile = Node.getFunctionSamples();
1078 if (FProfile) {
1079 Context.emplace_back(Args: Node.getFuncName(), Args: LineLocation(0, 0));
1080 // Save the new context for future references.
1081 SampleContextFrames NewContext = *Contexts.insert(x: Context).first;
1082 auto Ret = ProfileMap.emplace(Args&: NewContext, Args: std::move(*FProfile));
1083 FunctionSamples &NewProfile = Ret.first->second;
1084 NewProfile.getContext().setContext(Context: NewContext);
1085 Context.pop_back();
1086 }
1087
1088 for (auto &It : Node.getAllChildContext()) {
1089 ContextTrieNode &ChildNode = It.second;
1090 Context.emplace_back(Args: Node.getFuncName(), Args: ChildNode.getCallSiteLoc());
1091 convertToProfileMap(Node&: ChildNode, Context);
1092 Context.pop_back();
1093 }
1094}
1095
1096void CSProfileGenerator::convertToProfileMap() {
1097 assert(ProfileMap.empty() &&
1098 "ProfileMap should be empty before converting from the trie");
1099 assert(IsProfileValidOnTrie &&
1100 "Do not convert the trie twice, it's already destroyed");
1101
1102 SampleContextFrameVector Context;
1103 for (auto &It : getRootContext().getAllChildContext())
1104 convertToProfileMap(Node&: It.second, Context);
1105
1106 IsProfileValidOnTrie = false;
1107}
1108
1109void CSProfileGenerator::postProcessProfiles() {
1110 // Compute hot/cold threshold based on profile. This will be used for cold
1111 // context profile merging/trimming.
1112 computeSummaryAndThreshold();
1113
1114 // Run global pre-inliner to adjust/merge context profile based on estimated
1115 // inline decisions.
1116 if (EnableCSPreInliner) {
1117 ContextTracker.populateFuncToCtxtMap();
1118 CSPreInliner(ContextTracker, *Binary, Summary.get()).run();
1119 // Turn off the profile merger by default unless it is explicitly enabled.
1120 if (!CSProfMergeColdContext.getNumOccurrences())
1121 CSProfMergeColdContext = false;
1122 }
1123
1124 convertToProfileMap();
1125
1126 // Trim and merge cold context profile using cold threshold above.
1127 if (TrimColdProfile || CSProfMergeColdContext) {
1128 SampleContextTrimmer(ProfileMap)
1129 .trimAndMergeColdContextProfiles(
1130 ColdCountThreshold: HotCountThreshold, TrimColdContext: TrimColdProfile, MergeColdContext: CSProfMergeColdContext,
1131 ColdContextFrameLength: CSProfMaxColdContextDepth, TrimBaseProfileOnly: EnableCSPreInliner);
1132 }
1133
1134 if (GenCSNestedProfile) {
1135 ProfileConverter CSConverter(ProfileMap);
1136 CSConverter.convertCSProfiles();
1137 FunctionSamples::ProfileIsCS = false;
1138 }
1139 filterAmbiguousProfile(Profiles&: ProfileMap);
1140 ProfileGeneratorBase::calculateAndShowDensity(Profiles: ProfileMap);
1141}
1142
1143void ProfileGeneratorBase::computeSummaryAndThreshold(
1144 SampleProfileMap &Profiles) {
1145 SampleProfileSummaryBuilder Builder(ProfileSummaryBuilder::DefaultCutoffs);
1146 Summary = Builder.computeSummaryForProfiles(Profiles);
1147 HotCountThreshold = ProfileSummaryBuilder::getHotCountThreshold(
1148 DS: (Summary->getDetailedSummary()));
1149 ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold(
1150 DS: (Summary->getDetailedSummary()));
1151}
1152
1153void CSProfileGenerator::computeSummaryAndThreshold() {
1154 // Always merge and use context-less profile map to compute summary.
1155 SampleProfileMap ContextLessProfiles;
1156 ContextTracker.createContextLessProfileMap(ContextLessProfiles);
1157
1158 // Set the flag below to avoid merging the profile again in
1159 // computeSummaryAndThreshold
1160 FunctionSamples::ProfileIsCS = false;
1161 assert(
1162 (!UseContextLessSummary.getNumOccurrences() || UseContextLessSummary) &&
1163 "Don't set --profile-summary-contextless to false for profile "
1164 "generation");
1165 ProfileGeneratorBase::computeSummaryAndThreshold(Profiles&: ContextLessProfiles);
1166 // Recover the old value.
1167 FunctionSamples::ProfileIsCS = true;
1168}
1169
1170void ProfileGeneratorBase::extractProbesFromRange(
1171 const RangeSample &RangeCounter, ProbeCounterMap &ProbeCounter,
1172 bool FindDisjointRanges) {
1173 const RangeSample *PRanges = &RangeCounter;
1174 RangeSample Ranges;
1175 if (FindDisjointRanges) {
1176 findDisjointRanges(DisjointRanges&: Ranges, Ranges: RangeCounter);
1177 PRanges = &Ranges;
1178 }
1179
1180 for (const auto &Range : *PRanges) {
1181 uint64_t RangeBegin = Range.first.first;
1182 uint64_t RangeEnd = Range.first.second;
1183 uint64_t Count = Range.second;
1184
1185 InstructionPointer IP(Binary, RangeBegin, true);
1186 // Disjoint ranges may have range in the middle of two instr,
1187 // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range
1188 // can be Addr1+1 to Addr2-1. We should ignore such range.
1189 if (IP.Address > RangeEnd)
1190 continue;
1191
1192 do {
1193 const AddressProbesMap &Address2ProbesMap =
1194 Binary->getAddress2ProbesMap();
1195 auto It = Address2ProbesMap.find(x: IP.Address);
1196 if (It != Address2ProbesMap.end()) {
1197 for (const auto &Probe : It->second) {
1198 ProbeCounter[&Probe] += Count;
1199 }
1200 }
1201 } while (IP.advance() && IP.Address <= RangeEnd);
1202 }
1203}
1204
1205static void extractPrefixContextStack(SampleContextFrameVector &ContextStack,
1206 const SmallVectorImpl<uint64_t> &AddrVec,
1207 ProfiledBinary *Binary) {
1208 SmallVector<const MCDecodedPseudoProbe *, 16> Probes;
1209 for (auto Address : reverse(C: AddrVec)) {
1210 const MCDecodedPseudoProbe *CallProbe =
1211 Binary->getCallProbeForAddr(Address);
1212 // These could be the cases when a probe is not found at a calliste. Cutting
1213 // off the context from here since the inliner will not know how to consume
1214 // a context with unknown callsites.
1215 // 1. for functions that are not sampled when
1216 // --decode-probe-for-profiled-functions-only is on.
1217 // 2. for a merged callsite. Callsite merging may cause the loss of original
1218 // probe IDs.
1219 // 3. for an external callsite.
1220 if (!CallProbe)
1221 break;
1222 Probes.push_back(Elt: CallProbe);
1223 }
1224
1225 std::reverse(first: Probes.begin(), last: Probes.end());
1226
1227 // Extract context stack for reusing, leaf context stack will be added
1228 // compressed while looking up function profile.
1229 for (const auto *P : Probes) {
1230 Binary->getInlineContextForProbe(Probe: P, InlineContextStack&: ContextStack, IncludeLeaf: true);
1231 }
1232}
1233
1234void CSProfileGenerator::generateProbeBasedProfile() {
1235 // Enable pseudo probe functionalities in SampleProf
1236 FunctionSamples::ProfileIsProbeBased = true;
1237 for (const auto &CI : *SampleCounters) {
1238 const AddrBasedCtxKey *CtxKey =
1239 dyn_cast<AddrBasedCtxKey>(Val: CI.first.getPtr());
1240 // Fill in function body samples from probes, also infer caller's samples
1241 // from callee's probe
1242 populateBodySamplesWithProbes(RangeCounter: CI.second.RangeCounter, CtxKey);
1243 // Fill in boundary samples for a call probe
1244 populateBoundarySamplesWithProbes(BranchCounter: CI.second.BranchCounter, CtxKey);
1245 }
1246}
1247
1248void CSProfileGenerator::populateBodySamplesWithProbes(
1249 const RangeSample &RangeCounter, const AddrBasedCtxKey *CtxKey) {
1250 ProbeCounterMap ProbeCounter;
1251 // Extract the top frame probes by looking up each address among the range in
1252 // the Address2ProbeMap
1253 extractProbesFromRange(RangeCounter, ProbeCounter);
1254 std::unordered_map<MCDecodedPseudoProbeInlineTree *,
1255 std::unordered_set<FunctionSamples *>>
1256 FrameSamples;
1257 for (const auto &PI : ProbeCounter) {
1258 const MCDecodedPseudoProbe *Probe = PI.first;
1259 uint64_t Count = PI.second;
1260 // Disjoint ranges have introduce zero-filled gap that
1261 // doesn't belong to current context, filter them out.
1262 if (!Probe->isBlock() || Count == 0)
1263 continue;
1264
1265 ContextTrieNode *ContextNode = getContextNodeForLeafProbe(CtxKey, LeafProbe: Probe);
1266 FunctionSamples &FunctionProfile = *ContextNode->getFunctionSamples();
1267 // Record the current frame and FunctionProfile whenever samples are
1268 // collected for non-danglie probes. This is for reporting all of the
1269 // zero count probes of the frame later.
1270 FrameSamples[Probe->getInlineTreeNode()].insert(x: &FunctionProfile);
1271 FunctionProfile.addBodySamples(LineOffset: Probe->getIndex(), Discriminator: Probe->getDiscriminator(),
1272 Num: Count);
1273 FunctionProfile.addTotalSamples(Num: Count);
1274 if (Probe->isEntry()) {
1275 FunctionProfile.addHeadSamples(Num: Count);
1276 // Look up for the caller's function profile
1277 const auto *InlinerDesc = Binary->getInlinerDescForProbe(Probe);
1278 ContextTrieNode *CallerNode = ContextNode->getParentContext();
1279 if (InlinerDesc != nullptr && CallerNode != &getRootContext()) {
1280 // Since the context id will be compressed, we have to use callee's
1281 // context id to infer caller's context id to ensure they share the
1282 // same context prefix.
1283 uint64_t CallerIndex = ContextNode->getCallSiteLoc().LineOffset;
1284 uint64_t CallerDiscriminator = ContextNode->getCallSiteLoc().Discriminator;
1285 assert(CallerIndex &&
1286 "Inferred caller's location index shouldn't be zero!");
1287 assert(!CallerDiscriminator &&
1288 "Callsite probe should not have a discriminator!");
1289 FunctionSamples &CallerProfile =
1290 *getOrCreateFunctionSamples(ContextNode: CallerNode);
1291 CallerProfile.setFunctionHash(InlinerDesc->FuncHash);
1292 CallerProfile.addBodySamples(LineOffset: CallerIndex, Discriminator: CallerDiscriminator, Num: Count);
1293 CallerProfile.addTotalSamples(Num: Count);
1294 CallerProfile.addCalledTargetSamples(LineOffset: CallerIndex, Discriminator: CallerDiscriminator,
1295 Func: ContextNode->getFuncName(), Num: Count);
1296 }
1297 }
1298 }
1299
1300 // Assign zero count for remaining probes without sample hits to
1301 // differentiate from probes optimized away, of which the counts are unknown
1302 // and will be inferred by the compiler.
1303 for (auto &I : FrameSamples) {
1304 for (auto *FunctionProfile : I.second) {
1305 for (auto *Probe : I.first->getProbes()) {
1306 FunctionProfile->addBodySamples(LineOffset: Probe->getIndex(),
1307 Discriminator: Probe->getDiscriminator(), Num: 0);
1308 }
1309 }
1310 }
1311}
1312
1313void CSProfileGenerator::populateBoundarySamplesWithProbes(
1314 const BranchSample &BranchCounter, const AddrBasedCtxKey *CtxKey) {
1315 for (const auto &BI : BranchCounter) {
1316 uint64_t SourceAddress = BI.first.first;
1317 uint64_t TargetAddress = BI.first.second;
1318 uint64_t Count = BI.second;
1319 const MCDecodedPseudoProbe *CallProbe =
1320 Binary->getCallProbeForAddr(Address: SourceAddress);
1321 if (CallProbe == nullptr)
1322 continue;
1323 FunctionSamples &FunctionProfile =
1324 getFunctionProfileForLeafProbe(CtxKey, LeafProbe: CallProbe);
1325 FunctionProfile.addBodySamples(LineOffset: CallProbe->getIndex(), Discriminator: 0, Num: Count);
1326 FunctionProfile.addTotalSamples(Num: Count);
1327 StringRef CalleeName = getCalleeNameForAddress(TargetAddress);
1328 if (CalleeName.size() == 0)
1329 continue;
1330 FunctionProfile.addCalledTargetSamples(LineOffset: CallProbe->getIndex(),
1331 Discriminator: CallProbe->getDiscriminator(),
1332 Func: FunctionId(CalleeName), Num: Count);
1333 }
1334}
1335
1336ContextTrieNode *CSProfileGenerator::getContextNodeForLeafProbe(
1337 const AddrBasedCtxKey *CtxKey, const MCDecodedPseudoProbe *LeafProbe) {
1338
1339 const SmallVectorImpl<uint64_t> *PContext = &CtxKey->Context;
1340 SmallVector<uint64_t, 16> NewContext;
1341
1342 if (InferMissingFrames) {
1343 SmallVector<uint64_t, 16> Context = CtxKey->Context;
1344 // Append leaf frame for a complete inference.
1345 Context.push_back(Elt: LeafProbe->getAddress());
1346 inferMissingFrames(Context, NewContext);
1347 // Pop out the leaf probe that was pushed in above.
1348 NewContext.pop_back();
1349 PContext = &NewContext;
1350 }
1351
1352 SampleContextFrameVector ContextStack;
1353 extractPrefixContextStack(ContextStack, AddrVec: *PContext, Binary);
1354
1355 // Explicitly copy the context for appending the leaf context
1356 SampleContextFrameVector NewContextStack(ContextStack.begin(),
1357 ContextStack.end());
1358 Binary->getInlineContextForProbe(Probe: LeafProbe, InlineContextStack&: NewContextStack, IncludeLeaf: true);
1359 // For leaf inlined context with the top frame, we should strip off the top
1360 // frame's probe id, like:
1361 // Inlined stack: [foo:1, bar:2], the ContextId will be "foo:1 @ bar"
1362 auto LeafFrame = NewContextStack.back();
1363 LeafFrame.Location = LineLocation(0, 0);
1364 NewContextStack.pop_back();
1365 // Compress the context string except for the leaf frame
1366 CSProfileGenerator::compressRecursionContext(Context&: NewContextStack);
1367 CSProfileGenerator::trimContext(S&: NewContextStack);
1368 NewContextStack.push_back(Elt: LeafFrame);
1369
1370 const auto *FuncDesc = Binary->getFuncDescForGUID(GUID: LeafProbe->getGuid());
1371 bool WasLeafInlined = LeafProbe->getInlineTreeNode()->hasInlineSite();
1372 ContextTrieNode *ContextNode =
1373 getOrCreateContextNode(Context: NewContextStack, WasLeafInlined);
1374 ContextNode->getFunctionSamples()->setFunctionHash(FuncDesc->FuncHash);
1375 return ContextNode;
1376}
1377
1378FunctionSamples &CSProfileGenerator::getFunctionProfileForLeafProbe(
1379 const AddrBasedCtxKey *CtxKey, const MCDecodedPseudoProbe *LeafProbe) {
1380 return *getContextNodeForLeafProbe(CtxKey, LeafProbe)->getFunctionSamples();
1381}
1382
1383} // end namespace sampleprof
1384} // end namespace llvm
1385