1//===-- ProfileGenerator.cpp - Profile Generator ---------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8#include "ProfileGenerator.h"
9#include "ErrorHandling.h"
10#include "MissingFrameInferrer.h"
11#include "Options.h"
12#include "PerfReader.h"
13#include "ProfiledBinary.h"
14#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
15#include "llvm/ProfileData/ProfileCommon.h"
16#include <algorithm>
17#include <float.h>
18#include <unordered_set>
19#include <utility>
20
21namespace llvm {
22
23cl::opt<std::string> OutputFilename("output", cl::value_desc("output"),
24 cl::Required,
25 cl::desc("Output profile file"),
26 cl::cat(ProfGenCategory));
27static cl::alias OutputA("o", cl::desc("Alias for --output"),
28 cl::aliasopt(OutputFilename));
29
30static cl::opt<SampleProfileFormat> OutputFormat(
31 "format", cl::desc("Format of output profile"), cl::init(Val: SPF_Ext_Binary),
32 cl::values(clEnumValN(SPF_Binary, "binary", "Binary encoding (default)"),
33 clEnumValN(SPF_Ext_Binary, "extbinary",
34 "Extensible binary encoding"),
35 clEnumValN(SPF_Text, "text", "Text encoding"),
36 clEnumValN(SPF_GCC, "gcc",
37 "GCC encoding (only meaningful for -sample)")),
38 cl::cat(ProfGenCategory));
39
40static cl::opt<bool> UseMD5(
41 "use-md5", cl::Hidden,
42 cl::desc("Use md5 to represent function names in the output profile (only "
43 "meaningful for -extbinary)"));
44
45static cl::opt<bool> PopulateProfileSymbolList(
46 "populate-profile-symbol-list", cl::init(Val: false), cl::Hidden,
47 cl::desc("Populate profile symbol list (only meaningful for -extbinary)"));
48
49static cl::opt<bool> FillZeroForAllFuncs(
50 "fill-zero-for-all-funcs", cl::init(Val: false), cl::Hidden,
51 cl::desc("Attribute all functions' range with zero count "
52 "even it's not hit by any samples."));
53
54static cl::opt<int32_t, true> RecursionCompression(
55 "compress-recursion",
56 cl::desc("Compressing recursion by deduplicating adjacent frame "
57 "sequences up to the specified size. -1 means no size limit."),
58 cl::Hidden,
59 cl::location(L&: llvm::sampleprof::CSProfileGenerator::MaxCompressionSize));
60
61static cl::opt<bool>
62 TrimColdProfile("trim-cold-profile",
63 cl::desc("If the total count of the profile is smaller "
64 "than threshold, it will be trimmed."),
65 cl::cat(ProfGenCategory));
66
67static cl::opt<bool> MarkAllContextPreinlined(
68 "mark-all-context-preinlined",
69 cl::desc("Mark all function samples as preinlined(set "
70 "ContextShouldBeInlined attribute)."),
71 cl::init(Val: false));
72
73static cl::opt<bool> CSProfMergeColdContext(
74 "csprof-merge-cold-context", cl::init(Val: true),
75 cl::desc("If the total count of context profile is smaller than "
76 "the threshold, it will be merged into context-less base "
77 "profile."),
78 cl::cat(ProfGenCategory));
79
80static cl::opt<uint32_t> CSProfMaxColdContextDepth(
81 "csprof-max-cold-context-depth", cl::init(Val: 1),
82 cl::desc("Keep the last K contexts while merging cold profile. 1 means the "
83 "context-less base profile"),
84 cl::cat(ProfGenCategory));
85
86static cl::opt<int, true> CSProfMaxContextDepth(
87 "csprof-max-context-depth",
88 cl::desc("Keep the last K contexts while merging profile. -1 means no "
89 "depth limit."),
90 cl::location(L&: llvm::sampleprof::CSProfileGenerator::MaxContextDepth),
91 cl::cat(ProfGenCategory));
92
93static cl::opt<double> ProfileDensityThreshold(
94 "profile-density-threshold", cl::init(Val: 50),
95 cl::desc("If the profile density is below the given threshold, it "
96 "will be suggested to increase the sampling rate."),
97 cl::Optional, cl::cat(ProfGenCategory));
98static cl::opt<bool> ShowDensity("show-density", cl::init(Val: false),
99 cl::desc("show profile density details"),
100 cl::Optional, cl::cat(ProfGenCategory));
101static cl::opt<int> ProfileDensityCutOffHot(
102 "profile-density-cutoff-hot", cl::init(Val: 990000),
103 cl::desc("Total samples cutoff for functions used to calculate "
104 "profile density."),
105 cl::cat(ProfGenCategory));
106
107static cl::opt<bool> UpdateTotalSamples(
108 "update-total-samples", cl::init(Val: false),
109 cl::desc("Update total samples by accumulating all its body samples."),
110 cl::Optional, cl::cat(ProfGenCategory));
111
112static cl::opt<bool> GenCSNestedProfile(
113 "gen-cs-nested-profile", cl::Hidden, cl::init(Val: true),
114 cl::desc("Generate nested function profiles for CSSPGO"));
115
116cl::opt<bool> InferMissingFrames(
117 "infer-missing-frames", cl::init(Val: true),
118 cl::desc(
119 "Infer missing call frames due to compiler tail call elimination."),
120 cl::Optional, cl::cat(ProfGenCategory));
121
122namespace sampleprof {
123
124// Initialize the MaxCompressionSize to -1 which means no size limit
125int32_t CSProfileGenerator::MaxCompressionSize = -1;
126
127int CSProfileGenerator::MaxContextDepth = -1;
128
129bool ProfileGeneratorBase::UseFSDiscriminator = false;
130
131std::unique_ptr<ProfileGeneratorBase>
132ProfileGeneratorBase::create(ProfiledBinary *Binary,
133 const ContextSampleCounterMap *SampleCounters,
134 bool ProfileIsCS) {
135 std::unique_ptr<ProfileGeneratorBase> Generator;
136 if (ProfileIsCS) {
137 Generator.reset(p: new CSProfileGenerator(Binary, SampleCounters));
138 } else {
139 Generator.reset(p: new ProfileGenerator(Binary, SampleCounters));
140 }
141 ProfileGeneratorBase::UseFSDiscriminator = Binary->useFSDiscriminator();
142 FunctionSamples::ProfileIsFS = Binary->useFSDiscriminator();
143
144 return Generator;
145}
146
147std::unique_ptr<ProfileGeneratorBase>
148ProfileGeneratorBase::create(ProfiledBinary *Binary, SampleProfileMap &Profiles,
149 bool ProfileIsCS) {
150 std::unique_ptr<ProfileGeneratorBase> Generator;
151 if (ProfileIsCS) {
152 Generator.reset(p: new CSProfileGenerator(Binary, Profiles));
153 } else {
154 Generator.reset(p: new ProfileGenerator(Binary, std::move(Profiles)));
155 }
156 ProfileGeneratorBase::UseFSDiscriminator = Binary->useFSDiscriminator();
157 FunctionSamples::ProfileIsFS = Binary->useFSDiscriminator();
158
159 return Generator;
160}
161
162void ProfileGeneratorBase::write(std::unique_ptr<SampleProfileWriter> Writer,
163 SampleProfileMap &ProfileMap) {
164 // Populate profile symbol list if extended binary format is used.
165 ProfileSymbolList SymbolList;
166
167 if (PopulateProfileSymbolList && OutputFormat == SPF_Ext_Binary) {
168 Binary->populateSymbolListFromDWARF(SymbolList);
169 Writer->setProfileSymbolList(&SymbolList);
170 }
171
172 if (std::error_code EC = Writer->write(ProfileMap))
173 exitWithError(EC: std::move(EC));
174}
175
176void ProfileGeneratorBase::write() {
177 auto WriterOrErr = SampleProfileWriter::create(Filename: OutputFilename, Format: OutputFormat);
178 if (std::error_code EC = WriterOrErr.getError())
179 exitWithError(EC, Whence: OutputFilename);
180
181 if (UseMD5) {
182 if (OutputFormat != SPF_Ext_Binary)
183 WithColor::warning() << "-use-md5 is ignored. Specify "
184 "--format=extbinary to enable it\n";
185 else
186 WriterOrErr.get()->setUseMD5();
187 }
188
189 write(Writer: std::move(WriterOrErr.get()), ProfileMap);
190}
191
192void ProfileGeneratorBase::showDensitySuggestion(double Density) {
193 if (Density == 0.0)
194 WithColor::warning() << "The output profile is empty or the "
195 "--profile-density-cutoff-hot option is "
196 "set too low. Please check your command.\n";
197 else if (Density < ProfileDensityThreshold)
198 WithColor::warning()
199 << "Sample PGO is estimated to optimize better with "
200 << format(Fmt: "%.1f", Vals: ProfileDensityThreshold / Density)
201 << "x more samples. Please consider increasing sampling rate or "
202 "profiling for longer duration to get more samples.\n";
203
204 if (ShowDensity)
205 outs() << "Functions with density >= " << format(Fmt: "%.1f", Vals: Density)
206 << " account for "
207 << format(Fmt: "%.2f",
208 Vals: static_cast<double>(ProfileDensityCutOffHot) / 10000)
209 << "% total sample counts.\n";
210}
211
212bool ProfileGeneratorBase::filterAmbiguousProfile(FunctionSamples &FS) {
213 for (const auto &Prefix : FuncPrefixsToFilter) {
214 if (FS.getFuncName().starts_with(Prefix))
215 return true;
216 }
217
218 // Filter the function profiles for the inlinees. It's useful for fuzzy
219 // profile matching which flattens the profile and inlinees' samples are
220 // merged into top-level function.
221 for (auto &Callees :
222 const_cast<CallsiteSampleMap &>(FS.getCallsiteSamples())) {
223 auto &CalleesMap = Callees.second;
224 for (auto I = CalleesMap.begin(); I != CalleesMap.end();) {
225 auto FS = I++;
226 if (filterAmbiguousProfile(FS&: FS->second))
227 CalleesMap.erase(position: FS);
228 }
229 }
230 return false;
231}
232
233// For built-in local initialization function such as __cxx_global_var_init,
234// __tls_init prefix function, there could be multiple versions of the functions
235// in the final binary. However, in the profile generation, we call
236// getCanonicalFnName to canonicalize the names which strips the suffixes.
237// Therefore, samples from different functions queries the same profile and the
238// samples are merged. As the functions are essentially different, entries of
239// the merged profile are ambiguous. In sample loader, the IR from one version
240// would be attributed towards a merged entries, which is inaccurate. Especially
241// for fuzzy profile matching, it gets multiple callsites(from different
242// function) but used to match one callsite, which misleads the matching and
243// causes a lot of false positives report. Hence, we want to filter them out
244// from the profile map during the profile generation time. The profiles are all
245// cold functions, it won't have perf impact.
246void ProfileGeneratorBase::filterAmbiguousProfile(SampleProfileMap &Profiles) {
247 for (auto I = ProfileMap.begin(); I != ProfileMap.end();) {
248 auto FS = I++;
249 if (filterAmbiguousProfile(FS&: FS->second))
250 ProfileMap.erase(It: FS);
251 }
252}
253
254void ProfileGeneratorBase::findDisjointRanges(RangeSample &DisjointRanges,
255 const RangeSample &Ranges) {
256
257 /*
258 Regions may overlap with each other. Using the boundary info, find all
259 disjoint ranges and their sample count. BoundaryPoint contains the count
260 multiple samples begin/end at this points.
261
262 |<--100-->| Sample1
263 |<------200------>| Sample2
264 A B C
265
266 In the example above,
267 Sample1 begins at A, ends at B, its value is 100.
268 Sample2 beings at A, ends at C, its value is 200.
269 For A, BeginCount is the sum of sample begins at A, which is 300 and no
270 samples ends at A, so EndCount is 0.
271 Then boundary points A, B, and C with begin/end counts are:
272 A: (300, 0)
273 B: (0, 100)
274 C: (0, 200)
275 */
276 struct BoundaryPoint {
277 // Sum of sample counts beginning at this point
278 uint64_t BeginCount = UINT64_MAX;
279 // Sum of sample counts ending at this point
280 uint64_t EndCount = UINT64_MAX;
281 // Is the begin point of a zero range.
282 bool IsZeroRangeBegin = false;
283 // Is the end point of a zero range.
284 bool IsZeroRangeEnd = false;
285
286 void addBeginCount(uint64_t Count) {
287 if (BeginCount == UINT64_MAX)
288 BeginCount = 0;
289 BeginCount += Count;
290 }
291
292 void addEndCount(uint64_t Count) {
293 if (EndCount == UINT64_MAX)
294 EndCount = 0;
295 EndCount += Count;
296 }
297 };
298
299 /*
300 For the above example. With boundary points, follwing logic finds two
301 disjoint region of
302
303 [A,B]: 300
304 [B+1,C]: 200
305
306 If there is a boundary point that both begin and end, the point itself
307 becomes a separate disjoint region. For example, if we have original
308 ranges of
309
310 |<--- 100 --->|
311 |<--- 200 --->|
312 A B C
313
314 there are three boundary points with their begin/end counts of
315
316 A: (100, 0)
317 B: (200, 100)
318 C: (0, 200)
319
320 the disjoint ranges would be
321
322 [A, B-1]: 100
323 [B, B]: 300
324 [B+1, C]: 200.
325
326 Example for zero value range:
327
328 |<--- 100 --->|
329 |<--- 200 --->|
330 |<--------------- 0 ----------------->|
331 A B C D E F
332
333 [A, B-1] : 0
334 [B, C] : 100
335 [C+1, D-1]: 0
336 [D, E] : 200
337 [E+1, F] : 0
338 */
339 std::map<uint64_t, BoundaryPoint> Boundaries;
340
341 for (const auto &Item : Ranges) {
342 assert(Item.first.first <= Item.first.second &&
343 "Invalid instruction range");
344 auto &BeginPoint = Boundaries[Item.first.first];
345 auto &EndPoint = Boundaries[Item.first.second];
346 uint64_t Count = Item.second;
347
348 BeginPoint.addBeginCount(Count);
349 EndPoint.addEndCount(Count);
350 if (Count == 0) {
351 BeginPoint.IsZeroRangeBegin = true;
352 EndPoint.IsZeroRangeEnd = true;
353 }
354 }
355
356 // Use UINT64_MAX to indicate there is no existing range between BeginAddress
357 // and the next valid address
358 uint64_t BeginAddress = UINT64_MAX;
359 int ZeroRangeDepth = 0;
360 uint64_t Count = 0;
361 for (const auto &Item : Boundaries) {
362 uint64_t Address = Item.first;
363 const BoundaryPoint &Point = Item.second;
364 if (Point.BeginCount != UINT64_MAX) {
365 if (BeginAddress != UINT64_MAX)
366 DisjointRanges[{BeginAddress, Address - 1}] = Count;
367 Count += Point.BeginCount;
368 BeginAddress = Address;
369 ZeroRangeDepth += Point.IsZeroRangeBegin;
370 }
371 if (Point.EndCount != UINT64_MAX) {
372 assert((BeginAddress != UINT64_MAX) &&
373 "First boundary point cannot be 'end' point");
374 DisjointRanges[{BeginAddress, Address}] = Count;
375 assert(Count >= Point.EndCount && "Mismatched live ranges");
376 Count -= Point.EndCount;
377 BeginAddress = Address + 1;
378 ZeroRangeDepth -= Point.IsZeroRangeEnd;
379 // If the remaining count is zero and it's no longer in a zero range, this
380 // means we consume all the ranges before, thus mark BeginAddress as
381 // UINT64_MAX. e.g. supposing we have two non-overlapping ranges:
382 // [<---- 10 ---->]
383 // [<---- 20 ---->]
384 // A B C D
385 // The BeginAddress(B+1) will reset to invalid(UINT64_MAX), so we won't
386 // have the [B+1, C-1] zero range.
387 if (Count == 0 && ZeroRangeDepth == 0)
388 BeginAddress = UINT64_MAX;
389 }
390 }
391}
392
393void ProfileGeneratorBase::updateBodySamplesforFunctionProfile(
394 FunctionSamples &FunctionProfile, const SampleContextFrame &LeafLoc,
395 uint64_t Count) {
396 // Use the maximum count of samples with same line location
397 uint32_t Discriminator = getBaseDiscriminator(Discriminator: LeafLoc.Location.Discriminator);
398
399 // Use duplication factor to compensated for loop unroll/vectorization.
400 // Note that this is only needed when we're taking MAX of the counts at
401 // the location instead of SUM.
402 Count *= getDuplicationFactor(Discriminator: LeafLoc.Location.Discriminator);
403
404 ErrorOr<uint64_t> R =
405 FunctionProfile.findSamplesAt(LineOffset: LeafLoc.Location.LineOffset, Discriminator);
406
407 uint64_t PreviousCount = R ? R.get() : 0;
408 if (PreviousCount <= Count) {
409 FunctionProfile.addBodySamples(LineOffset: LeafLoc.Location.LineOffset, Discriminator,
410 Num: Count - PreviousCount);
411 }
412}
413
414void ProfileGeneratorBase::updateTotalSamples() {
415 for (auto &Item : ProfileMap) {
416 FunctionSamples &FunctionProfile = Item.second;
417 FunctionProfile.updateTotalSamples();
418 }
419}
420
421void ProfileGeneratorBase::updateCallsiteSamples() {
422 for (auto &Item : ProfileMap) {
423 FunctionSamples &FunctionProfile = Item.second;
424 FunctionProfile.updateCallsiteSamples();
425 }
426}
427
428void ProfileGeneratorBase::updateFunctionSamples() {
429 updateCallsiteSamples();
430
431 if (UpdateTotalSamples)
432 updateTotalSamples();
433}
434
435void ProfileGeneratorBase::collectProfiledFunctions() {
436 std::unordered_set<const BinaryFunction *> ProfiledFunctions;
437 if (collectFunctionsFromRawProfile(ProfiledFunctions))
438 Binary->setProfiledFunctions(ProfiledFunctions);
439 else if (collectFunctionsFromLLVMProfile(ProfiledFunctions))
440 Binary->setProfiledFunctions(ProfiledFunctions);
441 else
442 llvm_unreachable("Unsupported input profile");
443}
444
445bool ProfileGeneratorBase::collectFunctionsFromRawProfile(
446 std::unordered_set<const BinaryFunction *> &ProfiledFunctions) {
447 if (!SampleCounters)
448 return false;
449 // Go through all the stacks, ranges and branches in sample counters, use
450 // the start of the range to look up the function it belongs and record the
451 // function.
452 for (const auto &CI : *SampleCounters) {
453 if (const auto *CtxKey = dyn_cast<AddrBasedCtxKey>(Val: CI.first.getPtr())) {
454 for (auto StackAddr : CtxKey->Context) {
455 if (FuncRange *FRange = Binary->findFuncRange(Address: StackAddr))
456 ProfiledFunctions.insert(x: FRange->Func);
457 }
458 }
459
460 for (auto Item : CI.second.RangeCounter) {
461 uint64_t StartAddress = Item.first.first;
462 if (FuncRange *FRange = Binary->findFuncRange(Address: StartAddress))
463 ProfiledFunctions.insert(x: FRange->Func);
464 }
465
466 for (auto Item : CI.second.BranchCounter) {
467 uint64_t SourceAddress = Item.first.first;
468 uint64_t TargetAddress = Item.first.second;
469 if (FuncRange *FRange = Binary->findFuncRange(Address: SourceAddress))
470 ProfiledFunctions.insert(x: FRange->Func);
471 if (FuncRange *FRange = Binary->findFuncRange(Address: TargetAddress))
472 ProfiledFunctions.insert(x: FRange->Func);
473 }
474 }
475 return true;
476}
477
478bool ProfileGenerator::collectFunctionsFromLLVMProfile(
479 std::unordered_set<const BinaryFunction *> &ProfiledFunctions) {
480 for (const auto &FS : ProfileMap) {
481 if (auto *Func = Binary->getBinaryFunction(FName: FS.second.getFunction()))
482 ProfiledFunctions.insert(x: Func);
483 }
484 return true;
485}
486
487bool CSProfileGenerator::collectFunctionsFromLLVMProfile(
488 std::unordered_set<const BinaryFunction *> &ProfiledFunctions) {
489 for (auto *Node : ContextTracker) {
490 if (!Node->getFuncName().empty())
491 if (auto *Func = Binary->getBinaryFunction(FName: Node->getFuncName()))
492 ProfiledFunctions.insert(x: Func);
493 }
494 return true;
495}
496
497FunctionSamples &
498ProfileGenerator::getTopLevelFunctionProfile(FunctionId FuncName) {
499 SampleContext Context(FuncName);
500 return ProfileMap.create(Ctx: Context);
501}
502
503void ProfileGenerator::generateProfile() {
504 collectProfiledFunctions();
505
506 if (Binary->usePseudoProbes()) {
507 Binary->decodePseudoProbe();
508 if (LoadFunctionFromSymbol)
509 Binary->loadSymbolsFromPseudoProbe();
510 }
511
512 if (SampleCounters) {
513 if (Binary->usePseudoProbes()) {
514 generateProbeBasedProfile();
515 } else {
516 generateLineNumBasedProfile();
517 }
518 }
519
520 postProcessProfiles();
521}
522
523void ProfileGeneratorBase::markAllContextPreinlined(
524 SampleProfileMap &ProfileMap) {
525 for (auto &I : ProfileMap)
526 I.second.setContextAttribute(ContextShouldBeInlined);
527 FunctionSamples::ProfileIsPreInlined = true;
528}
529
530void ProfileGenerator::postProcessProfiles() {
531 computeSummaryAndThreshold(ProfileMap);
532 trimColdProfiles(Profiles: ProfileMap, ColdCntThreshold: ColdCountThreshold);
533 filterAmbiguousProfile(Profiles&: ProfileMap);
534 if (MarkAllContextPreinlined)
535 markAllContextPreinlined(ProfileMap);
536 calculateAndShowDensity(Profiles: ProfileMap);
537}
538
539void ProfileGenerator::trimColdProfiles(const SampleProfileMap &Profiles,
540 uint64_t ColdCntThreshold) {
541 if (!TrimColdProfile)
542 return;
543
544 // Move cold profiles into a tmp container.
545 std::vector<hash_code> ColdProfileHashes;
546 for (const auto &I : ProfileMap) {
547 if (I.second.getTotalSamples() < ColdCntThreshold)
548 ColdProfileHashes.emplace_back(args: I.first);
549 }
550
551 // Remove the cold profile from ProfileMap.
552 for (const auto &I : ColdProfileHashes)
553 ProfileMap.erase(Key: I);
554}
555
556void ProfileGenerator::generateLineNumBasedProfile() {
557 assert(SampleCounters->size() == 1 &&
558 "Must have one entry for profile generation.");
559 const SampleCounter &SC = SampleCounters->begin()->second;
560 // Fill in function body samples
561 populateBodySamplesForAllFunctions(RangeCounter: SC.RangeCounter);
562 // Fill in boundary sample counts as well as call site samples for calls
563 populateBoundarySamplesForAllFunctions(BranchCounters: SC.BranchCounter);
564 populateTypeSamplesForAllFunctions(DataAccessSamples: SC.DataAccessCounter);
565
566 updateFunctionSamples();
567}
568
569void ProfileGenerator::generateProbeBasedProfile() {
570 assert(SampleCounters->size() == 1 &&
571 "Must have one entry for profile generation.");
572 // Enable pseudo probe functionalities in SampleProf
573 FunctionSamples::ProfileIsProbeBased = true;
574 const SampleCounter &SC = SampleCounters->begin()->second;
575 // Fill in function body samples
576 populateBodySamplesWithProbesForAllFunctions(RangeCounter: SC.RangeCounter);
577 // Fill in boundary sample counts as well as call site samples for calls
578 populateBoundarySamplesWithProbesForAllFunctions(BranchCounters: SC.BranchCounter);
579
580 updateFunctionSamples();
581}
582
583void ProfileGenerator::populateBodySamplesWithProbesForAllFunctions(
584 const RangeSample &RangeCounter) {
585 ProbeCounterMap ProbeCounter;
586 // preprocessRangeCounter returns disjoint ranges, so no longer to redo it
587 // inside extractProbesFromRange.
588 extractProbesFromRange(RangeCounter: preprocessRangeCounter(RangeCounter), ProbeCounter,
589 FindDisjointRanges: false);
590
591 for (const auto &PI : ProbeCounter) {
592 const MCDecodedPseudoProbe *Probe = PI.first;
593 uint64_t Count = PI.second;
594 SampleContextFrameVector FrameVec;
595 Binary->getInlineContextForProbe(Probe, InlineContextStack&: FrameVec, IncludeLeaf: true);
596 FunctionSamples &FunctionProfile =
597 getLeafProfileAndAddTotalSamples(FrameVec, Count);
598 FunctionProfile.addBodySamples(LineOffset: Probe->getIndex(), Discriminator: Probe->getDiscriminator(),
599 Num: Count);
600 if (Probe->isEntry())
601 FunctionProfile.addHeadSamples(Num: Count);
602 }
603}
604
605void ProfileGenerator::populateBoundarySamplesWithProbesForAllFunctions(
606 const BranchSample &BranchCounters) {
607 for (const auto &Entry : BranchCounters) {
608 uint64_t SourceAddress = Entry.first.first;
609 uint64_t TargetAddress = Entry.first.second;
610 uint64_t Count = Entry.second;
611 assert(Count != 0 && "Unexpected zero weight branch");
612
613 StringRef CalleeName = getCalleeNameForAddress(TargetAddress);
614 if (CalleeName.size() == 0)
615 continue;
616
617 const MCDecodedPseudoProbe *CallProbe =
618 Binary->getCallProbeForAddr(Address: SourceAddress);
619 if (CallProbe == nullptr)
620 continue;
621
622 // Record called target sample and its count.
623 SampleContextFrameVector FrameVec;
624 Binary->getInlineContextForProbe(Probe: CallProbe, InlineContextStack&: FrameVec, IncludeLeaf: true);
625
626 if (!FrameVec.empty()) {
627 FunctionSamples &FunctionProfile =
628 getLeafProfileAndAddTotalSamples(FrameVec, Count: 0);
629 FunctionProfile.addCalledTargetSamples(
630 LineOffset: FrameVec.back().Location.LineOffset,
631 Discriminator: FrameVec.back().Location.Discriminator, Func: FunctionId(CalleeName),
632 Num: Count);
633 }
634 }
635}
636
637FunctionSamples &ProfileGenerator::getLeafProfileAndAddTotalSamples(
638 const SampleContextFrameVector &FrameVec, uint64_t Count) {
639 // Get top level profile
640 FunctionSamples *FunctionProfile =
641 &getTopLevelFunctionProfile(FuncName: FrameVec[0].Func);
642 FunctionProfile->addTotalSamples(Num: Count);
643 if (Binary->usePseudoProbes()) {
644 const auto *FuncDesc = Binary->getFuncDescForGUID(
645 GUID: FunctionProfile->getFunction().getHashCode());
646 FunctionProfile->setFunctionHash(FuncDesc->FuncHash);
647 }
648
649 for (size_t I = 1; I < FrameVec.size(); I++) {
650 LineLocation Callsite(
651 FrameVec[I - 1].Location.LineOffset,
652 getBaseDiscriminator(Discriminator: FrameVec[I - 1].Location.Discriminator));
653 FunctionSamplesMap &SamplesMap =
654 FunctionProfile->functionSamplesAt(Loc: Callsite);
655 auto Ret = SamplesMap.emplace(args: FrameVec[I].Func, args: FunctionSamples());
656 if (Ret.second) {
657 SampleContext Context(FrameVec[I].Func);
658 Ret.first->second.setContext(Context);
659 }
660 FunctionProfile = &Ret.first->second;
661 FunctionProfile->addTotalSamples(Num: Count);
662 if (Binary->usePseudoProbes()) {
663 const auto *FuncDesc = Binary->getFuncDescForGUID(
664 GUID: FunctionProfile->getFunction().getHashCode());
665 FunctionProfile->setFunctionHash(FuncDesc->FuncHash);
666 }
667 }
668
669 return *FunctionProfile;
670}
671
672RangeSample
673ProfileGenerator::preprocessRangeCounter(const RangeSample &RangeCounter) {
674 RangeSample Ranges(RangeCounter.begin(), RangeCounter.end());
675 if (FillZeroForAllFuncs) {
676 for (auto &FuncI : Binary->getAllBinaryFunctions()) {
677 for (auto &R : FuncI.second.Ranges) {
678 Ranges[{R.first, R.second - 1}] += 0;
679 }
680 }
681 } else {
682 // For each range, we search for all ranges of the function it belongs to
683 // and initialize it with zero count, so it remains zero if doesn't hit any
684 // samples. This is to be consistent with compiler that interpret zero count
685 // as unexecuted(cold).
686 for (const auto &I : RangeCounter) {
687 uint64_t StartAddress = I.first.first;
688 for (const auto &Range : Binary->getRanges(Address: StartAddress))
689 Ranges[{Range.first, Range.second - 1}] += 0;
690 }
691 }
692 RangeSample DisjointRanges;
693 findDisjointRanges(DisjointRanges, Ranges);
694 return DisjointRanges;
695}
696
697void ProfileGenerator::populateBodySamplesForAllFunctions(
698 const RangeSample &RangeCounter) {
699 for (const auto &Range : preprocessRangeCounter(RangeCounter)) {
700 uint64_t RangeBegin = Range.first.first;
701 uint64_t RangeEnd = Range.first.second;
702 uint64_t Count = Range.second;
703
704 InstructionPointer IP(Binary, RangeBegin, true);
705 // Disjoint ranges may have range in the middle of two instr,
706 // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range
707 // can be Addr1+1 to Addr2-1. We should ignore such range.
708 if (IP.Address > RangeEnd)
709 continue;
710
711 do {
712 const SampleContextFrameVector FrameVec =
713 Binary->getFrameLocationStack(Address: IP.Address);
714 if (!FrameVec.empty()) {
715 // FIXME: As accumulating total count per instruction caused some
716 // regression, we changed to accumulate total count per byte as a
717 // workaround. Tuning hotness threshold on the compiler side might be
718 // necessary in the future.
719 FunctionSamples &FunctionProfile = getLeafProfileAndAddTotalSamples(
720 FrameVec, Count: Count * Binary->getInstSize(Address: IP.Address));
721 updateBodySamplesforFunctionProfile(FunctionProfile, LeafLoc: FrameVec.back(),
722 Count);
723 }
724 } while (IP.advance() && IP.Address <= RangeEnd);
725 }
726}
727
728StringRef
729ProfileGeneratorBase::getCalleeNameForAddress(uint64_t TargetAddress) {
730 // Get the function range by branch target if it's a call branch.
731 auto *FRange = Binary->findFuncRangeForStartAddr(Address: TargetAddress);
732
733 // We won't accumulate sample count for a range whose start is not the real
734 // function entry such as outlined function or inner labels.
735 if (!FRange || !FRange->IsFuncEntry)
736 return StringRef();
737
738 // DWARF and symbol table may have mismatching function names. Instead, we'll
739 // try to use its pseudo probe name first.
740 if (Binary->usePseudoProbes()) {
741 auto FuncName = Binary->findPseudoProbeName(Func: FRange->Func);
742 if (FuncName.size())
743 return FunctionSamples::getCanonicalFnName(FnName: FuncName);
744 }
745
746 return FunctionSamples::getCanonicalFnName(FnName: FRange->getFuncName());
747}
748
749void ProfileGenerator::populateBoundarySamplesForAllFunctions(
750 const BranchSample &BranchCounters) {
751 for (const auto &Entry : BranchCounters) {
752 uint64_t SourceAddress = Entry.first.first;
753 uint64_t TargetAddress = Entry.first.second;
754 uint64_t Count = Entry.second;
755 assert(Count != 0 && "Unexpected zero weight branch");
756
757 StringRef CalleeName = getCalleeNameForAddress(TargetAddress);
758 if (CalleeName.size() == 0)
759 continue;
760 // Record called target sample and its count.
761 const SampleContextFrameVector &FrameVec =
762 Binary->getCachedFrameLocationStack(Address: SourceAddress);
763 if (!FrameVec.empty()) {
764 FunctionSamples &FunctionProfile =
765 getLeafProfileAndAddTotalSamples(FrameVec, Count: 0);
766 FunctionProfile.addCalledTargetSamples(
767 LineOffset: FrameVec.back().Location.LineOffset,
768 Discriminator: getBaseDiscriminator(Discriminator: FrameVec.back().Location.Discriminator),
769 Func: FunctionId(CalleeName), Num: Count);
770 }
771 // Add head samples for callee.
772 FunctionSamples &CalleeProfile =
773 getTopLevelFunctionProfile(FuncName: FunctionId(CalleeName));
774 CalleeProfile.addHeadSamples(Num: Count);
775 }
776}
777
778void ProfileGenerator::populateTypeSamplesForAllFunctions(
779 const DataAccessSample &DataAccessSamples) {
780 // For each instruction with vtable accesses, get its symbolized inline
781 // stack, and add the vtable counters to the function samples.
782 for (const auto &[IpData, Count] : DataAccessSamples) {
783 uint64_t InstAddr = IpData.first;
784 const SampleContextFrameVector &FrameVec =
785 Binary->getCachedFrameLocationStack(Address: InstAddr,
786 /* UseProbeDiscriminator= */ false);
787 if (!FrameVec.empty()) {
788 FunctionSamples &FunctionProfile =
789 getLeafProfileAndAddTotalSamples(FrameVec, /* Count= */ 0);
790 LineLocation Loc(
791 FrameVec.back().Location.LineOffset,
792 getBaseDiscriminator(Discriminator: FrameVec.back().Location.Discriminator));
793 FunctionProfile.addTypeSamplesAt(Loc, Type: FunctionId(IpData.second), Count);
794 }
795 }
796}
797
798void ProfileGeneratorBase::calculateBodySamplesAndSize(
799 const FunctionSamples &FSamples, uint64_t &TotalBodySamples,
800 uint64_t &FuncBodySize) {
801 // Note that ideally the size should be the number of function instruction.
802 // However, for probe-based profile, we don't have the accurate instruction
803 // count for each probe, instead, the probe sample is the samples count for
804 // the block, which is equivelant to
805 // total_instruction_samples/num_of_instruction in one block. Hence, we use
806 // the number of probe as a proxy for the function's size.
807 FuncBodySize += FSamples.getBodySamples().size();
808
809 // The accumulated body samples re-calculated here could be different from the
810 // TotalSamples(getTotalSamples) field of FunctionSamples for line-number
811 // based profile. The reason is that TotalSamples is the sum of all the
812 // samples of the machine instruction in one source-code line, however, the
813 // entry of Bodysamples is the only max number of them, so the TotalSamples is
814 // usually much bigger than the accumulated body samples as one souce-code
815 // line can emit many machine instructions. We observed a regression when we
816 // switched to use the accumulated body samples(by using
817 // -update-total-samples). Hence, it's safer to re-calculate here to avoid
818 // such discrepancy. There is no problem for probe-based profile, as the
819 // TotalSamples is exactly the same as the accumulated body samples.
820 for (const auto &I : FSamples.getBodySamples())
821 TotalBodySamples += I.second.getSamples();
822
823 for (const auto &CallsiteSamples : FSamples.getCallsiteSamples())
824 for (const auto &Callee : CallsiteSamples.second) {
825 // For binary-level density, the inlinees' samples and size should be
826 // included in the calculation.
827 calculateBodySamplesAndSize(FSamples: Callee.second, TotalBodySamples,
828 FuncBodySize);
829 }
830}
831
832// Calculate Profile-density:
833// Calculate the density for each function and sort them in descending order,
834// keep accumulating their total samples unitl it exceeds the
835// percentage_threshold(cut-off) of total profile samples, the profile-density
836// is the last(minimum) function-density of the processed functions, which means
837// all the functions hot to perf are on good density if the profile-density is
838// good. The percentage_threshold(--profile-density-cutoff-hot) is configurable
839// depending on how much regression the system want to tolerate.
840double
841ProfileGeneratorBase::calculateDensity(const SampleProfileMap &Profiles) {
842 double ProfileDensity = 0.0;
843
844 uint64_t TotalProfileSamples = 0;
845 // A list of the function profile density and its total samples.
846 std::vector<std::pair<double, uint64_t>> FuncDensityList;
847 for (const auto &I : Profiles) {
848 uint64_t TotalBodySamples = 0;
849 uint64_t FuncBodySize = 0;
850 calculateBodySamplesAndSize(FSamples: I.second, TotalBodySamples, FuncBodySize);
851
852 if (FuncBodySize == 0)
853 continue;
854
855 double FuncDensity = static_cast<double>(TotalBodySamples) / FuncBodySize;
856 TotalProfileSamples += TotalBodySamples;
857 FuncDensityList.emplace_back(args&: FuncDensity, args&: TotalBodySamples);
858 }
859
860 // Sorted by the density in descending order.
861 llvm::stable_sort(Range&: FuncDensityList, C: [&](const std::pair<double, uint64_t> &A,
862 const std::pair<double, uint64_t> &B) {
863 if (A.first != B.first)
864 return A.first > B.first;
865 return A.second < B.second;
866 });
867
868 uint64_t AccumulatedSamples = 0;
869 uint32_t I = 0;
870 assert(ProfileDensityCutOffHot <= 1000000 &&
871 "The cutoff value is greater than 1000000(100%)");
872 while (AccumulatedSamples < TotalProfileSamples *
873 static_cast<float>(ProfileDensityCutOffHot) /
874 1000000 &&
875 I < FuncDensityList.size()) {
876 AccumulatedSamples += FuncDensityList[I].second;
877 ProfileDensity = FuncDensityList[I].first;
878 I++;
879 }
880
881 return ProfileDensity;
882}
883
884void ProfileGeneratorBase::calculateAndShowDensity(
885 const SampleProfileMap &Profiles) {
886 double Density = calculateDensity(Profiles);
887 showDensitySuggestion(Density);
888}
889
890FunctionSamples *
891CSProfileGenerator::getOrCreateFunctionSamples(ContextTrieNode *ContextNode,
892 bool WasLeafInlined) {
893 FunctionSamples *FProfile = ContextNode->getFunctionSamples();
894 if (!FProfile) {
895 FSamplesList.emplace_back();
896 FProfile = &FSamplesList.back();
897 FProfile->setFunction(ContextNode->getFuncName());
898 ContextNode->setFunctionSamples(FProfile);
899 }
900 // Update ContextWasInlined attribute for existing contexts.
901 // The current function can be called in two ways:
902 // - when processing a probe of the current frame
903 // - when processing the entry probe of an inlinee's frame, which
904 // is then used to update the callsite count of the current frame.
905 // The two can happen in any order, hence here we are making sure
906 // `ContextWasInlined` is always set as expected.
907 // TODO: Note that the former does not always happen if no probes of the
908 // current frame has samples, and if the latter happens, we could lose the
909 // attribute. This should be fixed.
910 if (WasLeafInlined)
911 FProfile->getContext().setAttribute(ContextWasInlined);
912 return FProfile;
913}
914
915ContextTrieNode *
916CSProfileGenerator::getOrCreateContextNode(const SampleContextFrames Context,
917 bool WasLeafInlined) {
918 ContextTrieNode *ContextNode =
919 ContextTracker.getOrCreateContextPath(Context, AllowCreate: true);
920 getOrCreateFunctionSamples(ContextNode, WasLeafInlined);
921 return ContextNode;
922}
923
924void CSProfileGenerator::generateProfile() {
925 FunctionSamples::ProfileIsCS = true;
926
927 collectProfiledFunctions();
928
929 if (Binary->usePseudoProbes()) {
930 Binary->decodePseudoProbe();
931 if (InferMissingFrames)
932 initializeMissingFrameInferrer();
933 if (LoadFunctionFromSymbol)
934 Binary->loadSymbolsFromPseudoProbe();
935 }
936
937 if (SampleCounters) {
938 if (Binary->usePseudoProbes()) {
939 generateProbeBasedProfile();
940 } else {
941 generateLineNumBasedProfile();
942 }
943 }
944
945 if (Binary->getTrackFuncContextSize())
946 computeSizeForProfiledFunctions();
947
948 postProcessProfiles();
949}
950
951void CSProfileGenerator::initializeMissingFrameInferrer() {
952 Binary->getMissingContextInferrer()->initialize(SampleCounters);
953}
954
955void CSProfileGenerator::inferMissingFrames(
956 const SmallVectorImpl<uint64_t> &Context,
957 SmallVectorImpl<uint64_t> &NewContext) {
958 Binary->inferMissingFrames(Context, NewContext);
959}
960
961void CSProfileGenerator::computeSizeForProfiledFunctions() {
962 for (auto *Func : Binary->getProfiledFunctions())
963 Binary->computeInlinedContextSizeForFunc(Func);
964
965 // Flush the symbolizer to save memory.
966 Binary->flushSymbolizer();
967}
968
969void CSProfileGenerator::updateFunctionSamples() {
970 for (auto *Node : ContextTracker) {
971 FunctionSamples *FSamples = Node->getFunctionSamples();
972 if (FSamples) {
973 if (UpdateTotalSamples)
974 FSamples->updateTotalSamples();
975 FSamples->updateCallsiteSamples();
976 }
977 }
978}
979
980void CSProfileGenerator::generateLineNumBasedProfile() {
981 for (const auto &CI : *SampleCounters) {
982 const auto *CtxKey = cast<StringBasedCtxKey>(Val: CI.first.getPtr());
983
984 ContextTrieNode *ContextNode = &getRootContext();
985 // Sample context will be empty if the jump is an external-to-internal call
986 // pattern, the head samples should be added for the internal function.
987 if (!CtxKey->Context.empty()) {
988 // Get or create function profile for the range
989 ContextNode =
990 getOrCreateContextNode(Context: CtxKey->Context, WasLeafInlined: CtxKey->WasLeafInlined);
991 // Fill in function body samples
992 populateBodySamplesForFunction(FunctionProfile&: *ContextNode->getFunctionSamples(),
993 RangeCounters: CI.second.RangeCounter);
994 }
995 // Fill in boundary sample counts as well as call site samples for calls
996 populateBoundarySamplesForFunction(CallerNode: ContextNode, BranchCounters: CI.second.BranchCounter);
997 }
998 // Fill in call site value sample for inlined calls and also use context to
999 // infer missing samples. Since we don't have call count for inlined
1000 // functions, we estimate it from inlinee's profile using the entry of the
1001 // body sample.
1002 populateInferredFunctionSamples(Node&: getRootContext());
1003
1004 updateFunctionSamples();
1005}
1006
1007void CSProfileGenerator::populateBodySamplesForFunction(
1008 FunctionSamples &FunctionProfile, const RangeSample &RangeCounter) {
1009 // Compute disjoint ranges first, so we can use MAX
1010 // for calculating count for each location.
1011 RangeSample Ranges;
1012 findDisjointRanges(DisjointRanges&: Ranges, Ranges: RangeCounter);
1013 for (const auto &Range : Ranges) {
1014 uint64_t RangeBegin = Range.first.first;
1015 uint64_t RangeEnd = Range.first.second;
1016 uint64_t Count = Range.second;
1017 // Disjoint ranges have introduce zero-filled gap that
1018 // doesn't belong to current context, filter them out.
1019 if (Count == 0)
1020 continue;
1021
1022 InstructionPointer IP(Binary, RangeBegin, true);
1023 // Disjoint ranges may have range in the middle of two instr,
1024 // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range
1025 // can be Addr1+1 to Addr2-1. We should ignore such range.
1026 if (IP.Address > RangeEnd)
1027 continue;
1028
1029 do {
1030 auto LeafLoc = Binary->getInlineLeafFrameLoc(Address: IP.Address);
1031 if (LeafLoc) {
1032 // Recording body sample for this specific context
1033 updateBodySamplesforFunctionProfile(FunctionProfile, LeafLoc: *LeafLoc, Count);
1034 FunctionProfile.addTotalSamples(Num: Count);
1035 }
1036 } while (IP.advance() && IP.Address <= RangeEnd);
1037 }
1038}
1039
1040void CSProfileGenerator::populateBoundarySamplesForFunction(
1041 ContextTrieNode *Node, const BranchSample &BranchCounters) {
1042
1043 for (const auto &Entry : BranchCounters) {
1044 uint64_t SourceAddress = Entry.first.first;
1045 uint64_t TargetAddress = Entry.first.second;
1046 uint64_t Count = Entry.second;
1047 assert(Count != 0 && "Unexpected zero weight branch");
1048
1049 StringRef CalleeName = getCalleeNameForAddress(TargetAddress);
1050 if (CalleeName.size() == 0)
1051 continue;
1052
1053 ContextTrieNode *CallerNode = Node;
1054 LineLocation CalleeCallSite(0, 0);
1055 if (CallerNode != &getRootContext()) {
1056 // Record called target sample and its count
1057 auto LeafLoc = Binary->getInlineLeafFrameLoc(Address: SourceAddress);
1058 if (LeafLoc) {
1059 CallerNode->getFunctionSamples()->addCalledTargetSamples(
1060 LineOffset: LeafLoc->Location.LineOffset,
1061 Discriminator: getBaseDiscriminator(Discriminator: LeafLoc->Location.Discriminator),
1062 Func: FunctionId(CalleeName),
1063 Num: Count);
1064 // Record head sample for called target(callee)
1065 CalleeCallSite = LeafLoc->Location;
1066 }
1067 }
1068
1069 ContextTrieNode *CalleeNode =
1070 CallerNode->getOrCreateChildContext(CallSite: CalleeCallSite,
1071 ChildName: FunctionId(CalleeName));
1072 FunctionSamples *CalleeProfile = getOrCreateFunctionSamples(ContextNode: CalleeNode);
1073 CalleeProfile->addHeadSamples(Num: Count);
1074 }
1075}
1076
1077void CSProfileGenerator::populateInferredFunctionSamples(
1078 ContextTrieNode &Node) {
1079 // There is no call jmp sample between the inliner and inlinee, we need to use
1080 // the inlinee's context to infer inliner's context, i.e. parent(inliner)'s
1081 // sample depends on child(inlinee)'s sample, so traverse the tree in
1082 // post-order.
1083 for (auto &It : Node.getAllChildContext())
1084 populateInferredFunctionSamples(Node&: It.second);
1085
1086 FunctionSamples *CalleeProfile = Node.getFunctionSamples();
1087 if (!CalleeProfile)
1088 return;
1089 // If we already have head sample counts, we must have value profile
1090 // for call sites added already. Skip to avoid double counting.
1091 if (CalleeProfile->getHeadSamples())
1092 return;
1093 ContextTrieNode *CallerNode = Node.getParentContext();
1094 // If we don't have context, nothing to do for caller's call site.
1095 // This could happen for entry point function.
1096 if (CallerNode == &getRootContext())
1097 return;
1098
1099 LineLocation CallerLeafFrameLoc = Node.getCallSiteLoc();
1100 FunctionSamples &CallerProfile = *getOrCreateFunctionSamples(ContextNode: CallerNode);
1101 // Since we don't have call count for inlined functions, we
1102 // estimate it from inlinee's profile using entry body sample.
1103 uint64_t EstimatedCallCount = CalleeProfile->getHeadSamplesEstimate();
1104 // If we don't have samples with location, use 1 to indicate live.
1105 if (!EstimatedCallCount && !CalleeProfile->getBodySamples().size())
1106 EstimatedCallCount = 1;
1107 CallerProfile.addCalledTargetSamples(LineOffset: CallerLeafFrameLoc.LineOffset,
1108 Discriminator: CallerLeafFrameLoc.Discriminator,
1109 Func: Node.getFuncName(), Num: EstimatedCallCount);
1110 CallerProfile.addBodySamples(LineOffset: CallerLeafFrameLoc.LineOffset,
1111 Discriminator: CallerLeafFrameLoc.Discriminator,
1112 Num: EstimatedCallCount);
1113 CallerProfile.addTotalSamples(Num: EstimatedCallCount);
1114}
1115
1116void CSProfileGenerator::convertToProfileMap(
1117 ContextTrieNode &Node, SampleContextFrameVector &Context) {
1118 FunctionSamples *FProfile = Node.getFunctionSamples();
1119 if (FProfile) {
1120 Context.emplace_back(Args: Node.getFuncName(), Args: LineLocation(0, 0));
1121 // Save the new context for future references.
1122 SampleContextFrames NewContext = *Contexts.insert(x: Context).first;
1123 auto Ret = ProfileMap.emplace(Args&: NewContext, Args: std::move(*FProfile));
1124 FunctionSamples &NewProfile = Ret.first->second;
1125 NewProfile.getContext().setContext(Context: NewContext);
1126 Context.pop_back();
1127 }
1128
1129 for (auto &It : Node.getAllChildContext()) {
1130 ContextTrieNode &ChildNode = It.second;
1131 Context.emplace_back(Args: Node.getFuncName(), Args: ChildNode.getCallSiteLoc());
1132 convertToProfileMap(Node&: ChildNode, Context);
1133 Context.pop_back();
1134 }
1135}
1136
1137void CSProfileGenerator::convertToProfileMap() {
1138 assert(ProfileMap.empty() &&
1139 "ProfileMap should be empty before converting from the trie");
1140 assert(IsProfileValidOnTrie &&
1141 "Do not convert the trie twice, it's already destroyed");
1142
1143 SampleContextFrameVector Context;
1144 for (auto &It : getRootContext().getAllChildContext())
1145 convertToProfileMap(Node&: It.second, Context);
1146
1147 IsProfileValidOnTrie = false;
1148}
1149
1150void CSProfileGenerator::postProcessProfiles() {
1151 // Compute hot/cold threshold based on profile. This will be used for cold
1152 // context profile merging/trimming.
1153 computeSummaryAndThreshold();
1154
1155 // Run global pre-inliner to adjust/merge context profile based on estimated
1156 // inline decisions.
1157 if (EnableCSPreInliner) {
1158 ContextTracker.populateFuncToCtxtMap();
1159 CSPreInliner(ContextTracker, *Binary, Summary.get()).run();
1160 // Turn off the profile merger by default unless it is explicitly enabled.
1161 if (!CSProfMergeColdContext.getNumOccurrences())
1162 CSProfMergeColdContext = false;
1163 }
1164
1165 convertToProfileMap();
1166
1167 // Trim and merge cold context profile using cold threshold above.
1168 if (TrimColdProfile || CSProfMergeColdContext) {
1169 SampleContextTrimmer(ProfileMap)
1170 .trimAndMergeColdContextProfiles(
1171 ColdCountThreshold: HotCountThreshold, TrimColdContext: TrimColdProfile, MergeColdContext: CSProfMergeColdContext,
1172 ColdContextFrameLength: CSProfMaxColdContextDepth, TrimBaseProfileOnly: EnableCSPreInliner);
1173 }
1174
1175 if (GenCSNestedProfile) {
1176 ProfileConverter CSConverter(ProfileMap);
1177 CSConverter.convertCSProfiles();
1178 FunctionSamples::ProfileIsCS = false;
1179 }
1180 filterAmbiguousProfile(Profiles&: ProfileMap);
1181 if (MarkAllContextPreinlined)
1182 markAllContextPreinlined(ProfileMap);
1183 ProfileGeneratorBase::calculateAndShowDensity(Profiles: ProfileMap);
1184}
1185
1186void ProfileGeneratorBase::computeSummaryAndThreshold(
1187 SampleProfileMap &Profiles) {
1188 SampleProfileSummaryBuilder Builder(ProfileSummaryBuilder::DefaultCutoffs);
1189 Summary = Builder.computeSummaryForProfiles(Profiles);
1190 HotCountThreshold = ProfileSummaryBuilder::getHotCountThreshold(
1191 DS: (Summary->getDetailedSummary()));
1192 ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold(
1193 DS: (Summary->getDetailedSummary()));
1194}
1195
1196void CSProfileGenerator::computeSummaryAndThreshold() {
1197 // Always merge and use context-less profile map to compute summary.
1198 SampleProfileMap ContextLessProfiles;
1199 ContextTracker.createContextLessProfileMap(ContextLessProfiles);
1200
1201 // Set the flag below to avoid merging the profile again in
1202 // computeSummaryAndThreshold
1203 FunctionSamples::ProfileIsCS = false;
1204 assert(
1205 (!UseContextLessSummary.getNumOccurrences() || UseContextLessSummary) &&
1206 "Don't set --profile-summary-contextless to false for profile "
1207 "generation");
1208 ProfileGeneratorBase::computeSummaryAndThreshold(Profiles&: ContextLessProfiles);
1209 // Recover the old value.
1210 FunctionSamples::ProfileIsCS = true;
1211}
1212
1213void ProfileGeneratorBase::extractProbesFromRange(
1214 const RangeSample &RangeCounter, ProbeCounterMap &ProbeCounter,
1215 bool FindDisjointRanges) {
1216 const RangeSample *PRanges = &RangeCounter;
1217 RangeSample Ranges;
1218 if (FindDisjointRanges) {
1219 findDisjointRanges(DisjointRanges&: Ranges, Ranges: RangeCounter);
1220 PRanges = &Ranges;
1221 }
1222
1223 for (const auto &Range : *PRanges) {
1224 uint64_t RangeBegin = Range.first.first;
1225 uint64_t RangeEnd = Range.first.second;
1226 uint64_t Count = Range.second;
1227
1228 InstructionPointer IP(Binary, RangeBegin, true);
1229 // Disjoint ranges may have range in the middle of two instr,
1230 // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range
1231 // can be Addr1+1 to Addr2-1. We should ignore such range.
1232 if (IP.Address > RangeEnd)
1233 continue;
1234
1235 do {
1236 const AddressProbesMap &Address2ProbesMap =
1237 Binary->getAddress2ProbesMap();
1238 for (const MCDecodedPseudoProbe &Probe :
1239 Address2ProbesMap.find(Address: IP.Address)) {
1240 ProbeCounter[&Probe] += Count;
1241 }
1242 } while (IP.advance() && IP.Address <= RangeEnd);
1243 }
1244}
1245
1246static void extractPrefixContextStack(SampleContextFrameVector &ContextStack,
1247 const SmallVectorImpl<uint64_t> &AddrVec,
1248 ProfiledBinary *Binary) {
1249 SmallVector<const MCDecodedPseudoProbe *, 16> Probes;
1250 for (auto Address : reverse(C: AddrVec)) {
1251 const MCDecodedPseudoProbe *CallProbe =
1252 Binary->getCallProbeForAddr(Address);
1253 // These could be the cases when a probe is not found at a calliste. Cutting
1254 // off the context from here since the inliner will not know how to consume
1255 // a context with unknown callsites.
1256 // 1. for functions that are not sampled when
1257 // --decode-probe-for-profiled-functions-only is on.
1258 // 2. for a merged callsite. Callsite merging may cause the loss of original
1259 // probe IDs.
1260 // 3. for an external callsite.
1261 if (!CallProbe)
1262 break;
1263 Probes.push_back(Elt: CallProbe);
1264 }
1265
1266 std::reverse(first: Probes.begin(), last: Probes.end());
1267
1268 // Extract context stack for reusing, leaf context stack will be added
1269 // compressed while looking up function profile.
1270 for (const auto *P : Probes) {
1271 Binary->getInlineContextForProbe(Probe: P, InlineContextStack&: ContextStack, IncludeLeaf: true);
1272 }
1273}
1274
1275void CSProfileGenerator::generateProbeBasedProfile() {
1276 // Enable pseudo probe functionalities in SampleProf
1277 FunctionSamples::ProfileIsProbeBased = true;
1278 for (const auto &CI : *SampleCounters) {
1279 const AddrBasedCtxKey *CtxKey =
1280 dyn_cast<AddrBasedCtxKey>(Val: CI.first.getPtr());
1281 // Fill in function body samples from probes, also infer caller's samples
1282 // from callee's probe
1283 populateBodySamplesWithProbes(RangeCounter: CI.second.RangeCounter, CtxKey);
1284 // Fill in boundary samples for a call probe
1285 populateBoundarySamplesWithProbes(BranchCounter: CI.second.BranchCounter, CtxKey);
1286 }
1287}
1288
1289void CSProfileGenerator::populateBodySamplesWithProbes(
1290 const RangeSample &RangeCounter, const AddrBasedCtxKey *CtxKey) {
1291 ProbeCounterMap ProbeCounter;
1292 // Extract the top frame probes by looking up each address among the range in
1293 // the Address2ProbeMap
1294 extractProbesFromRange(RangeCounter, ProbeCounter);
1295 std::unordered_map<MCDecodedPseudoProbeInlineTree *,
1296 std::unordered_set<FunctionSamples *>>
1297 FrameSamples;
1298 for (const auto &PI : ProbeCounter) {
1299 const MCDecodedPseudoProbe *Probe = PI.first;
1300 uint64_t Count = PI.second;
1301 // Disjoint ranges have introduce zero-filled gap that
1302 // doesn't belong to current context, filter them out.
1303 if (!Probe->isBlock() || Count == 0)
1304 continue;
1305
1306 ContextTrieNode *ContextNode = getContextNodeForLeafProbe(CtxKey, LeafProbe: Probe);
1307 FunctionSamples &FunctionProfile = *ContextNode->getFunctionSamples();
1308 // Record the current frame and FunctionProfile whenever samples are
1309 // collected for non-danglie probes. This is for reporting all of the
1310 // zero count probes of the frame later.
1311 FrameSamples[Probe->getInlineTreeNode()].insert(x: &FunctionProfile);
1312 FunctionProfile.addBodySamples(LineOffset: Probe->getIndex(), Discriminator: Probe->getDiscriminator(),
1313 Num: Count);
1314 FunctionProfile.addTotalSamples(Num: Count);
1315 if (Probe->isEntry()) {
1316 FunctionProfile.addHeadSamples(Num: Count);
1317 // Look up for the caller's function profile
1318 const auto *InlinerDesc = Binary->getInlinerDescForProbe(Probe);
1319 ContextTrieNode *CallerNode = ContextNode->getParentContext();
1320 if (InlinerDesc != nullptr && CallerNode != &getRootContext()) {
1321 // Since the context id will be compressed, we have to use callee's
1322 // context id to infer caller's context id to ensure they share the
1323 // same context prefix.
1324 uint64_t CallerIndex = ContextNode->getCallSiteLoc().LineOffset;
1325 uint64_t CallerDiscriminator = ContextNode->getCallSiteLoc().Discriminator;
1326 assert(CallerIndex &&
1327 "Inferred caller's location index shouldn't be zero!");
1328 assert(!CallerDiscriminator &&
1329 "Callsite probe should not have a discriminator!");
1330 FunctionSamples &CallerProfile =
1331 *getOrCreateFunctionSamples(ContextNode: CallerNode);
1332 CallerProfile.setFunctionHash(InlinerDesc->FuncHash);
1333 CallerProfile.addBodySamples(LineOffset: CallerIndex, Discriminator: CallerDiscriminator, Num: Count);
1334 CallerProfile.addTotalSamples(Num: Count);
1335 CallerProfile.addCalledTargetSamples(LineOffset: CallerIndex, Discriminator: CallerDiscriminator,
1336 Func: ContextNode->getFuncName(), Num: Count);
1337 }
1338 }
1339 }
1340
1341 // Assign zero count for remaining probes without sample hits to
1342 // differentiate from probes optimized away, of which the counts are unknown
1343 // and will be inferred by the compiler.
1344 for (auto &I : FrameSamples) {
1345 for (auto *FunctionProfile : I.second) {
1346 for (const MCDecodedPseudoProbe &Probe : I.first->getProbes()) {
1347 FunctionProfile->addBodySamples(LineOffset: Probe.getIndex(),
1348 Discriminator: Probe.getDiscriminator(), Num: 0);
1349 }
1350 }
1351 }
1352}
1353
1354void CSProfileGenerator::populateBoundarySamplesWithProbes(
1355 const BranchSample &BranchCounter, const AddrBasedCtxKey *CtxKey) {
1356 for (const auto &BI : BranchCounter) {
1357 uint64_t SourceAddress = BI.first.first;
1358 uint64_t TargetAddress = BI.first.second;
1359 uint64_t Count = BI.second;
1360 const MCDecodedPseudoProbe *CallProbe =
1361 Binary->getCallProbeForAddr(Address: SourceAddress);
1362 if (CallProbe == nullptr)
1363 continue;
1364 FunctionSamples &FunctionProfile =
1365 getFunctionProfileForLeafProbe(CtxKey, LeafProbe: CallProbe);
1366 FunctionProfile.addBodySamples(LineOffset: CallProbe->getIndex(), Discriminator: 0, Num: Count);
1367 FunctionProfile.addTotalSamples(Num: Count);
1368 StringRef CalleeName = getCalleeNameForAddress(TargetAddress);
1369 if (CalleeName.size() == 0)
1370 continue;
1371 FunctionProfile.addCalledTargetSamples(LineOffset: CallProbe->getIndex(),
1372 Discriminator: CallProbe->getDiscriminator(),
1373 Func: FunctionId(CalleeName), Num: Count);
1374 }
1375}
1376
1377ContextTrieNode *CSProfileGenerator::getContextNodeForLeafProbe(
1378 const AddrBasedCtxKey *CtxKey, const MCDecodedPseudoProbe *LeafProbe) {
1379
1380 const SmallVectorImpl<uint64_t> *PContext = &CtxKey->Context;
1381 SmallVector<uint64_t, 16> NewContext;
1382
1383 if (InferMissingFrames) {
1384 SmallVector<uint64_t, 16> Context = CtxKey->Context;
1385 // Append leaf frame for a complete inference.
1386 Context.push_back(Elt: LeafProbe->getAddress());
1387 inferMissingFrames(Context, NewContext);
1388 // Pop out the leaf probe that was pushed in above.
1389 NewContext.pop_back();
1390 PContext = &NewContext;
1391 }
1392
1393 SampleContextFrameVector ContextStack;
1394 extractPrefixContextStack(ContextStack, AddrVec: *PContext, Binary);
1395
1396 // Explicitly copy the context for appending the leaf context
1397 SampleContextFrameVector NewContextStack(ContextStack.begin(),
1398 ContextStack.end());
1399 Binary->getInlineContextForProbe(Probe: LeafProbe, InlineContextStack&: NewContextStack, IncludeLeaf: true);
1400 // For leaf inlined context with the top frame, we should strip off the top
1401 // frame's probe id, like:
1402 // Inlined stack: [foo:1, bar:2], the ContextId will be "foo:1 @ bar"
1403 auto LeafFrame = NewContextStack.back();
1404 LeafFrame.Location = LineLocation(0, 0);
1405 NewContextStack.pop_back();
1406 // Compress the context string except for the leaf frame
1407 CSProfileGenerator::compressRecursionContext(Context&: NewContextStack);
1408 CSProfileGenerator::trimContext(S&: NewContextStack);
1409 NewContextStack.push_back(Elt: LeafFrame);
1410
1411 const auto *FuncDesc = Binary->getFuncDescForGUID(GUID: LeafProbe->getGuid());
1412 bool WasLeafInlined = LeafProbe->getInlineTreeNode()->hasInlineSite();
1413 ContextTrieNode *ContextNode =
1414 getOrCreateContextNode(Context: NewContextStack, WasLeafInlined);
1415 ContextNode->getFunctionSamples()->setFunctionHash(FuncDesc->FuncHash);
1416 return ContextNode;
1417}
1418
1419FunctionSamples &CSProfileGenerator::getFunctionProfileForLeafProbe(
1420 const AddrBasedCtxKey *CtxKey, const MCDecodedPseudoProbe *LeafProbe) {
1421 return *getContextNodeForLeafProbe(CtxKey, LeafProbe)->getFunctionSamples();
1422}
1423
1424} // end namespace sampleprof
1425} // end namespace llvm
1426