1//===-- ProfileGenerator.cpp - Profile Generator ---------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8#include "ProfileGenerator.h"
9#include "ErrorHandling.h"
10#include "MissingFrameInferrer.h"
11#include "Options.h"
12#include "PerfReader.h"
13#include "ProfiledBinary.h"
14#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
15#include "llvm/ProfileData/ProfileCommon.h"
16#include "llvm/Support/Timer.h"
17#include <algorithm>
18#include <float.h>
19#include <unordered_set>
20#include <utility>
21
22namespace llvm {
23
24cl::opt<std::string> OutputFilename("output", cl::value_desc("output"),
25 cl::Required,
26 cl::desc("Output profile file"),
27 cl::cat(ProfGenCategory));
28static cl::alias OutputA("o", cl::desc("Alias for --output"),
29 cl::aliasopt(OutputFilename));
30
31static cl::opt<SampleProfileFormat> OutputFormat(
32 "format", cl::desc("Format of output profile"), cl::init(Val: SPF_Ext_Binary),
33 cl::values(clEnumValN(SPF_Binary, "binary", "Binary encoding (default)"),
34 clEnumValN(SPF_Ext_Binary, "extbinary",
35 "Extensible binary encoding"),
36 clEnumValN(SPF_Text, "text", "Text encoding"),
37 clEnumValN(SPF_GCC, "gcc",
38 "GCC encoding (only meaningful for -sample)")),
39 cl::cat(ProfGenCategory));
40
41static cl::opt<bool> UseMD5(
42 "use-md5", cl::Hidden,
43 cl::desc("Use md5 to represent function names in the output profile (only "
44 "meaningful for -extbinary)"));
45
46static cl::opt<bool> PopulateProfileSymbolList(
47 "populate-profile-symbol-list", cl::init(Val: false), cl::Hidden,
48 cl::desc("Populate profile symbol list (only meaningful for -extbinary)"));
49
50static cl::opt<bool> FillZeroForAllFuncs(
51 "fill-zero-for-all-funcs", cl::init(Val: false), cl::Hidden,
52 cl::desc("Attribute all functions' range with zero count "
53 "even it's not hit by any samples."));
54
55static cl::opt<int32_t, true> RecursionCompression(
56 "compress-recursion",
57 cl::desc("Compressing recursion by deduplicating adjacent frame "
58 "sequences up to the specified size. -1 means no size limit."),
59 cl::Hidden,
60 cl::location(L&: llvm::sampleprof::CSProfileGenerator::MaxCompressionSize));
61
62static cl::opt<bool>
63 TrimColdProfile("trim-cold-profile",
64 cl::desc("If the total count of the profile is smaller "
65 "than threshold, it will be trimmed."),
66 cl::cat(ProfGenCategory));
67
68static cl::opt<bool> MarkAllContextPreinlined(
69 "mark-all-context-preinlined",
70 cl::desc("Mark all function samples as preinlined(set "
71 "ContextShouldBeInlined attribute)."),
72 cl::init(Val: false));
73
74static cl::opt<bool> CSProfMergeColdContext(
75 "csprof-merge-cold-context", cl::init(Val: true),
76 cl::desc("If the total count of context profile is smaller than "
77 "the threshold, it will be merged into context-less base "
78 "profile."),
79 cl::cat(ProfGenCategory));
80
81static cl::opt<uint32_t> CSProfMaxColdContextDepth(
82 "csprof-max-cold-context-depth", cl::init(Val: 1),
83 cl::desc("Keep the last K contexts while merging cold profile. 1 means the "
84 "context-less base profile"),
85 cl::cat(ProfGenCategory));
86
87static cl::opt<int, true> CSProfMaxContextDepth(
88 "csprof-max-context-depth",
89 cl::desc("Keep the last K contexts while merging profile. -1 means no "
90 "depth limit."),
91 cl::location(L&: llvm::sampleprof::CSProfileGenerator::MaxContextDepth),
92 cl::cat(ProfGenCategory));
93
94static cl::opt<double> ProfileDensityThreshold(
95 "profile-density-threshold", cl::init(Val: 50),
96 cl::desc("If the profile density is below the given threshold, it "
97 "will be suggested to increase the sampling rate."),
98 cl::Optional, cl::cat(ProfGenCategory));
99static cl::opt<bool> ShowDensity("show-density", cl::init(Val: false),
100 cl::desc("show profile density details"),
101 cl::Optional, cl::cat(ProfGenCategory));
102static cl::opt<int> ProfileDensityCutOffHot(
103 "profile-density-cutoff-hot", cl::init(Val: 990000),
104 cl::desc("Total samples cutoff for functions used to calculate "
105 "profile density."),
106 cl::cat(ProfGenCategory));
107
108static cl::opt<bool> UpdateTotalSamples(
109 "update-total-samples", cl::init(Val: false),
110 cl::desc("Update total samples by accumulating all its body samples."),
111 cl::Optional, cl::cat(ProfGenCategory));
112
113static cl::opt<bool> GenCSNestedProfile(
114 "gen-cs-nested-profile", cl::Hidden, cl::init(Val: true),
115 cl::desc("Generate nested function profiles for CSSPGO"));
116
117cl::opt<bool> InferMissingFrames(
118 "infer-missing-frames", cl::init(Val: true),
119 cl::desc(
120 "Infer missing call frames due to compiler tail call elimination."),
121 cl::Optional, cl::cat(ProfGenCategory));
122
123namespace sampleprof {
124
125// Initialize the MaxCompressionSize to -1 which means no size limit
126int32_t CSProfileGenerator::MaxCompressionSize = -1;
127
128int CSProfileGenerator::MaxContextDepth = -1;
129
130bool ProfileGeneratorBase::UseFSDiscriminator = false;
131
132std::unique_ptr<ProfileGeneratorBase>
133ProfileGeneratorBase::create(ProfiledBinary *Binary,
134 const ContextSampleCounterMap *SampleCounters,
135 bool ProfileIsCS) {
136 std::unique_ptr<ProfileGeneratorBase> Generator;
137 if (ProfileIsCS) {
138 Generator.reset(p: new CSProfileGenerator(Binary, SampleCounters));
139 } else {
140 Generator.reset(p: new ProfileGenerator(Binary, SampleCounters));
141 }
142 ProfileGeneratorBase::UseFSDiscriminator = Binary->useFSDiscriminator();
143 FunctionSamples::ProfileIsFS = Binary->useFSDiscriminator();
144
145 return Generator;
146}
147
148std::unique_ptr<ProfileGeneratorBase>
149ProfileGeneratorBase::create(ProfiledBinary *Binary, SampleProfileMap &Profiles,
150 bool ProfileIsCS) {
151 std::unique_ptr<ProfileGeneratorBase> Generator;
152 if (ProfileIsCS) {
153 Generator.reset(p: new CSProfileGenerator(Binary, Profiles));
154 } else {
155 Generator.reset(p: new ProfileGenerator(Binary, std::move(Profiles)));
156 }
157 ProfileGeneratorBase::UseFSDiscriminator = Binary->useFSDiscriminator();
158 FunctionSamples::ProfileIsFS = Binary->useFSDiscriminator();
159
160 return Generator;
161}
162
163void ProfileGeneratorBase::write(std::unique_ptr<SampleProfileWriter> Writer,
164 SampleProfileMap &ProfileMap) {
165 // Populate profile symbol list if extended binary format is used.
166 ProfileSymbolList SymbolList;
167
168 if (PopulateProfileSymbolList && OutputFormat == SPF_Ext_Binary) {
169 Binary->populateSymbolListFromDWARF(SymbolList);
170 Writer->setProfileSymbolList(&SymbolList);
171 }
172
173 if (std::error_code EC = Writer->write(ProfileMap))
174 exitWithError(EC: std::move(EC));
175}
176
177void ProfileGeneratorBase::write() {
178 auto WriterOrErr = SampleProfileWriter::create(Filename: OutputFilename, Format: OutputFormat);
179 if (std::error_code EC = WriterOrErr.getError())
180 exitWithError(EC, Whence: OutputFilename);
181
182 if (UseMD5) {
183 if (OutputFormat != SPF_Ext_Binary)
184 WithColor::warning() << "-use-md5 is ignored. Specify "
185 "--format=extbinary to enable it\n";
186 else
187 WriterOrErr.get()->setUseMD5();
188 }
189
190 write(Writer: std::move(WriterOrErr.get()), ProfileMap);
191}
192
193void ProfileGeneratorBase::showDensitySuggestion(double Density) {
194 if (Density == 0.0)
195 WithColor::warning() << "The output profile is empty or the "
196 "--profile-density-cutoff-hot option is "
197 "set too low. Please check your command.\n";
198 else if (Density < ProfileDensityThreshold)
199 WithColor::warning()
200 << "Sample PGO is estimated to optimize better with "
201 << format(Fmt: "%.1f", Vals: ProfileDensityThreshold / Density)
202 << "x more samples. Please consider increasing sampling rate or "
203 "profiling for longer duration to get more samples.\n";
204
205 if (ShowDensity)
206 outs() << "Functions with density >= " << format(Fmt: "%.1f", Vals: Density)
207 << " account for "
208 << format(Fmt: "%.2f",
209 Vals: static_cast<double>(ProfileDensityCutOffHot) / 10000)
210 << "% total sample counts.\n";
211}
212
213bool ProfileGeneratorBase::filterAmbiguousProfile(FunctionSamples &FS) {
214 for (const auto &Prefix : FuncPrefixsToFilter) {
215 if (FS.getFuncName().starts_with(Prefix))
216 return true;
217 }
218
219 // Filter the function profiles for the inlinees. It's useful for fuzzy
220 // profile matching which flattens the profile and inlinees' samples are
221 // merged into top-level function.
222 for (auto &Callees :
223 const_cast<CallsiteSampleMap &>(FS.getCallsiteSamples())) {
224 auto &CalleesMap = Callees.second;
225 for (auto I = CalleesMap.begin(); I != CalleesMap.end();) {
226 auto FS = I++;
227 if (filterAmbiguousProfile(FS&: FS->second))
228 CalleesMap.erase(position: FS);
229 }
230 }
231 return false;
232}
233
234// For built-in local initialization function such as __cxx_global_var_init,
235// __tls_init prefix function, there could be multiple versions of the functions
236// in the final binary. However, in the profile generation, we call
237// getCanonicalFnName to canonicalize the names which strips the suffixes.
238// Therefore, samples from different functions queries the same profile and the
239// samples are merged. As the functions are essentially different, entries of
240// the merged profile are ambiguous. In sample loader, the IR from one version
241// would be attributed towards a merged entries, which is inaccurate. Especially
242// for fuzzy profile matching, it gets multiple callsites(from different
243// function) but used to match one callsite, which misleads the matching and
244// causes a lot of false positives report. Hence, we want to filter them out
245// from the profile map during the profile generation time. The profiles are all
246// cold functions, it won't have perf impact.
247void ProfileGeneratorBase::filterAmbiguousProfile(SampleProfileMap &Profiles) {
248 for (auto I = Profiles.begin(); I != Profiles.end();) {
249 auto FS = I++;
250 if (filterAmbiguousProfile(FS&: FS->second))
251 Profiles.erase(It: FS);
252 }
253}
254
255void ProfileGeneratorBase::findDisjointRanges(RangeSample &DisjointRanges,
256 const RangeSample &Ranges) {
257
258 /*
259 Regions may overlap with each other. Using the boundary info, find all
260 disjoint ranges and their sample count. BoundaryPoint contains the count
261 multiple samples begin/end at this points.
262
263 |<--100-->| Sample1
264 |<------200------>| Sample2
265 A B C
266
267 In the example above,
268 Sample1 begins at A, ends at B, its value is 100.
269 Sample2 beings at A, ends at C, its value is 200.
270 For A, BeginCount is the sum of sample begins at A, which is 300 and no
271 samples ends at A, so EndCount is 0.
272 Then boundary points A, B, and C with begin/end counts are:
273 A: (300, 0)
274 B: (0, 100)
275 C: (0, 200)
276 */
277 struct BoundaryPoint {
278 // Sum of sample counts beginning at this point
279 uint64_t BeginCount = UINT64_MAX;
280 // Sum of sample counts ending at this point
281 uint64_t EndCount = UINT64_MAX;
282 // Is the begin point of a zero range.
283 bool IsZeroRangeBegin = false;
284 // Is the end point of a zero range.
285 bool IsZeroRangeEnd = false;
286
287 void addBeginCount(uint64_t Count) {
288 if (BeginCount == UINT64_MAX)
289 BeginCount = 0;
290 BeginCount += Count;
291 }
292
293 void addEndCount(uint64_t Count) {
294 if (EndCount == UINT64_MAX)
295 EndCount = 0;
296 EndCount += Count;
297 }
298 };
299
300 /*
301 For the above example. With boundary points, follwing logic finds two
302 disjoint region of
303
304 [A,B]: 300
305 [B+1,C]: 200
306
307 If there is a boundary point that both begin and end, the point itself
308 becomes a separate disjoint region. For example, if we have original
309 ranges of
310
311 |<--- 100 --->|
312 |<--- 200 --->|
313 A B C
314
315 there are three boundary points with their begin/end counts of
316
317 A: (100, 0)
318 B: (200, 100)
319 C: (0, 200)
320
321 the disjoint ranges would be
322
323 [A, B-1]: 100
324 [B, B]: 300
325 [B+1, C]: 200.
326
327 Example for zero value range:
328
329 |<--- 100 --->|
330 |<--- 200 --->|
331 |<--------------- 0 ----------------->|
332 A B C D E F
333
334 [A, B-1] : 0
335 [B, C] : 100
336 [C+1, D-1]: 0
337 [D, E] : 200
338 [E+1, F] : 0
339 */
340 std::map<uint64_t, BoundaryPoint> Boundaries;
341
342 for (const auto &Item : Ranges) {
343 assert(Item.first.first <= Item.first.second &&
344 "Invalid instruction range");
345 auto &BeginPoint = Boundaries[Item.first.first];
346 auto &EndPoint = Boundaries[Item.first.second];
347 uint64_t Count = Item.second;
348
349 BeginPoint.addBeginCount(Count);
350 EndPoint.addEndCount(Count);
351 if (Count == 0) {
352 BeginPoint.IsZeroRangeBegin = true;
353 EndPoint.IsZeroRangeEnd = true;
354 }
355 }
356
357 // Use UINT64_MAX to indicate there is no existing range between BeginAddress
358 // and the next valid address
359 uint64_t BeginAddress = UINT64_MAX;
360 int ZeroRangeDepth = 0;
361 uint64_t Count = 0;
362 for (const auto &Item : Boundaries) {
363 uint64_t Address = Item.first;
364 const BoundaryPoint &Point = Item.second;
365 if (Point.BeginCount != UINT64_MAX) {
366 if (BeginAddress != UINT64_MAX)
367 DisjointRanges[{BeginAddress, Address - 1}] = Count;
368 Count += Point.BeginCount;
369 BeginAddress = Address;
370 ZeroRangeDepth += Point.IsZeroRangeBegin;
371 }
372 if (Point.EndCount != UINT64_MAX) {
373 assert((BeginAddress != UINT64_MAX) &&
374 "First boundary point cannot be 'end' point");
375 DisjointRanges[{BeginAddress, Address}] = Count;
376 assert(Count >= Point.EndCount && "Mismatched live ranges");
377 Count -= Point.EndCount;
378 BeginAddress = Address + 1;
379 ZeroRangeDepth -= Point.IsZeroRangeEnd;
380 // If the remaining count is zero and it's no longer in a zero range, this
381 // means we consume all the ranges before, thus mark BeginAddress as
382 // UINT64_MAX. e.g. supposing we have two non-overlapping ranges:
383 // [<---- 10 ---->]
384 // [<---- 20 ---->]
385 // A B C D
386 // The BeginAddress(B+1) will reset to invalid(UINT64_MAX), so we won't
387 // have the [B+1, C-1] zero range.
388 if (Count == 0 && ZeroRangeDepth == 0)
389 BeginAddress = UINT64_MAX;
390 }
391 }
392}
393
394void ProfileGeneratorBase::updateBodySamplesforFunctionProfile(
395 FunctionSamples &FunctionProfile, const SampleContextFrame &LeafLoc,
396 uint64_t Count) {
397 // Use the maximum count of samples with same line location
398 uint32_t Discriminator = getBaseDiscriminator(Discriminator: LeafLoc.Location.Discriminator);
399
400 // Use duplication factor to compensated for loop unroll/vectorization.
401 // Note that this is only needed when we're taking MAX of the counts at
402 // the location instead of SUM.
403 Count *= getDuplicationFactor(Discriminator: LeafLoc.Location.Discriminator);
404
405 ErrorOr<uint64_t> R =
406 FunctionProfile.findSamplesAt(LineOffset: LeafLoc.Location.LineOffset, Discriminator);
407
408 uint64_t PreviousCount = R ? R.get() : 0;
409 if (PreviousCount <= Count) {
410 FunctionProfile.addBodySamples(LineOffset: LeafLoc.Location.LineOffset, Discriminator,
411 Num: Count - PreviousCount);
412 }
413}
414
415void ProfileGeneratorBase::updateTotalSamples() {
416 for (auto &Item : ProfileMap) {
417 FunctionSamples &FunctionProfile = Item.second;
418 FunctionProfile.updateTotalSamples();
419 }
420}
421
422void ProfileGeneratorBase::updateCallsiteSamples() {
423 for (auto &Item : ProfileMap) {
424 FunctionSamples &FunctionProfile = Item.second;
425 FunctionProfile.updateCallsiteSamples();
426 }
427}
428
429void ProfileGeneratorBase::updateFunctionSamples() {
430 updateCallsiteSamples();
431
432 if (UpdateTotalSamples)
433 updateTotalSamples();
434}
435
436void ProfileGeneratorBase::collectProfiledFunctions() {
437 SmallPtrSet<const BinaryFunction *, 0> ProfiledFunctions;
438 if (collectFunctionsFromRawProfile(ProfiledFunctions))
439 Binary->setProfiledFunctions(ProfiledFunctions);
440 else if (collectFunctionsFromLLVMProfile(ProfiledFunctions))
441 Binary->setProfiledFunctions(ProfiledFunctions);
442 else
443 llvm_unreachable("Unsupported input profile");
444}
445
446bool ProfileGeneratorBase::collectFunctionsFromRawProfile(
447 SmallPtrSetImpl<const BinaryFunction *> &ProfiledFunctions) {
448 if (!SampleCounters)
449 return false;
450 // Go through all the stacks, ranges and branches in sample counters, use
451 // the start of the range to look up the function it belongs and record the
452 // function.
453 for (const auto &CI : *SampleCounters) {
454 if (const auto *CtxKey = dyn_cast<AddrBasedCtxKey>(Val: CI.first.getPtr())) {
455 for (auto StackAddr : CtxKey->Context) {
456 if (FuncRange *FRange = Binary->findFuncRange(Address: StackAddr))
457 ProfiledFunctions.insert(Ptr: FRange->Func);
458 }
459 }
460
461 for (auto Item : CI.second.RangeCounter) {
462 uint64_t StartAddress = Item.first.first;
463 if (FuncRange *FRange = Binary->findFuncRange(Address: StartAddress))
464 ProfiledFunctions.insert(Ptr: FRange->Func);
465 }
466
467 for (auto Item : CI.second.BranchCounter) {
468 uint64_t SourceAddress = Item.first.first;
469 uint64_t TargetAddress = Item.first.second;
470 if (FuncRange *FRange = Binary->findFuncRange(Address: SourceAddress))
471 ProfiledFunctions.insert(Ptr: FRange->Func);
472 if (FuncRange *FRange = Binary->findFuncRange(Address: TargetAddress))
473 ProfiledFunctions.insert(Ptr: FRange->Func);
474 }
475 }
476 return true;
477}
478
479bool ProfileGenerator::collectFunctionsFromLLVMProfile(
480 SmallPtrSetImpl<const BinaryFunction *> &ProfiledFunctions) {
481 for (const auto &FS : ProfileMap) {
482 if (auto *Func = Binary->getBinaryFunction(FName: FS.second.getFunction()))
483 ProfiledFunctions.insert(Ptr: Func);
484 }
485 return true;
486}
487
488bool CSProfileGenerator::collectFunctionsFromLLVMProfile(
489 SmallPtrSetImpl<const BinaryFunction *> &ProfiledFunctions) {
490 for (auto *Node : ContextTracker) {
491 if (!Node->getFuncName().empty())
492 if (auto *Func = Binary->getBinaryFunction(FName: Node->getFuncName()))
493 ProfiledFunctions.insert(Ptr: Func);
494 }
495 return true;
496}
497
498FunctionSamples &
499ProfileGenerator::getTopLevelFunctionProfile(FunctionId FuncName) {
500 SampleContext Context(FuncName);
501 return ProfileMap.create(Ctx: Context);
502}
503
504void ProfileGenerator::generateProfile() {
505 NamedRegionTimer T("generate", "Generate profile", "profgen", "llvm-profgen",
506 TimeProfGen);
507 collectProfiledFunctions();
508
509 if (Binary->usePseudoProbes()) {
510 Binary->decodePseudoProbe();
511 if (LoadFunctionFromSymbol)
512 Binary->loadSymbolsFromPseudoProbe();
513 }
514
515 if (SampleCounters) {
516 if (Binary->usePseudoProbes()) {
517 generateProbeBasedProfile();
518 } else {
519 generateLineNumBasedProfile();
520 }
521 }
522
523 postProcessProfiles();
524}
525
526void ProfileGeneratorBase::markAllContextPreinlined(
527 SampleProfileMap &ProfileMap) {
528 for (auto &I : ProfileMap)
529 I.second.setContextAttribute(ContextShouldBeInlined);
530 FunctionSamples::ProfileIsPreInlined = true;
531}
532
533void ProfileGenerator::postProcessProfiles() {
534 computeSummaryAndThreshold(ProfileMap);
535 trimColdProfiles(ColdCntThreshold: ColdCountThreshold);
536 filterAmbiguousProfile(Profiles&: ProfileMap);
537 if (MarkAllContextPreinlined)
538 markAllContextPreinlined(ProfileMap);
539 calculateAndShowDensity(Profiles: ProfileMap);
540}
541
542void ProfileGenerator::trimColdProfiles(uint64_t ColdCntThreshold) {
543 if (!TrimColdProfile)
544 return;
545
546 // Move cold profiles into a tmp container.
547 std::vector<hash_code> ColdProfileHashes;
548 for (const auto &I : ProfileMap) {
549 if (I.second.getTotalSamples() < ColdCntThreshold)
550 ColdProfileHashes.emplace_back(args: I.first);
551 }
552
553 // Remove the cold profile from ProfileMap.
554 for (const auto &I : ColdProfileHashes)
555 ProfileMap.erase(Key: I);
556}
557
558void ProfileGenerator::generateLineNumBasedProfile() {
559 assert(SampleCounters->size() == 1 &&
560 "Must have one entry for profile generation.");
561 const SampleCounter &SC = SampleCounters->begin()->second;
562 // Fill in function body samples
563 populateBodySamplesForAllFunctions(RangeCounter: SC.RangeCounter);
564 // Fill in boundary sample counts as well as call site samples for calls
565 populateBoundarySamplesForAllFunctions(BranchCounters: SC.BranchCounter);
566 populateTypeSamplesForAllFunctions(DataAccessSamples: SC.DataAccessCounter);
567
568 updateFunctionSamples();
569}
570
571void ProfileGenerator::generateProbeBasedProfile() {
572 assert(SampleCounters->size() == 1 &&
573 "Must have one entry for profile generation.");
574 // Enable pseudo probe functionalities in SampleProf
575 FunctionSamples::ProfileIsProbeBased = true;
576 const SampleCounter &SC = SampleCounters->begin()->second;
577 // Fill in function body samples
578 populateBodySamplesWithProbesForAllFunctions(RangeCounter: SC.RangeCounter);
579 // Fill in boundary sample counts as well as call site samples for calls
580 populateBoundarySamplesWithProbesForAllFunctions(BranchCounters: SC.BranchCounter);
581
582 updateFunctionSamples();
583}
584
585void ProfileGenerator::populateBodySamplesWithProbesForAllFunctions(
586 const RangeSample &RangeCounter) {
587 ProbeCounterMap ProbeCounter;
588 // preprocessRangeCounter returns disjoint ranges, so no longer to redo it
589 // inside extractProbesFromRange.
590 extractProbesFromRange(RangeCounter: preprocessRangeCounter(RangeCounter), ProbeCounter,
591 FindDisjointRanges: false);
592
593 for (const auto &PI : ProbeCounter) {
594 const MCDecodedPseudoProbe *Probe = PI.first;
595 uint64_t Count = PI.second;
596 SampleContextFrameVector FrameVec;
597 Binary->getInlineContextForProbe(Probe, InlineContextStack&: FrameVec, IncludeLeaf: true);
598 FunctionSamples &FunctionProfile =
599 getLeafProfileAndAddTotalSamples(FrameVec, Count);
600 FunctionProfile.addBodySamples(LineOffset: Probe->getIndex(), Discriminator: Probe->getDiscriminator(),
601 Num: Count);
602 if (Probe->isEntry())
603 FunctionProfile.addHeadSamples(Num: Count);
604 }
605}
606
607void ProfileGenerator::populateBoundarySamplesWithProbesForAllFunctions(
608 const BranchSample &BranchCounters) {
609 for (const auto &Entry : BranchCounters) {
610 uint64_t SourceAddress = Entry.first.first;
611 uint64_t TargetAddress = Entry.first.second;
612 uint64_t Count = Entry.second;
613 assert(Count != 0 && "Unexpected zero weight branch");
614
615 StringRef CalleeName = getCalleeNameForAddress(TargetAddress);
616 if (CalleeName.size() == 0)
617 continue;
618
619 const MCDecodedPseudoProbe *CallProbe =
620 Binary->getCallProbeForAddr(Address: SourceAddress);
621 if (CallProbe == nullptr)
622 continue;
623
624 // Record called target sample and its count.
625 SampleContextFrameVector FrameVec;
626 Binary->getInlineContextForProbe(Probe: CallProbe, InlineContextStack&: FrameVec, IncludeLeaf: true);
627
628 if (!FrameVec.empty()) {
629 FunctionSamples &FunctionProfile =
630 getLeafProfileAndAddTotalSamples(FrameVec, Count: 0);
631 FunctionProfile.addCalledTargetSamples(
632 LineOffset: FrameVec.back().Location.LineOffset,
633 Discriminator: FrameVec.back().Location.Discriminator, Func: FunctionId(CalleeName),
634 Num: Count);
635 }
636 }
637}
638
639FunctionSamples &ProfileGenerator::getLeafProfileAndAddTotalSamples(
640 const SampleContextFrameVector &FrameVec, uint64_t Count) {
641 // Get top level profile
642 FunctionSamples *FunctionProfile =
643 &getTopLevelFunctionProfile(FuncName: FrameVec[0].Func);
644 FunctionProfile->addTotalSamples(Num: Count);
645 if (Binary->usePseudoProbes()) {
646 const auto *FuncDesc = Binary->getFuncDescForGUID(
647 GUID: FunctionProfile->getFunction().getHashCode());
648 FunctionProfile->setFunctionHash(FuncDesc->FuncHash);
649 }
650
651 for (size_t I = 1; I < FrameVec.size(); I++) {
652 LineLocation Callsite(
653 FrameVec[I - 1].Location.LineOffset,
654 getBaseDiscriminator(Discriminator: FrameVec[I - 1].Location.Discriminator));
655 FunctionSamplesMap &SamplesMap =
656 FunctionProfile->functionSamplesAt(Loc: Callsite);
657 auto Ret = SamplesMap.emplace(args: FrameVec[I].Func, args: FunctionSamples());
658 if (Ret.second) {
659 SampleContext Context(FrameVec[I].Func);
660 Ret.first->second.setContext(Context);
661 }
662 FunctionProfile = &Ret.first->second;
663 FunctionProfile->addTotalSamples(Num: Count);
664 if (Binary->usePseudoProbes()) {
665 const auto *FuncDesc = Binary->getFuncDescForGUID(
666 GUID: FunctionProfile->getFunction().getHashCode());
667 FunctionProfile->setFunctionHash(FuncDesc->FuncHash);
668 }
669 }
670
671 return *FunctionProfile;
672}
673
674RangeSample
675ProfileGenerator::preprocessRangeCounter(const RangeSample &RangeCounter) {
676 RangeSample Ranges(RangeCounter.begin(), RangeCounter.end());
677 if (FillZeroForAllFuncs) {
678 for (auto &FuncI : Binary->getAllBinaryFunctions()) {
679 for (auto &R : FuncI.second.Ranges) {
680 Ranges[{R.first, R.second - 1}] += 0;
681 }
682 }
683 } else {
684 // For each range, we search for all ranges of the function it belongs to
685 // and initialize it with zero count, so it remains zero if doesn't hit any
686 // samples. This is to be consistent with compiler that interpret zero count
687 // as unexecuted(cold).
688 for (const auto &I : RangeCounter) {
689 uint64_t StartAddress = I.first.first;
690 for (const auto &Range : Binary->getRanges(Address: StartAddress))
691 Ranges[{Range.first, Range.second - 1}] += 0;
692 }
693 }
694 RangeSample DisjointRanges;
695 findDisjointRanges(DisjointRanges, Ranges);
696 return DisjointRanges;
697}
698
699void ProfileGenerator::populateBodySamplesForAllFunctions(
700 const RangeSample &RangeCounter) {
701 for (const auto &Range : preprocessRangeCounter(RangeCounter)) {
702 uint64_t RangeBegin = Range.first.first;
703 uint64_t RangeEnd = Range.first.second;
704 uint64_t Count = Range.second;
705
706 InstructionPointer IP(Binary, RangeBegin, true);
707 // Disjoint ranges may have range in the middle of two instr,
708 // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range
709 // can be Addr1+1 to Addr2-1. We should ignore such range.
710 if (IP.Address > RangeEnd)
711 continue;
712
713 do {
714 const SampleContextFrameVector FrameVec =
715 Binary->getFrameLocationStack(Address: IP.Address);
716 if (!FrameVec.empty()) {
717 // FIXME: As accumulating total count per instruction caused some
718 // regression, we changed to accumulate total count per byte as a
719 // workaround. Tuning hotness threshold on the compiler side might be
720 // necessary in the future.
721 FunctionSamples &FunctionProfile = getLeafProfileAndAddTotalSamples(
722 FrameVec, Count: Count * Binary->getInstSize(Address: IP.Address));
723 updateBodySamplesforFunctionProfile(FunctionProfile, LeafLoc: FrameVec.back(),
724 Count);
725 }
726 } while (IP.advance() && IP.Address <= RangeEnd);
727 }
728}
729
730StringRef
731ProfileGeneratorBase::getCalleeNameForAddress(uint64_t TargetAddress) {
732 // Get the function range by branch target if it's a call branch.
733 auto *FRange = Binary->findFuncRangeForStartAddr(Address: TargetAddress);
734
735 // We won't accumulate sample count for a range whose start is not the real
736 // function entry such as outlined function or inner labels.
737 if (!FRange || !FRange->IsFuncEntry)
738 return StringRef();
739
740 // DWARF and symbol table may have mismatching function names. Instead, we'll
741 // try to use its pseudo probe name first.
742 if (Binary->usePseudoProbes()) {
743 auto FuncName = Binary->findPseudoProbeName(Func: FRange->Func);
744 if (FuncName.size())
745 return FunctionSamples::getCanonicalFnName(FnName: FuncName);
746 }
747
748 return FunctionSamples::getCanonicalFnName(FnName: FRange->getFuncName());
749}
750
751void ProfileGenerator::populateBoundarySamplesForAllFunctions(
752 const BranchSample &BranchCounters) {
753 for (const auto &Entry : BranchCounters) {
754 uint64_t SourceAddress = Entry.first.first;
755 uint64_t TargetAddress = Entry.first.second;
756 uint64_t Count = Entry.second;
757 assert(Count != 0 && "Unexpected zero weight branch");
758
759 StringRef CalleeName = getCalleeNameForAddress(TargetAddress);
760 if (CalleeName.size() == 0)
761 continue;
762 // Record called target sample and its count.
763 const SampleContextFrameVector &FrameVec =
764 Binary->getCachedFrameLocationStack(Address: SourceAddress);
765 if (!FrameVec.empty()) {
766 FunctionSamples &FunctionProfile =
767 getLeafProfileAndAddTotalSamples(FrameVec, Count: 0);
768 FunctionProfile.addCalledTargetSamples(
769 LineOffset: FrameVec.back().Location.LineOffset,
770 Discriminator: getBaseDiscriminator(Discriminator: FrameVec.back().Location.Discriminator),
771 Func: FunctionId(CalleeName), Num: Count);
772 }
773 // Add head samples for callee.
774 FunctionSamples &CalleeProfile =
775 getTopLevelFunctionProfile(FuncName: FunctionId(CalleeName));
776 CalleeProfile.addHeadSamples(Num: Count);
777 }
778}
779
780void ProfileGenerator::populateTypeSamplesForAllFunctions(
781 const DataAccessSample &DataAccessSamples) {
782 // For each instruction with vtable accesses, get its symbolized inline
783 // stack, and add the vtable counters to the function samples.
784 for (const auto &[IpData, Count] : DataAccessSamples) {
785 uint64_t InstAddr = IpData.first;
786 const SampleContextFrameVector &FrameVec =
787 Binary->getCachedFrameLocationStack(Address: InstAddr,
788 /* UseProbeDiscriminator= */ false);
789 if (!FrameVec.empty()) {
790 FunctionSamples &FunctionProfile =
791 getLeafProfileAndAddTotalSamples(FrameVec, /* Count= */ 0);
792 LineLocation Loc(
793 FrameVec.back().Location.LineOffset,
794 getBaseDiscriminator(Discriminator: FrameVec.back().Location.Discriminator));
795 FunctionProfile.addTypeSamplesAt(Loc, Type: FunctionId(IpData.second), Count);
796 }
797 }
798}
799
800void ProfileGeneratorBase::calculateBodySamplesAndSize(
801 const FunctionSamples &FSamples, uint64_t &TotalBodySamples,
802 uint64_t &FuncBodySize) {
803 // Note that ideally the size should be the number of function instruction.
804 // However, for probe-based profile, we don't have the accurate instruction
805 // count for each probe, instead, the probe sample is the samples count for
806 // the block, which is equivelant to
807 // total_instruction_samples/num_of_instruction in one block. Hence, we use
808 // the number of probe as a proxy for the function's size.
809 FuncBodySize += FSamples.getBodySamples().size();
810
811 // The accumulated body samples re-calculated here could be different from the
812 // TotalSamples(getTotalSamples) field of FunctionSamples for line-number
813 // based profile. The reason is that TotalSamples is the sum of all the
814 // samples of the machine instruction in one source-code line, however, the
815 // entry of Bodysamples is the only max number of them, so the TotalSamples is
816 // usually much bigger than the accumulated body samples as one souce-code
817 // line can emit many machine instructions. We observed a regression when we
818 // switched to use the accumulated body samples(by using
819 // -update-total-samples). Hence, it's safer to re-calculate here to avoid
820 // such discrepancy. There is no problem for probe-based profile, as the
821 // TotalSamples is exactly the same as the accumulated body samples.
822 for (const auto &I : FSamples.getBodySamples())
823 TotalBodySamples += I.second.getSamples();
824
825 for (const auto &CallsiteSamples : FSamples.getCallsiteSamples())
826 for (const auto &Callee : CallsiteSamples.second) {
827 // For binary-level density, the inlinees' samples and size should be
828 // included in the calculation.
829 calculateBodySamplesAndSize(FSamples: Callee.second, TotalBodySamples,
830 FuncBodySize);
831 }
832}
833
834// Calculate Profile-density:
835// Calculate the density for each function and sort them in descending order,
836// keep accumulating their total samples unitl it exceeds the
837// percentage_threshold(cut-off) of total profile samples, the profile-density
838// is the last(minimum) function-density of the processed functions, which means
839// all the functions hot to perf are on good density if the profile-density is
840// good. The percentage_threshold(--profile-density-cutoff-hot) is configurable
841// depending on how much regression the system want to tolerate.
842double
843ProfileGeneratorBase::calculateDensity(const SampleProfileMap &Profiles) {
844 double ProfileDensity = 0.0;
845
846 uint64_t TotalProfileSamples = 0;
847 // A list of the function profile density and its total samples.
848 std::vector<std::pair<double, uint64_t>> FuncDensityList;
849 for (const auto &I : Profiles) {
850 uint64_t TotalBodySamples = 0;
851 uint64_t FuncBodySize = 0;
852 calculateBodySamplesAndSize(FSamples: I.second, TotalBodySamples, FuncBodySize);
853
854 if (FuncBodySize == 0)
855 continue;
856
857 double FuncDensity = static_cast<double>(TotalBodySamples) / FuncBodySize;
858 TotalProfileSamples += TotalBodySamples;
859 FuncDensityList.emplace_back(args&: FuncDensity, args&: TotalBodySamples);
860 }
861
862 // Sorted by the density in descending order.
863 llvm::stable_sort(Range&: FuncDensityList, C: [&](const std::pair<double, uint64_t> &A,
864 const std::pair<double, uint64_t> &B) {
865 if (A.first != B.first)
866 return A.first > B.first;
867 return A.second < B.second;
868 });
869
870 uint64_t AccumulatedSamples = 0;
871 uint32_t I = 0;
872 assert(ProfileDensityCutOffHot <= 1000000 &&
873 "The cutoff value is greater than 1000000(100%)");
874 while (AccumulatedSamples < TotalProfileSamples *
875 static_cast<float>(ProfileDensityCutOffHot) /
876 1000000 &&
877 I < FuncDensityList.size()) {
878 AccumulatedSamples += FuncDensityList[I].second;
879 ProfileDensity = FuncDensityList[I].first;
880 I++;
881 }
882
883 return ProfileDensity;
884}
885
886void ProfileGeneratorBase::calculateAndShowDensity(
887 const SampleProfileMap &Profiles) {
888 double Density = calculateDensity(Profiles);
889 showDensitySuggestion(Density);
890}
891
892FunctionSamples *
893CSProfileGenerator::getOrCreateFunctionSamples(ContextTrieNode *ContextNode,
894 bool WasLeafInlined) {
895 FunctionSamples *FProfile = ContextNode->getFunctionSamples();
896 if (!FProfile) {
897 FSamplesList.emplace_back();
898 FProfile = &FSamplesList.back();
899 FProfile->setFunction(ContextNode->getFuncName());
900 ContextNode->setFunctionSamples(FProfile);
901 }
902 // Update ContextWasInlined attribute for existing contexts.
903 // The current function can be called in two ways:
904 // - when processing a probe of the current frame
905 // - when processing the entry probe of an inlinee's frame, which
906 // is then used to update the callsite count of the current frame.
907 // The two can happen in any order, hence here we are making sure
908 // `ContextWasInlined` is always set as expected.
909 // TODO: Note that the former does not always happen if no probes of the
910 // current frame has samples, and if the latter happens, we could lose the
911 // attribute. This should be fixed.
912 if (WasLeafInlined)
913 FProfile->getContext().setAttribute(ContextWasInlined);
914 return FProfile;
915}
916
917ContextTrieNode *
918CSProfileGenerator::getOrCreateContextNode(const SampleContextFrames Context,
919 bool WasLeafInlined) {
920 ContextTrieNode *ContextNode =
921 ContextTracker.getOrCreateContextPath(Context, AllowCreate: true);
922 getOrCreateFunctionSamples(ContextNode, WasLeafInlined);
923 return ContextNode;
924}
925
926void CSProfileGenerator::generateProfile() {
927 NamedRegionTimer T("generate", "Generate CS profile", "profgen",
928 "llvm-profgen", TimeProfGen);
929 FunctionSamples::ProfileIsCS = true;
930
931 collectProfiledFunctions();
932
933 if (Binary->usePseudoProbes()) {
934 Binary->decodePseudoProbe();
935 if (InferMissingFrames)
936 initializeMissingFrameInferrer();
937 if (LoadFunctionFromSymbol)
938 Binary->loadSymbolsFromPseudoProbe();
939 }
940
941 if (SampleCounters) {
942 if (Binary->usePseudoProbes()) {
943 generateProbeBasedProfile();
944 } else {
945 generateLineNumBasedProfile();
946 }
947 }
948
949 if (Binary->getTrackFuncContextSize())
950 computeSizeForProfiledFunctions();
951
952 postProcessProfiles();
953}
954
955void CSProfileGenerator::initializeMissingFrameInferrer() {
956 Binary->getMissingContextInferrer()->initialize(SampleCounters);
957}
958
959void CSProfileGenerator::inferMissingFrames(
960 const SmallVectorImpl<uint64_t> &Context,
961 SmallVectorImpl<uint64_t> &NewContext) {
962 Binary->inferMissingFrames(Context, NewContext);
963}
964
965void CSProfileGenerator::computeSizeForProfiledFunctions() {
966 for (auto *Func : Binary->getProfiledFunctions())
967 Binary->computeInlinedContextSizeForFunc(Func);
968
969 // Flush the symbolizer to save memory.
970 Binary->flushSymbolizer();
971}
972
973void CSProfileGenerator::updateFunctionSamples() {
974 for (auto *Node : ContextTracker) {
975 FunctionSamples *FSamples = Node->getFunctionSamples();
976 if (FSamples) {
977 if (UpdateTotalSamples)
978 FSamples->updateTotalSamples();
979 FSamples->updateCallsiteSamples();
980 }
981 }
982}
983
984void CSProfileGenerator::generateLineNumBasedProfile() {
985 for (const auto &CI : *SampleCounters) {
986 const auto *CtxKey = cast<StringBasedCtxKey>(Val: CI.first.getPtr());
987
988 ContextTrieNode *ContextNode = &getRootContext();
989 // Sample context will be empty if the jump is an external-to-internal call
990 // pattern, the head samples should be added for the internal function.
991 if (!CtxKey->Context.empty()) {
992 // Get or create function profile for the range
993 ContextNode =
994 getOrCreateContextNode(Context: CtxKey->Context, WasLeafInlined: CtxKey->WasLeafInlined);
995 // Fill in function body samples
996 populateBodySamplesForFunction(FunctionProfile&: *ContextNode->getFunctionSamples(),
997 RangeCounters: CI.second.RangeCounter);
998 }
999 // Fill in boundary sample counts as well as call site samples for calls
1000 populateBoundarySamplesForFunction(CallerNode: ContextNode, BranchCounters: CI.second.BranchCounter);
1001 }
1002 // Fill in call site value sample for inlined calls and also use context to
1003 // infer missing samples. Since we don't have call count for inlined
1004 // functions, we estimate it from inlinee's profile using the entry of the
1005 // body sample.
1006 populateInferredFunctionSamples(Node&: getRootContext());
1007
1008 updateFunctionSamples();
1009}
1010
1011void CSProfileGenerator::populateBodySamplesForFunction(
1012 FunctionSamples &FunctionProfile, const RangeSample &RangeCounter) {
1013 // Compute disjoint ranges first, so we can use MAX
1014 // for calculating count for each location.
1015 RangeSample Ranges;
1016 findDisjointRanges(DisjointRanges&: Ranges, Ranges: RangeCounter);
1017 for (const auto &Range : Ranges) {
1018 uint64_t RangeBegin = Range.first.first;
1019 uint64_t RangeEnd = Range.first.second;
1020 uint64_t Count = Range.second;
1021 // Disjoint ranges have introduce zero-filled gap that
1022 // doesn't belong to current context, filter them out.
1023 if (Count == 0)
1024 continue;
1025
1026 InstructionPointer IP(Binary, RangeBegin, true);
1027 // Disjoint ranges may have range in the middle of two instr,
1028 // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range
1029 // can be Addr1+1 to Addr2-1. We should ignore such range.
1030 if (IP.Address > RangeEnd)
1031 continue;
1032
1033 do {
1034 auto LeafLoc = Binary->getInlineLeafFrameLoc(Address: IP.Address);
1035 if (LeafLoc) {
1036 // Recording body sample for this specific context
1037 updateBodySamplesforFunctionProfile(FunctionProfile, LeafLoc: *LeafLoc, Count);
1038 FunctionProfile.addTotalSamples(Num: Count);
1039 }
1040 } while (IP.advance() && IP.Address <= RangeEnd);
1041 }
1042}
1043
1044void CSProfileGenerator::populateBoundarySamplesForFunction(
1045 ContextTrieNode *Node, const BranchSample &BranchCounters) {
1046
1047 for (const auto &Entry : BranchCounters) {
1048 uint64_t SourceAddress = Entry.first.first;
1049 uint64_t TargetAddress = Entry.first.second;
1050 uint64_t Count = Entry.second;
1051 assert(Count != 0 && "Unexpected zero weight branch");
1052
1053 StringRef CalleeName = getCalleeNameForAddress(TargetAddress);
1054 if (CalleeName.size() == 0)
1055 continue;
1056
1057 ContextTrieNode *CallerNode = Node;
1058 LineLocation CalleeCallSite(0, 0);
1059 if (CallerNode != &getRootContext()) {
1060 // Record called target sample and its count
1061 auto LeafLoc = Binary->getInlineLeafFrameLoc(Address: SourceAddress);
1062 if (LeafLoc) {
1063 CallerNode->getFunctionSamples()->addCalledTargetSamples(
1064 LineOffset: LeafLoc->Location.LineOffset,
1065 Discriminator: getBaseDiscriminator(Discriminator: LeafLoc->Location.Discriminator),
1066 Func: FunctionId(CalleeName),
1067 Num: Count);
1068 // Record head sample for called target(callee)
1069 CalleeCallSite = LeafLoc->Location;
1070 }
1071 }
1072
1073 ContextTrieNode *CalleeNode =
1074 CallerNode->getOrCreateChildContext(CallSite: CalleeCallSite,
1075 ChildName: FunctionId(CalleeName));
1076 FunctionSamples *CalleeProfile = getOrCreateFunctionSamples(ContextNode: CalleeNode);
1077 CalleeProfile->addHeadSamples(Num: Count);
1078 }
1079}
1080
1081void CSProfileGenerator::populateInferredFunctionSamples(
1082 ContextTrieNode &Node) {
1083 // There is no call jmp sample between the inliner and inlinee, we need to use
1084 // the inlinee's context to infer inliner's context, i.e. parent(inliner)'s
1085 // sample depends on child(inlinee)'s sample, so traverse the tree in
1086 // post-order.
1087 for (auto &It : Node.getAllChildContext())
1088 populateInferredFunctionSamples(Node&: It.second);
1089
1090 FunctionSamples *CalleeProfile = Node.getFunctionSamples();
1091 if (!CalleeProfile)
1092 return;
1093 // If we already have head sample counts, we must have value profile
1094 // for call sites added already. Skip to avoid double counting.
1095 if (CalleeProfile->getHeadSamples())
1096 return;
1097 ContextTrieNode *CallerNode = Node.getParentContext();
1098 // If we don't have context, nothing to do for caller's call site.
1099 // This could happen for entry point function.
1100 if (CallerNode == &getRootContext())
1101 return;
1102
1103 LineLocation CallerLeafFrameLoc = Node.getCallSiteLoc();
1104 FunctionSamples &CallerProfile = *getOrCreateFunctionSamples(ContextNode: CallerNode);
1105 // Since we don't have call count for inlined functions, we
1106 // estimate it from inlinee's profile using entry body sample.
1107 uint64_t EstimatedCallCount = CalleeProfile->getHeadSamplesEstimate();
1108 // If we don't have samples with location, use 1 to indicate live.
1109 if (!EstimatedCallCount && !CalleeProfile->getBodySamples().size())
1110 EstimatedCallCount = 1;
1111 CallerProfile.addCalledTargetSamples(LineOffset: CallerLeafFrameLoc.LineOffset,
1112 Discriminator: CallerLeafFrameLoc.Discriminator,
1113 Func: Node.getFuncName(), Num: EstimatedCallCount);
1114 CallerProfile.addBodySamples(LineOffset: CallerLeafFrameLoc.LineOffset,
1115 Discriminator: CallerLeafFrameLoc.Discriminator,
1116 Num: EstimatedCallCount);
1117 CallerProfile.addTotalSamples(Num: EstimatedCallCount);
1118}
1119
1120void CSProfileGenerator::convertToProfileMap(
1121 ContextTrieNode &Node, SampleContextFrameVector &Context) {
1122 FunctionSamples *FProfile = Node.getFunctionSamples();
1123 if (FProfile) {
1124 Context.emplace_back(Args: Node.getFuncName(), Args: LineLocation(0, 0));
1125 // Save the new context for future references.
1126 SampleContextFrames NewContext = *Contexts.insert(x: Context).first;
1127 auto Ret = ProfileMap.emplace(Args&: NewContext, Args: std::move(*FProfile));
1128 FunctionSamples &NewProfile = Ret.first->second;
1129 NewProfile.getContext().setContext(Context: NewContext);
1130 Context.pop_back();
1131 }
1132
1133 for (auto &It : Node.getAllChildContext()) {
1134 ContextTrieNode &ChildNode = It.second;
1135 Context.emplace_back(Args: Node.getFuncName(), Args: ChildNode.getCallSiteLoc());
1136 convertToProfileMap(Node&: ChildNode, Context);
1137 Context.pop_back();
1138 }
1139}
1140
1141void CSProfileGenerator::convertToProfileMap() {
1142 assert(ProfileMap.empty() &&
1143 "ProfileMap should be empty before converting from the trie");
1144 assert(IsProfileValidOnTrie &&
1145 "Do not convert the trie twice, it's already destroyed");
1146
1147 SampleContextFrameVector Context;
1148 for (auto &It : getRootContext().getAllChildContext())
1149 convertToProfileMap(Node&: It.second, Context);
1150
1151 IsProfileValidOnTrie = false;
1152}
1153
1154void CSProfileGenerator::postProcessProfiles() {
1155 // Compute hot/cold threshold based on profile. This will be used for cold
1156 // context profile merging/trimming.
1157 computeSummaryAndThreshold();
1158
1159 // Run global pre-inliner to adjust/merge context profile based on estimated
1160 // inline decisions.
1161 if (EnableCSPreInliner) {
1162 ContextTracker.populateFuncToCtxtMap();
1163 CSPreInliner(ContextTracker, *Binary, Summary.get()).run();
1164 // Turn off the profile merger by default unless it is explicitly enabled.
1165 if (!CSProfMergeColdContext.getNumOccurrences())
1166 CSProfMergeColdContext = false;
1167 }
1168
1169 convertToProfileMap();
1170
1171 // Trim and merge cold context profile using cold threshold above.
1172 if (TrimColdProfile || CSProfMergeColdContext) {
1173 SampleContextTrimmer(ProfileMap)
1174 .trimAndMergeColdContextProfiles(
1175 ColdCountThreshold: HotCountThreshold, TrimColdContext: TrimColdProfile, MergeColdContext: CSProfMergeColdContext,
1176 ColdContextFrameLength: CSProfMaxColdContextDepth, TrimBaseProfileOnly: EnableCSPreInliner);
1177 }
1178
1179 if (GenCSNestedProfile) {
1180 ProfileConverter CSConverter(ProfileMap);
1181 CSConverter.convertCSProfiles();
1182 FunctionSamples::ProfileIsCS = false;
1183 }
1184 filterAmbiguousProfile(Profiles&: ProfileMap);
1185 if (MarkAllContextPreinlined)
1186 markAllContextPreinlined(ProfileMap);
1187 ProfileGeneratorBase::calculateAndShowDensity(Profiles: ProfileMap);
1188}
1189
1190void ProfileGeneratorBase::computeSummaryAndThreshold(
1191 SampleProfileMap &Profiles) {
1192 SampleProfileSummaryBuilder Builder(ProfileSummaryBuilder::DefaultCutoffs);
1193 Summary = Builder.computeSummaryForProfiles(Profiles);
1194 HotCountThreshold = ProfileSummaryBuilder::getHotCountThreshold(
1195 DS: (Summary->getDetailedSummary()));
1196 ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold(
1197 DS: (Summary->getDetailedSummary()));
1198}
1199
1200void CSProfileGenerator::computeSummaryAndThreshold() {
1201 // Always merge and use context-less profile map to compute summary.
1202 SampleProfileMap ContextLessProfiles;
1203 ContextTracker.createContextLessProfileMap(ContextLessProfiles);
1204
1205 // Set the flag below to avoid merging the profile again in
1206 // computeSummaryAndThreshold
1207 FunctionSamples::ProfileIsCS = false;
1208 assert(
1209 (!UseContextLessSummary.getNumOccurrences() || UseContextLessSummary) &&
1210 "Don't set --profile-summary-contextless to false for profile "
1211 "generation");
1212 ProfileGeneratorBase::computeSummaryAndThreshold(Profiles&: ContextLessProfiles);
1213 // Recover the old value.
1214 FunctionSamples::ProfileIsCS = true;
1215}
1216
1217void ProfileGeneratorBase::extractProbesFromRange(
1218 const RangeSample &RangeCounter, ProbeCounterMap &ProbeCounter,
1219 bool FindDisjointRanges) {
1220 const RangeSample *PRanges = &RangeCounter;
1221 RangeSample Ranges;
1222 if (FindDisjointRanges) {
1223 findDisjointRanges(DisjointRanges&: Ranges, Ranges: RangeCounter);
1224 PRanges = &Ranges;
1225 }
1226
1227 for (const auto &Range : *PRanges) {
1228 uint64_t RangeBegin = Range.first.first;
1229 uint64_t RangeEnd = Range.first.second;
1230 uint64_t Count = Range.second;
1231
1232 InstructionPointer IP(Binary, RangeBegin, true);
1233 // Disjoint ranges may have range in the middle of two instr,
1234 // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range
1235 // can be Addr1+1 to Addr2-1. We should ignore such range.
1236 if (IP.Address > RangeEnd)
1237 continue;
1238
1239 do {
1240 const AddressProbesMap &Address2ProbesMap =
1241 Binary->getAddress2ProbesMap();
1242 for (const MCDecodedPseudoProbe &Probe :
1243 Address2ProbesMap.find(Address: IP.Address)) {
1244 ProbeCounter[&Probe] += Count;
1245 }
1246 } while (IP.advance() && IP.Address <= RangeEnd);
1247 }
1248}
1249
1250static void extractPrefixContextStack(SampleContextFrameVector &ContextStack,
1251 const SmallVectorImpl<uint64_t> &AddrVec,
1252 ProfiledBinary *Binary) {
1253 SmallVector<const MCDecodedPseudoProbe *, 16> Probes;
1254 for (auto Address : reverse(C: AddrVec)) {
1255 const MCDecodedPseudoProbe *CallProbe =
1256 Binary->getCallProbeForAddr(Address);
1257 // These could be the cases when a probe is not found at a calliste. Cutting
1258 // off the context from here since the inliner will not know how to consume
1259 // a context with unknown callsites.
1260 // 1. for functions that are not sampled when
1261 // --decode-probe-for-profiled-functions-only is on.
1262 // 2. for a merged callsite. Callsite merging may cause the loss of original
1263 // probe IDs.
1264 // 3. for an external callsite.
1265 if (!CallProbe)
1266 break;
1267 Probes.push_back(Elt: CallProbe);
1268 }
1269
1270 std::reverse(first: Probes.begin(), last: Probes.end());
1271
1272 // Extract context stack for reusing, leaf context stack will be added
1273 // compressed while looking up function profile.
1274 for (const auto *P : Probes) {
1275 Binary->getInlineContextForProbe(Probe: P, InlineContextStack&: ContextStack, IncludeLeaf: true);
1276 }
1277}
1278
1279void CSProfileGenerator::generateProbeBasedProfile() {
1280 // Enable pseudo probe functionalities in SampleProf
1281 FunctionSamples::ProfileIsProbeBased = true;
1282 for (const auto &CI : *SampleCounters) {
1283 const AddrBasedCtxKey *CtxKey =
1284 dyn_cast<AddrBasedCtxKey>(Val: CI.first.getPtr());
1285 // Fill in function body samples from probes, also infer caller's samples
1286 // from callee's probe
1287 populateBodySamplesWithProbes(RangeCounter: CI.second.RangeCounter, CtxKey);
1288 // Fill in boundary samples for a call probe
1289 populateBoundarySamplesWithProbes(BranchCounter: CI.second.BranchCounter, CtxKey);
1290 }
1291}
1292
1293void CSProfileGenerator::populateBodySamplesWithProbes(
1294 const RangeSample &RangeCounter, const AddrBasedCtxKey *CtxKey) {
1295 ProbeCounterMap ProbeCounter;
1296 // Extract the top frame probes by looking up each address among the range in
1297 // the Address2ProbeMap
1298 extractProbesFromRange(RangeCounter, ProbeCounter);
1299 DenseMap<MCDecodedPseudoProbeInlineTree *, SmallPtrSet<FunctionSamples *, 0>>
1300 FrameSamples;
1301 for (const auto &PI : ProbeCounter) {
1302 const MCDecodedPseudoProbe *Probe = PI.first;
1303 uint64_t Count = PI.second;
1304 // Disjoint ranges have introduce zero-filled gap that
1305 // doesn't belong to current context, filter them out.
1306 if (!Probe->isBlock() || Count == 0)
1307 continue;
1308
1309 ContextTrieNode *ContextNode = getContextNodeForLeafProbe(CtxKey, LeafProbe: Probe);
1310 FunctionSamples &FunctionProfile = *ContextNode->getFunctionSamples();
1311 // Record the current frame and FunctionProfile whenever samples are
1312 // collected for non-danglie probes. This is for reporting all of the
1313 // zero count probes of the frame later.
1314 FrameSamples[Probe->getInlineTreeNode()].insert(Ptr: &FunctionProfile);
1315 FunctionProfile.addBodySamples(LineOffset: Probe->getIndex(), Discriminator: Probe->getDiscriminator(),
1316 Num: Count);
1317 FunctionProfile.addTotalSamples(Num: Count);
1318 if (Probe->isEntry()) {
1319 FunctionProfile.addHeadSamples(Num: Count);
1320 // Look up for the caller's function profile
1321 const auto *InlinerDesc = Binary->getInlinerDescForProbe(Probe);
1322 ContextTrieNode *CallerNode = ContextNode->getParentContext();
1323 if (InlinerDesc != nullptr && CallerNode != &getRootContext()) {
1324 // Since the context id will be compressed, we have to use callee's
1325 // context id to infer caller's context id to ensure they share the
1326 // same context prefix.
1327 uint64_t CallerIndex = ContextNode->getCallSiteLoc().LineOffset;
1328 uint64_t CallerDiscriminator = ContextNode->getCallSiteLoc().Discriminator;
1329 assert(CallerIndex &&
1330 "Inferred caller's location index shouldn't be zero!");
1331 assert(!CallerDiscriminator &&
1332 "Callsite probe should not have a discriminator!");
1333 FunctionSamples &CallerProfile =
1334 *getOrCreateFunctionSamples(ContextNode: CallerNode);
1335 CallerProfile.setFunctionHash(InlinerDesc->FuncHash);
1336 CallerProfile.addBodySamples(LineOffset: CallerIndex, Discriminator: CallerDiscriminator, Num: Count);
1337 CallerProfile.addTotalSamples(Num: Count);
1338 CallerProfile.addCalledTargetSamples(LineOffset: CallerIndex, Discriminator: CallerDiscriminator,
1339 Func: ContextNode->getFuncName(), Num: Count);
1340 }
1341 }
1342 }
1343
1344 // Assign zero count for remaining probes without sample hits to
1345 // differentiate from probes optimized away, of which the counts are unknown
1346 // and will be inferred by the compiler.
1347 for (auto &I : FrameSamples) {
1348 for (auto *FunctionProfile : I.second) {
1349 for (const MCDecodedPseudoProbe &Probe : I.first->getProbes()) {
1350 FunctionProfile->addBodySamples(LineOffset: Probe.getIndex(),
1351 Discriminator: Probe.getDiscriminator(), Num: 0);
1352 }
1353 }
1354 }
1355}
1356
1357void CSProfileGenerator::populateBoundarySamplesWithProbes(
1358 const BranchSample &BranchCounter, const AddrBasedCtxKey *CtxKey) {
1359 for (const auto &BI : BranchCounter) {
1360 uint64_t SourceAddress = BI.first.first;
1361 uint64_t TargetAddress = BI.first.second;
1362 uint64_t Count = BI.second;
1363 const MCDecodedPseudoProbe *CallProbe =
1364 Binary->getCallProbeForAddr(Address: SourceAddress);
1365 if (CallProbe == nullptr)
1366 continue;
1367 FunctionSamples &FunctionProfile =
1368 getFunctionProfileForLeafProbe(CtxKey, LeafProbe: CallProbe);
1369 FunctionProfile.addBodySamples(LineOffset: CallProbe->getIndex(), Discriminator: 0, Num: Count);
1370 FunctionProfile.addTotalSamples(Num: Count);
1371 StringRef CalleeName = getCalleeNameForAddress(TargetAddress);
1372 if (CalleeName.size() == 0)
1373 continue;
1374 FunctionProfile.addCalledTargetSamples(LineOffset: CallProbe->getIndex(),
1375 Discriminator: CallProbe->getDiscriminator(),
1376 Func: FunctionId(CalleeName), Num: Count);
1377 }
1378}
1379
1380ContextTrieNode *CSProfileGenerator::getContextNodeForLeafProbe(
1381 const AddrBasedCtxKey *CtxKey, const MCDecodedPseudoProbe *LeafProbe) {
1382
1383 const SmallVectorImpl<uint64_t> *PContext = &CtxKey->Context;
1384 SmallVector<uint64_t, 16> NewContext;
1385
1386 if (InferMissingFrames) {
1387 SmallVector<uint64_t, 16> Context = CtxKey->Context;
1388 // Append leaf frame for a complete inference.
1389 Context.push_back(Elt: LeafProbe->getAddress());
1390 inferMissingFrames(Context, NewContext);
1391 // Pop out the leaf probe that was pushed in above.
1392 NewContext.pop_back();
1393 PContext = &NewContext;
1394 }
1395
1396 SampleContextFrameVector ContextStack;
1397 extractPrefixContextStack(ContextStack, AddrVec: *PContext, Binary);
1398
1399 // Explicitly copy the context for appending the leaf context
1400 SampleContextFrameVector NewContextStack(ContextStack.begin(),
1401 ContextStack.end());
1402 Binary->getInlineContextForProbe(Probe: LeafProbe, InlineContextStack&: NewContextStack, IncludeLeaf: true);
1403 // For leaf inlined context with the top frame, we should strip off the top
1404 // frame's probe id, like:
1405 // Inlined stack: [foo:1, bar:2], the ContextId will be "foo:1 @ bar"
1406 auto LeafFrame = NewContextStack.back();
1407 LeafFrame.Location = LineLocation(0, 0);
1408 NewContextStack.pop_back();
1409 // Compress the context string except for the leaf frame
1410 CSProfileGenerator::compressRecursionContext(Context&: NewContextStack);
1411 CSProfileGenerator::trimContext(S&: NewContextStack);
1412 NewContextStack.push_back(Elt: LeafFrame);
1413
1414 const auto *FuncDesc = Binary->getFuncDescForGUID(GUID: LeafProbe->getGuid());
1415 bool WasLeafInlined = LeafProbe->getInlineTreeNode()->hasInlineSite();
1416 ContextTrieNode *ContextNode =
1417 getOrCreateContextNode(Context: NewContextStack, WasLeafInlined);
1418 ContextNode->getFunctionSamples()->setFunctionHash(FuncDesc->FuncHash);
1419 return ContextNode;
1420}
1421
1422FunctionSamples &CSProfileGenerator::getFunctionProfileForLeafProbe(
1423 const AddrBasedCtxKey *CtxKey, const MCDecodedPseudoProbe *LeafProbe) {
1424 return *getContextNodeForLeafProbe(CtxKey, LeafProbe)->getFunctionSamples();
1425}
1426
1427} // end namespace sampleprof
1428} // end namespace llvm
1429