1//===-- ProfileGenerator.cpp - Profile Generator ---------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8#include "ProfileGenerator.h"
9#include "ErrorHandling.h"
10#include "MissingFrameInferrer.h"
11#include "PerfReader.h"
12#include "ProfiledBinary.h"
13#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
14#include "llvm/ProfileData/ProfileCommon.h"
15#include <algorithm>
16#include <float.h>
17#include <unordered_set>
18#include <utility>
19
20cl::opt<std::string> OutputFilename("output", cl::value_desc("output"),
21 cl::Required,
22 cl::desc("Output profile file"));
23static cl::alias OutputA("o", cl::desc("Alias for --output"),
24 cl::aliasopt(OutputFilename));
25
26static cl::opt<SampleProfileFormat> OutputFormat(
27 "format", cl::desc("Format of output profile"), cl::init(Val: SPF_Ext_Binary),
28 cl::values(
29 clEnumValN(SPF_Binary, "binary", "Binary encoding (default)"),
30 clEnumValN(SPF_Ext_Binary, "extbinary", "Extensible binary encoding"),
31 clEnumValN(SPF_Text, "text", "Text encoding"),
32 clEnumValN(SPF_GCC, "gcc",
33 "GCC encoding (only meaningful for -sample)")));
34
35static cl::opt<bool> UseMD5(
36 "use-md5", cl::Hidden,
37 cl::desc("Use md5 to represent function names in the output profile (only "
38 "meaningful for -extbinary)"));
39
40static cl::opt<bool> PopulateProfileSymbolList(
41 "populate-profile-symbol-list", cl::init(Val: false), cl::Hidden,
42 cl::desc("Populate profile symbol list (only meaningful for -extbinary)"));
43
44static cl::opt<bool> FillZeroForAllFuncs(
45 "fill-zero-for-all-funcs", cl::init(Val: false), cl::Hidden,
46 cl::desc("Attribute all functions' range with zero count "
47 "even it's not hit by any samples."));
48
49static cl::opt<int32_t, true> RecursionCompression(
50 "compress-recursion",
51 cl::desc("Compressing recursion by deduplicating adjacent frame "
52 "sequences up to the specified size. -1 means no size limit."),
53 cl::Hidden,
54 cl::location(L&: llvm::sampleprof::CSProfileGenerator::MaxCompressionSize));
55
56static cl::opt<bool>
57 TrimColdProfile("trim-cold-profile",
58 cl::desc("If the total count of the profile is smaller "
59 "than threshold, it will be trimmed."));
60
61static cl::opt<bool> CSProfMergeColdContext(
62 "csprof-merge-cold-context", cl::init(Val: true),
63 cl::desc("If the total count of context profile is smaller than "
64 "the threshold, it will be merged into context-less base "
65 "profile."));
66
67static cl::opt<uint32_t> CSProfMaxColdContextDepth(
68 "csprof-max-cold-context-depth", cl::init(Val: 1),
69 cl::desc("Keep the last K contexts while merging cold profile. 1 means the "
70 "context-less base profile"));
71
72static cl::opt<int, true> CSProfMaxContextDepth(
73 "csprof-max-context-depth",
74 cl::desc("Keep the last K contexts while merging profile. -1 means no "
75 "depth limit."),
76 cl::location(L&: llvm::sampleprof::CSProfileGenerator::MaxContextDepth));
77
78static cl::opt<double> ProfileDensityThreshold(
79 "profile-density-threshold", llvm::cl::init(Val: 50),
80 llvm::cl::desc("If the profile density is below the given threshold, it "
81 "will be suggested to increase the sampling rate."),
82 llvm::cl::Optional);
83static cl::opt<bool> ShowDensity("show-density", llvm::cl::init(Val: false),
84 llvm::cl::desc("show profile density details"),
85 llvm::cl::Optional);
86static cl::opt<int> ProfileDensityCutOffHot(
87 "profile-density-cutoff-hot", llvm::cl::init(Val: 990000),
88 llvm::cl::desc("Total samples cutoff for functions used to calculate "
89 "profile density."));
90
91static cl::opt<bool> UpdateTotalSamples(
92 "update-total-samples", llvm::cl::init(Val: false),
93 llvm::cl::desc(
94 "Update total samples by accumulating all its body samples."),
95 llvm::cl::Optional);
96
97static cl::opt<bool> GenCSNestedProfile(
98 "gen-cs-nested-profile", cl::Hidden, cl::init(Val: true),
99 cl::desc("Generate nested function profiles for CSSPGO"));
100
101cl::opt<bool> InferMissingFrames(
102 "infer-missing-frames", llvm::cl::init(Val: true),
103 llvm::cl::desc(
104 "Infer missing call frames due to compiler tail call elimination."),
105 llvm::cl::Optional);
106
107using namespace llvm;
108using namespace sampleprof;
109
110namespace llvm {
111
112namespace sampleprof {
113
114// Initialize the MaxCompressionSize to -1 which means no size limit
115int32_t CSProfileGenerator::MaxCompressionSize = -1;
116
117int CSProfileGenerator::MaxContextDepth = -1;
118
119bool ProfileGeneratorBase::UseFSDiscriminator = false;
120
121std::unique_ptr<ProfileGeneratorBase>
122ProfileGeneratorBase::create(ProfiledBinary *Binary,
123 const ContextSampleCounterMap *SampleCounters,
124 bool ProfileIsCS) {
125 std::unique_ptr<ProfileGeneratorBase> Generator;
126 if (ProfileIsCS) {
127 Generator.reset(p: new CSProfileGenerator(Binary, SampleCounters));
128 } else {
129 Generator.reset(p: new ProfileGenerator(Binary, SampleCounters));
130 }
131 ProfileGeneratorBase::UseFSDiscriminator = Binary->useFSDiscriminator();
132 FunctionSamples::ProfileIsFS = Binary->useFSDiscriminator();
133
134 return Generator;
135}
136
137std::unique_ptr<ProfileGeneratorBase>
138ProfileGeneratorBase::create(ProfiledBinary *Binary, SampleProfileMap &Profiles,
139 bool ProfileIsCS) {
140 std::unique_ptr<ProfileGeneratorBase> Generator;
141 if (ProfileIsCS) {
142 Generator.reset(p: new CSProfileGenerator(Binary, Profiles));
143 } else {
144 Generator.reset(p: new ProfileGenerator(Binary, std::move(Profiles)));
145 }
146 ProfileGeneratorBase::UseFSDiscriminator = Binary->useFSDiscriminator();
147 FunctionSamples::ProfileIsFS = Binary->useFSDiscriminator();
148
149 return Generator;
150}
151
152void ProfileGeneratorBase::write(std::unique_ptr<SampleProfileWriter> Writer,
153 SampleProfileMap &ProfileMap) {
154 // Populate profile symbol list if extended binary format is used.
155 ProfileSymbolList SymbolList;
156
157 if (PopulateProfileSymbolList && OutputFormat == SPF_Ext_Binary) {
158 Binary->populateSymbolListFromDWARF(SymbolList);
159 Writer->setProfileSymbolList(&SymbolList);
160 }
161
162 if (std::error_code EC = Writer->write(ProfileMap))
163 exitWithError(EC: std::move(EC));
164}
165
166void ProfileGeneratorBase::write() {
167 auto WriterOrErr = SampleProfileWriter::create(Filename: OutputFilename, Format: OutputFormat);
168 if (std::error_code EC = WriterOrErr.getError())
169 exitWithError(EC, Whence: OutputFilename);
170
171 if (UseMD5) {
172 if (OutputFormat != SPF_Ext_Binary)
173 WithColor::warning() << "-use-md5 is ignored. Specify "
174 "--format=extbinary to enable it\n";
175 else
176 WriterOrErr.get()->setUseMD5();
177 }
178
179 write(Writer: std::move(WriterOrErr.get()), ProfileMap);
180}
181
182void ProfileGeneratorBase::showDensitySuggestion(double Density) {
183 if (Density == 0.0)
184 WithColor::warning() << "The output profile is empty or the "
185 "--profile-density-cutoff-hot option is "
186 "set too low. Please check your command.\n";
187 else if (Density < ProfileDensityThreshold)
188 WithColor::warning()
189 << "Sample PGO is estimated to optimize better with "
190 << format(Fmt: "%.1f", Vals: ProfileDensityThreshold / Density)
191 << "x more samples. Please consider increasing sampling rate or "
192 "profiling for longer duration to get more samples.\n";
193
194 if (ShowDensity)
195 outs() << "Functions with density >= " << format(Fmt: "%.1f", Vals: Density)
196 << " account for "
197 << format(Fmt: "%.2f",
198 Vals: static_cast<double>(ProfileDensityCutOffHot) / 10000)
199 << "% total sample counts.\n";
200}
201
202bool ProfileGeneratorBase::filterAmbiguousProfile(FunctionSamples &FS) {
203 for (const auto &Prefix : FuncPrefixsToFilter) {
204 if (FS.getFuncName().starts_with(Prefix))
205 return true;
206 }
207
208 // Filter the function profiles for the inlinees. It's useful for fuzzy
209 // profile matching which flattens the profile and inlinees' samples are
210 // merged into top-level function.
211 for (auto &Callees :
212 const_cast<CallsiteSampleMap &>(FS.getCallsiteSamples())) {
213 auto &CalleesMap = Callees.second;
214 for (auto I = CalleesMap.begin(); I != CalleesMap.end();) {
215 auto FS = I++;
216 if (filterAmbiguousProfile(FS&: FS->second))
217 CalleesMap.erase(position: FS);
218 }
219 }
220 return false;
221}
222
223// For built-in local initialization function such as __cxx_global_var_init,
224// __tls_init prefix function, there could be multiple versions of the functions
225// in the final binary. However, in the profile generation, we call
226// getCanonicalFnName to canonicalize the names which strips the suffixes.
227// Therefore, samples from different functions queries the same profile and the
228// samples are merged. As the functions are essentially different, entries of
229// the merged profile are ambiguous. In sample loader, the IR from one version
230// would be attributed towards a merged entries, which is inaccurate. Especially
231// for fuzzy profile matching, it gets multiple callsites(from different
232// function) but used to match one callsite, which misleads the matching and
233// causes a lot of false positives report. Hence, we want to filter them out
234// from the profile map during the profile generation time. The profiles are all
235// cold functions, it won't have perf impact.
236void ProfileGeneratorBase::filterAmbiguousProfile(SampleProfileMap &Profiles) {
237 for (auto I = ProfileMap.begin(); I != ProfileMap.end();) {
238 auto FS = I++;
239 if (filterAmbiguousProfile(FS&: FS->second))
240 ProfileMap.erase(It: FS);
241 }
242}
243
244void ProfileGeneratorBase::findDisjointRanges(RangeSample &DisjointRanges,
245 const RangeSample &Ranges) {
246
247 /*
248 Regions may overlap with each other. Using the boundary info, find all
249 disjoint ranges and their sample count. BoundaryPoint contains the count
250 multiple samples begin/end at this points.
251
252 |<--100-->| Sample1
253 |<------200------>| Sample2
254 A B C
255
256 In the example above,
257 Sample1 begins at A, ends at B, its value is 100.
258 Sample2 beings at A, ends at C, its value is 200.
259 For A, BeginCount is the sum of sample begins at A, which is 300 and no
260 samples ends at A, so EndCount is 0.
261 Then boundary points A, B, and C with begin/end counts are:
262 A: (300, 0)
263 B: (0, 100)
264 C: (0, 200)
265 */
266 struct BoundaryPoint {
267 // Sum of sample counts beginning at this point
268 uint64_t BeginCount = UINT64_MAX;
269 // Sum of sample counts ending at this point
270 uint64_t EndCount = UINT64_MAX;
271 // Is the begin point of a zero range.
272 bool IsZeroRangeBegin = false;
273 // Is the end point of a zero range.
274 bool IsZeroRangeEnd = false;
275
276 void addBeginCount(uint64_t Count) {
277 if (BeginCount == UINT64_MAX)
278 BeginCount = 0;
279 BeginCount += Count;
280 }
281
282 void addEndCount(uint64_t Count) {
283 if (EndCount == UINT64_MAX)
284 EndCount = 0;
285 EndCount += Count;
286 }
287 };
288
289 /*
290 For the above example. With boundary points, follwing logic finds two
291 disjoint region of
292
293 [A,B]: 300
294 [B+1,C]: 200
295
296 If there is a boundary point that both begin and end, the point itself
297 becomes a separate disjoint region. For example, if we have original
298 ranges of
299
300 |<--- 100 --->|
301 |<--- 200 --->|
302 A B C
303
304 there are three boundary points with their begin/end counts of
305
306 A: (100, 0)
307 B: (200, 100)
308 C: (0, 200)
309
310 the disjoint ranges would be
311
312 [A, B-1]: 100
313 [B, B]: 300
314 [B+1, C]: 200.
315
316 Example for zero value range:
317
318 |<--- 100 --->|
319 |<--- 200 --->|
320 |<--------------- 0 ----------------->|
321 A B C D E F
322
323 [A, B-1] : 0
324 [B, C] : 100
325 [C+1, D-1]: 0
326 [D, E] : 200
327 [E+1, F] : 0
328 */
329 std::map<uint64_t, BoundaryPoint> Boundaries;
330
331 for (const auto &Item : Ranges) {
332 assert(Item.first.first <= Item.first.second &&
333 "Invalid instruction range");
334 auto &BeginPoint = Boundaries[Item.first.first];
335 auto &EndPoint = Boundaries[Item.first.second];
336 uint64_t Count = Item.second;
337
338 BeginPoint.addBeginCount(Count);
339 EndPoint.addEndCount(Count);
340 if (Count == 0) {
341 BeginPoint.IsZeroRangeBegin = true;
342 EndPoint.IsZeroRangeEnd = true;
343 }
344 }
345
346 // Use UINT64_MAX to indicate there is no existing range between BeginAddress
347 // and the next valid address
348 uint64_t BeginAddress = UINT64_MAX;
349 int ZeroRangeDepth = 0;
350 uint64_t Count = 0;
351 for (const auto &Item : Boundaries) {
352 uint64_t Address = Item.first;
353 const BoundaryPoint &Point = Item.second;
354 if (Point.BeginCount != UINT64_MAX) {
355 if (BeginAddress != UINT64_MAX)
356 DisjointRanges[{BeginAddress, Address - 1}] = Count;
357 Count += Point.BeginCount;
358 BeginAddress = Address;
359 ZeroRangeDepth += Point.IsZeroRangeBegin;
360 }
361 if (Point.EndCount != UINT64_MAX) {
362 assert((BeginAddress != UINT64_MAX) &&
363 "First boundary point cannot be 'end' point");
364 DisjointRanges[{BeginAddress, Address}] = Count;
365 assert(Count >= Point.EndCount && "Mismatched live ranges");
366 Count -= Point.EndCount;
367 BeginAddress = Address + 1;
368 ZeroRangeDepth -= Point.IsZeroRangeEnd;
369 // If the remaining count is zero and it's no longer in a zero range, this
370 // means we consume all the ranges before, thus mark BeginAddress as
371 // UINT64_MAX. e.g. supposing we have two non-overlapping ranges:
372 // [<---- 10 ---->]
373 // [<---- 20 ---->]
374 // A B C D
375 // The BeginAddress(B+1) will reset to invalid(UINT64_MAX), so we won't
376 // have the [B+1, C-1] zero range.
377 if (Count == 0 && ZeroRangeDepth == 0)
378 BeginAddress = UINT64_MAX;
379 }
380 }
381}
382
383void ProfileGeneratorBase::updateBodySamplesforFunctionProfile(
384 FunctionSamples &FunctionProfile, const SampleContextFrame &LeafLoc,
385 uint64_t Count) {
386 // Use the maximum count of samples with same line location
387 uint32_t Discriminator = getBaseDiscriminator(Discriminator: LeafLoc.Location.Discriminator);
388
389 // Use duplication factor to compensated for loop unroll/vectorization.
390 // Note that this is only needed when we're taking MAX of the counts at
391 // the location instead of SUM.
392 Count *= getDuplicationFactor(Discriminator: LeafLoc.Location.Discriminator);
393
394 ErrorOr<uint64_t> R =
395 FunctionProfile.findSamplesAt(LineOffset: LeafLoc.Location.LineOffset, Discriminator);
396
397 uint64_t PreviousCount = R ? R.get() : 0;
398 if (PreviousCount <= Count) {
399 FunctionProfile.addBodySamples(LineOffset: LeafLoc.Location.LineOffset, Discriminator,
400 Num: Count - PreviousCount);
401 }
402}
403
404void ProfileGeneratorBase::updateTotalSamples() {
405 for (auto &Item : ProfileMap) {
406 FunctionSamples &FunctionProfile = Item.second;
407 FunctionProfile.updateTotalSamples();
408 }
409}
410
411void ProfileGeneratorBase::updateCallsiteSamples() {
412 for (auto &Item : ProfileMap) {
413 FunctionSamples &FunctionProfile = Item.second;
414 FunctionProfile.updateCallsiteSamples();
415 }
416}
417
418void ProfileGeneratorBase::updateFunctionSamples() {
419 updateCallsiteSamples();
420
421 if (UpdateTotalSamples)
422 updateTotalSamples();
423}
424
425void ProfileGeneratorBase::collectProfiledFunctions() {
426 std::unordered_set<const BinaryFunction *> ProfiledFunctions;
427 if (collectFunctionsFromRawProfile(ProfiledFunctions))
428 Binary->setProfiledFunctions(ProfiledFunctions);
429 else if (collectFunctionsFromLLVMProfile(ProfiledFunctions))
430 Binary->setProfiledFunctions(ProfiledFunctions);
431 else
432 llvm_unreachable("Unsupported input profile");
433}
434
435bool ProfileGeneratorBase::collectFunctionsFromRawProfile(
436 std::unordered_set<const BinaryFunction *> &ProfiledFunctions) {
437 if (!SampleCounters)
438 return false;
439 // Go through all the stacks, ranges and branches in sample counters, use
440 // the start of the range to look up the function it belongs and record the
441 // function.
442 for (const auto &CI : *SampleCounters) {
443 if (const auto *CtxKey = dyn_cast<AddrBasedCtxKey>(Val: CI.first.getPtr())) {
444 for (auto StackAddr : CtxKey->Context) {
445 if (FuncRange *FRange = Binary->findFuncRange(Address: StackAddr))
446 ProfiledFunctions.insert(x: FRange->Func);
447 }
448 }
449
450 for (auto Item : CI.second.RangeCounter) {
451 uint64_t StartAddress = Item.first.first;
452 if (FuncRange *FRange = Binary->findFuncRange(Address: StartAddress))
453 ProfiledFunctions.insert(x: FRange->Func);
454 }
455
456 for (auto Item : CI.second.BranchCounter) {
457 uint64_t SourceAddress = Item.first.first;
458 uint64_t TargetAddress = Item.first.second;
459 if (FuncRange *FRange = Binary->findFuncRange(Address: SourceAddress))
460 ProfiledFunctions.insert(x: FRange->Func);
461 if (FuncRange *FRange = Binary->findFuncRange(Address: TargetAddress))
462 ProfiledFunctions.insert(x: FRange->Func);
463 }
464 }
465 return true;
466}
467
468bool ProfileGenerator::collectFunctionsFromLLVMProfile(
469 std::unordered_set<const BinaryFunction *> &ProfiledFunctions) {
470 for (const auto &FS : ProfileMap) {
471 if (auto *Func = Binary->getBinaryFunction(FName: FS.second.getFunction()))
472 ProfiledFunctions.insert(x: Func);
473 }
474 return true;
475}
476
477bool CSProfileGenerator::collectFunctionsFromLLVMProfile(
478 std::unordered_set<const BinaryFunction *> &ProfiledFunctions) {
479 for (auto *Node : ContextTracker) {
480 if (!Node->getFuncName().empty())
481 if (auto *Func = Binary->getBinaryFunction(FName: Node->getFuncName()))
482 ProfiledFunctions.insert(x: Func);
483 }
484 return true;
485}
486
487FunctionSamples &
488ProfileGenerator::getTopLevelFunctionProfile(FunctionId FuncName) {
489 SampleContext Context(FuncName);
490 return ProfileMap.create(Ctx: Context);
491}
492
493void ProfileGenerator::generateProfile() {
494 collectProfiledFunctions();
495
496 if (Binary->usePseudoProbes())
497 Binary->decodePseudoProbe();
498
499 if (SampleCounters) {
500 if (Binary->usePseudoProbes()) {
501 generateProbeBasedProfile();
502 } else {
503 generateLineNumBasedProfile();
504 }
505 }
506
507 postProcessProfiles();
508}
509
510void ProfileGenerator::postProcessProfiles() {
511 computeSummaryAndThreshold(ProfileMap);
512 trimColdProfiles(Profiles: ProfileMap, ColdCntThreshold: ColdCountThreshold);
513 filterAmbiguousProfile(Profiles&: ProfileMap);
514 calculateAndShowDensity(Profiles: ProfileMap);
515}
516
517void ProfileGenerator::trimColdProfiles(const SampleProfileMap &Profiles,
518 uint64_t ColdCntThreshold) {
519 if (!TrimColdProfile)
520 return;
521
522 // Move cold profiles into a tmp container.
523 std::vector<hash_code> ColdProfileHashes;
524 for (const auto &I : ProfileMap) {
525 if (I.second.getTotalSamples() < ColdCntThreshold)
526 ColdProfileHashes.emplace_back(args: I.first);
527 }
528
529 // Remove the cold profile from ProfileMap.
530 for (const auto &I : ColdProfileHashes)
531 ProfileMap.erase(Key: I);
532}
533
534void ProfileGenerator::generateLineNumBasedProfile() {
535 assert(SampleCounters->size() == 1 &&
536 "Must have one entry for profile generation.");
537 const SampleCounter &SC = SampleCounters->begin()->second;
538 // Fill in function body samples
539 populateBodySamplesForAllFunctions(RangeCounter: SC.RangeCounter);
540 // Fill in boundary sample counts as well as call site samples for calls
541 populateBoundarySamplesForAllFunctions(BranchCounters: SC.BranchCounter);
542
543 updateFunctionSamples();
544}
545
546void ProfileGenerator::generateProbeBasedProfile() {
547 assert(SampleCounters->size() == 1 &&
548 "Must have one entry for profile generation.");
549 // Enable pseudo probe functionalities in SampleProf
550 FunctionSamples::ProfileIsProbeBased = true;
551 const SampleCounter &SC = SampleCounters->begin()->second;
552 // Fill in function body samples
553 populateBodySamplesWithProbesForAllFunctions(RangeCounter: SC.RangeCounter);
554 // Fill in boundary sample counts as well as call site samples for calls
555 populateBoundarySamplesWithProbesForAllFunctions(BranchCounters: SC.BranchCounter);
556
557 updateFunctionSamples();
558}
559
560void ProfileGenerator::populateBodySamplesWithProbesForAllFunctions(
561 const RangeSample &RangeCounter) {
562 ProbeCounterMap ProbeCounter;
563 // preprocessRangeCounter returns disjoint ranges, so no longer to redo it
564 // inside extractProbesFromRange.
565 extractProbesFromRange(RangeCounter: preprocessRangeCounter(RangeCounter), ProbeCounter,
566 FindDisjointRanges: false);
567
568 for (const auto &PI : ProbeCounter) {
569 const MCDecodedPseudoProbe *Probe = PI.first;
570 uint64_t Count = PI.second;
571 SampleContextFrameVector FrameVec;
572 Binary->getInlineContextForProbe(Probe, InlineContextStack&: FrameVec, IncludeLeaf: true);
573 FunctionSamples &FunctionProfile =
574 getLeafProfileAndAddTotalSamples(FrameVec, Count);
575 FunctionProfile.addBodySamples(LineOffset: Probe->getIndex(), Discriminator: Probe->getDiscriminator(),
576 Num: Count);
577 if (Probe->isEntry())
578 FunctionProfile.addHeadSamples(Num: Count);
579 }
580}
581
582void ProfileGenerator::populateBoundarySamplesWithProbesForAllFunctions(
583 const BranchSample &BranchCounters) {
584 for (const auto &Entry : BranchCounters) {
585 uint64_t SourceAddress = Entry.first.first;
586 uint64_t TargetAddress = Entry.first.second;
587 uint64_t Count = Entry.second;
588 assert(Count != 0 && "Unexpected zero weight branch");
589
590 StringRef CalleeName = getCalleeNameForAddress(TargetAddress);
591 if (CalleeName.size() == 0)
592 continue;
593
594 const MCDecodedPseudoProbe *CallProbe =
595 Binary->getCallProbeForAddr(Address: SourceAddress);
596 if (CallProbe == nullptr)
597 continue;
598
599 // Record called target sample and its count.
600 SampleContextFrameVector FrameVec;
601 Binary->getInlineContextForProbe(Probe: CallProbe, InlineContextStack&: FrameVec, IncludeLeaf: true);
602
603 if (!FrameVec.empty()) {
604 FunctionSamples &FunctionProfile =
605 getLeafProfileAndAddTotalSamples(FrameVec, Count: 0);
606 FunctionProfile.addCalledTargetSamples(
607 LineOffset: FrameVec.back().Location.LineOffset,
608 Discriminator: FrameVec.back().Location.Discriminator,
609 Func: FunctionId(CalleeName), Num: Count);
610 }
611 }
612}
613
614FunctionSamples &ProfileGenerator::getLeafProfileAndAddTotalSamples(
615 const SampleContextFrameVector &FrameVec, uint64_t Count) {
616 // Get top level profile
617 FunctionSamples *FunctionProfile =
618 &getTopLevelFunctionProfile(FuncName: FrameVec[0].Func);
619 FunctionProfile->addTotalSamples(Num: Count);
620 if (Binary->usePseudoProbes()) {
621 const auto *FuncDesc = Binary->getFuncDescForGUID(
622 GUID: FunctionProfile->getFunction().getHashCode());
623 FunctionProfile->setFunctionHash(FuncDesc->FuncHash);
624 }
625
626 for (size_t I = 1; I < FrameVec.size(); I++) {
627 LineLocation Callsite(
628 FrameVec[I - 1].Location.LineOffset,
629 getBaseDiscriminator(Discriminator: FrameVec[I - 1].Location.Discriminator));
630 FunctionSamplesMap &SamplesMap =
631 FunctionProfile->functionSamplesAt(Loc: Callsite);
632 auto Ret =
633 SamplesMap.emplace(args: FrameVec[I].Func, args: FunctionSamples());
634 if (Ret.second) {
635 SampleContext Context(FrameVec[I].Func);
636 Ret.first->second.setContext(Context);
637 }
638 FunctionProfile = &Ret.first->second;
639 FunctionProfile->addTotalSamples(Num: Count);
640 if (Binary->usePseudoProbes()) {
641 const auto *FuncDesc = Binary->getFuncDescForGUID(
642 GUID: FunctionProfile->getFunction().getHashCode());
643 FunctionProfile->setFunctionHash(FuncDesc->FuncHash);
644 }
645 }
646
647 return *FunctionProfile;
648}
649
650RangeSample
651ProfileGenerator::preprocessRangeCounter(const RangeSample &RangeCounter) {
652 RangeSample Ranges(RangeCounter.begin(), RangeCounter.end());
653 if (FillZeroForAllFuncs) {
654 for (auto &FuncI : Binary->getAllBinaryFunctions()) {
655 for (auto &R : FuncI.second.Ranges) {
656 Ranges[{R.first, R.second - 1}] += 0;
657 }
658 }
659 } else {
660 // For each range, we search for all ranges of the function it belongs to
661 // and initialize it with zero count, so it remains zero if doesn't hit any
662 // samples. This is to be consistent with compiler that interpret zero count
663 // as unexecuted(cold).
664 for (const auto &I : RangeCounter) {
665 uint64_t StartAddress = I.first.first;
666 for (const auto &Range : Binary->getRanges(Address: StartAddress))
667 Ranges[{Range.first, Range.second - 1}] += 0;
668 }
669 }
670 RangeSample DisjointRanges;
671 findDisjointRanges(DisjointRanges, Ranges);
672 return DisjointRanges;
673}
674
675void ProfileGenerator::populateBodySamplesForAllFunctions(
676 const RangeSample &RangeCounter) {
677 for (const auto &Range : preprocessRangeCounter(RangeCounter)) {
678 uint64_t RangeBegin = Range.first.first;
679 uint64_t RangeEnd = Range.first.second;
680 uint64_t Count = Range.second;
681
682 InstructionPointer IP(Binary, RangeBegin, true);
683 // Disjoint ranges may have range in the middle of two instr,
684 // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range
685 // can be Addr1+1 to Addr2-1. We should ignore such range.
686 if (IP.Address > RangeEnd)
687 continue;
688
689 do {
690 const SampleContextFrameVector FrameVec =
691 Binary->getFrameLocationStack(Address: IP.Address);
692 if (!FrameVec.empty()) {
693 // FIXME: As accumulating total count per instruction caused some
694 // regression, we changed to accumulate total count per byte as a
695 // workaround. Tuning hotness threshold on the compiler side might be
696 // necessary in the future.
697 FunctionSamples &FunctionProfile = getLeafProfileAndAddTotalSamples(
698 FrameVec, Count: Count * Binary->getInstSize(Address: IP.Address));
699 updateBodySamplesforFunctionProfile(FunctionProfile, LeafLoc: FrameVec.back(),
700 Count);
701 }
702 } while (IP.advance() && IP.Address <= RangeEnd);
703 }
704}
705
706StringRef
707ProfileGeneratorBase::getCalleeNameForAddress(uint64_t TargetAddress) {
708 // Get the function range by branch target if it's a call branch.
709 auto *FRange = Binary->findFuncRangeForStartAddr(Address: TargetAddress);
710
711 // We won't accumulate sample count for a range whose start is not the real
712 // function entry such as outlined function or inner labels.
713 if (!FRange || !FRange->IsFuncEntry)
714 return StringRef();
715
716 return FunctionSamples::getCanonicalFnName(FnName: FRange->getFuncName());
717}
718
719void ProfileGenerator::populateBoundarySamplesForAllFunctions(
720 const BranchSample &BranchCounters) {
721 for (const auto &Entry : BranchCounters) {
722 uint64_t SourceAddress = Entry.first.first;
723 uint64_t TargetAddress = Entry.first.second;
724 uint64_t Count = Entry.second;
725 assert(Count != 0 && "Unexpected zero weight branch");
726
727 StringRef CalleeName = getCalleeNameForAddress(TargetAddress);
728 if (CalleeName.size() == 0)
729 continue;
730 // Record called target sample and its count.
731 const SampleContextFrameVector &FrameVec =
732 Binary->getCachedFrameLocationStack(Address: SourceAddress);
733 if (!FrameVec.empty()) {
734 FunctionSamples &FunctionProfile =
735 getLeafProfileAndAddTotalSamples(FrameVec, Count: 0);
736 FunctionProfile.addCalledTargetSamples(
737 LineOffset: FrameVec.back().Location.LineOffset,
738 Discriminator: getBaseDiscriminator(Discriminator: FrameVec.back().Location.Discriminator),
739 Func: FunctionId(CalleeName), Num: Count);
740 }
741 // Add head samples for callee.
742 FunctionSamples &CalleeProfile =
743 getTopLevelFunctionProfile(FuncName: FunctionId(CalleeName));
744 CalleeProfile.addHeadSamples(Num: Count);
745 }
746}
747
748void ProfileGeneratorBase::calculateBodySamplesAndSize(
749 const FunctionSamples &FSamples, uint64_t &TotalBodySamples,
750 uint64_t &FuncBodySize) {
751 // Note that ideally the size should be the number of function instruction.
752 // However, for probe-based profile, we don't have the accurate instruction
753 // count for each probe, instead, the probe sample is the samples count for
754 // the block, which is equivelant to
755 // total_instruction_samples/num_of_instruction in one block. Hence, we use
756 // the number of probe as a proxy for the function's size.
757 FuncBodySize += FSamples.getBodySamples().size();
758
759 // The accumulated body samples re-calculated here could be different from the
760 // TotalSamples(getTotalSamples) field of FunctionSamples for line-number
761 // based profile. The reason is that TotalSamples is the sum of all the
762 // samples of the machine instruction in one source-code line, however, the
763 // entry of Bodysamples is the only max number of them, so the TotalSamples is
764 // usually much bigger than the accumulated body samples as one souce-code
765 // line can emit many machine instructions. We observed a regression when we
766 // switched to use the accumulated body samples(by using
767 // -update-total-samples). Hence, it's safer to re-calculate here to avoid
768 // such discrepancy. There is no problem for probe-based profile, as the
769 // TotalSamples is exactly the same as the accumulated body samples.
770 for (const auto &I : FSamples.getBodySamples())
771 TotalBodySamples += I.second.getSamples();
772
773 for (const auto &CallsiteSamples : FSamples.getCallsiteSamples())
774 for (const auto &Callee : CallsiteSamples.second) {
775 // For binary-level density, the inlinees' samples and size should be
776 // included in the calculation.
777 calculateBodySamplesAndSize(FSamples: Callee.second, TotalBodySamples,
778 FuncBodySize);
779 }
780}
781
782// Calculate Profile-density:
783// Calculate the density for each function and sort them in descending order,
784// keep accumulating their total samples unitl it exceeds the
785// percentage_threshold(cut-off) of total profile samples, the profile-density
786// is the last(minimum) function-density of the processed functions, which means
787// all the functions hot to perf are on good density if the profile-density is
788// good. The percentage_threshold(--profile-density-cutoff-hot) is configurable
789// depending on how much regression the system want to tolerate.
790double
791ProfileGeneratorBase::calculateDensity(const SampleProfileMap &Profiles) {
792 double ProfileDensity = 0.0;
793
794 uint64_t TotalProfileSamples = 0;
795 // A list of the function profile density and its total samples.
796 std::vector<std::pair<double, uint64_t>> FuncDensityList;
797 for (const auto &I : Profiles) {
798 uint64_t TotalBodySamples = 0;
799 uint64_t FuncBodySize = 0;
800 calculateBodySamplesAndSize(FSamples: I.second, TotalBodySamples, FuncBodySize);
801
802 if (FuncBodySize == 0)
803 continue;
804
805 double FuncDensity = static_cast<double>(TotalBodySamples) / FuncBodySize;
806 TotalProfileSamples += TotalBodySamples;
807 FuncDensityList.emplace_back(args&: FuncDensity, args&: TotalBodySamples);
808 }
809
810 // Sorted by the density in descending order.
811 llvm::stable_sort(Range&: FuncDensityList, C: [&](const std::pair<double, uint64_t> &A,
812 const std::pair<double, uint64_t> &B) {
813 if (A.first != B.first)
814 return A.first > B.first;
815 return A.second < B.second;
816 });
817
818 uint64_t AccumulatedSamples = 0;
819 uint32_t I = 0;
820 assert(ProfileDensityCutOffHot <= 1000000 &&
821 "The cutoff value is greater than 1000000(100%)");
822 while (AccumulatedSamples < TotalProfileSamples *
823 static_cast<float>(ProfileDensityCutOffHot) /
824 1000000 &&
825 I < FuncDensityList.size()) {
826 AccumulatedSamples += FuncDensityList[I].second;
827 ProfileDensity = FuncDensityList[I].first;
828 I++;
829 }
830
831 return ProfileDensity;
832}
833
834void ProfileGeneratorBase::calculateAndShowDensity(
835 const SampleProfileMap &Profiles) {
836 double Density = calculateDensity(Profiles);
837 showDensitySuggestion(Density);
838}
839
840FunctionSamples *
841CSProfileGenerator::getOrCreateFunctionSamples(ContextTrieNode *ContextNode,
842 bool WasLeafInlined) {
843 FunctionSamples *FProfile = ContextNode->getFunctionSamples();
844 if (!FProfile) {
845 FSamplesList.emplace_back();
846 FProfile = &FSamplesList.back();
847 FProfile->setFunction(ContextNode->getFuncName());
848 ContextNode->setFunctionSamples(FProfile);
849 }
850 // Update ContextWasInlined attribute for existing contexts.
851 // The current function can be called in two ways:
852 // - when processing a probe of the current frame
853 // - when processing the entry probe of an inlinee's frame, which
854 // is then used to update the callsite count of the current frame.
855 // The two can happen in any order, hence here we are making sure
856 // `ContextWasInlined` is always set as expected.
857 // TODO: Note that the former does not always happen if no probes of the
858 // current frame has samples, and if the latter happens, we could lose the
859 // attribute. This should be fixed.
860 if (WasLeafInlined)
861 FProfile->getContext().setAttribute(ContextWasInlined);
862 return FProfile;
863}
864
865ContextTrieNode *
866CSProfileGenerator::getOrCreateContextNode(const SampleContextFrames Context,
867 bool WasLeafInlined) {
868 ContextTrieNode *ContextNode =
869 ContextTracker.getOrCreateContextPath(Context, AllowCreate: true);
870 getOrCreateFunctionSamples(ContextNode, WasLeafInlined);
871 return ContextNode;
872}
873
874void CSProfileGenerator::generateProfile() {
875 FunctionSamples::ProfileIsCS = true;
876
877 collectProfiledFunctions();
878
879 if (Binary->usePseudoProbes()) {
880 Binary->decodePseudoProbe();
881 if (InferMissingFrames)
882 initializeMissingFrameInferrer();
883 }
884
885 if (SampleCounters) {
886 if (Binary->usePseudoProbes()) {
887 generateProbeBasedProfile();
888 } else {
889 generateLineNumBasedProfile();
890 }
891 }
892
893 if (Binary->getTrackFuncContextSize())
894 computeSizeForProfiledFunctions();
895
896 postProcessProfiles();
897}
898
899void CSProfileGenerator::initializeMissingFrameInferrer() {
900 Binary->getMissingContextInferrer()->initialize(SampleCounters);
901}
902
903void CSProfileGenerator::inferMissingFrames(
904 const SmallVectorImpl<uint64_t> &Context,
905 SmallVectorImpl<uint64_t> &NewContext) {
906 Binary->inferMissingFrames(Context, NewContext);
907}
908
909void CSProfileGenerator::computeSizeForProfiledFunctions() {
910 for (auto *Func : Binary->getProfiledFunctions())
911 Binary->computeInlinedContextSizeForFunc(Func);
912
913 // Flush the symbolizer to save memory.
914 Binary->flushSymbolizer();
915}
916
917void CSProfileGenerator::updateFunctionSamples() {
918 for (auto *Node : ContextTracker) {
919 FunctionSamples *FSamples = Node->getFunctionSamples();
920 if (FSamples) {
921 if (UpdateTotalSamples)
922 FSamples->updateTotalSamples();
923 FSamples->updateCallsiteSamples();
924 }
925 }
926}
927
928void CSProfileGenerator::generateLineNumBasedProfile() {
929 for (const auto &CI : *SampleCounters) {
930 const auto *CtxKey = cast<StringBasedCtxKey>(Val: CI.first.getPtr());
931
932 ContextTrieNode *ContextNode = &getRootContext();
933 // Sample context will be empty if the jump is an external-to-internal call
934 // pattern, the head samples should be added for the internal function.
935 if (!CtxKey->Context.empty()) {
936 // Get or create function profile for the range
937 ContextNode =
938 getOrCreateContextNode(Context: CtxKey->Context, WasLeafInlined: CtxKey->WasLeafInlined);
939 // Fill in function body samples
940 populateBodySamplesForFunction(FunctionProfile&: *ContextNode->getFunctionSamples(),
941 RangeCounters: CI.second.RangeCounter);
942 }
943 // Fill in boundary sample counts as well as call site samples for calls
944 populateBoundarySamplesForFunction(CallerNode: ContextNode, BranchCounters: CI.second.BranchCounter);
945 }
946 // Fill in call site value sample for inlined calls and also use context to
947 // infer missing samples. Since we don't have call count for inlined
948 // functions, we estimate it from inlinee's profile using the entry of the
949 // body sample.
950 populateInferredFunctionSamples(Node&: getRootContext());
951
952 updateFunctionSamples();
953}
954
955void CSProfileGenerator::populateBodySamplesForFunction(
956 FunctionSamples &FunctionProfile, const RangeSample &RangeCounter) {
957 // Compute disjoint ranges first, so we can use MAX
958 // for calculating count for each location.
959 RangeSample Ranges;
960 findDisjointRanges(DisjointRanges&: Ranges, Ranges: RangeCounter);
961 for (const auto &Range : Ranges) {
962 uint64_t RangeBegin = Range.first.first;
963 uint64_t RangeEnd = Range.first.second;
964 uint64_t Count = Range.second;
965 // Disjoint ranges have introduce zero-filled gap that
966 // doesn't belong to current context, filter them out.
967 if (Count == 0)
968 continue;
969
970 InstructionPointer IP(Binary, RangeBegin, true);
971 // Disjoint ranges may have range in the middle of two instr,
972 // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range
973 // can be Addr1+1 to Addr2-1. We should ignore such range.
974 if (IP.Address > RangeEnd)
975 continue;
976
977 do {
978 auto LeafLoc = Binary->getInlineLeafFrameLoc(Address: IP.Address);
979 if (LeafLoc) {
980 // Recording body sample for this specific context
981 updateBodySamplesforFunctionProfile(FunctionProfile, LeafLoc: *LeafLoc, Count);
982 FunctionProfile.addTotalSamples(Num: Count);
983 }
984 } while (IP.advance() && IP.Address <= RangeEnd);
985 }
986}
987
988void CSProfileGenerator::populateBoundarySamplesForFunction(
989 ContextTrieNode *Node, const BranchSample &BranchCounters) {
990
991 for (const auto &Entry : BranchCounters) {
992 uint64_t SourceAddress = Entry.first.first;
993 uint64_t TargetAddress = Entry.first.second;
994 uint64_t Count = Entry.second;
995 assert(Count != 0 && "Unexpected zero weight branch");
996
997 StringRef CalleeName = getCalleeNameForAddress(TargetAddress);
998 if (CalleeName.size() == 0)
999 continue;
1000
1001 ContextTrieNode *CallerNode = Node;
1002 LineLocation CalleeCallSite(0, 0);
1003 if (CallerNode != &getRootContext()) {
1004 // Record called target sample and its count
1005 auto LeafLoc = Binary->getInlineLeafFrameLoc(Address: SourceAddress);
1006 if (LeafLoc) {
1007 CallerNode->getFunctionSamples()->addCalledTargetSamples(
1008 LineOffset: LeafLoc->Location.LineOffset,
1009 Discriminator: getBaseDiscriminator(Discriminator: LeafLoc->Location.Discriminator),
1010 Func: FunctionId(CalleeName),
1011 Num: Count);
1012 // Record head sample for called target(callee)
1013 CalleeCallSite = LeafLoc->Location;
1014 }
1015 }
1016
1017 ContextTrieNode *CalleeNode =
1018 CallerNode->getOrCreateChildContext(CallSite: CalleeCallSite,
1019 ChildName: FunctionId(CalleeName));
1020 FunctionSamples *CalleeProfile = getOrCreateFunctionSamples(ContextNode: CalleeNode);
1021 CalleeProfile->addHeadSamples(Num: Count);
1022 }
1023}
1024
1025void CSProfileGenerator::populateInferredFunctionSamples(
1026 ContextTrieNode &Node) {
1027 // There is no call jmp sample between the inliner and inlinee, we need to use
1028 // the inlinee's context to infer inliner's context, i.e. parent(inliner)'s
1029 // sample depends on child(inlinee)'s sample, so traverse the tree in
1030 // post-order.
1031 for (auto &It : Node.getAllChildContext())
1032 populateInferredFunctionSamples(Node&: It.second);
1033
1034 FunctionSamples *CalleeProfile = Node.getFunctionSamples();
1035 if (!CalleeProfile)
1036 return;
1037 // If we already have head sample counts, we must have value profile
1038 // for call sites added already. Skip to avoid double counting.
1039 if (CalleeProfile->getHeadSamples())
1040 return;
1041 ContextTrieNode *CallerNode = Node.getParentContext();
1042 // If we don't have context, nothing to do for caller's call site.
1043 // This could happen for entry point function.
1044 if (CallerNode == &getRootContext())
1045 return;
1046
1047 LineLocation CallerLeafFrameLoc = Node.getCallSiteLoc();
1048 FunctionSamples &CallerProfile = *getOrCreateFunctionSamples(ContextNode: CallerNode);
1049 // Since we don't have call count for inlined functions, we
1050 // estimate it from inlinee's profile using entry body sample.
1051 uint64_t EstimatedCallCount = CalleeProfile->getHeadSamplesEstimate();
1052 // If we don't have samples with location, use 1 to indicate live.
1053 if (!EstimatedCallCount && !CalleeProfile->getBodySamples().size())
1054 EstimatedCallCount = 1;
1055 CallerProfile.addCalledTargetSamples(LineOffset: CallerLeafFrameLoc.LineOffset,
1056 Discriminator: CallerLeafFrameLoc.Discriminator,
1057 Func: Node.getFuncName(), Num: EstimatedCallCount);
1058 CallerProfile.addBodySamples(LineOffset: CallerLeafFrameLoc.LineOffset,
1059 Discriminator: CallerLeafFrameLoc.Discriminator,
1060 Num: EstimatedCallCount);
1061 CallerProfile.addTotalSamples(Num: EstimatedCallCount);
1062}
1063
1064void CSProfileGenerator::convertToProfileMap(
1065 ContextTrieNode &Node, SampleContextFrameVector &Context) {
1066 FunctionSamples *FProfile = Node.getFunctionSamples();
1067 if (FProfile) {
1068 Context.emplace_back(Args: Node.getFuncName(), Args: LineLocation(0, 0));
1069 // Save the new context for future references.
1070 SampleContextFrames NewContext = *Contexts.insert(x: Context).first;
1071 auto Ret = ProfileMap.emplace(Args&: NewContext, Args: std::move(*FProfile));
1072 FunctionSamples &NewProfile = Ret.first->second;
1073 NewProfile.getContext().setContext(Context: NewContext);
1074 Context.pop_back();
1075 }
1076
1077 for (auto &It : Node.getAllChildContext()) {
1078 ContextTrieNode &ChildNode = It.second;
1079 Context.emplace_back(Args: Node.getFuncName(), Args: ChildNode.getCallSiteLoc());
1080 convertToProfileMap(Node&: ChildNode, Context);
1081 Context.pop_back();
1082 }
1083}
1084
1085void CSProfileGenerator::convertToProfileMap() {
1086 assert(ProfileMap.empty() &&
1087 "ProfileMap should be empty before converting from the trie");
1088 assert(IsProfileValidOnTrie &&
1089 "Do not convert the trie twice, it's already destroyed");
1090
1091 SampleContextFrameVector Context;
1092 for (auto &It : getRootContext().getAllChildContext())
1093 convertToProfileMap(Node&: It.second, Context);
1094
1095 IsProfileValidOnTrie = false;
1096}
1097
1098void CSProfileGenerator::postProcessProfiles() {
1099 // Compute hot/cold threshold based on profile. This will be used for cold
1100 // context profile merging/trimming.
1101 computeSummaryAndThreshold();
1102
1103 // Run global pre-inliner to adjust/merge context profile based on estimated
1104 // inline decisions.
1105 if (EnableCSPreInliner) {
1106 ContextTracker.populateFuncToCtxtMap();
1107 CSPreInliner(ContextTracker, *Binary, Summary.get()).run();
1108 // Turn off the profile merger by default unless it is explicitly enabled.
1109 if (!CSProfMergeColdContext.getNumOccurrences())
1110 CSProfMergeColdContext = false;
1111 }
1112
1113 convertToProfileMap();
1114
1115 // Trim and merge cold context profile using cold threshold above.
1116 if (TrimColdProfile || CSProfMergeColdContext) {
1117 SampleContextTrimmer(ProfileMap)
1118 .trimAndMergeColdContextProfiles(
1119 ColdCountThreshold: HotCountThreshold, TrimColdContext: TrimColdProfile, MergeColdContext: CSProfMergeColdContext,
1120 ColdContextFrameLength: CSProfMaxColdContextDepth, TrimBaseProfileOnly: EnableCSPreInliner);
1121 }
1122
1123 if (GenCSNestedProfile) {
1124 ProfileConverter CSConverter(ProfileMap);
1125 CSConverter.convertCSProfiles();
1126 FunctionSamples::ProfileIsCS = false;
1127 }
1128 filterAmbiguousProfile(Profiles&: ProfileMap);
1129 ProfileGeneratorBase::calculateAndShowDensity(Profiles: ProfileMap);
1130}
1131
1132void ProfileGeneratorBase::computeSummaryAndThreshold(
1133 SampleProfileMap &Profiles) {
1134 SampleProfileSummaryBuilder Builder(ProfileSummaryBuilder::DefaultCutoffs);
1135 Summary = Builder.computeSummaryForProfiles(Profiles);
1136 HotCountThreshold = ProfileSummaryBuilder::getHotCountThreshold(
1137 DS: (Summary->getDetailedSummary()));
1138 ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold(
1139 DS: (Summary->getDetailedSummary()));
1140}
1141
1142void CSProfileGenerator::computeSummaryAndThreshold() {
1143 // Always merge and use context-less profile map to compute summary.
1144 SampleProfileMap ContextLessProfiles;
1145 ContextTracker.createContextLessProfileMap(ContextLessProfiles);
1146
1147 // Set the flag below to avoid merging the profile again in
1148 // computeSummaryAndThreshold
1149 FunctionSamples::ProfileIsCS = false;
1150 assert(
1151 (!UseContextLessSummary.getNumOccurrences() || UseContextLessSummary) &&
1152 "Don't set --profile-summary-contextless to false for profile "
1153 "generation");
1154 ProfileGeneratorBase::computeSummaryAndThreshold(Profiles&: ContextLessProfiles);
1155 // Recover the old value.
1156 FunctionSamples::ProfileIsCS = true;
1157}
1158
1159void ProfileGeneratorBase::extractProbesFromRange(
1160 const RangeSample &RangeCounter, ProbeCounterMap &ProbeCounter,
1161 bool FindDisjointRanges) {
1162 const RangeSample *PRanges = &RangeCounter;
1163 RangeSample Ranges;
1164 if (FindDisjointRanges) {
1165 findDisjointRanges(DisjointRanges&: Ranges, Ranges: RangeCounter);
1166 PRanges = &Ranges;
1167 }
1168
1169 for (const auto &Range : *PRanges) {
1170 uint64_t RangeBegin = Range.first.first;
1171 uint64_t RangeEnd = Range.first.second;
1172 uint64_t Count = Range.second;
1173
1174 InstructionPointer IP(Binary, RangeBegin, true);
1175 // Disjoint ranges may have range in the middle of two instr,
1176 // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range
1177 // can be Addr1+1 to Addr2-1. We should ignore such range.
1178 if (IP.Address > RangeEnd)
1179 continue;
1180
1181 do {
1182 const AddressProbesMap &Address2ProbesMap =
1183 Binary->getAddress2ProbesMap();
1184 for (const MCDecodedPseudoProbe &Probe :
1185 Address2ProbesMap.find(Address: IP.Address)) {
1186 ProbeCounter[&Probe] += Count;
1187 }
1188 } while (IP.advance() && IP.Address <= RangeEnd);
1189 }
1190}
1191
1192static void extractPrefixContextStack(SampleContextFrameVector &ContextStack,
1193 const SmallVectorImpl<uint64_t> &AddrVec,
1194 ProfiledBinary *Binary) {
1195 SmallVector<const MCDecodedPseudoProbe *, 16> Probes;
1196 for (auto Address : reverse(C: AddrVec)) {
1197 const MCDecodedPseudoProbe *CallProbe =
1198 Binary->getCallProbeForAddr(Address);
1199 // These could be the cases when a probe is not found at a calliste. Cutting
1200 // off the context from here since the inliner will not know how to consume
1201 // a context with unknown callsites.
1202 // 1. for functions that are not sampled when
1203 // --decode-probe-for-profiled-functions-only is on.
1204 // 2. for a merged callsite. Callsite merging may cause the loss of original
1205 // probe IDs.
1206 // 3. for an external callsite.
1207 if (!CallProbe)
1208 break;
1209 Probes.push_back(Elt: CallProbe);
1210 }
1211
1212 std::reverse(first: Probes.begin(), last: Probes.end());
1213
1214 // Extract context stack for reusing, leaf context stack will be added
1215 // compressed while looking up function profile.
1216 for (const auto *P : Probes) {
1217 Binary->getInlineContextForProbe(Probe: P, InlineContextStack&: ContextStack, IncludeLeaf: true);
1218 }
1219}
1220
1221void CSProfileGenerator::generateProbeBasedProfile() {
1222 // Enable pseudo probe functionalities in SampleProf
1223 FunctionSamples::ProfileIsProbeBased = true;
1224 for (const auto &CI : *SampleCounters) {
1225 const AddrBasedCtxKey *CtxKey =
1226 dyn_cast<AddrBasedCtxKey>(Val: CI.first.getPtr());
1227 // Fill in function body samples from probes, also infer caller's samples
1228 // from callee's probe
1229 populateBodySamplesWithProbes(RangeCounter: CI.second.RangeCounter, CtxKey);
1230 // Fill in boundary samples for a call probe
1231 populateBoundarySamplesWithProbes(BranchCounter: CI.second.BranchCounter, CtxKey);
1232 }
1233}
1234
1235void CSProfileGenerator::populateBodySamplesWithProbes(
1236 const RangeSample &RangeCounter, const AddrBasedCtxKey *CtxKey) {
1237 ProbeCounterMap ProbeCounter;
1238 // Extract the top frame probes by looking up each address among the range in
1239 // the Address2ProbeMap
1240 extractProbesFromRange(RangeCounter, ProbeCounter);
1241 std::unordered_map<MCDecodedPseudoProbeInlineTree *,
1242 std::unordered_set<FunctionSamples *>>
1243 FrameSamples;
1244 for (const auto &PI : ProbeCounter) {
1245 const MCDecodedPseudoProbe *Probe = PI.first;
1246 uint64_t Count = PI.second;
1247 // Disjoint ranges have introduce zero-filled gap that
1248 // doesn't belong to current context, filter them out.
1249 if (!Probe->isBlock() || Count == 0)
1250 continue;
1251
1252 ContextTrieNode *ContextNode = getContextNodeForLeafProbe(CtxKey, LeafProbe: Probe);
1253 FunctionSamples &FunctionProfile = *ContextNode->getFunctionSamples();
1254 // Record the current frame and FunctionProfile whenever samples are
1255 // collected for non-danglie probes. This is for reporting all of the
1256 // zero count probes of the frame later.
1257 FrameSamples[Probe->getInlineTreeNode()].insert(x: &FunctionProfile);
1258 FunctionProfile.addBodySamples(LineOffset: Probe->getIndex(), Discriminator: Probe->getDiscriminator(),
1259 Num: Count);
1260 FunctionProfile.addTotalSamples(Num: Count);
1261 if (Probe->isEntry()) {
1262 FunctionProfile.addHeadSamples(Num: Count);
1263 // Look up for the caller's function profile
1264 const auto *InlinerDesc = Binary->getInlinerDescForProbe(Probe);
1265 ContextTrieNode *CallerNode = ContextNode->getParentContext();
1266 if (InlinerDesc != nullptr && CallerNode != &getRootContext()) {
1267 // Since the context id will be compressed, we have to use callee's
1268 // context id to infer caller's context id to ensure they share the
1269 // same context prefix.
1270 uint64_t CallerIndex = ContextNode->getCallSiteLoc().LineOffset;
1271 uint64_t CallerDiscriminator = ContextNode->getCallSiteLoc().Discriminator;
1272 assert(CallerIndex &&
1273 "Inferred caller's location index shouldn't be zero!");
1274 assert(!CallerDiscriminator &&
1275 "Callsite probe should not have a discriminator!");
1276 FunctionSamples &CallerProfile =
1277 *getOrCreateFunctionSamples(ContextNode: CallerNode);
1278 CallerProfile.setFunctionHash(InlinerDesc->FuncHash);
1279 CallerProfile.addBodySamples(LineOffset: CallerIndex, Discriminator: CallerDiscriminator, Num: Count);
1280 CallerProfile.addTotalSamples(Num: Count);
1281 CallerProfile.addCalledTargetSamples(LineOffset: CallerIndex, Discriminator: CallerDiscriminator,
1282 Func: ContextNode->getFuncName(), Num: Count);
1283 }
1284 }
1285 }
1286
1287 // Assign zero count for remaining probes without sample hits to
1288 // differentiate from probes optimized away, of which the counts are unknown
1289 // and will be inferred by the compiler.
1290 for (auto &I : FrameSamples) {
1291 for (auto *FunctionProfile : I.second) {
1292 for (const MCDecodedPseudoProbe &Probe : I.first->getProbes()) {
1293 FunctionProfile->addBodySamples(LineOffset: Probe.getIndex(),
1294 Discriminator: Probe.getDiscriminator(), Num: 0);
1295 }
1296 }
1297 }
1298}
1299
1300void CSProfileGenerator::populateBoundarySamplesWithProbes(
1301 const BranchSample &BranchCounter, const AddrBasedCtxKey *CtxKey) {
1302 for (const auto &BI : BranchCounter) {
1303 uint64_t SourceAddress = BI.first.first;
1304 uint64_t TargetAddress = BI.first.second;
1305 uint64_t Count = BI.second;
1306 const MCDecodedPseudoProbe *CallProbe =
1307 Binary->getCallProbeForAddr(Address: SourceAddress);
1308 if (CallProbe == nullptr)
1309 continue;
1310 FunctionSamples &FunctionProfile =
1311 getFunctionProfileForLeafProbe(CtxKey, LeafProbe: CallProbe);
1312 FunctionProfile.addBodySamples(LineOffset: CallProbe->getIndex(), Discriminator: 0, Num: Count);
1313 FunctionProfile.addTotalSamples(Num: Count);
1314 StringRef CalleeName = getCalleeNameForAddress(TargetAddress);
1315 if (CalleeName.size() == 0)
1316 continue;
1317 FunctionProfile.addCalledTargetSamples(LineOffset: CallProbe->getIndex(),
1318 Discriminator: CallProbe->getDiscriminator(),
1319 Func: FunctionId(CalleeName), Num: Count);
1320 }
1321}
1322
1323ContextTrieNode *CSProfileGenerator::getContextNodeForLeafProbe(
1324 const AddrBasedCtxKey *CtxKey, const MCDecodedPseudoProbe *LeafProbe) {
1325
1326 const SmallVectorImpl<uint64_t> *PContext = &CtxKey->Context;
1327 SmallVector<uint64_t, 16> NewContext;
1328
1329 if (InferMissingFrames) {
1330 SmallVector<uint64_t, 16> Context = CtxKey->Context;
1331 // Append leaf frame for a complete inference.
1332 Context.push_back(Elt: LeafProbe->getAddress());
1333 inferMissingFrames(Context, NewContext);
1334 // Pop out the leaf probe that was pushed in above.
1335 NewContext.pop_back();
1336 PContext = &NewContext;
1337 }
1338
1339 SampleContextFrameVector ContextStack;
1340 extractPrefixContextStack(ContextStack, AddrVec: *PContext, Binary);
1341
1342 // Explicitly copy the context for appending the leaf context
1343 SampleContextFrameVector NewContextStack(ContextStack.begin(),
1344 ContextStack.end());
1345 Binary->getInlineContextForProbe(Probe: LeafProbe, InlineContextStack&: NewContextStack, IncludeLeaf: true);
1346 // For leaf inlined context with the top frame, we should strip off the top
1347 // frame's probe id, like:
1348 // Inlined stack: [foo:1, bar:2], the ContextId will be "foo:1 @ bar"
1349 auto LeafFrame = NewContextStack.back();
1350 LeafFrame.Location = LineLocation(0, 0);
1351 NewContextStack.pop_back();
1352 // Compress the context string except for the leaf frame
1353 CSProfileGenerator::compressRecursionContext(Context&: NewContextStack);
1354 CSProfileGenerator::trimContext(S&: NewContextStack);
1355 NewContextStack.push_back(Elt: LeafFrame);
1356
1357 const auto *FuncDesc = Binary->getFuncDescForGUID(GUID: LeafProbe->getGuid());
1358 bool WasLeafInlined = LeafProbe->getInlineTreeNode()->hasInlineSite();
1359 ContextTrieNode *ContextNode =
1360 getOrCreateContextNode(Context: NewContextStack, WasLeafInlined);
1361 ContextNode->getFunctionSamples()->setFunctionHash(FuncDesc->FuncHash);
1362 return ContextNode;
1363}
1364
1365FunctionSamples &CSProfileGenerator::getFunctionProfileForLeafProbe(
1366 const AddrBasedCtxKey *CtxKey, const MCDecodedPseudoProbe *LeafProbe) {
1367 return *getContextNodeForLeafProbe(CtxKey, LeafProbe)->getFunctionSamples();
1368}
1369
1370} // end namespace sampleprof
1371} // end namespace llvm
1372