1 | //===-- ProfileGenerator.cpp - Profile Generator ---------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | #include "ProfileGenerator.h" |
9 | #include "ErrorHandling.h" |
10 | #include "MissingFrameInferrer.h" |
11 | #include "PerfReader.h" |
12 | #include "ProfiledBinary.h" |
13 | #include "llvm/DebugInfo/Symbolize/SymbolizableModule.h" |
14 | #include "llvm/ProfileData/ProfileCommon.h" |
15 | #include <algorithm> |
16 | #include <float.h> |
17 | #include <unordered_set> |
18 | #include <utility> |
19 | |
20 | cl::opt<std::string> OutputFilename("output" , cl::value_desc("output" ), |
21 | cl::Required, |
22 | cl::desc("Output profile file" )); |
23 | static cl::alias OutputA("o" , cl::desc("Alias for --output" ), |
24 | cl::aliasopt(OutputFilename)); |
25 | |
26 | static cl::opt<SampleProfileFormat> OutputFormat( |
27 | "format" , cl::desc("Format of output profile" ), cl::init(Val: SPF_Ext_Binary), |
28 | cl::values( |
29 | clEnumValN(SPF_Binary, "binary" , "Binary encoding (default)" ), |
30 | clEnumValN(SPF_Ext_Binary, "extbinary" , "Extensible binary encoding" ), |
31 | clEnumValN(SPF_Text, "text" , "Text encoding" ), |
32 | clEnumValN(SPF_GCC, "gcc" , |
33 | "GCC encoding (only meaningful for -sample)" ))); |
34 | |
35 | static cl::opt<bool> UseMD5( |
36 | "use-md5" , cl::Hidden, |
37 | cl::desc("Use md5 to represent function names in the output profile (only " |
38 | "meaningful for -extbinary)" )); |
39 | |
40 | static cl::opt<bool> PopulateProfileSymbolList( |
41 | "populate-profile-symbol-list" , cl::init(Val: false), cl::Hidden, |
42 | cl::desc("Populate profile symbol list (only meaningful for -extbinary)" )); |
43 | |
44 | static cl::opt<bool> FillZeroForAllFuncs( |
45 | "fill-zero-for-all-funcs" , cl::init(Val: false), cl::Hidden, |
46 | cl::desc("Attribute all functions' range with zero count " |
47 | "even it's not hit by any samples." )); |
48 | |
49 | static cl::opt<int32_t, true> RecursionCompression( |
50 | "compress-recursion" , |
51 | cl::desc("Compressing recursion by deduplicating adjacent frame " |
52 | "sequences up to the specified size. -1 means no size limit." ), |
53 | cl::Hidden, |
54 | cl::location(L&: llvm::sampleprof::CSProfileGenerator::MaxCompressionSize)); |
55 | |
56 | static cl::opt<bool> |
57 | TrimColdProfile("trim-cold-profile" , |
58 | cl::desc("If the total count of the profile is smaller " |
59 | "than threshold, it will be trimmed." )); |
60 | |
61 | static cl::opt<bool> CSProfMergeColdContext( |
62 | "csprof-merge-cold-context" , cl::init(Val: true), |
63 | cl::desc("If the total count of context profile is smaller than " |
64 | "the threshold, it will be merged into context-less base " |
65 | "profile." )); |
66 | |
67 | static cl::opt<uint32_t> CSProfMaxColdContextDepth( |
68 | "csprof-max-cold-context-depth" , cl::init(Val: 1), |
69 | cl::desc("Keep the last K contexts while merging cold profile. 1 means the " |
70 | "context-less base profile" )); |
71 | |
72 | static cl::opt<int, true> CSProfMaxContextDepth( |
73 | "csprof-max-context-depth" , |
74 | cl::desc("Keep the last K contexts while merging profile. -1 means no " |
75 | "depth limit." ), |
76 | cl::location(L&: llvm::sampleprof::CSProfileGenerator::MaxContextDepth)); |
77 | |
78 | static cl::opt<double> ProfileDensityThreshold( |
79 | "profile-density-threshold" , llvm::cl::init(Val: 50), |
80 | llvm::cl::desc("If the profile density is below the given threshold, it " |
81 | "will be suggested to increase the sampling rate." ), |
82 | llvm::cl::Optional); |
83 | static cl::opt<bool> ShowDensity("show-density" , llvm::cl::init(Val: false), |
84 | llvm::cl::desc("show profile density details" ), |
85 | llvm::cl::Optional); |
86 | static cl::opt<int> ProfileDensityCutOffHot( |
87 | "profile-density-cutoff-hot" , llvm::cl::init(Val: 990000), |
88 | llvm::cl::desc("Total samples cutoff for functions used to calculate " |
89 | "profile density." )); |
90 | |
91 | static cl::opt<bool> UpdateTotalSamples( |
92 | "update-total-samples" , llvm::cl::init(Val: false), |
93 | llvm::cl::desc( |
94 | "Update total samples by accumulating all its body samples." ), |
95 | llvm::cl::Optional); |
96 | |
97 | static cl::opt<bool> GenCSNestedProfile( |
98 | "gen-cs-nested-profile" , cl::Hidden, cl::init(Val: true), |
99 | cl::desc("Generate nested function profiles for CSSPGO" )); |
100 | |
101 | cl::opt<bool> InferMissingFrames( |
102 | "infer-missing-frames" , llvm::cl::init(Val: true), |
103 | llvm::cl::desc( |
104 | "Infer missing call frames due to compiler tail call elimination." ), |
105 | llvm::cl::Optional); |
106 | |
107 | extern cl::opt<bool> LeadingIPOnly; |
108 | |
109 | using namespace llvm; |
110 | using namespace sampleprof; |
111 | |
112 | namespace llvm { |
113 | extern cl::opt<int> ProfileSummaryCutoffHot; |
114 | extern cl::opt<bool> UseContextLessSummary; |
115 | |
116 | namespace sampleprof { |
117 | |
118 | // Initialize the MaxCompressionSize to -1 which means no size limit |
119 | int32_t CSProfileGenerator::MaxCompressionSize = -1; |
120 | |
121 | int CSProfileGenerator::MaxContextDepth = -1; |
122 | |
123 | bool ProfileGeneratorBase::UseFSDiscriminator = false; |
124 | |
125 | std::unique_ptr<ProfileGeneratorBase> |
126 | ProfileGeneratorBase::create(ProfiledBinary *Binary, |
127 | const ContextSampleCounterMap *SampleCounters, |
128 | bool ProfileIsCS) { |
129 | std::unique_ptr<ProfileGeneratorBase> Generator; |
130 | if (ProfileIsCS) { |
131 | Generator.reset(p: new CSProfileGenerator(Binary, SampleCounters)); |
132 | } else { |
133 | Generator.reset(p: new ProfileGenerator(Binary, SampleCounters)); |
134 | } |
135 | ProfileGeneratorBase::UseFSDiscriminator = Binary->useFSDiscriminator(); |
136 | FunctionSamples::ProfileIsFS = Binary->useFSDiscriminator(); |
137 | |
138 | return Generator; |
139 | } |
140 | |
141 | std::unique_ptr<ProfileGeneratorBase> |
142 | ProfileGeneratorBase::create(ProfiledBinary *Binary, SampleProfileMap &Profiles, |
143 | bool ProfileIsCS) { |
144 | std::unique_ptr<ProfileGeneratorBase> Generator; |
145 | if (ProfileIsCS) { |
146 | Generator.reset(p: new CSProfileGenerator(Binary, Profiles)); |
147 | } else { |
148 | Generator.reset(p: new ProfileGenerator(Binary, std::move(Profiles))); |
149 | } |
150 | ProfileGeneratorBase::UseFSDiscriminator = Binary->useFSDiscriminator(); |
151 | FunctionSamples::ProfileIsFS = Binary->useFSDiscriminator(); |
152 | |
153 | return Generator; |
154 | } |
155 | |
156 | void ProfileGeneratorBase::write(std::unique_ptr<SampleProfileWriter> Writer, |
157 | SampleProfileMap &ProfileMap) { |
158 | // Populate profile symbol list if extended binary format is used. |
159 | ProfileSymbolList SymbolList; |
160 | |
161 | if (PopulateProfileSymbolList && OutputFormat == SPF_Ext_Binary) { |
162 | Binary->populateSymbolListFromDWARF(SymbolList); |
163 | Writer->setProfileSymbolList(&SymbolList); |
164 | } |
165 | |
166 | if (std::error_code EC = Writer->write(ProfileMap)) |
167 | exitWithError(EC: std::move(EC)); |
168 | } |
169 | |
170 | void ProfileGeneratorBase::write() { |
171 | auto WriterOrErr = SampleProfileWriter::create(Filename: OutputFilename, Format: OutputFormat); |
172 | if (std::error_code EC = WriterOrErr.getError()) |
173 | exitWithError(EC, Whence: OutputFilename); |
174 | |
175 | if (UseMD5) { |
176 | if (OutputFormat != SPF_Ext_Binary) |
177 | WithColor::warning() << "-use-md5 is ignored. Specify " |
178 | "--format=extbinary to enable it\n" ; |
179 | else |
180 | WriterOrErr.get()->setUseMD5(); |
181 | } |
182 | |
183 | write(Writer: std::move(WriterOrErr.get()), ProfileMap); |
184 | } |
185 | |
186 | void ProfileGeneratorBase::showDensitySuggestion(double Density) { |
187 | if (Density == 0.0) |
188 | WithColor::warning() << "The output profile is empty or the " |
189 | "--profile-density-cutoff-hot option is " |
190 | "set too low. Please check your command.\n" ; |
191 | else if (Density < ProfileDensityThreshold) |
192 | WithColor::warning() |
193 | << "Sample PGO is estimated to optimize better with " |
194 | << format(Fmt: "%.1f" , Vals: ProfileDensityThreshold / Density) |
195 | << "x more samples. Please consider increasing sampling rate or " |
196 | "profiling for longer duration to get more samples.\n" ; |
197 | |
198 | if (ShowDensity) |
199 | outs() << "Functions with density >= " << format(Fmt: "%.1f" , Vals: Density) |
200 | << " account for " |
201 | << format(Fmt: "%.2f" , |
202 | Vals: static_cast<double>(ProfileDensityCutOffHot) / 10000) |
203 | << "% total sample counts.\n" ; |
204 | } |
205 | |
206 | bool ProfileGeneratorBase::filterAmbiguousProfile(FunctionSamples &FS) { |
207 | for (const auto &Prefix : FuncPrefixsToFilter) { |
208 | if (FS.getFuncName().starts_with(Prefix)) |
209 | return true; |
210 | } |
211 | |
212 | // Filter the function profiles for the inlinees. It's useful for fuzzy |
213 | // profile matching which flattens the profile and inlinees' samples are |
214 | // merged into top-level function. |
215 | for (auto &Callees : |
216 | const_cast<CallsiteSampleMap &>(FS.getCallsiteSamples())) { |
217 | auto &CalleesMap = Callees.second; |
218 | for (auto I = CalleesMap.begin(); I != CalleesMap.end();) { |
219 | auto FS = I++; |
220 | if (filterAmbiguousProfile(FS&: FS->second)) |
221 | CalleesMap.erase(position: FS); |
222 | } |
223 | } |
224 | return false; |
225 | } |
226 | |
227 | // For built-in local initialization function such as __cxx_global_var_init, |
228 | // __tls_init prefix function, there could be multiple versions of the functions |
229 | // in the final binary. However, in the profile generation, we call |
230 | // getCanonicalFnName to canonicalize the names which strips the suffixes. |
231 | // Therefore, samples from different functions queries the same profile and the |
232 | // samples are merged. As the functions are essentially different, entries of |
233 | // the merged profile are ambiguous. In sample loader, the IR from one version |
234 | // would be attributed towards a merged entries, which is inaccurate. Especially |
235 | // for fuzzy profile matching, it gets multiple callsites(from different |
236 | // function) but used to match one callsite, which misleads the matching and |
237 | // causes a lot of false positives report. Hence, we want to filter them out |
238 | // from the profile map during the profile generation time. The profiles are all |
239 | // cold functions, it won't have perf impact. |
240 | void ProfileGeneratorBase::filterAmbiguousProfile(SampleProfileMap &Profiles) { |
241 | for (auto I = ProfileMap.begin(); I != ProfileMap.end();) { |
242 | auto FS = I++; |
243 | if (filterAmbiguousProfile(FS&: FS->second)) |
244 | ProfileMap.erase(It: FS); |
245 | } |
246 | } |
247 | |
248 | void ProfileGeneratorBase::findDisjointRanges(RangeSample &DisjointRanges, |
249 | const RangeSample &Ranges) { |
250 | |
251 | /* |
252 | Regions may overlap with each other. Using the boundary info, find all |
253 | disjoint ranges and their sample count. BoundaryPoint contains the count |
254 | multiple samples begin/end at this points. |
255 | |
256 | |<--100-->| Sample1 |
257 | |<------200------>| Sample2 |
258 | A B C |
259 | |
260 | In the example above, |
261 | Sample1 begins at A, ends at B, its value is 100. |
262 | Sample2 beings at A, ends at C, its value is 200. |
263 | For A, BeginCount is the sum of sample begins at A, which is 300 and no |
264 | samples ends at A, so EndCount is 0. |
265 | Then boundary points A, B, and C with begin/end counts are: |
266 | A: (300, 0) |
267 | B: (0, 100) |
268 | C: (0, 200) |
269 | */ |
270 | struct BoundaryPoint { |
271 | // Sum of sample counts beginning at this point |
272 | uint64_t BeginCount = UINT64_MAX; |
273 | // Sum of sample counts ending at this point |
274 | uint64_t EndCount = UINT64_MAX; |
275 | // Is the begin point of a zero range. |
276 | bool IsZeroRangeBegin = false; |
277 | // Is the end point of a zero range. |
278 | bool IsZeroRangeEnd = false; |
279 | |
280 | void addBeginCount(uint64_t Count) { |
281 | if (BeginCount == UINT64_MAX) |
282 | BeginCount = 0; |
283 | BeginCount += Count; |
284 | } |
285 | |
286 | void addEndCount(uint64_t Count) { |
287 | if (EndCount == UINT64_MAX) |
288 | EndCount = 0; |
289 | EndCount += Count; |
290 | } |
291 | }; |
292 | |
293 | /* |
294 | For the above example. With boundary points, follwing logic finds two |
295 | disjoint region of |
296 | |
297 | [A,B]: 300 |
298 | [B+1,C]: 200 |
299 | |
300 | If there is a boundary point that both begin and end, the point itself |
301 | becomes a separate disjoint region. For example, if we have original |
302 | ranges of |
303 | |
304 | |<--- 100 --->| |
305 | |<--- 200 --->| |
306 | A B C |
307 | |
308 | there are three boundary points with their begin/end counts of |
309 | |
310 | A: (100, 0) |
311 | B: (200, 100) |
312 | C: (0, 200) |
313 | |
314 | the disjoint ranges would be |
315 | |
316 | [A, B-1]: 100 |
317 | [B, B]: 300 |
318 | [B+1, C]: 200. |
319 | |
320 | Example for zero value range: |
321 | |
322 | |<--- 100 --->| |
323 | |<--- 200 --->| |
324 | |<--------------- 0 ----------------->| |
325 | A B C D E F |
326 | |
327 | [A, B-1] : 0 |
328 | [B, C] : 100 |
329 | [C+1, D-1]: 0 |
330 | [D, E] : 200 |
331 | [E+1, F] : 0 |
332 | */ |
333 | std::map<uint64_t, BoundaryPoint> Boundaries; |
334 | |
335 | for (const auto &Item : Ranges) { |
336 | assert(Item.first.first <= Item.first.second && |
337 | "Invalid instruction range" ); |
338 | auto &BeginPoint = Boundaries[Item.first.first]; |
339 | auto &EndPoint = Boundaries[Item.first.second]; |
340 | uint64_t Count = Item.second; |
341 | |
342 | BeginPoint.addBeginCount(Count); |
343 | EndPoint.addEndCount(Count); |
344 | if (Count == 0) { |
345 | BeginPoint.IsZeroRangeBegin = true; |
346 | EndPoint.IsZeroRangeEnd = true; |
347 | } |
348 | } |
349 | |
350 | // Use UINT64_MAX to indicate there is no existing range between BeginAddress |
351 | // and the next valid address |
352 | uint64_t BeginAddress = UINT64_MAX; |
353 | int ZeroRangeDepth = 0; |
354 | uint64_t Count = 0; |
355 | for (const auto &Item : Boundaries) { |
356 | uint64_t Address = Item.first; |
357 | const BoundaryPoint &Point = Item.second; |
358 | if (Point.BeginCount != UINT64_MAX) { |
359 | if (BeginAddress != UINT64_MAX) |
360 | DisjointRanges[{BeginAddress, Address - 1}] = Count; |
361 | Count += Point.BeginCount; |
362 | BeginAddress = Address; |
363 | ZeroRangeDepth += Point.IsZeroRangeBegin; |
364 | } |
365 | if (Point.EndCount != UINT64_MAX) { |
366 | assert((BeginAddress != UINT64_MAX) && |
367 | "First boundary point cannot be 'end' point" ); |
368 | DisjointRanges[{BeginAddress, Address}] = Count; |
369 | assert(Count >= Point.EndCount && "Mismatched live ranges" ); |
370 | Count -= Point.EndCount; |
371 | BeginAddress = Address + 1; |
372 | ZeroRangeDepth -= Point.IsZeroRangeEnd; |
373 | // If the remaining count is zero and it's no longer in a zero range, this |
374 | // means we consume all the ranges before, thus mark BeginAddress as |
375 | // UINT64_MAX. e.g. supposing we have two non-overlapping ranges: |
376 | // [<---- 10 ---->] |
377 | // [<---- 20 ---->] |
378 | // A B C D |
379 | // The BeginAddress(B+1) will reset to invalid(UINT64_MAX), so we won't |
380 | // have the [B+1, C-1] zero range. |
381 | if (Count == 0 && ZeroRangeDepth == 0) |
382 | BeginAddress = UINT64_MAX; |
383 | } |
384 | } |
385 | } |
386 | |
387 | void ProfileGeneratorBase::updateBodySamplesforFunctionProfile( |
388 | FunctionSamples &FunctionProfile, const SampleContextFrame &LeafLoc, |
389 | uint64_t Count) { |
390 | // Use the maximum count of samples with same line location |
391 | uint32_t Discriminator = getBaseDiscriminator(Discriminator: LeafLoc.Location.Discriminator); |
392 | |
393 | if (LeadingIPOnly) { |
394 | // When computing an IP-based profile we take the SUM of counts at the |
395 | // location instead of applying duplication factors and taking the MAX. |
396 | FunctionProfile.addBodySamples(LineOffset: LeafLoc.Location.LineOffset, Discriminator, |
397 | Num: Count); |
398 | } else { |
399 | // Otherwise, use duplication factor to compensate for loop |
400 | // unroll/vectorization. Note that this is only needed when we're taking |
401 | // MAX of the counts at the location instead of SUM. |
402 | Count *= getDuplicationFactor(Discriminator: LeafLoc.Location.Discriminator); |
403 | |
404 | ErrorOr<uint64_t> R = FunctionProfile.findSamplesAt( |
405 | LineOffset: LeafLoc.Location.LineOffset, Discriminator); |
406 | |
407 | uint64_t PreviousCount = R ? R.get() : 0; |
408 | if (PreviousCount <= Count) { |
409 | FunctionProfile.addBodySamples(LineOffset: LeafLoc.Location.LineOffset, Discriminator, |
410 | Num: Count - PreviousCount); |
411 | } |
412 | } |
413 | } |
414 | |
415 | void ProfileGeneratorBase::updateTotalSamples() { |
416 | for (auto &Item : ProfileMap) { |
417 | FunctionSamples &FunctionProfile = Item.second; |
418 | FunctionProfile.updateTotalSamples(); |
419 | } |
420 | } |
421 | |
422 | void ProfileGeneratorBase::updateCallsiteSamples() { |
423 | for (auto &Item : ProfileMap) { |
424 | FunctionSamples &FunctionProfile = Item.second; |
425 | FunctionProfile.updateCallsiteSamples(); |
426 | } |
427 | } |
428 | |
429 | void ProfileGeneratorBase::updateFunctionSamples() { |
430 | updateCallsiteSamples(); |
431 | |
432 | if (UpdateTotalSamples) |
433 | updateTotalSamples(); |
434 | } |
435 | |
436 | void ProfileGeneratorBase::collectProfiledFunctions() { |
437 | std::unordered_set<const BinaryFunction *> ProfiledFunctions; |
438 | if (collectFunctionsFromRawProfile(ProfiledFunctions)) |
439 | Binary->setProfiledFunctions(ProfiledFunctions); |
440 | else if (collectFunctionsFromLLVMProfile(ProfiledFunctions)) |
441 | Binary->setProfiledFunctions(ProfiledFunctions); |
442 | else |
443 | llvm_unreachable("Unsupported input profile" ); |
444 | } |
445 | |
446 | bool ProfileGeneratorBase::collectFunctionsFromRawProfile( |
447 | std::unordered_set<const BinaryFunction *> &ProfiledFunctions) { |
448 | if (!SampleCounters) |
449 | return false; |
450 | // Go through all the stacks, ranges and branches in sample counters, use |
451 | // the start of the range to look up the function it belongs and record the |
452 | // function. |
453 | for (const auto &CI : *SampleCounters) { |
454 | if (const auto *CtxKey = dyn_cast<AddrBasedCtxKey>(Val: CI.first.getPtr())) { |
455 | for (auto StackAddr : CtxKey->Context) { |
456 | if (FuncRange *FRange = Binary->findFuncRange(Address: StackAddr)) |
457 | ProfiledFunctions.insert(x: FRange->Func); |
458 | } |
459 | } |
460 | |
461 | for (auto Item : CI.second.RangeCounter) { |
462 | uint64_t StartAddress = Item.first.first; |
463 | if (FuncRange *FRange = Binary->findFuncRange(Address: StartAddress)) |
464 | ProfiledFunctions.insert(x: FRange->Func); |
465 | } |
466 | |
467 | for (auto Item : CI.second.BranchCounter) { |
468 | uint64_t SourceAddress = Item.first.first; |
469 | uint64_t TargetAddress = Item.first.second; |
470 | if (FuncRange *FRange = Binary->findFuncRange(Address: SourceAddress)) |
471 | ProfiledFunctions.insert(x: FRange->Func); |
472 | if (FuncRange *FRange = Binary->findFuncRange(Address: TargetAddress)) |
473 | ProfiledFunctions.insert(x: FRange->Func); |
474 | } |
475 | } |
476 | return true; |
477 | } |
478 | |
479 | bool ProfileGenerator::collectFunctionsFromLLVMProfile( |
480 | std::unordered_set<const BinaryFunction *> &ProfiledFunctions) { |
481 | for (const auto &FS : ProfileMap) { |
482 | if (auto *Func = Binary->getBinaryFunction(FName: FS.second.getFunction())) |
483 | ProfiledFunctions.insert(x: Func); |
484 | } |
485 | return true; |
486 | } |
487 | |
488 | bool CSProfileGenerator::collectFunctionsFromLLVMProfile( |
489 | std::unordered_set<const BinaryFunction *> &ProfiledFunctions) { |
490 | for (auto *Node : ContextTracker) { |
491 | if (!Node->getFuncName().empty()) |
492 | if (auto *Func = Binary->getBinaryFunction(FName: Node->getFuncName())) |
493 | ProfiledFunctions.insert(x: Func); |
494 | } |
495 | return true; |
496 | } |
497 | |
498 | FunctionSamples & |
499 | ProfileGenerator::getTopLevelFunctionProfile(FunctionId FuncName) { |
500 | SampleContext Context(FuncName); |
501 | return ProfileMap.create(Ctx: Context); |
502 | } |
503 | |
504 | void ProfileGenerator::generateProfile() { |
505 | collectProfiledFunctions(); |
506 | |
507 | if (Binary->usePseudoProbes()) |
508 | Binary->decodePseudoProbe(); |
509 | |
510 | if (SampleCounters) { |
511 | if (Binary->usePseudoProbes()) { |
512 | generateProbeBasedProfile(); |
513 | } else { |
514 | generateLineNumBasedProfile(); |
515 | } |
516 | } |
517 | |
518 | postProcessProfiles(); |
519 | } |
520 | |
521 | void ProfileGenerator::postProcessProfiles() { |
522 | computeSummaryAndThreshold(ProfileMap); |
523 | trimColdProfiles(Profiles: ProfileMap, ColdCntThreshold: ColdCountThreshold); |
524 | filterAmbiguousProfile(Profiles&: ProfileMap); |
525 | calculateAndShowDensity(Profiles: ProfileMap); |
526 | } |
527 | |
528 | void ProfileGenerator::trimColdProfiles(const SampleProfileMap &Profiles, |
529 | uint64_t ColdCntThreshold) { |
530 | if (!TrimColdProfile) |
531 | return; |
532 | |
533 | // Move cold profiles into a tmp container. |
534 | std::vector<hash_code> ColdProfileHashes; |
535 | for (const auto &I : ProfileMap) { |
536 | if (I.second.getTotalSamples() < ColdCntThreshold) |
537 | ColdProfileHashes.emplace_back(args: I.first); |
538 | } |
539 | |
540 | // Remove the cold profile from ProfileMap. |
541 | for (const auto &I : ColdProfileHashes) |
542 | ProfileMap.erase(Key: I); |
543 | } |
544 | |
545 | void ProfileGenerator::generateLineNumBasedProfile() { |
546 | assert(SampleCounters->size() == 1 && |
547 | "Must have one entry for profile generation." ); |
548 | const SampleCounter &SC = SampleCounters->begin()->second; |
549 | // Fill in function body samples |
550 | populateBodySamplesForAllFunctions(RangeCounter: SC.RangeCounter); |
551 | // Fill in boundary sample counts as well as call site samples for calls |
552 | populateBoundarySamplesForAllFunctions(BranchCounters: SC.BranchCounter); |
553 | |
554 | updateFunctionSamples(); |
555 | } |
556 | |
557 | void ProfileGenerator::generateProbeBasedProfile() { |
558 | assert(SampleCounters->size() == 1 && |
559 | "Must have one entry for profile generation." ); |
560 | // Enable pseudo probe functionalities in SampleProf |
561 | FunctionSamples::ProfileIsProbeBased = true; |
562 | const SampleCounter &SC = SampleCounters->begin()->second; |
563 | // Fill in function body samples |
564 | populateBodySamplesWithProbesForAllFunctions(RangeCounter: SC.RangeCounter); |
565 | // Fill in boundary sample counts as well as call site samples for calls |
566 | populateBoundarySamplesWithProbesForAllFunctions(BranchCounters: SC.BranchCounter); |
567 | |
568 | updateFunctionSamples(); |
569 | } |
570 | |
571 | void ProfileGenerator::populateBodySamplesWithProbesForAllFunctions( |
572 | const RangeSample &RangeCounter) { |
573 | ProbeCounterMap ProbeCounter; |
574 | // preprocessRangeCounter returns disjoint ranges, so no longer to redo it |
575 | // inside extractProbesFromRange. |
576 | extractProbesFromRange(RangeCounter: preprocessRangeCounter(RangeCounter), ProbeCounter, |
577 | FindDisjointRanges: false); |
578 | |
579 | for (const auto &PI : ProbeCounter) { |
580 | const MCDecodedPseudoProbe *Probe = PI.first; |
581 | uint64_t Count = PI.second; |
582 | SampleContextFrameVector FrameVec; |
583 | Binary->getInlineContextForProbe(Probe, InlineContextStack&: FrameVec, IncludeLeaf: true); |
584 | FunctionSamples &FunctionProfile = |
585 | getLeafProfileAndAddTotalSamples(FrameVec, Count); |
586 | FunctionProfile.addBodySamples(LineOffset: Probe->getIndex(), Discriminator: Probe->getDiscriminator(), |
587 | Num: Count); |
588 | if (Probe->isEntry()) |
589 | FunctionProfile.addHeadSamples(Num: Count); |
590 | } |
591 | } |
592 | |
593 | void ProfileGenerator::populateBoundarySamplesWithProbesForAllFunctions( |
594 | const BranchSample &BranchCounters) { |
595 | for (const auto &Entry : BranchCounters) { |
596 | uint64_t SourceAddress = Entry.first.first; |
597 | uint64_t TargetAddress = Entry.first.second; |
598 | uint64_t Count = Entry.second; |
599 | assert(Count != 0 && "Unexpected zero weight branch" ); |
600 | |
601 | StringRef CalleeName = getCalleeNameForAddress(TargetAddress); |
602 | if (CalleeName.size() == 0) |
603 | continue; |
604 | |
605 | const MCDecodedPseudoProbe *CallProbe = |
606 | Binary->getCallProbeForAddr(Address: SourceAddress); |
607 | if (CallProbe == nullptr) |
608 | continue; |
609 | |
610 | // Record called target sample and its count. |
611 | SampleContextFrameVector FrameVec; |
612 | Binary->getInlineContextForProbe(Probe: CallProbe, InlineContextStack&: FrameVec, IncludeLeaf: true); |
613 | |
614 | if (!FrameVec.empty()) { |
615 | FunctionSamples &FunctionProfile = |
616 | getLeafProfileAndAddTotalSamples(FrameVec, Count: 0); |
617 | FunctionProfile.addCalledTargetSamples( |
618 | LineOffset: FrameVec.back().Location.LineOffset, |
619 | Discriminator: FrameVec.back().Location.Discriminator, |
620 | Func: FunctionId(CalleeName), Num: Count); |
621 | } |
622 | } |
623 | } |
624 | |
625 | FunctionSamples &ProfileGenerator::getLeafProfileAndAddTotalSamples( |
626 | const SampleContextFrameVector &FrameVec, uint64_t Count) { |
627 | // Get top level profile |
628 | FunctionSamples *FunctionProfile = |
629 | &getTopLevelFunctionProfile(FuncName: FrameVec[0].Func); |
630 | FunctionProfile->addTotalSamples(Num: Count); |
631 | if (Binary->usePseudoProbes()) { |
632 | const auto *FuncDesc = Binary->getFuncDescForGUID( |
633 | GUID: FunctionProfile->getFunction().getHashCode()); |
634 | FunctionProfile->setFunctionHash(FuncDesc->FuncHash); |
635 | } |
636 | |
637 | for (size_t I = 1; I < FrameVec.size(); I++) { |
638 | LineLocation Callsite( |
639 | FrameVec[I - 1].Location.LineOffset, |
640 | getBaseDiscriminator(Discriminator: FrameVec[I - 1].Location.Discriminator)); |
641 | FunctionSamplesMap &SamplesMap = |
642 | FunctionProfile->functionSamplesAt(Loc: Callsite); |
643 | auto Ret = |
644 | SamplesMap.emplace(args: FrameVec[I].Func, args: FunctionSamples()); |
645 | if (Ret.second) { |
646 | SampleContext Context(FrameVec[I].Func); |
647 | Ret.first->second.setContext(Context); |
648 | } |
649 | FunctionProfile = &Ret.first->second; |
650 | FunctionProfile->addTotalSamples(Num: Count); |
651 | if (Binary->usePseudoProbes()) { |
652 | const auto *FuncDesc = Binary->getFuncDescForGUID( |
653 | GUID: FunctionProfile->getFunction().getHashCode()); |
654 | FunctionProfile->setFunctionHash(FuncDesc->FuncHash); |
655 | } |
656 | } |
657 | |
658 | return *FunctionProfile; |
659 | } |
660 | |
661 | RangeSample |
662 | ProfileGenerator::preprocessRangeCounter(const RangeSample &RangeCounter) { |
663 | RangeSample Ranges(RangeCounter.begin(), RangeCounter.end()); |
664 | if (FillZeroForAllFuncs) { |
665 | for (auto &FuncI : Binary->getAllBinaryFunctions()) { |
666 | for (auto &R : FuncI.second.Ranges) { |
667 | Ranges[{R.first, R.second - 1}] += 0; |
668 | } |
669 | } |
670 | } else { |
671 | // For each range, we search for all ranges of the function it belongs to |
672 | // and initialize it with zero count, so it remains zero if doesn't hit any |
673 | // samples. This is to be consistent with compiler that interpret zero count |
674 | // as unexecuted(cold). |
675 | for (const auto &I : RangeCounter) { |
676 | uint64_t StartAddress = I.first.first; |
677 | for (const auto &Range : Binary->getRanges(Address: StartAddress)) |
678 | Ranges[{Range.first, Range.second - 1}] += 0; |
679 | } |
680 | } |
681 | RangeSample DisjointRanges; |
682 | findDisjointRanges(DisjointRanges, Ranges); |
683 | return DisjointRanges; |
684 | } |
685 | |
686 | void ProfileGenerator::populateBodySamplesForAllFunctions( |
687 | const RangeSample &RangeCounter) { |
688 | for (const auto &Range : preprocessRangeCounter(RangeCounter)) { |
689 | uint64_t RangeBegin = Range.first.first; |
690 | uint64_t RangeEnd = Range.first.second; |
691 | uint64_t Count = Range.second; |
692 | |
693 | InstructionPointer IP(Binary, RangeBegin, true); |
694 | // Disjoint ranges may have range in the middle of two instr, |
695 | // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range |
696 | // can be Addr1+1 to Addr2-1. We should ignore such range. |
697 | if (IP.Address > RangeEnd) |
698 | continue; |
699 | |
700 | do { |
701 | const SampleContextFrameVector FrameVec = |
702 | Binary->getFrameLocationStack(Address: IP.Address); |
703 | if (!FrameVec.empty()) { |
704 | // FIXME: As accumulating total count per instruction caused some |
705 | // regression, we changed to accumulate total count per byte as a |
706 | // workaround. Tuning hotness threshold on the compiler side might be |
707 | // necessary in the future. |
708 | FunctionSamples &FunctionProfile = getLeafProfileAndAddTotalSamples( |
709 | FrameVec, Count: Count * Binary->getInstSize(Address: IP.Address)); |
710 | updateBodySamplesforFunctionProfile(FunctionProfile, LeafLoc: FrameVec.back(), |
711 | Count); |
712 | } |
713 | } while (IP.advance() && IP.Address <= RangeEnd); |
714 | } |
715 | } |
716 | |
717 | StringRef |
718 | ProfileGeneratorBase::getCalleeNameForAddress(uint64_t TargetAddress) { |
719 | // Get the function range by branch target if it's a call branch. |
720 | auto *FRange = Binary->findFuncRangeForStartAddr(Address: TargetAddress); |
721 | |
722 | // We won't accumulate sample count for a range whose start is not the real |
723 | // function entry such as outlined function or inner labels. |
724 | if (!FRange || !FRange->IsFuncEntry) |
725 | return StringRef(); |
726 | |
727 | return FunctionSamples::getCanonicalFnName(FnName: FRange->getFuncName()); |
728 | } |
729 | |
730 | void ProfileGenerator::populateBoundarySamplesForAllFunctions( |
731 | const BranchSample &BranchCounters) { |
732 | for (const auto &Entry : BranchCounters) { |
733 | uint64_t SourceAddress = Entry.first.first; |
734 | uint64_t TargetAddress = Entry.first.second; |
735 | uint64_t Count = Entry.second; |
736 | assert(Count != 0 && "Unexpected zero weight branch" ); |
737 | |
738 | StringRef CalleeName = getCalleeNameForAddress(TargetAddress); |
739 | if (CalleeName.size() == 0) |
740 | continue; |
741 | // Record called target sample and its count. |
742 | const SampleContextFrameVector &FrameVec = |
743 | Binary->getCachedFrameLocationStack(Address: SourceAddress); |
744 | if (!FrameVec.empty()) { |
745 | FunctionSamples &FunctionProfile = |
746 | getLeafProfileAndAddTotalSamples(FrameVec, Count: 0); |
747 | FunctionProfile.addCalledTargetSamples( |
748 | LineOffset: FrameVec.back().Location.LineOffset, |
749 | Discriminator: getBaseDiscriminator(Discriminator: FrameVec.back().Location.Discriminator), |
750 | Func: FunctionId(CalleeName), Num: Count); |
751 | } |
752 | // Add head samples for callee. |
753 | FunctionSamples &CalleeProfile = |
754 | getTopLevelFunctionProfile(FuncName: FunctionId(CalleeName)); |
755 | CalleeProfile.addHeadSamples(Num: Count); |
756 | } |
757 | } |
758 | |
759 | void ProfileGeneratorBase::calculateBodySamplesAndSize( |
760 | const FunctionSamples &FSamples, uint64_t &TotalBodySamples, |
761 | uint64_t &FuncBodySize) { |
762 | // Note that ideally the size should be the number of function instruction. |
763 | // However, for probe-based profile, we don't have the accurate instruction |
764 | // count for each probe, instead, the probe sample is the samples count for |
765 | // the block, which is equivelant to |
766 | // total_instruction_samples/num_of_instruction in one block. Hence, we use |
767 | // the number of probe as a proxy for the function's size. |
768 | FuncBodySize += FSamples.getBodySamples().size(); |
769 | |
770 | // The accumulated body samples re-calculated here could be different from the |
771 | // TotalSamples(getTotalSamples) field of FunctionSamples for line-number |
772 | // based profile. The reason is that TotalSamples is the sum of all the |
773 | // samples of the machine instruction in one source-code line, however, the |
774 | // entry of Bodysamples is the only max number of them, so the TotalSamples is |
775 | // usually much bigger than the accumulated body samples as one souce-code |
776 | // line can emit many machine instructions. We observed a regression when we |
777 | // switched to use the accumulated body samples(by using |
778 | // -update-total-samples). Hence, it's safer to re-calculate here to avoid |
779 | // such discrepancy. There is no problem for probe-based profile, as the |
780 | // TotalSamples is exactly the same as the accumulated body samples. |
781 | for (const auto &I : FSamples.getBodySamples()) |
782 | TotalBodySamples += I.second.getSamples(); |
783 | |
784 | for (const auto &CallsiteSamples : FSamples.getCallsiteSamples()) |
785 | for (const auto &Callee : CallsiteSamples.second) { |
786 | // For binary-level density, the inlinees' samples and size should be |
787 | // included in the calculation. |
788 | calculateBodySamplesAndSize(FSamples: Callee.second, TotalBodySamples, |
789 | FuncBodySize); |
790 | } |
791 | } |
792 | |
793 | // Calculate Profile-density: |
794 | // Calculate the density for each function and sort them in descending order, |
795 | // keep accumulating their total samples unitl it exceeds the |
796 | // percentage_threshold(cut-off) of total profile samples, the profile-density |
797 | // is the last(minimum) function-density of the processed functions, which means |
798 | // all the functions hot to perf are on good density if the profile-density is |
799 | // good. The percentage_threshold(--profile-density-cutoff-hot) is configurable |
800 | // depending on how much regression the system want to tolerate. |
801 | double |
802 | ProfileGeneratorBase::calculateDensity(const SampleProfileMap &Profiles) { |
803 | double ProfileDensity = 0.0; |
804 | |
805 | uint64_t TotalProfileSamples = 0; |
806 | // A list of the function profile density and its total samples. |
807 | std::vector<std::pair<double, uint64_t>> FuncDensityList; |
808 | for (const auto &I : Profiles) { |
809 | uint64_t TotalBodySamples = 0; |
810 | uint64_t FuncBodySize = 0; |
811 | calculateBodySamplesAndSize(FSamples: I.second, TotalBodySamples, FuncBodySize); |
812 | |
813 | if (FuncBodySize == 0) |
814 | continue; |
815 | |
816 | double FuncDensity = static_cast<double>(TotalBodySamples) / FuncBodySize; |
817 | TotalProfileSamples += TotalBodySamples; |
818 | FuncDensityList.emplace_back(args&: FuncDensity, args&: TotalBodySamples); |
819 | } |
820 | |
821 | // Sorted by the density in descending order. |
822 | llvm::stable_sort(Range&: FuncDensityList, C: [&](const std::pair<double, uint64_t> &A, |
823 | const std::pair<double, uint64_t> &B) { |
824 | if (A.first != B.first) |
825 | return A.first > B.first; |
826 | return A.second < B.second; |
827 | }); |
828 | |
829 | uint64_t AccumulatedSamples = 0; |
830 | uint32_t I = 0; |
831 | assert(ProfileDensityCutOffHot <= 1000000 && |
832 | "The cutoff value is greater than 1000000(100%)" ); |
833 | while (AccumulatedSamples < TotalProfileSamples * |
834 | static_cast<float>(ProfileDensityCutOffHot) / |
835 | 1000000 && |
836 | I < FuncDensityList.size()) { |
837 | AccumulatedSamples += FuncDensityList[I].second; |
838 | ProfileDensity = FuncDensityList[I].first; |
839 | I++; |
840 | } |
841 | |
842 | return ProfileDensity; |
843 | } |
844 | |
845 | void ProfileGeneratorBase::calculateAndShowDensity( |
846 | const SampleProfileMap &Profiles) { |
847 | double Density = calculateDensity(Profiles); |
848 | showDensitySuggestion(Density); |
849 | } |
850 | |
851 | FunctionSamples * |
852 | CSProfileGenerator::getOrCreateFunctionSamples(ContextTrieNode *ContextNode, |
853 | bool WasLeafInlined) { |
854 | FunctionSamples *FProfile = ContextNode->getFunctionSamples(); |
855 | if (!FProfile) { |
856 | FSamplesList.emplace_back(); |
857 | FProfile = &FSamplesList.back(); |
858 | FProfile->setFunction(ContextNode->getFuncName()); |
859 | ContextNode->setFunctionSamples(FProfile); |
860 | } |
861 | // Update ContextWasInlined attribute for existing contexts. |
862 | // The current function can be called in two ways: |
863 | // - when processing a probe of the current frame |
864 | // - when processing the entry probe of an inlinee's frame, which |
865 | // is then used to update the callsite count of the current frame. |
866 | // The two can happen in any order, hence here we are making sure |
867 | // `ContextWasInlined` is always set as expected. |
868 | // TODO: Note that the former does not always happen if no probes of the |
869 | // current frame has samples, and if the latter happens, we could lose the |
870 | // attribute. This should be fixed. |
871 | if (WasLeafInlined) |
872 | FProfile->getContext().setAttribute(ContextWasInlined); |
873 | return FProfile; |
874 | } |
875 | |
876 | ContextTrieNode * |
877 | CSProfileGenerator::getOrCreateContextNode(const SampleContextFrames Context, |
878 | bool WasLeafInlined) { |
879 | ContextTrieNode *ContextNode = |
880 | ContextTracker.getOrCreateContextPath(Context, AllowCreate: true); |
881 | getOrCreateFunctionSamples(ContextNode, WasLeafInlined); |
882 | return ContextNode; |
883 | } |
884 | |
885 | void CSProfileGenerator::generateProfile() { |
886 | FunctionSamples::ProfileIsCS = true; |
887 | |
888 | collectProfiledFunctions(); |
889 | |
890 | if (Binary->usePseudoProbes()) { |
891 | Binary->decodePseudoProbe(); |
892 | if (InferMissingFrames) |
893 | initializeMissingFrameInferrer(); |
894 | } |
895 | |
896 | if (SampleCounters) { |
897 | if (Binary->usePseudoProbes()) { |
898 | generateProbeBasedProfile(); |
899 | } else { |
900 | generateLineNumBasedProfile(); |
901 | } |
902 | } |
903 | |
904 | if (Binary->getTrackFuncContextSize()) |
905 | computeSizeForProfiledFunctions(); |
906 | |
907 | postProcessProfiles(); |
908 | } |
909 | |
910 | void CSProfileGenerator::initializeMissingFrameInferrer() { |
911 | Binary->getMissingContextInferrer()->initialize(SampleCounters); |
912 | } |
913 | |
914 | void CSProfileGenerator::inferMissingFrames( |
915 | const SmallVectorImpl<uint64_t> &Context, |
916 | SmallVectorImpl<uint64_t> &NewContext) { |
917 | Binary->inferMissingFrames(Context, NewContext); |
918 | } |
919 | |
920 | void CSProfileGenerator::computeSizeForProfiledFunctions() { |
921 | for (auto *Func : Binary->getProfiledFunctions()) |
922 | Binary->computeInlinedContextSizeForFunc(Func); |
923 | |
924 | // Flush the symbolizer to save memory. |
925 | Binary->flushSymbolizer(); |
926 | } |
927 | |
928 | void CSProfileGenerator::updateFunctionSamples() { |
929 | for (auto *Node : ContextTracker) { |
930 | FunctionSamples *FSamples = Node->getFunctionSamples(); |
931 | if (FSamples) { |
932 | if (UpdateTotalSamples) |
933 | FSamples->updateTotalSamples(); |
934 | FSamples->updateCallsiteSamples(); |
935 | } |
936 | } |
937 | } |
938 | |
939 | void CSProfileGenerator::generateLineNumBasedProfile() { |
940 | for (const auto &CI : *SampleCounters) { |
941 | const auto *CtxKey = cast<StringBasedCtxKey>(Val: CI.first.getPtr()); |
942 | |
943 | ContextTrieNode *ContextNode = &getRootContext(); |
944 | // Sample context will be empty if the jump is an external-to-internal call |
945 | // pattern, the head samples should be added for the internal function. |
946 | if (!CtxKey->Context.empty()) { |
947 | // Get or create function profile for the range |
948 | ContextNode = |
949 | getOrCreateContextNode(Context: CtxKey->Context, WasLeafInlined: CtxKey->WasLeafInlined); |
950 | // Fill in function body samples |
951 | populateBodySamplesForFunction(FunctionProfile&: *ContextNode->getFunctionSamples(), |
952 | RangeCounters: CI.second.RangeCounter); |
953 | } |
954 | // Fill in boundary sample counts as well as call site samples for calls |
955 | populateBoundarySamplesForFunction(CallerNode: ContextNode, BranchCounters: CI.second.BranchCounter); |
956 | } |
957 | // Fill in call site value sample for inlined calls and also use context to |
958 | // infer missing samples. Since we don't have call count for inlined |
959 | // functions, we estimate it from inlinee's profile using the entry of the |
960 | // body sample. |
961 | populateInferredFunctionSamples(Node&: getRootContext()); |
962 | |
963 | updateFunctionSamples(); |
964 | } |
965 | |
966 | void CSProfileGenerator::populateBodySamplesForFunction( |
967 | FunctionSamples &FunctionProfile, const RangeSample &RangeCounter) { |
968 | // Compute disjoint ranges first, so we can use MAX |
969 | // for calculating count for each location. |
970 | RangeSample Ranges; |
971 | findDisjointRanges(DisjointRanges&: Ranges, Ranges: RangeCounter); |
972 | for (const auto &Range : Ranges) { |
973 | uint64_t RangeBegin = Range.first.first; |
974 | uint64_t RangeEnd = Range.first.second; |
975 | uint64_t Count = Range.second; |
976 | // Disjoint ranges have introduce zero-filled gap that |
977 | // doesn't belong to current context, filter them out. |
978 | if (Count == 0) |
979 | continue; |
980 | |
981 | InstructionPointer IP(Binary, RangeBegin, true); |
982 | // Disjoint ranges may have range in the middle of two instr, |
983 | // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range |
984 | // can be Addr1+1 to Addr2-1. We should ignore such range. |
985 | if (IP.Address > RangeEnd) |
986 | continue; |
987 | |
988 | do { |
989 | auto LeafLoc = Binary->getInlineLeafFrameLoc(Address: IP.Address); |
990 | if (LeafLoc) { |
991 | // Recording body sample for this specific context |
992 | updateBodySamplesforFunctionProfile(FunctionProfile, LeafLoc: *LeafLoc, Count); |
993 | FunctionProfile.addTotalSamples(Num: Count); |
994 | } |
995 | } while (IP.advance() && IP.Address <= RangeEnd); |
996 | } |
997 | } |
998 | |
999 | void CSProfileGenerator::populateBoundarySamplesForFunction( |
1000 | ContextTrieNode *Node, const BranchSample &BranchCounters) { |
1001 | |
1002 | for (const auto &Entry : BranchCounters) { |
1003 | uint64_t SourceAddress = Entry.first.first; |
1004 | uint64_t TargetAddress = Entry.first.second; |
1005 | uint64_t Count = Entry.second; |
1006 | assert(Count != 0 && "Unexpected zero weight branch" ); |
1007 | |
1008 | StringRef CalleeName = getCalleeNameForAddress(TargetAddress); |
1009 | if (CalleeName.size() == 0) |
1010 | continue; |
1011 | |
1012 | ContextTrieNode *CallerNode = Node; |
1013 | LineLocation CalleeCallSite(0, 0); |
1014 | if (CallerNode != &getRootContext()) { |
1015 | // Record called target sample and its count |
1016 | auto LeafLoc = Binary->getInlineLeafFrameLoc(Address: SourceAddress); |
1017 | if (LeafLoc) { |
1018 | CallerNode->getFunctionSamples()->addCalledTargetSamples( |
1019 | LineOffset: LeafLoc->Location.LineOffset, |
1020 | Discriminator: getBaseDiscriminator(Discriminator: LeafLoc->Location.Discriminator), |
1021 | Func: FunctionId(CalleeName), |
1022 | Num: Count); |
1023 | // Record head sample for called target(callee) |
1024 | CalleeCallSite = LeafLoc->Location; |
1025 | } |
1026 | } |
1027 | |
1028 | ContextTrieNode *CalleeNode = |
1029 | CallerNode->getOrCreateChildContext(CallSite: CalleeCallSite, |
1030 | ChildName: FunctionId(CalleeName)); |
1031 | FunctionSamples *CalleeProfile = getOrCreateFunctionSamples(ContextNode: CalleeNode); |
1032 | CalleeProfile->addHeadSamples(Num: Count); |
1033 | } |
1034 | } |
1035 | |
1036 | void CSProfileGenerator::populateInferredFunctionSamples( |
1037 | ContextTrieNode &Node) { |
1038 | // There is no call jmp sample between the inliner and inlinee, we need to use |
1039 | // the inlinee's context to infer inliner's context, i.e. parent(inliner)'s |
1040 | // sample depends on child(inlinee)'s sample, so traverse the tree in |
1041 | // post-order. |
1042 | for (auto &It : Node.getAllChildContext()) |
1043 | populateInferredFunctionSamples(Node&: It.second); |
1044 | |
1045 | FunctionSamples *CalleeProfile = Node.getFunctionSamples(); |
1046 | if (!CalleeProfile) |
1047 | return; |
1048 | // If we already have head sample counts, we must have value profile |
1049 | // for call sites added already. Skip to avoid double counting. |
1050 | if (CalleeProfile->getHeadSamples()) |
1051 | return; |
1052 | ContextTrieNode *CallerNode = Node.getParentContext(); |
1053 | // If we don't have context, nothing to do for caller's call site. |
1054 | // This could happen for entry point function. |
1055 | if (CallerNode == &getRootContext()) |
1056 | return; |
1057 | |
1058 | LineLocation CallerLeafFrameLoc = Node.getCallSiteLoc(); |
1059 | FunctionSamples &CallerProfile = *getOrCreateFunctionSamples(ContextNode: CallerNode); |
1060 | // Since we don't have call count for inlined functions, we |
1061 | // estimate it from inlinee's profile using entry body sample. |
1062 | uint64_t EstimatedCallCount = CalleeProfile->getHeadSamplesEstimate(); |
1063 | // If we don't have samples with location, use 1 to indicate live. |
1064 | if (!EstimatedCallCount && !CalleeProfile->getBodySamples().size()) |
1065 | EstimatedCallCount = 1; |
1066 | CallerProfile.addCalledTargetSamples(LineOffset: CallerLeafFrameLoc.LineOffset, |
1067 | Discriminator: CallerLeafFrameLoc.Discriminator, |
1068 | Func: Node.getFuncName(), Num: EstimatedCallCount); |
1069 | CallerProfile.addBodySamples(LineOffset: CallerLeafFrameLoc.LineOffset, |
1070 | Discriminator: CallerLeafFrameLoc.Discriminator, |
1071 | Num: EstimatedCallCount); |
1072 | CallerProfile.addTotalSamples(Num: EstimatedCallCount); |
1073 | } |
1074 | |
1075 | void CSProfileGenerator::convertToProfileMap( |
1076 | ContextTrieNode &Node, SampleContextFrameVector &Context) { |
1077 | FunctionSamples *FProfile = Node.getFunctionSamples(); |
1078 | if (FProfile) { |
1079 | Context.emplace_back(Args: Node.getFuncName(), Args: LineLocation(0, 0)); |
1080 | // Save the new context for future references. |
1081 | SampleContextFrames NewContext = *Contexts.insert(x: Context).first; |
1082 | auto Ret = ProfileMap.emplace(Args&: NewContext, Args: std::move(*FProfile)); |
1083 | FunctionSamples &NewProfile = Ret.first->second; |
1084 | NewProfile.getContext().setContext(Context: NewContext); |
1085 | Context.pop_back(); |
1086 | } |
1087 | |
1088 | for (auto &It : Node.getAllChildContext()) { |
1089 | ContextTrieNode &ChildNode = It.second; |
1090 | Context.emplace_back(Args: Node.getFuncName(), Args: ChildNode.getCallSiteLoc()); |
1091 | convertToProfileMap(Node&: ChildNode, Context); |
1092 | Context.pop_back(); |
1093 | } |
1094 | } |
1095 | |
1096 | void CSProfileGenerator::convertToProfileMap() { |
1097 | assert(ProfileMap.empty() && |
1098 | "ProfileMap should be empty before converting from the trie" ); |
1099 | assert(IsProfileValidOnTrie && |
1100 | "Do not convert the trie twice, it's already destroyed" ); |
1101 | |
1102 | SampleContextFrameVector Context; |
1103 | for (auto &It : getRootContext().getAllChildContext()) |
1104 | convertToProfileMap(Node&: It.second, Context); |
1105 | |
1106 | IsProfileValidOnTrie = false; |
1107 | } |
1108 | |
1109 | void CSProfileGenerator::postProcessProfiles() { |
1110 | // Compute hot/cold threshold based on profile. This will be used for cold |
1111 | // context profile merging/trimming. |
1112 | computeSummaryAndThreshold(); |
1113 | |
1114 | // Run global pre-inliner to adjust/merge context profile based on estimated |
1115 | // inline decisions. |
1116 | if (EnableCSPreInliner) { |
1117 | ContextTracker.populateFuncToCtxtMap(); |
1118 | CSPreInliner(ContextTracker, *Binary, Summary.get()).run(); |
1119 | // Turn off the profile merger by default unless it is explicitly enabled. |
1120 | if (!CSProfMergeColdContext.getNumOccurrences()) |
1121 | CSProfMergeColdContext = false; |
1122 | } |
1123 | |
1124 | convertToProfileMap(); |
1125 | |
1126 | // Trim and merge cold context profile using cold threshold above. |
1127 | if (TrimColdProfile || CSProfMergeColdContext) { |
1128 | SampleContextTrimmer(ProfileMap) |
1129 | .trimAndMergeColdContextProfiles( |
1130 | ColdCountThreshold: HotCountThreshold, TrimColdContext: TrimColdProfile, MergeColdContext: CSProfMergeColdContext, |
1131 | ColdContextFrameLength: CSProfMaxColdContextDepth, TrimBaseProfileOnly: EnableCSPreInliner); |
1132 | } |
1133 | |
1134 | if (GenCSNestedProfile) { |
1135 | ProfileConverter CSConverter(ProfileMap); |
1136 | CSConverter.convertCSProfiles(); |
1137 | FunctionSamples::ProfileIsCS = false; |
1138 | } |
1139 | filterAmbiguousProfile(Profiles&: ProfileMap); |
1140 | ProfileGeneratorBase::calculateAndShowDensity(Profiles: ProfileMap); |
1141 | } |
1142 | |
1143 | void ProfileGeneratorBase::computeSummaryAndThreshold( |
1144 | SampleProfileMap &Profiles) { |
1145 | SampleProfileSummaryBuilder Builder(ProfileSummaryBuilder::DefaultCutoffs); |
1146 | Summary = Builder.computeSummaryForProfiles(Profiles); |
1147 | HotCountThreshold = ProfileSummaryBuilder::getHotCountThreshold( |
1148 | DS: (Summary->getDetailedSummary())); |
1149 | ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold( |
1150 | DS: (Summary->getDetailedSummary())); |
1151 | } |
1152 | |
1153 | void CSProfileGenerator::computeSummaryAndThreshold() { |
1154 | // Always merge and use context-less profile map to compute summary. |
1155 | SampleProfileMap ContextLessProfiles; |
1156 | ContextTracker.createContextLessProfileMap(ContextLessProfiles); |
1157 | |
1158 | // Set the flag below to avoid merging the profile again in |
1159 | // computeSummaryAndThreshold |
1160 | FunctionSamples::ProfileIsCS = false; |
1161 | assert( |
1162 | (!UseContextLessSummary.getNumOccurrences() || UseContextLessSummary) && |
1163 | "Don't set --profile-summary-contextless to false for profile " |
1164 | "generation" ); |
1165 | ProfileGeneratorBase::computeSummaryAndThreshold(Profiles&: ContextLessProfiles); |
1166 | // Recover the old value. |
1167 | FunctionSamples::ProfileIsCS = true; |
1168 | } |
1169 | |
1170 | void ProfileGeneratorBase::( |
1171 | const RangeSample &RangeCounter, ProbeCounterMap &ProbeCounter, |
1172 | bool FindDisjointRanges) { |
1173 | const RangeSample *PRanges = &RangeCounter; |
1174 | RangeSample Ranges; |
1175 | if (FindDisjointRanges) { |
1176 | findDisjointRanges(DisjointRanges&: Ranges, Ranges: RangeCounter); |
1177 | PRanges = &Ranges; |
1178 | } |
1179 | |
1180 | for (const auto &Range : *PRanges) { |
1181 | uint64_t RangeBegin = Range.first.first; |
1182 | uint64_t RangeEnd = Range.first.second; |
1183 | uint64_t Count = Range.second; |
1184 | |
1185 | InstructionPointer IP(Binary, RangeBegin, true); |
1186 | // Disjoint ranges may have range in the middle of two instr, |
1187 | // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range |
1188 | // can be Addr1+1 to Addr2-1. We should ignore such range. |
1189 | if (IP.Address > RangeEnd) |
1190 | continue; |
1191 | |
1192 | do { |
1193 | const AddressProbesMap &Address2ProbesMap = |
1194 | Binary->getAddress2ProbesMap(); |
1195 | auto It = Address2ProbesMap.find(x: IP.Address); |
1196 | if (It != Address2ProbesMap.end()) { |
1197 | for (const auto &Probe : It->second) { |
1198 | ProbeCounter[&Probe] += Count; |
1199 | } |
1200 | } |
1201 | } while (IP.advance() && IP.Address <= RangeEnd); |
1202 | } |
1203 | } |
1204 | |
1205 | static void (SampleContextFrameVector &ContextStack, |
1206 | const SmallVectorImpl<uint64_t> &AddrVec, |
1207 | ProfiledBinary *Binary) { |
1208 | SmallVector<const MCDecodedPseudoProbe *, 16> Probes; |
1209 | for (auto Address : reverse(C: AddrVec)) { |
1210 | const MCDecodedPseudoProbe *CallProbe = |
1211 | Binary->getCallProbeForAddr(Address); |
1212 | // These could be the cases when a probe is not found at a calliste. Cutting |
1213 | // off the context from here since the inliner will not know how to consume |
1214 | // a context with unknown callsites. |
1215 | // 1. for functions that are not sampled when |
1216 | // --decode-probe-for-profiled-functions-only is on. |
1217 | // 2. for a merged callsite. Callsite merging may cause the loss of original |
1218 | // probe IDs. |
1219 | // 3. for an external callsite. |
1220 | if (!CallProbe) |
1221 | break; |
1222 | Probes.push_back(Elt: CallProbe); |
1223 | } |
1224 | |
1225 | std::reverse(first: Probes.begin(), last: Probes.end()); |
1226 | |
1227 | // Extract context stack for reusing, leaf context stack will be added |
1228 | // compressed while looking up function profile. |
1229 | for (const auto *P : Probes) { |
1230 | Binary->getInlineContextForProbe(Probe: P, InlineContextStack&: ContextStack, IncludeLeaf: true); |
1231 | } |
1232 | } |
1233 | |
1234 | void CSProfileGenerator::generateProbeBasedProfile() { |
1235 | // Enable pseudo probe functionalities in SampleProf |
1236 | FunctionSamples::ProfileIsProbeBased = true; |
1237 | for (const auto &CI : *SampleCounters) { |
1238 | const AddrBasedCtxKey *CtxKey = |
1239 | dyn_cast<AddrBasedCtxKey>(Val: CI.first.getPtr()); |
1240 | // Fill in function body samples from probes, also infer caller's samples |
1241 | // from callee's probe |
1242 | populateBodySamplesWithProbes(RangeCounter: CI.second.RangeCounter, CtxKey); |
1243 | // Fill in boundary samples for a call probe |
1244 | populateBoundarySamplesWithProbes(BranchCounter: CI.second.BranchCounter, CtxKey); |
1245 | } |
1246 | } |
1247 | |
1248 | void CSProfileGenerator::populateBodySamplesWithProbes( |
1249 | const RangeSample &RangeCounter, const AddrBasedCtxKey *CtxKey) { |
1250 | ProbeCounterMap ProbeCounter; |
1251 | // Extract the top frame probes by looking up each address among the range in |
1252 | // the Address2ProbeMap |
1253 | extractProbesFromRange(RangeCounter, ProbeCounter); |
1254 | std::unordered_map<MCDecodedPseudoProbeInlineTree *, |
1255 | std::unordered_set<FunctionSamples *>> |
1256 | FrameSamples; |
1257 | for (const auto &PI : ProbeCounter) { |
1258 | const MCDecodedPseudoProbe *Probe = PI.first; |
1259 | uint64_t Count = PI.second; |
1260 | // Disjoint ranges have introduce zero-filled gap that |
1261 | // doesn't belong to current context, filter them out. |
1262 | if (!Probe->isBlock() || Count == 0) |
1263 | continue; |
1264 | |
1265 | ContextTrieNode *ContextNode = getContextNodeForLeafProbe(CtxKey, LeafProbe: Probe); |
1266 | FunctionSamples &FunctionProfile = *ContextNode->getFunctionSamples(); |
1267 | // Record the current frame and FunctionProfile whenever samples are |
1268 | // collected for non-danglie probes. This is for reporting all of the |
1269 | // zero count probes of the frame later. |
1270 | FrameSamples[Probe->getInlineTreeNode()].insert(x: &FunctionProfile); |
1271 | FunctionProfile.addBodySamples(LineOffset: Probe->getIndex(), Discriminator: Probe->getDiscriminator(), |
1272 | Num: Count); |
1273 | FunctionProfile.addTotalSamples(Num: Count); |
1274 | if (Probe->isEntry()) { |
1275 | FunctionProfile.addHeadSamples(Num: Count); |
1276 | // Look up for the caller's function profile |
1277 | const auto *InlinerDesc = Binary->getInlinerDescForProbe(Probe); |
1278 | ContextTrieNode *CallerNode = ContextNode->getParentContext(); |
1279 | if (InlinerDesc != nullptr && CallerNode != &getRootContext()) { |
1280 | // Since the context id will be compressed, we have to use callee's |
1281 | // context id to infer caller's context id to ensure they share the |
1282 | // same context prefix. |
1283 | uint64_t CallerIndex = ContextNode->getCallSiteLoc().LineOffset; |
1284 | uint64_t CallerDiscriminator = ContextNode->getCallSiteLoc().Discriminator; |
1285 | assert(CallerIndex && |
1286 | "Inferred caller's location index shouldn't be zero!" ); |
1287 | assert(!CallerDiscriminator && |
1288 | "Callsite probe should not have a discriminator!" ); |
1289 | FunctionSamples &CallerProfile = |
1290 | *getOrCreateFunctionSamples(ContextNode: CallerNode); |
1291 | CallerProfile.setFunctionHash(InlinerDesc->FuncHash); |
1292 | CallerProfile.addBodySamples(LineOffset: CallerIndex, Discriminator: CallerDiscriminator, Num: Count); |
1293 | CallerProfile.addTotalSamples(Num: Count); |
1294 | CallerProfile.addCalledTargetSamples(LineOffset: CallerIndex, Discriminator: CallerDiscriminator, |
1295 | Func: ContextNode->getFuncName(), Num: Count); |
1296 | } |
1297 | } |
1298 | } |
1299 | |
1300 | // Assign zero count for remaining probes without sample hits to |
1301 | // differentiate from probes optimized away, of which the counts are unknown |
1302 | // and will be inferred by the compiler. |
1303 | for (auto &I : FrameSamples) { |
1304 | for (auto *FunctionProfile : I.second) { |
1305 | for (auto *Probe : I.first->getProbes()) { |
1306 | FunctionProfile->addBodySamples(LineOffset: Probe->getIndex(), |
1307 | Discriminator: Probe->getDiscriminator(), Num: 0); |
1308 | } |
1309 | } |
1310 | } |
1311 | } |
1312 | |
1313 | void CSProfileGenerator::populateBoundarySamplesWithProbes( |
1314 | const BranchSample &BranchCounter, const AddrBasedCtxKey *CtxKey) { |
1315 | for (const auto &BI : BranchCounter) { |
1316 | uint64_t SourceAddress = BI.first.first; |
1317 | uint64_t TargetAddress = BI.first.second; |
1318 | uint64_t Count = BI.second; |
1319 | const MCDecodedPseudoProbe *CallProbe = |
1320 | Binary->getCallProbeForAddr(Address: SourceAddress); |
1321 | if (CallProbe == nullptr) |
1322 | continue; |
1323 | FunctionSamples &FunctionProfile = |
1324 | getFunctionProfileForLeafProbe(CtxKey, LeafProbe: CallProbe); |
1325 | FunctionProfile.addBodySamples(LineOffset: CallProbe->getIndex(), Discriminator: 0, Num: Count); |
1326 | FunctionProfile.addTotalSamples(Num: Count); |
1327 | StringRef CalleeName = getCalleeNameForAddress(TargetAddress); |
1328 | if (CalleeName.size() == 0) |
1329 | continue; |
1330 | FunctionProfile.addCalledTargetSamples(LineOffset: CallProbe->getIndex(), |
1331 | Discriminator: CallProbe->getDiscriminator(), |
1332 | Func: FunctionId(CalleeName), Num: Count); |
1333 | } |
1334 | } |
1335 | |
1336 | ContextTrieNode *CSProfileGenerator::getContextNodeForLeafProbe( |
1337 | const AddrBasedCtxKey *CtxKey, const MCDecodedPseudoProbe *LeafProbe) { |
1338 | |
1339 | const SmallVectorImpl<uint64_t> *PContext = &CtxKey->Context; |
1340 | SmallVector<uint64_t, 16> NewContext; |
1341 | |
1342 | if (InferMissingFrames) { |
1343 | SmallVector<uint64_t, 16> Context = CtxKey->Context; |
1344 | // Append leaf frame for a complete inference. |
1345 | Context.push_back(Elt: LeafProbe->getAddress()); |
1346 | inferMissingFrames(Context, NewContext); |
1347 | // Pop out the leaf probe that was pushed in above. |
1348 | NewContext.pop_back(); |
1349 | PContext = &NewContext; |
1350 | } |
1351 | |
1352 | SampleContextFrameVector ContextStack; |
1353 | extractPrefixContextStack(ContextStack, AddrVec: *PContext, Binary); |
1354 | |
1355 | // Explicitly copy the context for appending the leaf context |
1356 | SampleContextFrameVector NewContextStack(ContextStack.begin(), |
1357 | ContextStack.end()); |
1358 | Binary->getInlineContextForProbe(Probe: LeafProbe, InlineContextStack&: NewContextStack, IncludeLeaf: true); |
1359 | // For leaf inlined context with the top frame, we should strip off the top |
1360 | // frame's probe id, like: |
1361 | // Inlined stack: [foo:1, bar:2], the ContextId will be "foo:1 @ bar" |
1362 | auto LeafFrame = NewContextStack.back(); |
1363 | LeafFrame.Location = LineLocation(0, 0); |
1364 | NewContextStack.pop_back(); |
1365 | // Compress the context string except for the leaf frame |
1366 | CSProfileGenerator::compressRecursionContext(Context&: NewContextStack); |
1367 | CSProfileGenerator::trimContext(S&: NewContextStack); |
1368 | NewContextStack.push_back(Elt: LeafFrame); |
1369 | |
1370 | const auto *FuncDesc = Binary->getFuncDescForGUID(GUID: LeafProbe->getGuid()); |
1371 | bool WasLeafInlined = LeafProbe->getInlineTreeNode()->hasInlineSite(); |
1372 | ContextTrieNode *ContextNode = |
1373 | getOrCreateContextNode(Context: NewContextStack, WasLeafInlined); |
1374 | ContextNode->getFunctionSamples()->setFunctionHash(FuncDesc->FuncHash); |
1375 | return ContextNode; |
1376 | } |
1377 | |
1378 | FunctionSamples &CSProfileGenerator::getFunctionProfileForLeafProbe( |
1379 | const AddrBasedCtxKey *CtxKey, const MCDecodedPseudoProbe *LeafProbe) { |
1380 | return *getContextNodeForLeafProbe(CtxKey, LeafProbe)->getFunctionSamples(); |
1381 | } |
1382 | |
1383 | } // end namespace sampleprof |
1384 | } // end namespace llvm |
1385 | |