1 | //===-- ProfileGenerator.h - Profile Generator -----------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #ifndef LLVM_TOOLS_LLVM_PROGEN_PROFILEGENERATOR_H |
10 | #define LLVM_TOOLS_LLVM_PROGEN_PROFILEGENERATOR_H |
11 | #include "CSPreInliner.h" |
12 | #include "ErrorHandling.h" |
13 | #include "PerfReader.h" |
14 | #include "ProfiledBinary.h" |
15 | #include "llvm/IR/DebugInfoMetadata.h" |
16 | #include "llvm/ProfileData/SampleProfWriter.h" |
17 | #include <memory> |
18 | #include <unordered_set> |
19 | |
20 | using namespace llvm; |
21 | using namespace sampleprof; |
22 | |
23 | namespace llvm { |
24 | namespace sampleprof { |
25 | |
26 | using ProbeCounterMap = |
27 | std::unordered_map<const MCDecodedPseudoProbe *, uint64_t>; |
28 | |
29 | // This base class for profile generation of sample-based PGO. We reuse all |
30 | // structures relating to function profiles and profile writers as seen in |
31 | // /ProfileData/SampleProf.h. |
32 | class ProfileGeneratorBase { |
33 | |
34 | public: |
35 | ProfileGeneratorBase(ProfiledBinary *Binary) : Binary(Binary){}; |
36 | ProfileGeneratorBase(ProfiledBinary *Binary, |
37 | const ContextSampleCounterMap *Counters) |
38 | : Binary(Binary), SampleCounters(Counters){}; |
39 | ProfileGeneratorBase(ProfiledBinary *Binary, |
40 | const SampleProfileMap &&Profiles) |
41 | : Binary(Binary), ProfileMap(std::move(Profiles)){}; |
42 | |
43 | virtual ~ProfileGeneratorBase() = default; |
44 | static std::unique_ptr<ProfileGeneratorBase> |
45 | create(ProfiledBinary *Binary, const ContextSampleCounterMap *Counters, |
46 | bool profileIsCS); |
47 | static std::unique_ptr<ProfileGeneratorBase> |
48 | create(ProfiledBinary *Binary, SampleProfileMap &ProfileMap, |
49 | bool profileIsCS); |
50 | virtual void generateProfile() = 0; |
51 | void write(); |
52 | |
53 | static uint32_t |
54 | getDuplicationFactor(unsigned Discriminator, |
55 | bool UseFSD = ProfileGeneratorBase::UseFSDiscriminator) { |
56 | return UseFSD ? 1 |
57 | : llvm::DILocation::getDuplicationFactorFromDiscriminator( |
58 | D: Discriminator); |
59 | } |
60 | |
61 | static uint32_t |
62 | getBaseDiscriminator(unsigned Discriminator, |
63 | bool UseFSD = ProfileGeneratorBase::UseFSDiscriminator) { |
64 | return UseFSD ? Discriminator |
65 | : DILocation::getBaseDiscriminatorFromDiscriminator( |
66 | D: Discriminator, /* IsFSDiscriminator */ IsFSDiscriminator: false); |
67 | } |
68 | |
69 | static bool UseFSDiscriminator; |
70 | |
71 | protected: |
72 | // Use SampleProfileWriter to serialize profile map |
73 | void write(std::unique_ptr<SampleProfileWriter> Writer, |
74 | SampleProfileMap &ProfileMap); |
75 | /* |
76 | For each region boundary point, mark if it is begin or end (or both) of |
77 | the region. Boundary points are inclusive. Log the sample count as well |
78 | so we can use it when we compute the sample count of each disjoint region |
79 | later. Note that there might be multiple ranges with different sample |
80 | count that share same begin/end point. We need to accumulate the sample |
81 | count for the boundary point for such case, because for the example |
82 | below, |
83 | |
84 | |<--100-->| |
85 | |<------200------>| |
86 | A B C |
87 | |
88 | sample count for disjoint region [A,B] would be 300. |
89 | */ |
90 | void findDisjointRanges(RangeSample &DisjointRanges, |
91 | const RangeSample &Ranges); |
92 | |
93 | // Go through each address from range to extract the top frame probe by |
94 | // looking up in the Address2ProbeMap |
95 | void (const RangeSample &RangeCounter, |
96 | ProbeCounterMap &ProbeCounter, |
97 | bool FindDisjointRanges = true); |
98 | |
99 | // Helper function for updating body sample for a leaf location in |
100 | // FunctionProfile |
101 | void updateBodySamplesforFunctionProfile(FunctionSamples &FunctionProfile, |
102 | const SampleContextFrame &LeafLoc, |
103 | uint64_t Count); |
104 | |
105 | void updateFunctionSamples(); |
106 | |
107 | void updateTotalSamples(); |
108 | |
109 | void updateCallsiteSamples(); |
110 | |
111 | void filterAmbiguousProfile(SampleProfileMap &Profiles); |
112 | |
113 | bool filterAmbiguousProfile(FunctionSamples &FS); |
114 | |
115 | StringRef getCalleeNameForAddress(uint64_t TargetAddress); |
116 | |
117 | void computeSummaryAndThreshold(SampleProfileMap &ProfileMap); |
118 | |
119 | void calculateBodySamplesAndSize(const FunctionSamples &FSamples, |
120 | uint64_t &TotalBodySamples, |
121 | uint64_t &FuncBodySize); |
122 | |
123 | double calculateDensity(const SampleProfileMap &Profiles); |
124 | |
125 | void calculateAndShowDensity(const SampleProfileMap &Profiles); |
126 | |
127 | void showDensitySuggestion(double Density); |
128 | |
129 | void collectProfiledFunctions(); |
130 | |
131 | bool collectFunctionsFromRawProfile( |
132 | std::unordered_set<const BinaryFunction *> &ProfiledFunctions); |
133 | |
134 | // Collect profiled Functions for llvm sample profile input. |
135 | virtual bool collectFunctionsFromLLVMProfile( |
136 | std::unordered_set<const BinaryFunction *> &ProfiledFunctions) = 0; |
137 | |
138 | // List of function prefix to filter out. |
139 | static constexpr const char *FuncPrefixsToFilter[] = {"__cxx_global_var_init" , |
140 | "__tls_init" }; |
141 | |
142 | // Thresholds from profile summary to answer isHotCount/isColdCount queries. |
143 | uint64_t HotCountThreshold; |
144 | |
145 | uint64_t ColdCountThreshold; |
146 | |
147 | ProfiledBinary *Binary = nullptr; |
148 | |
149 | std::unique_ptr<ProfileSummary> Summary; |
150 | |
151 | // Used by SampleProfileWriter |
152 | SampleProfileMap ProfileMap; |
153 | |
154 | const ContextSampleCounterMap *SampleCounters = nullptr; |
155 | }; |
156 | |
157 | class ProfileGenerator : public ProfileGeneratorBase { |
158 | |
159 | public: |
160 | ProfileGenerator(ProfiledBinary *Binary, |
161 | const ContextSampleCounterMap *Counters) |
162 | : ProfileGeneratorBase(Binary, Counters){}; |
163 | ProfileGenerator(ProfiledBinary *Binary, const SampleProfileMap &&Profiles) |
164 | : ProfileGeneratorBase(Binary, std::move(Profiles)){}; |
165 | void generateProfile() override; |
166 | |
167 | private: |
168 | void generateLineNumBasedProfile(); |
169 | void generateProbeBasedProfile(); |
170 | RangeSample preprocessRangeCounter(const RangeSample &RangeCounter); |
171 | FunctionSamples &getTopLevelFunctionProfile(FunctionId FuncName); |
172 | // Helper function to get the leaf frame's FunctionProfile by traversing the |
173 | // inline stack and meanwhile it adds the total samples for each frame's |
174 | // function profile. |
175 | FunctionSamples & |
176 | getLeafProfileAndAddTotalSamples(const SampleContextFrameVector &FrameVec, |
177 | uint64_t Count); |
178 | void populateBodySamplesForAllFunctions(const RangeSample &RangeCounter); |
179 | void |
180 | populateBoundarySamplesForAllFunctions(const BranchSample &BranchCounters); |
181 | void |
182 | populateBodySamplesWithProbesForAllFunctions(const RangeSample &RangeCounter); |
183 | void populateBoundarySamplesWithProbesForAllFunctions( |
184 | const BranchSample &BranchCounters); |
185 | void postProcessProfiles(); |
186 | void trimColdProfiles(const SampleProfileMap &Profiles, |
187 | uint64_t ColdCntThreshold); |
188 | bool collectFunctionsFromLLVMProfile( |
189 | std::unordered_set<const BinaryFunction *> &ProfiledFunctions) override; |
190 | }; |
191 | |
192 | class CSProfileGenerator : public ProfileGeneratorBase { |
193 | public: |
194 | CSProfileGenerator(ProfiledBinary *Binary, |
195 | const ContextSampleCounterMap *Counters) |
196 | : ProfileGeneratorBase(Binary, Counters){}; |
197 | CSProfileGenerator(ProfiledBinary *Binary, SampleProfileMap &Profiles) |
198 | : ProfileGeneratorBase(Binary), ContextTracker(Profiles, nullptr){}; |
199 | void generateProfile() override; |
200 | |
201 | // Trim the context stack at a given depth. |
202 | template <typename T> |
203 | static void trimContext(SmallVectorImpl<T> &S, int Depth = MaxContextDepth) { |
204 | if (Depth < 0 || static_cast<size_t>(Depth) >= S.size()) |
205 | return; |
206 | std::copy(S.begin() + S.size() - static_cast<size_t>(Depth), S.end(), |
207 | S.begin()); |
208 | S.resize(Depth); |
209 | } |
210 | |
211 | // Remove adjacent repeated context sequences up to a given sequence length, |
212 | // -1 means no size limit. Note that repeated sequences are identified based |
213 | // on the exact call site, this is finer granularity than function recursion. |
214 | template <typename T> |
215 | static void compressRecursionContext(SmallVectorImpl<T> &Context, |
216 | int32_t CSize = MaxCompressionSize) { |
217 | uint32_t I = 1; |
218 | uint32_t HS = static_cast<uint32_t>(Context.size() / 2); |
219 | uint32_t MaxDedupSize = |
220 | CSize == -1 ? HS : std::min(a: static_cast<uint32_t>(CSize), b: HS); |
221 | auto BeginIter = Context.begin(); |
222 | // Use an in-place algorithm to save memory copy |
223 | // End indicates the end location of current iteration's data |
224 | uint32_t End = 0; |
225 | // Deduplicate from length 1 to the max possible size of a repeated |
226 | // sequence. |
227 | while (I <= MaxDedupSize) { |
228 | // This is a linear algorithm that deduplicates adjacent repeated |
229 | // sequences of size I. The deduplication detection runs on a sliding |
230 | // window whose size is 2*I and it keeps sliding the window to deduplicate |
231 | // the data inside. Once duplication is detected, deduplicate it by |
232 | // skipping the right half part of the window, otherwise just copy back |
233 | // the new one by appending them at the back of End pointer(for the next |
234 | // iteration). |
235 | // |
236 | // For example: |
237 | // Input: [a1, a2, b1, b2] |
238 | // (Added index to distinguish the same char, the origin is [a, a, b, |
239 | // b], the size of the dedup window is 2(I = 1) at the beginning) |
240 | // |
241 | // 1) The initial status is a dummy window[null, a1], then just copy the |
242 | // right half of the window(End = 0), then slide the window. |
243 | // Result: [a1], a2, b1, b2 (End points to the element right before ], |
244 | // after ] is the data of the previous iteration) |
245 | // |
246 | // 2) Next window is [a1, a2]. Since a1 == a2, then skip the right half of |
247 | // the window i.e the duplication happen. Only slide the window. |
248 | // Result: [a1], a2, b1, b2 |
249 | // |
250 | // 3) Next window is [a2, b1], copy the right half of the window(b1 is |
251 | // new) to the End and slide the window. |
252 | // Result: [a1, b1], b1, b2 |
253 | // |
254 | // 4) Next window is [b1, b2], same to 2), skip b2. |
255 | // Result: [a1, b1], b1, b2 |
256 | // After resize, it will be [a, b] |
257 | |
258 | // Use pointers like below to do comparison inside the window |
259 | // [a b c a b c] |
260 | // | | | | | |
261 | // LeftBoundary Left Right Left+I Right+I |
262 | // A duplication found if Left < LeftBoundry. |
263 | |
264 | int32_t Right = I - 1; |
265 | End = I; |
266 | int32_t LeftBoundary = 0; |
267 | while (Right + I < Context.size()) { |
268 | // To avoids scanning a part of a sequence repeatedly, it finds out |
269 | // the common suffix of two hald in the window. The common suffix will |
270 | // serve as the common prefix of next possible pair of duplicate |
271 | // sequences. The non-common part will be ignored and never scanned |
272 | // again. |
273 | |
274 | // For example. |
275 | // Input: [a, b1], c1, b2, c2 |
276 | // I = 2 |
277 | // |
278 | // 1) For the window [a, b1, c1, b2], non-common-suffix for the right |
279 | // part is 'c1', copy it and only slide the window 1 step. |
280 | // Result: [a, b1, c1], b2, c2 |
281 | // |
282 | // 2) Next window is [b1, c1, b2, c2], so duplication happen. |
283 | // Result after resize: [a, b, c] |
284 | |
285 | int32_t Left = Right; |
286 | while (Left >= LeftBoundary && Context[Left] == Context[Left + I]) { |
287 | // Find the longest suffix inside the window. When stops, Left points |
288 | // at the diverging point in the current sequence. |
289 | Left--; |
290 | } |
291 | |
292 | bool DuplicationFound = (Left < LeftBoundary); |
293 | // Don't need to recheck the data before Right |
294 | LeftBoundary = Right + 1; |
295 | if (DuplicationFound) { |
296 | // Duplication found, skip right half of the window. |
297 | Right += I; |
298 | } else { |
299 | // Copy the non-common-suffix part of the adjacent sequence. |
300 | std::copy(BeginIter + Right + 1, BeginIter + Left + I + 1, |
301 | BeginIter + End); |
302 | End += Left + I - Right; |
303 | // Only slide the window by the size of non-common-suffix |
304 | Right = Left + I; |
305 | } |
306 | } |
307 | // Don't forget the remaining part that's not scanned. |
308 | std::copy(BeginIter + Right + 1, Context.end(), BeginIter + End); |
309 | End += Context.size() - Right - 1; |
310 | I++; |
311 | Context.resize(End); |
312 | MaxDedupSize = std::min(a: static_cast<uint32_t>(End / 2), b: MaxDedupSize); |
313 | } |
314 | } |
315 | |
316 | private: |
317 | void generateLineNumBasedProfile(); |
318 | |
319 | FunctionSamples *getOrCreateFunctionSamples(ContextTrieNode *ContextNode, |
320 | bool WasLeafInlined = false); |
321 | |
322 | // Lookup or create ContextTrieNode for the context, FunctionSamples is |
323 | // created inside this function. |
324 | ContextTrieNode *getOrCreateContextNode(const SampleContextFrames Context, |
325 | bool WasLeafInlined = false); |
326 | |
327 | // For profiled only functions, on-demand compute their inline context |
328 | // function byte size which is used by the pre-inliner. |
329 | void computeSizeForProfiledFunctions(); |
330 | // Post processing for profiles before writing out, such as mermining |
331 | // and trimming cold profiles, running preinliner on profiles. |
332 | void postProcessProfiles(); |
333 | |
334 | void populateBodySamplesForFunction(FunctionSamples &FunctionProfile, |
335 | const RangeSample &RangeCounters); |
336 | |
337 | void populateBoundarySamplesForFunction(ContextTrieNode *CallerNode, |
338 | const BranchSample &BranchCounters); |
339 | |
340 | void populateInferredFunctionSamples(ContextTrieNode &Node); |
341 | |
342 | void updateFunctionSamples(); |
343 | |
344 | void generateProbeBasedProfile(); |
345 | |
346 | // Fill in function body samples from probes |
347 | void populateBodySamplesWithProbes(const RangeSample &RangeCounter, |
348 | const AddrBasedCtxKey *CtxKey); |
349 | // Fill in boundary samples for a call probe |
350 | void populateBoundarySamplesWithProbes(const BranchSample &BranchCounter, |
351 | const AddrBasedCtxKey *CtxKey); |
352 | |
353 | ContextTrieNode * |
354 | getContextNodeForLeafProbe(const AddrBasedCtxKey *CtxKey, |
355 | const MCDecodedPseudoProbe *LeafProbe); |
356 | |
357 | // Helper function to get FunctionSamples for the leaf probe |
358 | FunctionSamples & |
359 | getFunctionProfileForLeafProbe(const AddrBasedCtxKey *CtxKey, |
360 | const MCDecodedPseudoProbe *LeafProbe); |
361 | |
362 | void convertToProfileMap(ContextTrieNode &Node, |
363 | SampleContextFrameVector &Context); |
364 | |
365 | void convertToProfileMap(); |
366 | |
367 | void computeSummaryAndThreshold(); |
368 | |
369 | bool collectFunctionsFromLLVMProfile( |
370 | std::unordered_set<const BinaryFunction *> &ProfiledFunctions) override; |
371 | |
372 | void initializeMissingFrameInferrer(); |
373 | |
374 | // Given an input `Context`, output `NewContext` with inferred missing tail |
375 | // call frames. |
376 | void inferMissingFrames(const SmallVectorImpl<uint64_t> &Context, |
377 | SmallVectorImpl<uint64_t> &NewContext); |
378 | |
379 | ContextTrieNode &getRootContext() { return ContextTracker.getRootContext(); }; |
380 | |
381 | // The container for holding the FunctionSamples used by context trie. |
382 | std::list<FunctionSamples> FSamplesList; |
383 | |
384 | // Underlying context table serves for sample profile writer. |
385 | std::unordered_set<SampleContextFrameVector, SampleContextFrameHash> Contexts; |
386 | |
387 | SampleContextTracker ContextTracker; |
388 | |
389 | bool IsProfileValidOnTrie = true; |
390 | |
391 | public: |
392 | // Deduplicate adjacent repeated context sequences up to a given sequence |
393 | // length. -1 means no size limit. |
394 | static int32_t MaxCompressionSize; |
395 | static int MaxContextDepth; |
396 | }; |
397 | |
398 | } // end namespace sampleprof |
399 | } // end namespace llvm |
400 | |
401 | #endif |
402 | |