| 1 | //===-- ProfileGenerator.h - Profile Generator -----------------*- C++ -*-===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | |
| 9 | #ifndef LLVM_TOOLS_LLVM_PROGEN_PROFILEGENERATOR_H |
| 10 | #define LLVM_TOOLS_LLVM_PROGEN_PROFILEGENERATOR_H |
| 11 | #include "CSPreInliner.h" |
| 12 | #include "ErrorHandling.h" |
| 13 | #include "PerfReader.h" |
| 14 | #include "ProfiledBinary.h" |
| 15 | #include "llvm/IR/DebugInfoMetadata.h" |
| 16 | #include "llvm/ProfileData/SampleProfWriter.h" |
| 17 | #include <memory> |
| 18 | #include <unordered_set> |
| 19 | |
| 20 | using namespace llvm; |
| 21 | using namespace sampleprof; |
| 22 | |
| 23 | namespace llvm { |
| 24 | namespace sampleprof { |
| 25 | |
| 26 | using ProbeCounterMap = |
| 27 | std::unordered_map<const MCDecodedPseudoProbe *, uint64_t>; |
| 28 | |
| 29 | // This base class for profile generation of sample-based PGO. We reuse all |
| 30 | // structures relating to function profiles and profile writers as seen in |
| 31 | // /ProfileData/SampleProf.h. |
| 32 | class ProfileGeneratorBase { |
| 33 | |
| 34 | public: |
| 35 | ProfileGeneratorBase(ProfiledBinary *Binary) : Binary(Binary){}; |
| 36 | ProfileGeneratorBase(ProfiledBinary *Binary, |
| 37 | const ContextSampleCounterMap *Counters) |
| 38 | : Binary(Binary), SampleCounters(Counters){}; |
| 39 | ProfileGeneratorBase(ProfiledBinary *Binary, |
| 40 | const SampleProfileMap &&Profiles) |
| 41 | : Binary(Binary), ProfileMap(std::move(Profiles)){}; |
| 42 | |
| 43 | virtual ~ProfileGeneratorBase() = default; |
| 44 | static std::unique_ptr<ProfileGeneratorBase> |
| 45 | create(ProfiledBinary *Binary, const ContextSampleCounterMap *Counters, |
| 46 | bool profileIsCS); |
| 47 | static std::unique_ptr<ProfileGeneratorBase> |
| 48 | create(ProfiledBinary *Binary, SampleProfileMap &ProfileMap, |
| 49 | bool profileIsCS); |
| 50 | virtual void generateProfile() = 0; |
| 51 | void write(); |
| 52 | |
| 53 | static uint32_t |
| 54 | getDuplicationFactor(unsigned Discriminator, |
| 55 | bool UseFSD = ProfileGeneratorBase::UseFSDiscriminator) { |
| 56 | return UseFSD ? 1 |
| 57 | : llvm::DILocation::getDuplicationFactorFromDiscriminator( |
| 58 | D: Discriminator); |
| 59 | } |
| 60 | |
| 61 | static uint32_t |
| 62 | getBaseDiscriminator(unsigned Discriminator, |
| 63 | bool UseFSD = ProfileGeneratorBase::UseFSDiscriminator) { |
| 64 | return UseFSD ? Discriminator |
| 65 | : DILocation::getBaseDiscriminatorFromDiscriminator( |
| 66 | D: Discriminator, /* IsFSDiscriminator */ IsFSDiscriminator: false); |
| 67 | } |
| 68 | |
| 69 | static bool UseFSDiscriminator; |
| 70 | |
| 71 | protected: |
| 72 | // Use SampleProfileWriter to serialize profile map |
| 73 | void write(std::unique_ptr<SampleProfileWriter> Writer, |
| 74 | SampleProfileMap &ProfileMap); |
| 75 | /* |
| 76 | For each region boundary point, mark if it is begin or end (or both) of |
| 77 | the region. Boundary points are inclusive. Log the sample count as well |
| 78 | so we can use it when we compute the sample count of each disjoint region |
| 79 | later. Note that there might be multiple ranges with different sample |
| 80 | count that share same begin/end point. We need to accumulate the sample |
| 81 | count for the boundary point for such case, because for the example |
| 82 | below, |
| 83 | |
| 84 | |<--100-->| |
| 85 | |<------200------>| |
| 86 | A B C |
| 87 | |
| 88 | sample count for disjoint region [A,B] would be 300. |
| 89 | */ |
| 90 | void findDisjointRanges(RangeSample &DisjointRanges, |
| 91 | const RangeSample &Ranges); |
| 92 | |
| 93 | // Go through each address from range to extract the top frame probe by |
| 94 | // looking up in the Address2ProbeMap |
| 95 | void (const RangeSample &RangeCounter, |
| 96 | ProbeCounterMap &ProbeCounter, |
| 97 | bool FindDisjointRanges = true); |
| 98 | |
| 99 | // Helper function for updating body sample for a leaf location in |
| 100 | // FunctionProfile |
| 101 | void updateBodySamplesforFunctionProfile(FunctionSamples &FunctionProfile, |
| 102 | const SampleContextFrame &LeafLoc, |
| 103 | uint64_t Count); |
| 104 | |
| 105 | void updateFunctionSamples(); |
| 106 | |
| 107 | void updateTotalSamples(); |
| 108 | |
| 109 | void updateCallsiteSamples(); |
| 110 | |
| 111 | void filterAmbiguousProfile(SampleProfileMap &Profiles); |
| 112 | |
| 113 | bool filterAmbiguousProfile(FunctionSamples &FS); |
| 114 | |
| 115 | StringRef getCalleeNameForAddress(uint64_t TargetAddress); |
| 116 | |
| 117 | void computeSummaryAndThreshold(SampleProfileMap &ProfileMap); |
| 118 | |
| 119 | void calculateBodySamplesAndSize(const FunctionSamples &FSamples, |
| 120 | uint64_t &TotalBodySamples, |
| 121 | uint64_t &FuncBodySize); |
| 122 | |
| 123 | double calculateDensity(const SampleProfileMap &Profiles); |
| 124 | |
| 125 | void calculateAndShowDensity(const SampleProfileMap &Profiles); |
| 126 | |
| 127 | void showDensitySuggestion(double Density); |
| 128 | |
| 129 | void collectProfiledFunctions(); |
| 130 | |
| 131 | bool collectFunctionsFromRawProfile( |
| 132 | std::unordered_set<const BinaryFunction *> &ProfiledFunctions); |
| 133 | |
| 134 | // Collect profiled Functions for llvm sample profile input. |
| 135 | virtual bool collectFunctionsFromLLVMProfile( |
| 136 | std::unordered_set<const BinaryFunction *> &ProfiledFunctions) = 0; |
| 137 | |
| 138 | // List of function prefix to filter out. |
| 139 | static constexpr const char *FuncPrefixsToFilter[] = {"__cxx_global_var_init" , |
| 140 | "__tls_init" }; |
| 141 | |
| 142 | // Thresholds from profile summary to answer isHotCount/isColdCount queries. |
| 143 | uint64_t HotCountThreshold; |
| 144 | |
| 145 | uint64_t ColdCountThreshold; |
| 146 | |
| 147 | ProfiledBinary *Binary = nullptr; |
| 148 | |
| 149 | std::unique_ptr<ProfileSummary> Summary; |
| 150 | |
| 151 | // Used by SampleProfileWriter |
| 152 | SampleProfileMap ProfileMap; |
| 153 | |
| 154 | const ContextSampleCounterMap *SampleCounters = nullptr; |
| 155 | }; |
| 156 | |
| 157 | class ProfileGenerator : public ProfileGeneratorBase { |
| 158 | |
| 159 | public: |
| 160 | ProfileGenerator(ProfiledBinary *Binary, |
| 161 | const ContextSampleCounterMap *Counters) |
| 162 | : ProfileGeneratorBase(Binary, Counters){}; |
| 163 | ProfileGenerator(ProfiledBinary *Binary, const SampleProfileMap &&Profiles) |
| 164 | : ProfileGeneratorBase(Binary, std::move(Profiles)){}; |
| 165 | void generateProfile() override; |
| 166 | |
| 167 | private: |
| 168 | void generateLineNumBasedProfile(); |
| 169 | void generateProbeBasedProfile(); |
| 170 | RangeSample preprocessRangeCounter(const RangeSample &RangeCounter); |
| 171 | FunctionSamples &getTopLevelFunctionProfile(FunctionId FuncName); |
| 172 | // Helper function to get the leaf frame's FunctionProfile by traversing the |
| 173 | // inline stack and meanwhile it adds the total samples for each frame's |
| 174 | // function profile. |
| 175 | FunctionSamples & |
| 176 | getLeafProfileAndAddTotalSamples(const SampleContextFrameVector &FrameVec, |
| 177 | uint64_t Count); |
| 178 | void populateBodySamplesForAllFunctions(const RangeSample &RangeCounter); |
| 179 | void |
| 180 | populateBoundarySamplesForAllFunctions(const BranchSample &BranchCounters); |
| 181 | void |
| 182 | populateBodySamplesWithProbesForAllFunctions(const RangeSample &RangeCounter); |
| 183 | void populateBoundarySamplesWithProbesForAllFunctions( |
| 184 | const BranchSample &BranchCounters); |
| 185 | void postProcessProfiles(); |
| 186 | void trimColdProfiles(const SampleProfileMap &Profiles, |
| 187 | uint64_t ColdCntThreshold); |
| 188 | bool collectFunctionsFromLLVMProfile( |
| 189 | std::unordered_set<const BinaryFunction *> &ProfiledFunctions) override; |
| 190 | }; |
| 191 | |
| 192 | class CSProfileGenerator : public ProfileGeneratorBase { |
| 193 | public: |
| 194 | CSProfileGenerator(ProfiledBinary *Binary, |
| 195 | const ContextSampleCounterMap *Counters) |
| 196 | : ProfileGeneratorBase(Binary, Counters){}; |
| 197 | CSProfileGenerator(ProfiledBinary *Binary, SampleProfileMap &Profiles) |
| 198 | : ProfileGeneratorBase(Binary), ContextTracker(Profiles, nullptr){}; |
| 199 | void generateProfile() override; |
| 200 | |
| 201 | // Trim the context stack at a given depth. |
| 202 | template <typename T> |
| 203 | static void trimContext(SmallVectorImpl<T> &S, int Depth = MaxContextDepth) { |
| 204 | if (Depth < 0 || static_cast<size_t>(Depth) >= S.size()) |
| 205 | return; |
| 206 | std::copy(S.begin() + S.size() - static_cast<size_t>(Depth), S.end(), |
| 207 | S.begin()); |
| 208 | S.resize(Depth); |
| 209 | } |
| 210 | |
| 211 | // Remove adjacent repeated context sequences up to a given sequence length, |
| 212 | // -1 means no size limit. Note that repeated sequences are identified based |
| 213 | // on the exact call site, this is finer granularity than function recursion. |
| 214 | template <typename T> |
| 215 | static void compressRecursionContext(SmallVectorImpl<T> &Context, |
| 216 | int32_t CSize = MaxCompressionSize) { |
| 217 | uint32_t I = 1; |
| 218 | uint32_t HS = static_cast<uint32_t>(Context.size() / 2); |
| 219 | uint32_t MaxDedupSize = |
| 220 | CSize == -1 ? HS : std::min(a: static_cast<uint32_t>(CSize), b: HS); |
| 221 | auto BeginIter = Context.begin(); |
| 222 | // Use an in-place algorithm to save memory copy |
| 223 | // End indicates the end location of current iteration's data |
| 224 | uint32_t End = 0; |
| 225 | // Deduplicate from length 1 to the max possible size of a repeated |
| 226 | // sequence. |
| 227 | while (I <= MaxDedupSize) { |
| 228 | // This is a linear algorithm that deduplicates adjacent repeated |
| 229 | // sequences of size I. The deduplication detection runs on a sliding |
| 230 | // window whose size is 2*I and it keeps sliding the window to deduplicate |
| 231 | // the data inside. Once duplication is detected, deduplicate it by |
| 232 | // skipping the right half part of the window, otherwise just copy back |
| 233 | // the new one by appending them at the back of End pointer(for the next |
| 234 | // iteration). |
| 235 | // |
| 236 | // For example: |
| 237 | // Input: [a1, a2, b1, b2] |
| 238 | // (Added index to distinguish the same char, the origin is [a, a, b, |
| 239 | // b], the size of the dedup window is 2(I = 1) at the beginning) |
| 240 | // |
| 241 | // 1) The initial status is a dummy window[null, a1], then just copy the |
| 242 | // right half of the window(End = 0), then slide the window. |
| 243 | // Result: [a1], a2, b1, b2 (End points to the element right before ], |
| 244 | // after ] is the data of the previous iteration) |
| 245 | // |
| 246 | // 2) Next window is [a1, a2]. Since a1 == a2, then skip the right half of |
| 247 | // the window i.e the duplication happen. Only slide the window. |
| 248 | // Result: [a1], a2, b1, b2 |
| 249 | // |
| 250 | // 3) Next window is [a2, b1], copy the right half of the window(b1 is |
| 251 | // new) to the End and slide the window. |
| 252 | // Result: [a1, b1], b1, b2 |
| 253 | // |
| 254 | // 4) Next window is [b1, b2], same to 2), skip b2. |
| 255 | // Result: [a1, b1], b1, b2 |
| 256 | // After resize, it will be [a, b] |
| 257 | |
| 258 | // Use pointers like below to do comparison inside the window |
| 259 | // [a b c a b c] |
| 260 | // | | | | | |
| 261 | // LeftBoundary Left Right Left+I Right+I |
| 262 | // A duplication found if Left < LeftBoundry. |
| 263 | |
| 264 | int32_t Right = I - 1; |
| 265 | End = I; |
| 266 | int32_t LeftBoundary = 0; |
| 267 | while (Right + I < Context.size()) { |
| 268 | // To avoids scanning a part of a sequence repeatedly, it finds out |
| 269 | // the common suffix of two hald in the window. The common suffix will |
| 270 | // serve as the common prefix of next possible pair of duplicate |
| 271 | // sequences. The non-common part will be ignored and never scanned |
| 272 | // again. |
| 273 | |
| 274 | // For example. |
| 275 | // Input: [a, b1], c1, b2, c2 |
| 276 | // I = 2 |
| 277 | // |
| 278 | // 1) For the window [a, b1, c1, b2], non-common-suffix for the right |
| 279 | // part is 'c1', copy it and only slide the window 1 step. |
| 280 | // Result: [a, b1, c1], b2, c2 |
| 281 | // |
| 282 | // 2) Next window is [b1, c1, b2, c2], so duplication happen. |
| 283 | // Result after resize: [a, b, c] |
| 284 | |
| 285 | int32_t Left = Right; |
| 286 | while (Left >= LeftBoundary && Context[Left] == Context[Left + I]) { |
| 287 | // Find the longest suffix inside the window. When stops, Left points |
| 288 | // at the diverging point in the current sequence. |
| 289 | Left--; |
| 290 | } |
| 291 | |
| 292 | bool DuplicationFound = (Left < LeftBoundary); |
| 293 | // Don't need to recheck the data before Right |
| 294 | LeftBoundary = Right + 1; |
| 295 | if (DuplicationFound) { |
| 296 | // Duplication found, skip right half of the window. |
| 297 | Right += I; |
| 298 | } else { |
| 299 | // Copy the non-common-suffix part of the adjacent sequence. |
| 300 | std::copy(BeginIter + Right + 1, BeginIter + Left + I + 1, |
| 301 | BeginIter + End); |
| 302 | End += Left + I - Right; |
| 303 | // Only slide the window by the size of non-common-suffix |
| 304 | Right = Left + I; |
| 305 | } |
| 306 | } |
| 307 | // Don't forget the remaining part that's not scanned. |
| 308 | std::copy(BeginIter + Right + 1, Context.end(), BeginIter + End); |
| 309 | End += Context.size() - Right - 1; |
| 310 | I++; |
| 311 | Context.resize(End); |
| 312 | MaxDedupSize = std::min(a: static_cast<uint32_t>(End / 2), b: MaxDedupSize); |
| 313 | } |
| 314 | } |
| 315 | |
| 316 | private: |
| 317 | void generateLineNumBasedProfile(); |
| 318 | |
| 319 | FunctionSamples *getOrCreateFunctionSamples(ContextTrieNode *ContextNode, |
| 320 | bool WasLeafInlined = false); |
| 321 | |
| 322 | // Lookup or create ContextTrieNode for the context, FunctionSamples is |
| 323 | // created inside this function. |
| 324 | ContextTrieNode *getOrCreateContextNode(const SampleContextFrames Context, |
| 325 | bool WasLeafInlined = false); |
| 326 | |
| 327 | // For profiled only functions, on-demand compute their inline context |
| 328 | // function byte size which is used by the pre-inliner. |
| 329 | void computeSizeForProfiledFunctions(); |
| 330 | // Post processing for profiles before writing out, such as mermining |
| 331 | // and trimming cold profiles, running preinliner on profiles. |
| 332 | void postProcessProfiles(); |
| 333 | |
| 334 | void populateBodySamplesForFunction(FunctionSamples &FunctionProfile, |
| 335 | const RangeSample &RangeCounters); |
| 336 | |
| 337 | void populateBoundarySamplesForFunction(ContextTrieNode *CallerNode, |
| 338 | const BranchSample &BranchCounters); |
| 339 | |
| 340 | void populateInferredFunctionSamples(ContextTrieNode &Node); |
| 341 | |
| 342 | void updateFunctionSamples(); |
| 343 | |
| 344 | void generateProbeBasedProfile(); |
| 345 | |
| 346 | // Fill in function body samples from probes |
| 347 | void populateBodySamplesWithProbes(const RangeSample &RangeCounter, |
| 348 | const AddrBasedCtxKey *CtxKey); |
| 349 | // Fill in boundary samples for a call probe |
| 350 | void populateBoundarySamplesWithProbes(const BranchSample &BranchCounter, |
| 351 | const AddrBasedCtxKey *CtxKey); |
| 352 | |
| 353 | ContextTrieNode * |
| 354 | getContextNodeForLeafProbe(const AddrBasedCtxKey *CtxKey, |
| 355 | const MCDecodedPseudoProbe *LeafProbe); |
| 356 | |
| 357 | // Helper function to get FunctionSamples for the leaf probe |
| 358 | FunctionSamples & |
| 359 | getFunctionProfileForLeafProbe(const AddrBasedCtxKey *CtxKey, |
| 360 | const MCDecodedPseudoProbe *LeafProbe); |
| 361 | |
| 362 | void convertToProfileMap(ContextTrieNode &Node, |
| 363 | SampleContextFrameVector &Context); |
| 364 | |
| 365 | void convertToProfileMap(); |
| 366 | |
| 367 | void computeSummaryAndThreshold(); |
| 368 | |
| 369 | bool collectFunctionsFromLLVMProfile( |
| 370 | std::unordered_set<const BinaryFunction *> &ProfiledFunctions) override; |
| 371 | |
| 372 | void initializeMissingFrameInferrer(); |
| 373 | |
| 374 | // Given an input `Context`, output `NewContext` with inferred missing tail |
| 375 | // call frames. |
| 376 | void inferMissingFrames(const SmallVectorImpl<uint64_t> &Context, |
| 377 | SmallVectorImpl<uint64_t> &NewContext); |
| 378 | |
| 379 | ContextTrieNode &getRootContext() { return ContextTracker.getRootContext(); }; |
| 380 | |
| 381 | // The container for holding the FunctionSamples used by context trie. |
| 382 | std::list<FunctionSamples> FSamplesList; |
| 383 | |
| 384 | // Underlying context table serves for sample profile writer. |
| 385 | std::unordered_set<SampleContextFrameVector, SampleContextFrameHash> Contexts; |
| 386 | |
| 387 | SampleContextTracker ContextTracker; |
| 388 | |
| 389 | bool IsProfileValidOnTrie = true; |
| 390 | |
| 391 | public: |
| 392 | // Deduplicate adjacent repeated context sequences up to a given sequence |
| 393 | // length. -1 means no size limit. |
| 394 | static int32_t MaxCompressionSize; |
| 395 | static int MaxContextDepth; |
| 396 | }; |
| 397 | |
| 398 | } // end namespace sampleprof |
| 399 | } // end namespace llvm |
| 400 | |
| 401 | #endif |
| 402 | |