| 1 | //===-- ProfileGenerator.h - Profile Generator -----------------*- C++ -*-===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | |
| 9 | #ifndef LLVM_TOOLS_LLVM_PROGEN_PROFILEGENERATOR_H |
| 10 | #define LLVM_TOOLS_LLVM_PROGEN_PROFILEGENERATOR_H |
| 11 | #include "CSPreInliner.h" |
| 12 | #include "ErrorHandling.h" |
| 13 | #include "PerfReader.h" |
| 14 | #include "ProfiledBinary.h" |
| 15 | #include "llvm/IR/DebugInfoMetadata.h" |
| 16 | #include "llvm/ProfileData/SampleProfWriter.h" |
| 17 | #include <memory> |
| 18 | #include <unordered_set> |
| 19 | |
| 20 | namespace llvm { |
| 21 | namespace sampleprof { |
| 22 | |
| 23 | using ProbeCounterMap = |
| 24 | std::unordered_map<const MCDecodedPseudoProbe *, uint64_t>; |
| 25 | |
| 26 | // This base class for profile generation of sample-based PGO. We reuse all |
| 27 | // structures relating to function profiles and profile writers as seen in |
| 28 | // /ProfileData/SampleProf.h. |
| 29 | class ProfileGeneratorBase { |
| 30 | |
| 31 | public: |
| 32 | ProfileGeneratorBase(ProfiledBinary *Binary) : Binary(Binary){}; |
| 33 | ProfileGeneratorBase(ProfiledBinary *Binary, |
| 34 | const ContextSampleCounterMap *Counters) |
| 35 | : Binary(Binary), SampleCounters(Counters){}; |
| 36 | ProfileGeneratorBase(ProfiledBinary *Binary, |
| 37 | const SampleProfileMap &&Profiles) |
| 38 | : Binary(Binary), ProfileMap(std::move(Profiles)){}; |
| 39 | |
| 40 | virtual ~ProfileGeneratorBase() = default; |
| 41 | static std::unique_ptr<ProfileGeneratorBase> |
| 42 | create(ProfiledBinary *Binary, const ContextSampleCounterMap *Counters, |
| 43 | bool profileIsCS); |
| 44 | static std::unique_ptr<ProfileGeneratorBase> |
| 45 | create(ProfiledBinary *Binary, SampleProfileMap &ProfileMap, |
| 46 | bool profileIsCS); |
| 47 | virtual void generateProfile() = 0; |
| 48 | void write(); |
| 49 | |
| 50 | static uint32_t |
| 51 | getDuplicationFactor(unsigned Discriminator, |
| 52 | bool UseFSD = ProfileGeneratorBase::UseFSDiscriminator) { |
| 53 | return UseFSD ? 1 |
| 54 | : llvm::DILocation::getDuplicationFactorFromDiscriminator( |
| 55 | D: Discriminator); |
| 56 | } |
| 57 | |
| 58 | static uint32_t |
| 59 | getBaseDiscriminator(unsigned Discriminator, |
| 60 | bool UseFSD = ProfileGeneratorBase::UseFSDiscriminator) { |
| 61 | return UseFSD ? Discriminator |
| 62 | : DILocation::getBaseDiscriminatorFromDiscriminator( |
| 63 | D: Discriminator, /* IsFSDiscriminator */ IsFSDiscriminator: false); |
| 64 | } |
| 65 | |
| 66 | static bool UseFSDiscriminator; |
| 67 | |
| 68 | protected: |
| 69 | // Use SampleProfileWriter to serialize profile map |
| 70 | void write(std::unique_ptr<SampleProfileWriter> Writer, |
| 71 | SampleProfileMap &ProfileMap); |
| 72 | /* |
| 73 | For each region boundary point, mark if it is begin or end (or both) of |
| 74 | the region. Boundary points are inclusive. Log the sample count as well |
| 75 | so we can use it when we compute the sample count of each disjoint region |
| 76 | later. Note that there might be multiple ranges with different sample |
| 77 | count that share same begin/end point. We need to accumulate the sample |
| 78 | count for the boundary point for such case, because for the example |
| 79 | below, |
| 80 | |
| 81 | |<--100-->| |
| 82 | |<------200------>| |
| 83 | A B C |
| 84 | |
| 85 | sample count for disjoint region [A,B] would be 300. |
| 86 | */ |
| 87 | void findDisjointRanges(RangeSample &DisjointRanges, |
| 88 | const RangeSample &Ranges); |
| 89 | |
| 90 | // Go through each address from range to extract the top frame probe by |
| 91 | // looking up in the Address2ProbeMap |
| 92 | void (const RangeSample &RangeCounter, |
| 93 | ProbeCounterMap &ProbeCounter, |
| 94 | bool FindDisjointRanges = true); |
| 95 | |
| 96 | // Helper function for updating body sample for a leaf location in |
| 97 | // FunctionProfile |
| 98 | void updateBodySamplesforFunctionProfile(FunctionSamples &FunctionProfile, |
| 99 | const SampleContextFrame &LeafLoc, |
| 100 | uint64_t Count); |
| 101 | |
| 102 | void updateFunctionSamples(); |
| 103 | |
| 104 | void updateTotalSamples(); |
| 105 | |
| 106 | void updateCallsiteSamples(); |
| 107 | |
| 108 | void filterAmbiguousProfile(SampleProfileMap &Profiles); |
| 109 | |
| 110 | bool filterAmbiguousProfile(FunctionSamples &FS); |
| 111 | |
| 112 | StringRef getCalleeNameForAddress(uint64_t TargetAddress); |
| 113 | |
| 114 | void computeSummaryAndThreshold(SampleProfileMap &ProfileMap); |
| 115 | |
| 116 | void calculateBodySamplesAndSize(const FunctionSamples &FSamples, |
| 117 | uint64_t &TotalBodySamples, |
| 118 | uint64_t &FuncBodySize); |
| 119 | |
| 120 | double calculateDensity(const SampleProfileMap &Profiles); |
| 121 | |
| 122 | void calculateAndShowDensity(const SampleProfileMap &Profiles); |
| 123 | |
| 124 | void showDensitySuggestion(double Density); |
| 125 | |
| 126 | void markAllContextPreinlined(SampleProfileMap &ProfileMap); |
| 127 | |
| 128 | void collectProfiledFunctions(); |
| 129 | |
| 130 | bool collectFunctionsFromRawProfile( |
| 131 | std::unordered_set<const BinaryFunction *> &ProfiledFunctions); |
| 132 | |
| 133 | // Collect profiled Functions for llvm sample profile input. |
| 134 | virtual bool collectFunctionsFromLLVMProfile( |
| 135 | std::unordered_set<const BinaryFunction *> &ProfiledFunctions) = 0; |
| 136 | |
| 137 | // List of function prefix to filter out. |
| 138 | static constexpr const char *FuncPrefixsToFilter[] = {"__cxx_global_var_init" , |
| 139 | "__tls_init" }; |
| 140 | |
| 141 | // Thresholds from profile summary to answer isHotCount/isColdCount queries. |
| 142 | uint64_t HotCountThreshold; |
| 143 | |
| 144 | uint64_t ColdCountThreshold; |
| 145 | |
| 146 | ProfiledBinary *Binary = nullptr; |
| 147 | |
| 148 | std::unique_ptr<ProfileSummary> Summary; |
| 149 | |
| 150 | // Used by SampleProfileWriter |
| 151 | SampleProfileMap ProfileMap; |
| 152 | |
| 153 | const ContextSampleCounterMap *SampleCounters = nullptr; |
| 154 | }; |
| 155 | |
| 156 | class ProfileGenerator : public ProfileGeneratorBase { |
| 157 | |
| 158 | public: |
| 159 | ProfileGenerator(ProfiledBinary *Binary, |
| 160 | const ContextSampleCounterMap *Counters) |
| 161 | : ProfileGeneratorBase(Binary, Counters){}; |
| 162 | ProfileGenerator(ProfiledBinary *Binary, const SampleProfileMap &&Profiles) |
| 163 | : ProfileGeneratorBase(Binary, std::move(Profiles)){}; |
| 164 | void generateProfile() override; |
| 165 | |
| 166 | private: |
| 167 | void generateLineNumBasedProfile(); |
| 168 | void generateProbeBasedProfile(); |
| 169 | RangeSample preprocessRangeCounter(const RangeSample &RangeCounter); |
| 170 | FunctionSamples &getTopLevelFunctionProfile(FunctionId FuncName); |
| 171 | // Helper function to get the leaf frame's FunctionProfile by traversing the |
| 172 | // inline stack and meanwhile it adds the total samples for each frame's |
| 173 | // function profile. |
| 174 | FunctionSamples & |
| 175 | getLeafProfileAndAddTotalSamples(const SampleContextFrameVector &FrameVec, |
| 176 | uint64_t Count); |
| 177 | void populateBodySamplesForAllFunctions(const RangeSample &RangeCounter); |
| 178 | void |
| 179 | populateBoundarySamplesForAllFunctions(const BranchSample &BranchCounters); |
| 180 | void |
| 181 | populateBodySamplesWithProbesForAllFunctions(const RangeSample &RangeCounter); |
| 182 | void populateBoundarySamplesWithProbesForAllFunctions( |
| 183 | const BranchSample &BranchCounters); |
| 184 | void |
| 185 | populateTypeSamplesForAllFunctions(const DataAccessSample &DataAccessSamples); |
| 186 | void postProcessProfiles(); |
| 187 | void trimColdProfiles(const SampleProfileMap &Profiles, |
| 188 | uint64_t ColdCntThreshold); |
| 189 | bool collectFunctionsFromLLVMProfile( |
| 190 | std::unordered_set<const BinaryFunction *> &ProfiledFunctions) override; |
| 191 | }; |
| 192 | |
| 193 | class CSProfileGenerator : public ProfileGeneratorBase { |
| 194 | public: |
| 195 | CSProfileGenerator(ProfiledBinary *Binary, |
| 196 | const ContextSampleCounterMap *Counters) |
| 197 | : ProfileGeneratorBase(Binary, Counters){}; |
| 198 | CSProfileGenerator(ProfiledBinary *Binary, SampleProfileMap &Profiles) |
| 199 | : ProfileGeneratorBase(Binary), ContextTracker(Profiles, nullptr){}; |
| 200 | void generateProfile() override; |
| 201 | |
| 202 | // Trim the context stack at a given depth. |
| 203 | template <typename T> |
| 204 | static void trimContext(SmallVectorImpl<T> &S, int Depth = MaxContextDepth) { |
| 205 | if (Depth < 0 || static_cast<size_t>(Depth) >= S.size()) |
| 206 | return; |
| 207 | std::copy(S.begin() + S.size() - static_cast<size_t>(Depth), S.end(), |
| 208 | S.begin()); |
| 209 | S.resize(Depth); |
| 210 | } |
| 211 | |
| 212 | // Remove adjacent repeated context sequences up to a given sequence length, |
| 213 | // -1 means no size limit. Note that repeated sequences are identified based |
| 214 | // on the exact call site, this is finer granularity than function recursion. |
| 215 | template <typename T> |
| 216 | static void compressRecursionContext(SmallVectorImpl<T> &Context, |
| 217 | int32_t CSize = MaxCompressionSize) { |
| 218 | uint32_t I = 1; |
| 219 | uint32_t HS = static_cast<uint32_t>(Context.size() / 2); |
| 220 | uint32_t MaxDedupSize = |
| 221 | CSize == -1 ? HS : std::min(a: static_cast<uint32_t>(CSize), b: HS); |
| 222 | auto BeginIter = Context.begin(); |
| 223 | // Use an in-place algorithm to save memory copy |
| 224 | // End indicates the end location of current iteration's data |
| 225 | uint32_t End = 0; |
| 226 | // Deduplicate from length 1 to the max possible size of a repeated |
| 227 | // sequence. |
| 228 | while (I <= MaxDedupSize) { |
| 229 | // This is a linear algorithm that deduplicates adjacent repeated |
| 230 | // sequences of size I. The deduplication detection runs on a sliding |
| 231 | // window whose size is 2*I and it keeps sliding the window to deduplicate |
| 232 | // the data inside. Once duplication is detected, deduplicate it by |
| 233 | // skipping the right half part of the window, otherwise just copy back |
| 234 | // the new one by appending them at the back of End pointer(for the next |
| 235 | // iteration). |
| 236 | // |
| 237 | // For example: |
| 238 | // Input: [a1, a2, b1, b2] |
| 239 | // (Added index to distinguish the same char, the origin is [a, a, b, |
| 240 | // b], the size of the dedup window is 2(I = 1) at the beginning) |
| 241 | // |
| 242 | // 1) The initial status is a dummy window[null, a1], then just copy the |
| 243 | // right half of the window(End = 0), then slide the window. |
| 244 | // Result: [a1], a2, b1, b2 (End points to the element right before ], |
| 245 | // after ] is the data of the previous iteration) |
| 246 | // |
| 247 | // 2) Next window is [a1, a2]. Since a1 == a2, then skip the right half of |
| 248 | // the window i.e the duplication happen. Only slide the window. |
| 249 | // Result: [a1], a2, b1, b2 |
| 250 | // |
| 251 | // 3) Next window is [a2, b1], copy the right half of the window(b1 is |
| 252 | // new) to the End and slide the window. |
| 253 | // Result: [a1, b1], b1, b2 |
| 254 | // |
| 255 | // 4) Next window is [b1, b2], same to 2), skip b2. |
| 256 | // Result: [a1, b1], b1, b2 |
| 257 | // After resize, it will be [a, b] |
| 258 | |
| 259 | // Use pointers like below to do comparison inside the window |
| 260 | // [a b c a b c] |
| 261 | // | | | | | |
| 262 | // LeftBoundary Left Right Left+I Right+I |
| 263 | // A duplication found if Left < LeftBoundry. |
| 264 | |
| 265 | int32_t Right = I - 1; |
| 266 | End = I; |
| 267 | int32_t LeftBoundary = 0; |
| 268 | while (Right + I < Context.size()) { |
| 269 | // To avoids scanning a part of a sequence repeatedly, it finds out |
| 270 | // the common suffix of two hald in the window. The common suffix will |
| 271 | // serve as the common prefix of next possible pair of duplicate |
| 272 | // sequences. The non-common part will be ignored and never scanned |
| 273 | // again. |
| 274 | |
| 275 | // For example. |
| 276 | // Input: [a, b1], c1, b2, c2 |
| 277 | // I = 2 |
| 278 | // |
| 279 | // 1) For the window [a, b1, c1, b2], non-common-suffix for the right |
| 280 | // part is 'c1', copy it and only slide the window 1 step. |
| 281 | // Result: [a, b1, c1], b2, c2 |
| 282 | // |
| 283 | // 2) Next window is [b1, c1, b2, c2], so duplication happen. |
| 284 | // Result after resize: [a, b, c] |
| 285 | |
| 286 | int32_t Left = Right; |
| 287 | while (Left >= LeftBoundary && Context[Left] == Context[Left + I]) { |
| 288 | // Find the longest suffix inside the window. When stops, Left points |
| 289 | // at the diverging point in the current sequence. |
| 290 | Left--; |
| 291 | } |
| 292 | |
| 293 | bool DuplicationFound = (Left < LeftBoundary); |
| 294 | // Don't need to recheck the data before Right |
| 295 | LeftBoundary = Right + 1; |
| 296 | if (DuplicationFound) { |
| 297 | // Duplication found, skip right half of the window. |
| 298 | Right += I; |
| 299 | } else { |
| 300 | // Copy the non-common-suffix part of the adjacent sequence. |
| 301 | std::copy(BeginIter + Right + 1, BeginIter + Left + I + 1, |
| 302 | BeginIter + End); |
| 303 | End += Left + I - Right; |
| 304 | // Only slide the window by the size of non-common-suffix |
| 305 | Right = Left + I; |
| 306 | } |
| 307 | } |
| 308 | // Don't forget the remaining part that's not scanned. |
| 309 | std::copy(BeginIter + Right + 1, Context.end(), BeginIter + End); |
| 310 | End += Context.size() - Right - 1; |
| 311 | I++; |
| 312 | Context.resize(End); |
| 313 | MaxDedupSize = std::min(a: static_cast<uint32_t>(End / 2), b: MaxDedupSize); |
| 314 | } |
| 315 | } |
| 316 | |
| 317 | private: |
| 318 | void generateLineNumBasedProfile(); |
| 319 | |
| 320 | FunctionSamples *getOrCreateFunctionSamples(ContextTrieNode *ContextNode, |
| 321 | bool WasLeafInlined = false); |
| 322 | |
| 323 | // Lookup or create ContextTrieNode for the context, FunctionSamples is |
| 324 | // created inside this function. |
| 325 | ContextTrieNode *getOrCreateContextNode(const SampleContextFrames Context, |
| 326 | bool WasLeafInlined = false); |
| 327 | |
| 328 | // For profiled only functions, on-demand compute their inline context |
| 329 | // function byte size which is used by the pre-inliner. |
| 330 | void computeSizeForProfiledFunctions(); |
| 331 | // Post processing for profiles before writing out, such as mermining |
| 332 | // and trimming cold profiles, running preinliner on profiles. |
| 333 | void postProcessProfiles(); |
| 334 | |
| 335 | void populateBodySamplesForFunction(FunctionSamples &FunctionProfile, |
| 336 | const RangeSample &RangeCounters); |
| 337 | |
| 338 | void populateBoundarySamplesForFunction(ContextTrieNode *CallerNode, |
| 339 | const BranchSample &BranchCounters); |
| 340 | |
| 341 | void populateInferredFunctionSamples(ContextTrieNode &Node); |
| 342 | |
| 343 | void updateFunctionSamples(); |
| 344 | |
| 345 | void generateProbeBasedProfile(); |
| 346 | |
| 347 | // Fill in function body samples from probes |
| 348 | void populateBodySamplesWithProbes(const RangeSample &RangeCounter, |
| 349 | const AddrBasedCtxKey *CtxKey); |
| 350 | // Fill in boundary samples for a call probe |
| 351 | void populateBoundarySamplesWithProbes(const BranchSample &BranchCounter, |
| 352 | const AddrBasedCtxKey *CtxKey); |
| 353 | |
| 354 | ContextTrieNode * |
| 355 | getContextNodeForLeafProbe(const AddrBasedCtxKey *CtxKey, |
| 356 | const MCDecodedPseudoProbe *LeafProbe); |
| 357 | |
| 358 | // Helper function to get FunctionSamples for the leaf probe |
| 359 | FunctionSamples & |
| 360 | getFunctionProfileForLeafProbe(const AddrBasedCtxKey *CtxKey, |
| 361 | const MCDecodedPseudoProbe *LeafProbe); |
| 362 | |
| 363 | void convertToProfileMap(ContextTrieNode &Node, |
| 364 | SampleContextFrameVector &Context); |
| 365 | |
| 366 | void convertToProfileMap(); |
| 367 | |
| 368 | void computeSummaryAndThreshold(); |
| 369 | |
| 370 | bool collectFunctionsFromLLVMProfile( |
| 371 | std::unordered_set<const BinaryFunction *> &ProfiledFunctions) override; |
| 372 | |
| 373 | void initializeMissingFrameInferrer(); |
| 374 | |
| 375 | // Given an input `Context`, output `NewContext` with inferred missing tail |
| 376 | // call frames. |
| 377 | void inferMissingFrames(const SmallVectorImpl<uint64_t> &Context, |
| 378 | SmallVectorImpl<uint64_t> &NewContext); |
| 379 | |
| 380 | ContextTrieNode &getRootContext() { return ContextTracker.getRootContext(); }; |
| 381 | |
| 382 | // The container for holding the FunctionSamples used by context trie. |
| 383 | std::list<FunctionSamples> FSamplesList; |
| 384 | |
| 385 | // Underlying context table serves for sample profile writer. |
| 386 | std::unordered_set<SampleContextFrameVector, SampleContextFrameHash> Contexts; |
| 387 | |
| 388 | SampleContextTracker ContextTracker; |
| 389 | |
| 390 | bool IsProfileValidOnTrie = true; |
| 391 | |
| 392 | public: |
| 393 | // Deduplicate adjacent repeated context sequences up to a given sequence |
| 394 | // length. -1 means no size limit. |
| 395 | static int32_t MaxCompressionSize; |
| 396 | static int MaxContextDepth; |
| 397 | }; |
| 398 | |
| 399 | } // end namespace sampleprof |
| 400 | } // end namespace llvm |
| 401 | |
| 402 | #endif |
| 403 | |