1//===- MemProfUse.cpp - memory allocation profile use pass --*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the MemProfUsePass which reads memory profiling data
10// and uses it to add metadata to instructions to guide optimization.
11//
12//===----------------------------------------------------------------------===//
13
14#include "llvm/Transforms/Instrumentation/MemProfUse.h"
15#include "llvm/ADT/SmallVector.h"
16#include "llvm/ADT/Statistic.h"
17#include "llvm/ADT/StringRef.h"
18#include "llvm/Analysis/MemoryProfileInfo.h"
19#include "llvm/Analysis/OptimizationRemarkEmitter.h"
20#include "llvm/Analysis/StaticDataProfileInfo.h"
21#include "llvm/Analysis/TargetLibraryInfo.h"
22#include "llvm/IR/DiagnosticInfo.h"
23#include "llvm/IR/Function.h"
24#include "llvm/IR/IntrinsicInst.h"
25#include "llvm/IR/Module.h"
26#include "llvm/ProfileData/DataAccessProf.h"
27#include "llvm/ProfileData/InstrProf.h"
28#include "llvm/ProfileData/InstrProfReader.h"
29#include "llvm/ProfileData/MemProfCommon.h"
30#include "llvm/Support/BLAKE3.h"
31#include "llvm/Support/CommandLine.h"
32#include "llvm/Support/Debug.h"
33#include "llvm/Support/HashBuilder.h"
34#include "llvm/Support/VirtualFileSystem.h"
35#include "llvm/Transforms/Utils/LongestCommonSequence.h"
36#include <map>
37#include <set>
38
39using namespace llvm;
40using namespace llvm::memprof;
41
42#define DEBUG_TYPE "memprof"
43
44namespace llvm {
45extern cl::opt<bool> PGOWarnMissing;
46extern cl::opt<bool> NoPGOWarnMismatch;
47extern cl::opt<bool> NoPGOWarnMismatchComdatWeak;
48extern cl::opt<bool> AnnotateStringLiteralSectionPrefix;
49} // namespace llvm
50
51// By default disable matching of allocation profiles onto operator new that
52// already explicitly pass a hot/cold hint, since we don't currently
53// override these hints anyway.
54static cl::opt<bool> ClMemProfMatchHotColdNew(
55 "memprof-match-hot-cold-new",
56 cl::desc(
57 "Match allocation profiles onto existing hot/cold operator new calls"),
58 cl::Hidden, cl::init(Val: false));
59
60static cl::opt<bool>
61 ClPrintMemProfMatchInfo("memprof-print-match-info",
62 cl::desc("Print matching stats for each allocation "
63 "context in this module's profiles"),
64 cl::Hidden, cl::init(Val: false));
65
66static cl::opt<bool> PrintMatchedAllocStack(
67 "memprof-print-matched-alloc-stack",
68 cl::desc("Print full stack context for matched "
69 "allocations with -memprof-print-match-info."),
70 cl::Hidden, cl::init(Val: false));
71
72static cl::opt<bool>
73 PrintFunctionGuids("memprof-print-function-guids",
74 cl::desc("Print function GUIDs computed for matching"),
75 cl::Hidden, cl::init(Val: false));
76
77static cl::opt<bool>
78 SalvageStaleProfile("memprof-salvage-stale-profile",
79 cl::desc("Salvage stale MemProf profile"),
80 cl::init(Val: false), cl::Hidden);
81
82static cl::opt<bool> ClMemProfAttachCalleeGuids(
83 "memprof-attach-calleeguids",
84 cl::desc(
85 "Attach calleeguids as value profile metadata for indirect calls."),
86 cl::init(Val: true), cl::Hidden);
87
88static cl::opt<unsigned> MinMatchedColdBytePercent(
89 "memprof-matching-cold-threshold", cl::init(Val: 100), cl::Hidden,
90 cl::desc("Min percent of cold bytes matched to hint allocation cold"));
91
92static cl::opt<bool> AnnotateStaticDataSectionPrefix(
93 "memprof-annotate-static-data-prefix", cl::init(Val: false), cl::Hidden,
94 cl::desc("If true, annotate the static data section prefix"));
95
96// Matching statistics
97STATISTIC(NumOfMemProfMissing, "Number of functions without memory profile.");
98STATISTIC(NumOfMemProfMismatch,
99 "Number of functions having mismatched memory profile hash.");
100STATISTIC(NumOfMemProfFunc, "Number of functions having valid memory profile.");
101STATISTIC(NumOfMemProfAllocContextProfiles,
102 "Number of alloc contexts in memory profile.");
103STATISTIC(NumOfMemProfCallSiteProfiles,
104 "Number of callsites in memory profile.");
105STATISTIC(NumOfMemProfMatchedAllocContexts,
106 "Number of matched memory profile alloc contexts.");
107STATISTIC(NumOfMemProfMatchedAllocs,
108 "Number of matched memory profile allocs.");
109STATISTIC(NumOfMemProfMatchedCallSites,
110 "Number of matched memory profile callsites.");
111STATISTIC(NumOfMemProfHotGlobalVars,
112 "Number of global vars annotated with 'hot' section prefix.");
113STATISTIC(NumOfMemProfColdGlobalVars,
114 "Number of global vars annotated with 'unlikely' section prefix.");
115STATISTIC(NumOfMemProfUnknownGlobalVars,
116 "Number of global vars with unknown hotness (no section prefix).");
117STATISTIC(NumOfMemProfExplicitSectionGlobalVars,
118 "Number of global vars with user-specified section (not annotated).");
119
120static void addCallsiteMetadata(Instruction &I,
121 ArrayRef<uint64_t> InlinedCallStack,
122 LLVMContext &Ctx) {
123 I.setMetadata(KindID: LLVMContext::MD_callsite,
124 Node: buildCallstackMetadata(CallStack: InlinedCallStack, Ctx));
125}
126
127static uint64_t computeStackId(GlobalValue::GUID Function, uint32_t LineOffset,
128 uint32_t Column) {
129 llvm::HashBuilder<llvm::TruncatedBLAKE3<8>, llvm::endianness::little>
130 HashBuilder;
131 HashBuilder.add(Args: Function, Args: LineOffset, Args: Column);
132 llvm::BLAKE3Result<8> Hash = HashBuilder.final();
133 uint64_t Id;
134 std::memcpy(dest: &Id, src: Hash.data(), n: sizeof(Hash));
135 return Id;
136}
137
138static uint64_t computeStackId(const memprof::Frame &Frame) {
139 return computeStackId(Function: Frame.Function, LineOffset: Frame.LineOffset, Column: Frame.Column);
140}
141
142static AllocationType getAllocType(const AllocationInfo *AllocInfo) {
143 return getAllocType(TotalLifetimeAccessDensity: AllocInfo->Info.getTotalLifetimeAccessDensity(),
144 AllocCount: AllocInfo->Info.getAllocCount(),
145 TotalLifetime: AllocInfo->Info.getTotalLifetime());
146}
147
148static AllocationType addCallStack(CallStackTrie &AllocTrie,
149 const AllocationInfo *AllocInfo,
150 uint64_t FullStackId) {
151 SmallVector<uint64_t> StackIds;
152 for (const auto &StackFrame : AllocInfo->CallStack)
153 StackIds.push_back(Elt: computeStackId(Frame: StackFrame));
154 auto AllocType = getAllocType(AllocInfo);
155 std::vector<ContextTotalSize> ContextSizeInfo;
156 if (recordContextSizeInfoForAnalysis()) {
157 auto TotalSize = AllocInfo->Info.getTotalSize();
158 assert(TotalSize);
159 assert(FullStackId != 0);
160 ContextSizeInfo.push_back(x: {.FullStackId: FullStackId, .TotalSize: TotalSize});
161 }
162 AllocTrie.addCallStack(AllocType, StackIds, ContextSizeInfo: std::move(ContextSizeInfo));
163 return AllocType;
164}
165
166// Return true if InlinedCallStack, computed from a call instruction's debug
167// info, is a prefix of ProfileCallStack, a list of Frames from profile data
168// (either the allocation data or a callsite).
169static bool
170stackFrameIncludesInlinedCallStack(ArrayRef<Frame> ProfileCallStack,
171 ArrayRef<uint64_t> InlinedCallStack) {
172 return ProfileCallStack.size() >= InlinedCallStack.size() &&
173 llvm::equal(LRange: ProfileCallStack.take_front(N: InlinedCallStack.size()),
174 RRange&: InlinedCallStack, P: [](const Frame &F, uint64_t StackId) {
175 return computeStackId(Frame: F) == StackId;
176 });
177}
178
179static bool isAllocationWithHotColdVariant(const Function *Callee,
180 const TargetLibraryInfo &TLI) {
181 if (!Callee)
182 return false;
183 LibFunc Func;
184 if (!TLI.getLibFunc(FDecl: *Callee, F&: Func))
185 return false;
186 switch (Func) {
187 case LibFunc_Znwm:
188 case LibFunc_ZnwmRKSt9nothrow_t:
189 case LibFunc_ZnwmSt11align_val_t:
190 case LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t:
191 case LibFunc_Znam:
192 case LibFunc_ZnamRKSt9nothrow_t:
193 case LibFunc_ZnamSt11align_val_t:
194 case LibFunc_ZnamSt11align_val_tRKSt9nothrow_t:
195 case LibFunc_size_returning_new:
196 case LibFunc_size_returning_new_aligned:
197 return true;
198 case LibFunc_Znwm12__hot_cold_t:
199 case LibFunc_ZnwmRKSt9nothrow_t12__hot_cold_t:
200 case LibFunc_ZnwmSt11align_val_t12__hot_cold_t:
201 case LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t12__hot_cold_t:
202 case LibFunc_Znam12__hot_cold_t:
203 case LibFunc_ZnamRKSt9nothrow_t12__hot_cold_t:
204 case LibFunc_ZnamSt11align_val_t12__hot_cold_t:
205 case LibFunc_ZnamSt11align_val_tRKSt9nothrow_t12__hot_cold_t:
206 case LibFunc_size_returning_new_hot_cold:
207 case LibFunc_size_returning_new_aligned_hot_cold:
208 return ClMemProfMatchHotColdNew;
209 default:
210 return false;
211 }
212}
213
214static void HandleUnsupportedAnnotationKinds(GlobalVariable &GVar,
215 AnnotationKind Kind) {
216 assert(Kind != llvm::memprof::AnnotationKind::AnnotationOK &&
217 "Should not handle AnnotationOK here");
218 SmallString<32> Reason;
219 switch (Kind) {
220 case llvm::memprof::AnnotationKind::ExplicitSection:
221 ++NumOfMemProfExplicitSectionGlobalVars;
222 Reason.append(RHS: "explicit section name");
223 break;
224 case llvm::memprof::AnnotationKind::DeclForLinker:
225 Reason.append(RHS: "linker declaration");
226 break;
227 case llvm::memprof::AnnotationKind::ReservedName:
228 Reason.append(RHS: "name starts with `llvm.`");
229 break;
230 default:
231 llvm_unreachable("Unexpected annotation kind");
232 }
233 LLVM_DEBUG(dbgs() << "Skip annotation for " << GVar.getName() << " due to "
234 << Reason << ".\n");
235}
236
237// Computes the LLVM version of MD5 hash for the content of a string
238// literal.
239static std::optional<uint64_t>
240getStringContentHash(const GlobalVariable &GVar) {
241 auto *Initializer = GVar.getInitializer();
242 if (!Initializer)
243 return std::nullopt;
244 if (auto *C = dyn_cast<ConstantDataSequential>(Val: Initializer))
245 if (C->isString()) {
246 // Note the hash computed for the literal would include the null byte.
247 return llvm::MD5Hash(Str: C->getAsString());
248 }
249 return std::nullopt;
250}
251
252// Structure for tracking info about matched allocation contexts for use with
253// -memprof-print-match-info and -memprof-print-matched-alloc-stack.
254struct AllocMatchInfo {
255 // Total size in bytes of matched context.
256 uint64_t TotalSize = 0;
257 // Matched allocation's type.
258 AllocationType AllocType = AllocationType::None;
259 // Number of frames matched to the allocation itself (values will be >1 in
260 // cases where allocation was already inlined). Use a set because there can
261 // be multiple inlined instances and each may have a different inline depth.
262 // Use std::set to iterate in sorted order when printing.
263 std::set<unsigned> MatchedFramesSet;
264 // The full call stack of the allocation, for cases where requested via
265 // -memprof-print-matched-alloc-stack.
266 std::vector<Frame> CallStack;
267
268 // Caller responsible for inserting the matched frames and the call stack when
269 // appropriate.
270 AllocMatchInfo(uint64_t TotalSize, AllocationType AllocType)
271 : TotalSize(TotalSize), AllocType(AllocType) {}
272};
273
274DenseMap<uint64_t, SmallVector<CallEdgeTy, 0>>
275memprof::extractCallsFromIR(Module &M, const TargetLibraryInfo &TLI,
276 function_ref<bool(uint64_t)> IsPresentInProfile) {
277 DenseMap<uint64_t, SmallVector<CallEdgeTy, 0>> Calls;
278
279 auto GetOffset = [](const DILocation *DIL) {
280 return (DIL->getLine() - DIL->getScope()->getSubprogram()->getLine()) &
281 0xffff;
282 };
283
284 for (Function &F : M) {
285 if (F.isDeclaration())
286 continue;
287
288 for (auto &BB : F) {
289 for (auto &I : BB) {
290 if (!isa<CallBase>(Val: &I) || isa<IntrinsicInst>(Val: &I))
291 continue;
292
293 auto *CB = dyn_cast<CallBase>(Val: &I);
294 auto *CalledFunction = CB->getCalledFunction();
295 // Disregard indirect calls and intrinsics.
296 if (!CalledFunction || CalledFunction->isIntrinsic())
297 continue;
298
299 StringRef CalleeName = CalledFunction->getName();
300 // True if we are calling a heap allocation function that supports
301 // hot/cold variants.
302 bool IsAlloc = isAllocationWithHotColdVariant(Callee: CalledFunction, TLI);
303 // True for the first iteration below, indicating that we are looking at
304 // a leaf node.
305 bool IsLeaf = true;
306 for (const DILocation *DIL = I.getDebugLoc(); DIL;
307 DIL = DIL->getInlinedAt()) {
308 StringRef CallerName = DIL->getSubprogramLinkageName();
309 assert(!CallerName.empty() &&
310 "Be sure to enable -fdebug-info-for-profiling");
311 uint64_t CallerGUID = memprof::getGUID(FunctionName: CallerName);
312 uint64_t CalleeGUID = memprof::getGUID(FunctionName: CalleeName);
313 // Pretend that we are calling a function with GUID == 0 if we are
314 // in the inline stack leading to a heap allocation function.
315 if (IsAlloc) {
316 if (IsLeaf) {
317 // For leaf nodes, set CalleeGUID to 0 without consulting
318 // IsPresentInProfile.
319 CalleeGUID = 0;
320 } else if (!IsPresentInProfile(CalleeGUID)) {
321 // In addition to the leaf case above, continue to set CalleeGUID
322 // to 0 as long as we don't see CalleeGUID in the profile.
323 CalleeGUID = 0;
324 } else {
325 // Once we encounter a callee that exists in the profile, stop
326 // setting CalleeGUID to 0.
327 IsAlloc = false;
328 }
329 }
330
331 LineLocation Loc = {GetOffset(DIL), DIL->getColumn()};
332 Calls[CallerGUID].emplace_back(Args&: Loc, Args&: CalleeGUID);
333 CalleeName = CallerName;
334 IsLeaf = false;
335 }
336 }
337 }
338 }
339
340 // Sort each call list by the source location.
341 for (auto &[CallerGUID, CallList] : Calls) {
342 llvm::sort(C&: CallList);
343 CallList.erase(CS: llvm::unique(R&: CallList), CE: CallList.end());
344 }
345
346 return Calls;
347}
348
349DenseMap<uint64_t, LocToLocMap>
350memprof::computeUndriftMap(Module &M, IndexedInstrProfReader *MemProfReader,
351 const TargetLibraryInfo &TLI) {
352 DenseMap<uint64_t, LocToLocMap> UndriftMaps;
353
354 DenseMap<uint64_t, SmallVector<memprof::CallEdgeTy, 0>> CallsFromProfile =
355 MemProfReader->getMemProfCallerCalleePairs();
356 DenseMap<uint64_t, SmallVector<memprof::CallEdgeTy, 0>> CallsFromIR =
357 extractCallsFromIR(M, TLI, IsPresentInProfile: [&](uint64_t GUID) {
358 return CallsFromProfile.contains(Val: GUID);
359 });
360
361 // Compute an undrift map for each CallerGUID.
362 for (const auto &[CallerGUID, IRAnchors] : CallsFromIR) {
363 auto It = CallsFromProfile.find(Val: CallerGUID);
364 if (It == CallsFromProfile.end())
365 continue;
366 const auto &ProfileAnchors = It->second;
367
368 LocToLocMap Matchings;
369 longestCommonSequence<LineLocation, GlobalValue::GUID>(
370 AnchorList1: ProfileAnchors, AnchorList2: IRAnchors, FunctionMatchesProfile: std::equal_to<GlobalValue::GUID>(),
371 InsertMatching: [&](LineLocation A, LineLocation B) { Matchings.try_emplace(k: A, args&: B); });
372 [[maybe_unused]] bool Inserted =
373 UndriftMaps.try_emplace(Key: CallerGUID, Args: std::move(Matchings)).second;
374
375 // The insertion must succeed because we visit each GUID exactly once.
376 assert(Inserted);
377 }
378
379 return UndriftMaps;
380}
381
382// Given a MemProfRecord, undrift all the source locations present in the
383// record in place.
384static void
385undriftMemProfRecord(const DenseMap<uint64_t, LocToLocMap> &UndriftMaps,
386 memprof::MemProfRecord &MemProfRec) {
387 // Undrift a call stack in place.
388 auto UndriftCallStack = [&](std::vector<Frame> &CallStack) {
389 for (auto &F : CallStack) {
390 auto I = UndriftMaps.find(Val: F.Function);
391 if (I == UndriftMaps.end())
392 continue;
393 auto J = I->second.find(x: LineLocation(F.LineOffset, F.Column));
394 if (J == I->second.end())
395 continue;
396 auto &NewLoc = J->second;
397 F.LineOffset = NewLoc.LineOffset;
398 F.Column = NewLoc.Column;
399 }
400 };
401
402 for (auto &AS : MemProfRec.AllocSites)
403 UndriftCallStack(AS.CallStack);
404
405 for (auto &CS : MemProfRec.CallSites)
406 UndriftCallStack(CS.Frames);
407}
408
409// Helper function to process CalleeGuids and create value profile metadata
410static void addVPMetadata(Module &M, Instruction &I,
411 ArrayRef<GlobalValue::GUID> CalleeGuids) {
412 if (!ClMemProfAttachCalleeGuids || CalleeGuids.empty())
413 return;
414
415 // Prepare the vector of value data, initializing from any existing
416 // value-profile metadata present on the instruction so that we merge the
417 // new CalleeGuids into the existing entries.
418 SmallVector<InstrProfValueData> VDs;
419 uint64_t TotalCount = 0;
420
421 if (I.getMetadata(KindID: LLVMContext::MD_prof)) {
422 // Read all existing entries so we can merge them. Use a large
423 // MaxNumValueData to retrieve all existing entries.
424 VDs = getValueProfDataFromInst(Inst: I, ValueKind: IPVK_IndirectCallTarget,
425 /*MaxNumValueData=*/UINT32_MAX, TotalC&: TotalCount);
426 }
427
428 // Save the original size for use later in detecting whether any were added.
429 const size_t OriginalSize = VDs.size();
430
431 // Initialize the set of existing guids with the original list.
432 DenseSet<uint64_t> ExistingValues(
433 llvm::from_range,
434 llvm::map_range(
435 C&: VDs, F: [](const InstrProfValueData &Entry) { return Entry.Value; }));
436
437 // Merge CalleeGuids into list of existing VDs, by appending any that are not
438 // already included.
439 VDs.reserve(N: OriginalSize + CalleeGuids.size());
440 for (auto G : CalleeGuids) {
441 if (!ExistingValues.insert(V: G).second)
442 continue;
443 InstrProfValueData NewEntry;
444 NewEntry.Value = G;
445 // For MemProf, we don't have actual call counts, so we assign
446 // a weight of 1 to each potential target.
447 // TODO: Consider making this weight configurable or increasing it to
448 // improve effectiveness for ICP.
449 NewEntry.Count = 1;
450 TotalCount += NewEntry.Count;
451 VDs.push_back(Elt: NewEntry);
452 }
453
454 // Update the VP metadata if we added any new callee GUIDs to the list.
455 assert(VDs.size() >= OriginalSize);
456 if (VDs.size() == OriginalSize)
457 return;
458
459 // First clear the existing !prof.
460 I.setMetadata(KindID: LLVMContext::MD_prof, Node: nullptr);
461
462 // No need to sort the updated VDs as all appended entries have the same count
463 // of 1, which is no larger than any existing entries. The incoming list of
464 // CalleeGuids should already be deterministic for a given profile.
465 annotateValueSite(M, Inst&: I, VDs, Sum: TotalCount, ValueKind: IPVK_IndirectCallTarget, MaxMDCount: VDs.size());
466}
467
468static void handleAllocSite(
469 Instruction &I, CallBase *CI, ArrayRef<uint64_t> InlinedCallStack,
470 LLVMContext &Ctx, OptimizationRemarkEmitter &ORE, uint64_t MaxColdSize,
471 const std::set<const AllocationInfo *> &AllocInfoSet,
472 std::map<uint64_t, AllocMatchInfo> &FullStackIdToAllocMatchInfo) {
473 // TODO: Remove this once the profile creation logic deduplicates contexts
474 // that are the same other than the IsInlineFrame bool. Until then, keep the
475 // largest.
476 DenseMap<uint64_t, const AllocationInfo *> UniqueFullContextIdAllocInfo;
477 for (auto *AllocInfo : AllocInfoSet) {
478 auto FullStackId = computeFullStackId(CallStack: AllocInfo->CallStack);
479 auto [It, Inserted] =
480 UniqueFullContextIdAllocInfo.insert(KV: {FullStackId, AllocInfo});
481 // If inserted entry, done.
482 if (Inserted)
483 continue;
484 // Keep the larger one, or the noncold one if they are the same size.
485 auto CurSize = It->second->Info.getTotalSize();
486 auto NewSize = AllocInfo->Info.getTotalSize();
487 if ((CurSize > NewSize) ||
488 (CurSize == NewSize &&
489 getAllocType(AllocInfo) != AllocationType::NotCold))
490 continue;
491 It->second = AllocInfo;
492 }
493 // We may match this instruction's location list to multiple MIB
494 // contexts. Add them to a Trie specialized for trimming the contexts to
495 // the minimal needed to disambiguate contexts with unique behavior.
496 CallStackTrie AllocTrie(&ORE, MaxColdSize);
497 uint64_t TotalSize = 0;
498 uint64_t TotalColdSize = 0;
499 for (auto &[FullStackId, AllocInfo] : UniqueFullContextIdAllocInfo) {
500 // Check the full inlined call stack against this one.
501 // If we found and thus matched all frames on the call, include
502 // this MIB.
503 if (stackFrameIncludesInlinedCallStack(ProfileCallStack: AllocInfo->CallStack,
504 InlinedCallStack)) {
505 NumOfMemProfMatchedAllocContexts++;
506 auto AllocType = addCallStack(AllocTrie, AllocInfo, FullStackId);
507 TotalSize += AllocInfo->Info.getTotalSize();
508 if (AllocType == AllocationType::Cold)
509 TotalColdSize += AllocInfo->Info.getTotalSize();
510 // Record information about the allocation if match info printing
511 // was requested.
512 if (ClPrintMemProfMatchInfo) {
513 assert(FullStackId != 0);
514 auto [Iter, Inserted] = FullStackIdToAllocMatchInfo.try_emplace(
515 k: FullStackId,
516 args: AllocMatchInfo(AllocInfo->Info.getTotalSize(), AllocType));
517 // Always insert the new matched frame count, since it may differ.
518 Iter->second.MatchedFramesSet.insert(x: InlinedCallStack.size());
519 if (Inserted && PrintMatchedAllocStack)
520 Iter->second.CallStack.insert(position: Iter->second.CallStack.begin(),
521 first: AllocInfo->CallStack.begin(),
522 last: AllocInfo->CallStack.end());
523 }
524 ORE.emit(
525 OptDiag: OptimizationRemark(DEBUG_TYPE, "MemProfUse", CI)
526 << ore::NV("AllocationCall", CI) << " in function "
527 << ore::NV("Caller", CI->getFunction())
528 << " matched alloc context with alloc type "
529 << ore::NV("Attribute", getAllocTypeAttributeString(Type: AllocType))
530 << " total size " << ore::NV("Size", AllocInfo->Info.getTotalSize())
531 << " full context id " << ore::NV("Context", FullStackId)
532 << " frame count " << ore::NV("Frames", InlinedCallStack.size()));
533 }
534 }
535 // If the threshold for the percent of cold bytes is less than 100%,
536 // and not all bytes are cold, see if we should still hint this
537 // allocation as cold without context sensitivity.
538 if (TotalColdSize < TotalSize && MinMatchedColdBytePercent < 100 &&
539 TotalColdSize * 100 >= MinMatchedColdBytePercent * TotalSize) {
540 AllocTrie.addSingleAllocTypeAttribute(CI, AT: AllocationType::Cold, Descriptor: "dominant");
541 return;
542 }
543
544 // We might not have matched any to the full inlined call stack.
545 // But if we did, create and attach metadata, or a function attribute if
546 // all contexts have identical profiled behavior.
547 if (!AllocTrie.empty()) {
548 NumOfMemProfMatchedAllocs++;
549 // MemprofMDAttached will be false if a function attribute was
550 // attached.
551 bool MemprofMDAttached = AllocTrie.buildAndAttachMIBMetadata(CI);
552 assert(MemprofMDAttached == I.hasMetadata(LLVMContext::MD_memprof));
553 if (MemprofMDAttached) {
554 // Add callsite metadata for the instruction's location list so that
555 // it simpler later on to identify which part of the MIB contexts
556 // are from this particular instruction (including during inlining,
557 // when the callsite metadata will be updated appropriately).
558 // FIXME: can this be changed to strip out the matching stack
559 // context ids from the MIB contexts and not add any callsite
560 // metadata here to save space?
561 addCallsiteMetadata(I, InlinedCallStack, Ctx);
562 }
563 }
564}
565
566// Helper struct for maintaining refs to callsite data. As an alternative we
567// could store a pointer to the CallSiteInfo struct but we also need the frame
568// index. Using ArrayRefs instead makes it a little easier to read.
569struct CallSiteEntry {
570 // Subset of frames for the corresponding CallSiteInfo.
571 ArrayRef<Frame> Frames;
572 // Potential targets for indirect calls.
573 ArrayRef<GlobalValue::GUID> CalleeGuids;
574};
575
576static void handleCallSite(Instruction &I, const Function *CalledFunction,
577 ArrayRef<uint64_t> InlinedCallStack,
578 const std::vector<CallSiteEntry> &CallSiteEntries,
579 Module &M,
580 std::set<std::vector<uint64_t>> &MatchedCallSites,
581 OptimizationRemarkEmitter &ORE) {
582 auto &Ctx = M.getContext();
583 // Set of Callee GUIDs to attach to indirect calls. We accumulate all of them
584 // to support cases where the instuction's inlined frames match multiple call
585 // site entries, which can happen if the profile was collected from a binary
586 // where this instruction was eventually inlined into multiple callers.
587 SetVector<GlobalValue::GUID> CalleeGuids;
588 bool CallsiteMDAdded = false;
589 for (const auto &CallSiteEntry : CallSiteEntries) {
590 // If we found and thus matched all frames on the call, create and
591 // attach call stack metadata.
592 if (stackFrameIncludesInlinedCallStack(ProfileCallStack: CallSiteEntry.Frames,
593 InlinedCallStack)) {
594 NumOfMemProfMatchedCallSites++;
595 // Only need to find one with a matching call stack and add a single
596 // callsite metadata.
597 if (!CallsiteMDAdded) {
598 addCallsiteMetadata(I, InlinedCallStack, Ctx);
599
600 // Accumulate call site matching information upon request.
601 if (ClPrintMemProfMatchInfo) {
602 std::vector<uint64_t> CallStack;
603 append_range(C&: CallStack, R&: InlinedCallStack);
604 MatchedCallSites.insert(x: std::move(CallStack));
605 }
606 OptimizationRemark Remark(DEBUG_TYPE, "MemProfUse", &I);
607 Remark << ore::NV("CallSite", &I) << " in function "
608 << ore::NV("Caller", I.getFunction())
609 << " matched callsite with frame count "
610 << ore::NV("Frames", InlinedCallStack.size())
611 << " and stack ids";
612 for (uint64_t StackId : InlinedCallStack)
613 Remark << " " << ore::NV("StackId", StackId);
614 ORE.emit(OptDiag&: Remark);
615
616 // If this is a direct call, we're done.
617 if (CalledFunction)
618 break;
619 CallsiteMDAdded = true;
620 }
621
622 assert(!CalledFunction && "Didn't expect direct call");
623
624 // Collect Callee GUIDs from all matching CallSiteEntries.
625 CalleeGuids.insert(Start: CallSiteEntry.CalleeGuids.begin(),
626 End: CallSiteEntry.CalleeGuids.end());
627 }
628 }
629 // Try to attach indirect call metadata if possible.
630 addVPMetadata(M, I, CalleeGuids: CalleeGuids.getArrayRef());
631}
632
633static void
634readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader,
635 const TargetLibraryInfo &TLI,
636 std::map<uint64_t, AllocMatchInfo> &FullStackIdToAllocMatchInfo,
637 std::set<std::vector<uint64_t>> &MatchedCallSites,
638 DenseMap<uint64_t, LocToLocMap> &UndriftMaps,
639 OptimizationRemarkEmitter &ORE, uint64_t MaxColdSize) {
640 auto &Ctx = M.getContext();
641 // Previously we used getIRPGOFuncName() here. If F is local linkage,
642 // getIRPGOFuncName() returns FuncName with prefix 'FileName;'. But
643 // llvm-profdata uses FuncName in dwarf to create GUID which doesn't
644 // contain FileName's prefix. It caused local linkage function can't
645 // find MemProfRecord. So we use getName() now.
646 // 'unique-internal-linkage-names' can make MemProf work better for local
647 // linkage function.
648 auto FuncName = F.getName();
649 auto FuncGUID = Function::getGUIDAssumingExternalLinkage(GlobalName: FuncName);
650 if (PrintFunctionGuids)
651 errs() << "MemProf: Function GUID " << FuncGUID << " is " << FuncName
652 << "\n";
653 std::optional<memprof::MemProfRecord> MemProfRec;
654 auto Err = MemProfReader->getMemProfRecord(FuncNameHash: FuncGUID).moveInto(Value&: MemProfRec);
655 if (Err) {
656 handleAllErrors(E: std::move(Err), Handlers: [&](const InstrProfError &IPE) {
657 auto Err = IPE.get();
658 bool SkipWarning = false;
659 LLVM_DEBUG(dbgs() << "Error in reading profile for Func " << FuncName
660 << ": ");
661 if (Err == instrprof_error::unknown_function) {
662 NumOfMemProfMissing++;
663 SkipWarning = !PGOWarnMissing;
664 LLVM_DEBUG(dbgs() << "unknown function");
665 } else if (Err == instrprof_error::hash_mismatch) {
666 NumOfMemProfMismatch++;
667 SkipWarning =
668 NoPGOWarnMismatch ||
669 (NoPGOWarnMismatchComdatWeak &&
670 (F.hasComdat() ||
671 F.getLinkage() == GlobalValue::AvailableExternallyLinkage));
672 LLVM_DEBUG(dbgs() << "hash mismatch (skip=" << SkipWarning << ")");
673 }
674
675 if (SkipWarning)
676 return;
677
678 std::string Msg = (IPE.message() + Twine(" ") + F.getName().str() +
679 Twine(" Hash = ") + std::to_string(val: FuncGUID))
680 .str();
681
682 Ctx.diagnose(
683 DI: DiagnosticInfoPGOProfile(M.getName().data(), Msg, DS_Warning));
684 });
685 return;
686 }
687
688 NumOfMemProfFunc++;
689
690 // If requested, undrfit MemProfRecord so that the source locations in it
691 // match those in the IR.
692 if (SalvageStaleProfile)
693 undriftMemProfRecord(UndriftMaps, MemProfRec&: *MemProfRec);
694
695 // Detect if there are non-zero column numbers in the profile. If not,
696 // treat all column numbers as 0 when matching (i.e. ignore any non-zero
697 // columns in the IR). The profiled binary might have been built with
698 // column numbers disabled, for example.
699 bool ProfileHasColumns = false;
700
701 // Build maps of the location hash to all profile data with that leaf location
702 // (allocation info and the callsites).
703 std::map<uint64_t, std::set<const AllocationInfo *>> LocHashToAllocInfo;
704
705 // For the callsites we need to record slices of the frame array (see comments
706 // below where the map entries are added) along with their CalleeGuids.
707 std::map<uint64_t, std::vector<CallSiteEntry>> LocHashToCallSites;
708 for (auto &AI : MemProfRec->AllocSites) {
709 NumOfMemProfAllocContextProfiles++;
710 // Associate the allocation info with the leaf frame. The later matching
711 // code will match any inlined call sequences in the IR with a longer prefix
712 // of call stack frames.
713 uint64_t StackId = computeStackId(Frame: AI.CallStack[0]);
714 LocHashToAllocInfo[StackId].insert(x: &AI);
715 ProfileHasColumns |= AI.CallStack[0].Column;
716 }
717 for (auto &CS : MemProfRec->CallSites) {
718 NumOfMemProfCallSiteProfiles++;
719 // Need to record all frames from leaf up to and including this function,
720 // as any of these may or may not have been inlined at this point.
721 unsigned Idx = 0;
722 for (auto &StackFrame : CS.Frames) {
723 uint64_t StackId = computeStackId(Frame: StackFrame);
724 ArrayRef<Frame> FrameSlice = ArrayRef<Frame>(CS.Frames).drop_front(N: Idx++);
725 // The callee guids for the slice containing all frames (due to the
726 // increment above Idx is now 1) comes from the CalleeGuids recorded in
727 // the CallSite. For the slices not containing the leaf-most frame, the
728 // callee guid is simply the function GUID of the prior frame.
729 LocHashToCallSites[StackId].push_back(
730 x: {.Frames: FrameSlice, .CalleeGuids: (Idx == 1 ? CS.CalleeGuids
731 : ArrayRef<GlobalValue::GUID>(
732 CS.Frames[Idx - 2].Function))});
733
734 ProfileHasColumns |= StackFrame.Column;
735 // Once we find this function, we can stop recording.
736 if (StackFrame.Function == FuncGUID)
737 break;
738 }
739 assert(Idx <= CS.Frames.size() && CS.Frames[Idx - 1].Function == FuncGUID);
740 }
741
742 auto GetOffset = [](const DILocation *DIL) {
743 return (DIL->getLine() - DIL->getScope()->getSubprogram()->getLine()) &
744 0xffff;
745 };
746
747 // Now walk the instructions, looking up the associated profile data using
748 // debug locations.
749 for (auto &BB : F) {
750 for (auto &I : BB) {
751 if (I.isDebugOrPseudoInst())
752 continue;
753 // We are only interested in calls (allocation or interior call stack
754 // context calls).
755 auto *CI = dyn_cast<CallBase>(Val: &I);
756 if (!CI)
757 continue;
758 auto *CalledFunction = CI->getCalledFunction();
759 if (CalledFunction && CalledFunction->isIntrinsic())
760 continue;
761 // List of call stack ids computed from the location hashes on debug
762 // locations (leaf to inlined at root).
763 SmallVector<uint64_t, 8> InlinedCallStack;
764 // Was the leaf location found in one of the profile maps?
765 bool LeafFound = false;
766 // If leaf was found in a map, iterators pointing to its location in both
767 // of the maps. It might exist in neither, one, or both (the latter case
768 // can happen because we don't currently have discriminators to
769 // distinguish the case when a single line/col maps to both an allocation
770 // and another callsite).
771 auto AllocInfoIter = LocHashToAllocInfo.end();
772 auto CallSitesIter = LocHashToCallSites.end();
773 for (const DILocation *DIL = I.getDebugLoc(); DIL != nullptr;
774 DIL = DIL->getInlinedAt()) {
775 // Use C++ linkage name if possible. Need to compile with
776 // -fdebug-info-for-profiling to get linkage name.
777 StringRef Name = DIL->getScope()->getSubprogram()->getLinkageName();
778 if (Name.empty())
779 Name = DIL->getScope()->getSubprogram()->getName();
780 auto CalleeGUID = Function::getGUIDAssumingExternalLinkage(GlobalName: Name);
781 auto StackId = computeStackId(Function: CalleeGUID, LineOffset: GetOffset(DIL),
782 Column: ProfileHasColumns ? DIL->getColumn() : 0);
783 // Check if we have found the profile's leaf frame. If yes, collect
784 // the rest of the call's inlined context starting here. If not, see if
785 // we find a match further up the inlined context (in case the profile
786 // was missing debug frames at the leaf).
787 if (!LeafFound) {
788 AllocInfoIter = LocHashToAllocInfo.find(x: StackId);
789 CallSitesIter = LocHashToCallSites.find(x: StackId);
790 if (AllocInfoIter != LocHashToAllocInfo.end() ||
791 CallSitesIter != LocHashToCallSites.end())
792 LeafFound = true;
793 }
794 if (LeafFound)
795 InlinedCallStack.push_back(Elt: StackId);
796 }
797 // If leaf not in either of the maps, skip inst.
798 if (!LeafFound)
799 continue;
800
801 // First add !memprof metadata from allocation info, if we found the
802 // instruction's leaf location in that map, and if the rest of the
803 // instruction's locations match the prefix Frame locations on an
804 // allocation context with the same leaf.
805 if (AllocInfoIter != LocHashToAllocInfo.end() &&
806 // Only consider allocations which support hinting.
807 isAllocationWithHotColdVariant(Callee: CI->getCalledFunction(), TLI))
808 handleAllocSite(I, CI, InlinedCallStack, Ctx, ORE, MaxColdSize,
809 AllocInfoSet: AllocInfoIter->second, FullStackIdToAllocMatchInfo);
810 else if (CallSitesIter != LocHashToCallSites.end())
811 // Otherwise, add callsite metadata. If we reach here then we found the
812 // instruction's leaf location in the callsites map and not the
813 // allocation map.
814 handleCallSite(I, CalledFunction, InlinedCallStack,
815 CallSiteEntries: CallSitesIter->second, M, MatchedCallSites, ORE);
816 }
817 }
818}
819
820MemProfUsePass::MemProfUsePass(std::string MemoryProfileFile,
821 IntrusiveRefCntPtr<vfs::FileSystem> FS)
822 : MemoryProfileFileName(MemoryProfileFile), FS(FS) {
823 if (!FS)
824 this->FS = vfs::getRealFileSystem();
825}
826
827PreservedAnalyses MemProfUsePass::run(Module &M, ModuleAnalysisManager &AM) {
828 // Return immediately if the module doesn't contain any function or global
829 // variables.
830 if (M.empty() && M.globals().empty())
831 return PreservedAnalyses::all();
832
833 LLVM_DEBUG(dbgs() << "Read in memory profile:\n");
834 auto &Ctx = M.getContext();
835 auto ReaderOrErr = IndexedInstrProfReader::create(Path: MemoryProfileFileName, FS&: *FS);
836 if (Error E = ReaderOrErr.takeError()) {
837 handleAllErrors(E: std::move(E), Handlers: [&](const ErrorInfoBase &EI) {
838 Ctx.diagnose(
839 DI: DiagnosticInfoPGOProfile(MemoryProfileFileName.data(), EI.message()));
840 });
841 return PreservedAnalyses::all();
842 }
843
844 std::unique_ptr<IndexedInstrProfReader> MemProfReader =
845 std::move(ReaderOrErr.get());
846 if (!MemProfReader) {
847 Ctx.diagnose(DI: DiagnosticInfoPGOProfile(
848 MemoryProfileFileName.data(), StringRef("Cannot get MemProfReader")));
849 return PreservedAnalyses::all();
850 }
851
852 if (!MemProfReader->hasMemoryProfile()) {
853 Ctx.diagnose(DI: DiagnosticInfoPGOProfile(MemoryProfileFileName.data(),
854 "Not a memory profile"));
855 return PreservedAnalyses::all();
856 }
857
858 const bool Changed =
859 annotateGlobalVariables(M, DataAccessProf: MemProfReader->getDataAccessProfileData());
860
861 // If the module doesn't contain any function, return after we process all
862 // global variables.
863 if (M.empty())
864 return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
865
866 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(IR&: M).getManager();
867
868 TargetLibraryInfo &TLI = FAM.getResult<TargetLibraryAnalysis>(IR&: *M.begin());
869 DenseMap<uint64_t, LocToLocMap> UndriftMaps;
870 if (SalvageStaleProfile)
871 UndriftMaps = computeUndriftMap(M, MemProfReader: MemProfReader.get(), TLI);
872
873 // Map from the stack hash of each matched allocation context in the function
874 // profiles to match info such as the total profiled size (bytes), allocation
875 // type, number of frames matched to the allocation itself, and the full array
876 // of call stack ids.
877 std::map<uint64_t, AllocMatchInfo> FullStackIdToAllocMatchInfo;
878
879 // Set of the matched call sites, each expressed as a sequence of an inline
880 // call stack.
881 std::set<std::vector<uint64_t>> MatchedCallSites;
882
883 uint64_t MaxColdSize = 0;
884 if (auto *MemProfSum = MemProfReader->getMemProfSummary())
885 MaxColdSize = MemProfSum->getMaxColdTotalSize();
886
887 for (auto &F : M) {
888 if (F.isDeclaration())
889 continue;
890
891 const TargetLibraryInfo &TLI = FAM.getResult<TargetLibraryAnalysis>(IR&: F);
892 auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F);
893 readMemprof(M, F, MemProfReader: MemProfReader.get(), TLI, FullStackIdToAllocMatchInfo,
894 MatchedCallSites, UndriftMaps, ORE, MaxColdSize);
895 }
896
897 if (ClPrintMemProfMatchInfo) {
898 for (const auto &[Id, Info] : FullStackIdToAllocMatchInfo) {
899 for (auto Frames : Info.MatchedFramesSet) {
900 // TODO: To reduce verbosity, should we change the existing message
901 // so that we emit a list of matched frame counts in a single message
902 // about the context (instead of one message per frame count?
903 errs() << "MemProf " << getAllocTypeAttributeString(Type: Info.AllocType)
904 << " context with id " << Id << " has total profiled size "
905 << Info.TotalSize << " is matched with " << Frames << " frames";
906 if (PrintMatchedAllocStack) {
907 errs() << " and call stack";
908 for (auto &F : Info.CallStack)
909 errs() << " " << computeStackId(Frame: F);
910 }
911 errs() << "\n";
912 }
913 }
914
915 for (const auto &CallStack : MatchedCallSites) {
916 errs() << "MemProf callsite match for inline call stack";
917 for (uint64_t StackId : CallStack)
918 errs() << " " << StackId;
919 errs() << "\n";
920 }
921 }
922
923 return PreservedAnalyses::none();
924}
925
926bool MemProfUsePass::annotateGlobalVariables(
927 Module &M, const memprof::DataAccessProfData *DataAccessProf) {
928 if (!AnnotateStaticDataSectionPrefix || M.globals().empty())
929 return false;
930
931 if (!DataAccessProf) {
932 M.addModuleFlag(Behavior: Module::Warning, Key: "EnableDataAccessProf", Val: 0U);
933 // FIXME: Add a diagnostic message without failing the compilation when
934 // data access profile payload is not available.
935 return false;
936 }
937 M.addModuleFlag(Behavior: Module::Warning, Key: "EnableDataAccessProf", Val: 1U);
938
939 bool Changed = false;
940 // Iterate all global variables in the module and annotate them based on
941 // data access profiles. Note it's up to the linker to decide how to map input
942 // sections to output sections, and one conservative practice is to map
943 // unlikely-prefixed ones to unlikely output section, and map the rest
944 // (hot-prefixed or prefix-less) to the canonical output section.
945 for (GlobalVariable &GVar : M.globals()) {
946 assert(!GVar.getSectionPrefix().has_value() &&
947 "GVar shouldn't have section prefix yet");
948 auto Kind = llvm::memprof::getAnnotationKind(GV: GVar);
949 if (Kind != llvm::memprof::AnnotationKind::AnnotationOK) {
950 HandleUnsupportedAnnotationKinds(GVar, Kind);
951 continue;
952 }
953
954 StringRef Name = GVar.getName();
955 SymbolHandleRef Handle = SymbolHandleRef(Name);
956 // Skip string literals as their mangled names don't stay stable across
957 // binary releases.
958 if (!AnnotateStringLiteralSectionPrefix)
959 if (Name.starts_with(Prefix: ".str"))
960 continue;
961
962 if (Name.starts_with(Prefix: ".str")) {
963 std::optional<uint64_t> Hash = getStringContentHash(GVar);
964 if (!Hash) {
965 LLVM_DEBUG(dbgs() << "Cannot compute content hash for string literal "
966 << Name << "\n");
967 continue;
968 }
969 Handle = SymbolHandleRef(Hash.value());
970 }
971
972 // DataAccessProfRecord's get* methods will canonicalize the name under the
973 // hood before looking it up, so optimizer doesn't need to do it.
974 std::optional<DataAccessProfRecord> Record =
975 DataAccessProf->getProfileRecord(SymID: Handle);
976 // Annotate a global variable as hot if it has non-zero sampled count, and
977 // annotate it as cold if it's seen in the profiled binary
978 // file but doesn't have any access sample.
979 // For logging, optimization remark emitter requires a llvm::Function, but
980 // it's not well defined how to associate a global variable with a function.
981 // So we just print out the static data section prefix in LLVM_DEBUG.
982 if (Record && Record->AccessCount > 0) {
983 ++NumOfMemProfHotGlobalVars;
984 Changed |= GVar.setSectionPrefix("hot");
985 LLVM_DEBUG(dbgs() << "Global variable " << Name
986 << " is annotated as hot\n");
987 } else if (DataAccessProf->isKnownColdSymbol(SymID: Handle)) {
988 ++NumOfMemProfColdGlobalVars;
989 Changed |= GVar.setSectionPrefix("unlikely");
990 Changed = true;
991 LLVM_DEBUG(dbgs() << "Global variable " << Name
992 << " is annotated as unlikely\n");
993 } else {
994 ++NumOfMemProfUnknownGlobalVars;
995 LLVM_DEBUG(dbgs() << "Global variable " << Name << " is not annotated\n");
996 }
997 }
998
999 return Changed;
1000}
1001