1//===- MemProfUse.cpp - memory allocation profile use pass --*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the MemProfUsePass which reads memory profiling data
10// and uses it to add metadata to instructions to guide optimization.
11//
12//===----------------------------------------------------------------------===//
13
14#include "llvm/Transforms/Instrumentation/MemProfUse.h"
15#include "llvm/ADT/SmallVector.h"
16#include "llvm/ADT/Statistic.h"
17#include "llvm/ADT/StringRef.h"
18#include "llvm/Analysis/MemoryProfileInfo.h"
19#include "llvm/Analysis/OptimizationRemarkEmitter.h"
20#include "llvm/Analysis/StaticDataProfileInfo.h"
21#include "llvm/Analysis/TargetLibraryInfo.h"
22#include "llvm/IR/DiagnosticInfo.h"
23#include "llvm/IR/Function.h"
24#include "llvm/IR/IntrinsicInst.h"
25#include "llvm/IR/Module.h"
26#include "llvm/ProfileData/DataAccessProf.h"
27#include "llvm/ProfileData/InstrProf.h"
28#include "llvm/ProfileData/InstrProfReader.h"
29#include "llvm/ProfileData/MemProfCommon.h"
30#include "llvm/Support/BLAKE3.h"
31#include "llvm/Support/CommandLine.h"
32#include "llvm/Support/Debug.h"
33#include "llvm/Support/HashBuilder.h"
34#include "llvm/Support/VirtualFileSystem.h"
35#include "llvm/Transforms/Utils/LongestCommonSequence.h"
36#include <map>
37#include <set>
38
39using namespace llvm;
40using namespace llvm::memprof;
41
42#define DEBUG_TYPE "memprof"
43
44namespace llvm {
45extern cl::opt<bool> PGOWarnMissing;
46extern cl::opt<bool> NoPGOWarnMismatch;
47extern cl::opt<bool> NoPGOWarnMismatchComdatWeak;
48} // namespace llvm
49
50// By default disable matching of allocation profiles onto operator new that
51// already explicitly pass a hot/cold hint, since we don't currently
52// override these hints anyway.
53static cl::opt<bool> ClMemProfMatchHotColdNew(
54 "memprof-match-hot-cold-new",
55 cl::desc(
56 "Match allocation profiles onto existing hot/cold operator new calls"),
57 cl::Hidden, cl::init(Val: false));
58
59static cl::opt<bool>
60 ClPrintMemProfMatchInfo("memprof-print-match-info",
61 cl::desc("Print matching stats for each allocation "
62 "context in this module's profiles"),
63 cl::Hidden, cl::init(Val: false));
64
65static cl::opt<bool> PrintMatchedAllocStack(
66 "memprof-print-matched-alloc-stack",
67 cl::desc("Print full stack context for matched "
68 "allocations with -memprof-print-match-info."),
69 cl::Hidden, cl::init(Val: false));
70
71static cl::opt<bool>
72 PrintFunctionGuids("memprof-print-function-guids",
73 cl::desc("Print function GUIDs computed for matching"),
74 cl::Hidden, cl::init(Val: false));
75
76static cl::opt<bool>
77 SalvageStaleProfile("memprof-salvage-stale-profile",
78 cl::desc("Salvage stale MemProf profile"),
79 cl::init(Val: false), cl::Hidden);
80
81static cl::opt<bool> ClMemProfAttachCalleeGuids(
82 "memprof-attach-calleeguids",
83 cl::desc(
84 "Attach calleeguids as value profile metadata for indirect calls."),
85 cl::init(Val: true), cl::Hidden);
86
87static cl::opt<unsigned> MinMatchedColdBytePercent(
88 "memprof-matching-cold-threshold", cl::init(Val: 100), cl::Hidden,
89 cl::desc("Min percent of cold bytes matched to hint allocation cold"));
90
91static cl::opt<bool> AnnotateStaticDataSectionPrefix(
92 "memprof-annotate-static-data-prefix", cl::init(Val: false), cl::Hidden,
93 cl::desc("If true, annotate the static data section prefix"));
94
95// FIXME: This option is added for incremental rollout purposes.
96// After the option, string literal partitioning should be implied by
97// AnnotateStaticDataSectionPrefix above and this option should be cleaned up.
98static cl::opt<bool> AnnotateStringLiteralSectionPrefix(
99 "memprof-annotate-string-literal-section-prefix", cl::init(Val: false),
100 cl::Hidden,
101 cl::desc("If true, annotate the string literal data section prefix"));
102
103// Matching statistics
104STATISTIC(NumOfMemProfMissing, "Number of functions without memory profile.");
105STATISTIC(NumOfMemProfMismatch,
106 "Number of functions having mismatched memory profile hash.");
107STATISTIC(NumOfMemProfFunc, "Number of functions having valid memory profile.");
108STATISTIC(NumOfMemProfAllocContextProfiles,
109 "Number of alloc contexts in memory profile.");
110STATISTIC(NumOfMemProfCallSiteProfiles,
111 "Number of callsites in memory profile.");
112STATISTIC(NumOfMemProfMatchedAllocContexts,
113 "Number of matched memory profile alloc contexts.");
114STATISTIC(NumOfMemProfMatchedAllocs,
115 "Number of matched memory profile allocs.");
116STATISTIC(NumOfMemProfMatchedCallSites,
117 "Number of matched memory profile callsites.");
118STATISTIC(NumOfMemProfHotGlobalVars,
119 "Number of global vars annotated with 'hot' section prefix.");
120STATISTIC(NumOfMemProfColdGlobalVars,
121 "Number of global vars annotated with 'unlikely' section prefix.");
122STATISTIC(NumOfMemProfUnknownGlobalVars,
123 "Number of global vars with unknown hotness (no section prefix).");
124STATISTIC(NumOfMemProfExplicitSectionGlobalVars,
125 "Number of global vars with user-specified section (not annotated).");
126
127static void addCallsiteMetadata(Instruction &I,
128 ArrayRef<uint64_t> InlinedCallStack,
129 LLVMContext &Ctx) {
130 I.setMetadata(KindID: LLVMContext::MD_callsite,
131 Node: buildCallstackMetadata(CallStack: InlinedCallStack, Ctx));
132}
133
134static uint64_t computeStackId(GlobalValue::GUID Function, uint32_t LineOffset,
135 uint32_t Column) {
136 llvm::HashBuilder<llvm::TruncatedBLAKE3<8>, llvm::endianness::little>
137 HashBuilder;
138 HashBuilder.add(Args: Function, Args: LineOffset, Args: Column);
139 llvm::BLAKE3Result<8> Hash = HashBuilder.final();
140 uint64_t Id;
141 std::memcpy(dest: &Id, src: Hash.data(), n: sizeof(Hash));
142 return Id;
143}
144
145static uint64_t computeStackId(const memprof::Frame &Frame) {
146 return computeStackId(Function: Frame.Function, LineOffset: Frame.LineOffset, Column: Frame.Column);
147}
148
149static AllocationType getAllocType(const AllocationInfo *AllocInfo) {
150 return getAllocType(TotalLifetimeAccessDensity: AllocInfo->Info.getTotalLifetimeAccessDensity(),
151 AllocCount: AllocInfo->Info.getAllocCount(),
152 TotalLifetime: AllocInfo->Info.getTotalLifetime());
153}
154
155static AllocationType addCallStack(CallStackTrie &AllocTrie,
156 const AllocationInfo *AllocInfo,
157 uint64_t FullStackId) {
158 SmallVector<uint64_t> StackIds;
159 for (const auto &StackFrame : AllocInfo->CallStack)
160 StackIds.push_back(Elt: computeStackId(Frame: StackFrame));
161 auto AllocType = getAllocType(AllocInfo);
162 std::vector<ContextTotalSize> ContextSizeInfo;
163 if (recordContextSizeInfoForAnalysis()) {
164 auto TotalSize = AllocInfo->Info.getTotalSize();
165 assert(TotalSize);
166 assert(FullStackId != 0);
167 ContextSizeInfo.push_back(x: {.FullStackId: FullStackId, .TotalSize: TotalSize});
168 }
169 AllocTrie.addCallStack(AllocType, StackIds, ContextSizeInfo: std::move(ContextSizeInfo));
170 return AllocType;
171}
172
173// Return true if InlinedCallStack, computed from a call instruction's debug
174// info, is a prefix of ProfileCallStack, a list of Frames from profile data
175// (either the allocation data or a callsite).
176static bool
177stackFrameIncludesInlinedCallStack(ArrayRef<Frame> ProfileCallStack,
178 ArrayRef<uint64_t> InlinedCallStack) {
179 return ProfileCallStack.size() >= InlinedCallStack.size() &&
180 llvm::equal(LRange: ProfileCallStack.take_front(N: InlinedCallStack.size()),
181 RRange&: InlinedCallStack, P: [](const Frame &F, uint64_t StackId) {
182 return computeStackId(Frame: F) == StackId;
183 });
184}
185
186static bool isAllocationWithHotColdVariant(const Function *Callee,
187 const TargetLibraryInfo &TLI) {
188 if (!Callee)
189 return false;
190 LibFunc Func;
191 if (!TLI.getLibFunc(FDecl: *Callee, F&: Func))
192 return false;
193 switch (Func) {
194 case LibFunc_Znwm:
195 case LibFunc_ZnwmRKSt9nothrow_t:
196 case LibFunc_ZnwmSt11align_val_t:
197 case LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t:
198 case LibFunc_Znam:
199 case LibFunc_ZnamRKSt9nothrow_t:
200 case LibFunc_ZnamSt11align_val_t:
201 case LibFunc_ZnamSt11align_val_tRKSt9nothrow_t:
202 case LibFunc_size_returning_new:
203 case LibFunc_size_returning_new_aligned:
204 return true;
205 case LibFunc_Znwm12__hot_cold_t:
206 case LibFunc_ZnwmRKSt9nothrow_t12__hot_cold_t:
207 case LibFunc_ZnwmSt11align_val_t12__hot_cold_t:
208 case LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t12__hot_cold_t:
209 case LibFunc_Znam12__hot_cold_t:
210 case LibFunc_ZnamRKSt9nothrow_t12__hot_cold_t:
211 case LibFunc_ZnamSt11align_val_t12__hot_cold_t:
212 case LibFunc_ZnamSt11align_val_tRKSt9nothrow_t12__hot_cold_t:
213 case LibFunc_size_returning_new_hot_cold:
214 case LibFunc_size_returning_new_aligned_hot_cold:
215 return ClMemProfMatchHotColdNew;
216 default:
217 return false;
218 }
219}
220
221static void HandleUnsupportedAnnotationKinds(GlobalVariable &GVar,
222 AnnotationKind Kind) {
223 assert(Kind != llvm::memprof::AnnotationKind::AnnotationOK &&
224 "Should not handle AnnotationOK here");
225 SmallString<32> Reason;
226 switch (Kind) {
227 case llvm::memprof::AnnotationKind::ExplicitSection:
228 ++NumOfMemProfExplicitSectionGlobalVars;
229 Reason.append(RHS: "explicit section name");
230 break;
231 case llvm::memprof::AnnotationKind::DeclForLinker:
232 Reason.append(RHS: "linker declaration");
233 break;
234 case llvm::memprof::AnnotationKind::ReservedName:
235 Reason.append(RHS: "name starts with `llvm.`");
236 break;
237 default:
238 llvm_unreachable("Unexpected annotation kind");
239 }
240 LLVM_DEBUG(dbgs() << "Skip annotation for " << GVar.getName() << " due to "
241 << Reason << ".\n");
242}
243
244// Computes the LLVM version of MD5 hash for the content of a string
245// literal.
246static std::optional<uint64_t>
247getStringContentHash(const GlobalVariable &GVar) {
248 auto *Initializer = GVar.getInitializer();
249 if (!Initializer)
250 return std::nullopt;
251 if (auto *C = dyn_cast<ConstantDataSequential>(Val: Initializer))
252 if (C->isString()) {
253 // Note the hash computed for the literal would include the null byte.
254 return llvm::MD5Hash(Str: C->getAsString());
255 }
256 return std::nullopt;
257}
258
259// Structure for tracking info about matched allocation contexts for use with
260// -memprof-print-match-info and -memprof-print-matched-alloc-stack.
261struct AllocMatchInfo {
262 // Total size in bytes of matched context.
263 uint64_t TotalSize = 0;
264 // Matched allocation's type.
265 AllocationType AllocType = AllocationType::None;
266 // Number of frames matched to the allocation itself (values will be >1 in
267 // cases where allocation was already inlined). Use a set because there can
268 // be multiple inlined instances and each may have a different inline depth.
269 // Use std::set to iterate in sorted order when printing.
270 std::set<unsigned> MatchedFramesSet;
271 // The full call stack of the allocation, for cases where requested via
272 // -memprof-print-matched-alloc-stack.
273 std::vector<Frame> CallStack;
274
275 // Caller responsible for inserting the matched frames and the call stack when
276 // appropriate.
277 AllocMatchInfo(uint64_t TotalSize, AllocationType AllocType)
278 : TotalSize(TotalSize), AllocType(AllocType) {}
279};
280
281DenseMap<uint64_t, SmallVector<CallEdgeTy, 0>>
282memprof::extractCallsFromIR(Module &M, const TargetLibraryInfo &TLI,
283 function_ref<bool(uint64_t)> IsPresentInProfile) {
284 DenseMap<uint64_t, SmallVector<CallEdgeTy, 0>> Calls;
285
286 auto GetOffset = [](const DILocation *DIL) {
287 return (DIL->getLine() - DIL->getScope()->getSubprogram()->getLine()) &
288 0xffff;
289 };
290
291 for (Function &F : M) {
292 if (F.isDeclaration())
293 continue;
294
295 for (auto &BB : F) {
296 for (auto &I : BB) {
297 if (!isa<CallBase>(Val: &I) || isa<IntrinsicInst>(Val: &I))
298 continue;
299
300 auto *CB = dyn_cast<CallBase>(Val: &I);
301 auto *CalledFunction = CB->getCalledFunction();
302 // Disregard indirect calls and intrinsics.
303 if (!CalledFunction || CalledFunction->isIntrinsic())
304 continue;
305
306 StringRef CalleeName = CalledFunction->getName();
307 // True if we are calling a heap allocation function that supports
308 // hot/cold variants.
309 bool IsAlloc = isAllocationWithHotColdVariant(Callee: CalledFunction, TLI);
310 // True for the first iteration below, indicating that we are looking at
311 // a leaf node.
312 bool IsLeaf = true;
313 for (const DILocation *DIL = I.getDebugLoc(); DIL;
314 DIL = DIL->getInlinedAt()) {
315 StringRef CallerName = DIL->getSubprogramLinkageName();
316 assert(!CallerName.empty() &&
317 "Be sure to enable -fdebug-info-for-profiling");
318 uint64_t CallerGUID = memprof::getGUID(FunctionName: CallerName);
319 uint64_t CalleeGUID = memprof::getGUID(FunctionName: CalleeName);
320 // Pretend that we are calling a function with GUID == 0 if we are
321 // in the inline stack leading to a heap allocation function.
322 if (IsAlloc) {
323 if (IsLeaf) {
324 // For leaf nodes, set CalleeGUID to 0 without consulting
325 // IsPresentInProfile.
326 CalleeGUID = 0;
327 } else if (!IsPresentInProfile(CalleeGUID)) {
328 // In addition to the leaf case above, continue to set CalleeGUID
329 // to 0 as long as we don't see CalleeGUID in the profile.
330 CalleeGUID = 0;
331 } else {
332 // Once we encounter a callee that exists in the profile, stop
333 // setting CalleeGUID to 0.
334 IsAlloc = false;
335 }
336 }
337
338 LineLocation Loc = {GetOffset(DIL), DIL->getColumn()};
339 Calls[CallerGUID].emplace_back(Args&: Loc, Args&: CalleeGUID);
340 CalleeName = CallerName;
341 IsLeaf = false;
342 }
343 }
344 }
345 }
346
347 // Sort each call list by the source location.
348 for (auto &[CallerGUID, CallList] : Calls) {
349 llvm::sort(C&: CallList);
350 CallList.erase(CS: llvm::unique(R&: CallList), CE: CallList.end());
351 }
352
353 return Calls;
354}
355
356DenseMap<uint64_t, LocToLocMap>
357memprof::computeUndriftMap(Module &M, IndexedInstrProfReader *MemProfReader,
358 const TargetLibraryInfo &TLI) {
359 DenseMap<uint64_t, LocToLocMap> UndriftMaps;
360
361 DenseMap<uint64_t, SmallVector<memprof::CallEdgeTy, 0>> CallsFromProfile =
362 MemProfReader->getMemProfCallerCalleePairs();
363 DenseMap<uint64_t, SmallVector<memprof::CallEdgeTy, 0>> CallsFromIR =
364 extractCallsFromIR(M, TLI, IsPresentInProfile: [&](uint64_t GUID) {
365 return CallsFromProfile.contains(Val: GUID);
366 });
367
368 // Compute an undrift map for each CallerGUID.
369 for (const auto &[CallerGUID, IRAnchors] : CallsFromIR) {
370 auto It = CallsFromProfile.find(Val: CallerGUID);
371 if (It == CallsFromProfile.end())
372 continue;
373 const auto &ProfileAnchors = It->second;
374
375 LocToLocMap Matchings;
376 longestCommonSequence<LineLocation, GlobalValue::GUID>(
377 AnchorList1: ProfileAnchors, AnchorList2: IRAnchors, FunctionMatchesProfile: std::equal_to<GlobalValue::GUID>(),
378 InsertMatching: [&](LineLocation A, LineLocation B) { Matchings.try_emplace(k: A, args&: B); });
379 [[maybe_unused]] bool Inserted =
380 UndriftMaps.try_emplace(Key: CallerGUID, Args: std::move(Matchings)).second;
381
382 // The insertion must succeed because we visit each GUID exactly once.
383 assert(Inserted);
384 }
385
386 return UndriftMaps;
387}
388
389// Given a MemProfRecord, undrift all the source locations present in the
390// record in place.
391static void
392undriftMemProfRecord(const DenseMap<uint64_t, LocToLocMap> &UndriftMaps,
393 memprof::MemProfRecord &MemProfRec) {
394 // Undrift a call stack in place.
395 auto UndriftCallStack = [&](std::vector<Frame> &CallStack) {
396 for (auto &F : CallStack) {
397 auto I = UndriftMaps.find(Val: F.Function);
398 if (I == UndriftMaps.end())
399 continue;
400 auto J = I->second.find(x: LineLocation(F.LineOffset, F.Column));
401 if (J == I->second.end())
402 continue;
403 auto &NewLoc = J->second;
404 F.LineOffset = NewLoc.LineOffset;
405 F.Column = NewLoc.Column;
406 }
407 };
408
409 for (auto &AS : MemProfRec.AllocSites)
410 UndriftCallStack(AS.CallStack);
411
412 for (auto &CS : MemProfRec.CallSites)
413 UndriftCallStack(CS.Frames);
414}
415
416// Helper function to process CalleeGuids and create value profile metadata
417static void addVPMetadata(Module &M, Instruction &I,
418 ArrayRef<GlobalValue::GUID> CalleeGuids) {
419 if (!ClMemProfAttachCalleeGuids || CalleeGuids.empty())
420 return;
421
422 // Prepare the vector of value data, initializing from any existing
423 // value-profile metadata present on the instruction so that we merge the
424 // new CalleeGuids into the existing entries.
425 SmallVector<InstrProfValueData> VDs;
426 uint64_t TotalCount = 0;
427
428 if (I.getMetadata(KindID: LLVMContext::MD_prof)) {
429 // Read all existing entries so we can merge them. Use a large
430 // MaxNumValueData to retrieve all existing entries.
431 VDs = getValueProfDataFromInst(Inst: I, ValueKind: IPVK_IndirectCallTarget,
432 /*MaxNumValueData=*/UINT32_MAX, TotalC&: TotalCount);
433 }
434
435 // Save the original size for use later in detecting whether any were added.
436 const size_t OriginalSize = VDs.size();
437
438 // Initialize the set of existing guids with the original list.
439 DenseSet<uint64_t> ExistingValues(
440 llvm::from_range,
441 llvm::map_range(
442 C&: VDs, F: [](const InstrProfValueData &Entry) { return Entry.Value; }));
443
444 // Merge CalleeGuids into list of existing VDs, by appending any that are not
445 // already included.
446 VDs.reserve(N: OriginalSize + CalleeGuids.size());
447 for (auto G : CalleeGuids) {
448 if (!ExistingValues.insert(V: G).second)
449 continue;
450 InstrProfValueData NewEntry;
451 NewEntry.Value = G;
452 // For MemProf, we don't have actual call counts, so we assign
453 // a weight of 1 to each potential target.
454 // TODO: Consider making this weight configurable or increasing it to
455 // improve effectiveness for ICP.
456 NewEntry.Count = 1;
457 TotalCount += NewEntry.Count;
458 VDs.push_back(Elt: NewEntry);
459 }
460
461 // Update the VP metadata if we added any new callee GUIDs to the list.
462 assert(VDs.size() >= OriginalSize);
463 if (VDs.size() == OriginalSize)
464 return;
465
466 // First clear the existing !prof.
467 I.setMetadata(KindID: LLVMContext::MD_prof, Node: nullptr);
468
469 // No need to sort the updated VDs as all appended entries have the same count
470 // of 1, which is no larger than any existing entries. The incoming list of
471 // CalleeGuids should already be deterministic for a given profile.
472 annotateValueSite(M, Inst&: I, VDs, Sum: TotalCount, ValueKind: IPVK_IndirectCallTarget, MaxMDCount: VDs.size());
473}
474
475static void handleAllocSite(
476 Instruction &I, CallBase *CI, ArrayRef<uint64_t> InlinedCallStack,
477 LLVMContext &Ctx, OptimizationRemarkEmitter &ORE, uint64_t MaxColdSize,
478 const std::set<const AllocationInfo *> &AllocInfoSet,
479 std::map<uint64_t, AllocMatchInfo> &FullStackIdToAllocMatchInfo) {
480 // TODO: Remove this once the profile creation logic deduplicates contexts
481 // that are the same other than the IsInlineFrame bool. Until then, keep the
482 // largest.
483 DenseMap<uint64_t, const AllocationInfo *> UniqueFullContextIdAllocInfo;
484 for (auto *AllocInfo : AllocInfoSet) {
485 auto FullStackId = computeFullStackId(CallStack: AllocInfo->CallStack);
486 auto [It, Inserted] =
487 UniqueFullContextIdAllocInfo.insert(KV: {FullStackId, AllocInfo});
488 // If inserted entry, done.
489 if (Inserted)
490 continue;
491 // Keep the larger one, or the noncold one if they are the same size.
492 auto CurSize = It->second->Info.getTotalSize();
493 auto NewSize = AllocInfo->Info.getTotalSize();
494 if ((CurSize > NewSize) ||
495 (CurSize == NewSize &&
496 getAllocType(AllocInfo) != AllocationType::NotCold))
497 continue;
498 It->second = AllocInfo;
499 }
500 // We may match this instruction's location list to multiple MIB
501 // contexts. Add them to a Trie specialized for trimming the contexts to
502 // the minimal needed to disambiguate contexts with unique behavior.
503 CallStackTrie AllocTrie(&ORE, MaxColdSize);
504 uint64_t TotalSize = 0;
505 uint64_t TotalColdSize = 0;
506 for (auto &[FullStackId, AllocInfo] : UniqueFullContextIdAllocInfo) {
507 // Check the full inlined call stack against this one.
508 // If we found and thus matched all frames on the call, include
509 // this MIB.
510 if (stackFrameIncludesInlinedCallStack(ProfileCallStack: AllocInfo->CallStack,
511 InlinedCallStack)) {
512 NumOfMemProfMatchedAllocContexts++;
513 auto AllocType = addCallStack(AllocTrie, AllocInfo, FullStackId);
514 TotalSize += AllocInfo->Info.getTotalSize();
515 if (AllocType == AllocationType::Cold)
516 TotalColdSize += AllocInfo->Info.getTotalSize();
517 // Record information about the allocation if match info printing
518 // was requested.
519 if (ClPrintMemProfMatchInfo) {
520 assert(FullStackId != 0);
521 auto [Iter, Inserted] = FullStackIdToAllocMatchInfo.try_emplace(
522 k: FullStackId,
523 args: AllocMatchInfo(AllocInfo->Info.getTotalSize(), AllocType));
524 // Always insert the new matched frame count, since it may differ.
525 Iter->second.MatchedFramesSet.insert(x: InlinedCallStack.size());
526 if (Inserted && PrintMatchedAllocStack)
527 Iter->second.CallStack.insert(position: Iter->second.CallStack.begin(),
528 first: AllocInfo->CallStack.begin(),
529 last: AllocInfo->CallStack.end());
530 }
531 ORE.emit(
532 OptDiag: OptimizationRemark(DEBUG_TYPE, "MemProfUse", CI)
533 << ore::NV("AllocationCall", CI) << " in function "
534 << ore::NV("Caller", CI->getFunction())
535 << " matched alloc context with alloc type "
536 << ore::NV("Attribute", getAllocTypeAttributeString(Type: AllocType))
537 << " total size " << ore::NV("Size", AllocInfo->Info.getTotalSize())
538 << " full context id " << ore::NV("Context", FullStackId)
539 << " frame count " << ore::NV("Frames", InlinedCallStack.size()));
540 }
541 }
542 // If the threshold for the percent of cold bytes is less than 100%,
543 // and not all bytes are cold, see if we should still hint this
544 // allocation as cold without context sensitivity.
545 if (TotalColdSize < TotalSize && MinMatchedColdBytePercent < 100 &&
546 TotalColdSize * 100 >= MinMatchedColdBytePercent * TotalSize) {
547 AllocTrie.addSingleAllocTypeAttribute(CI, AT: AllocationType::Cold, Descriptor: "dominant");
548 return;
549 }
550
551 // We might not have matched any to the full inlined call stack.
552 // But if we did, create and attach metadata, or a function attribute if
553 // all contexts have identical profiled behavior.
554 if (!AllocTrie.empty()) {
555 NumOfMemProfMatchedAllocs++;
556 // MemprofMDAttached will be false if a function attribute was
557 // attached.
558 bool MemprofMDAttached = AllocTrie.buildAndAttachMIBMetadata(CI);
559 assert(MemprofMDAttached == I.hasMetadata(LLVMContext::MD_memprof));
560 if (MemprofMDAttached) {
561 // Add callsite metadata for the instruction's location list so that
562 // it simpler later on to identify which part of the MIB contexts
563 // are from this particular instruction (including during inlining,
564 // when the callsite metadata will be updated appropriately).
565 // FIXME: can this be changed to strip out the matching stack
566 // context ids from the MIB contexts and not add any callsite
567 // metadata here to save space?
568 addCallsiteMetadata(I, InlinedCallStack, Ctx);
569 }
570 }
571}
572
573// Helper struct for maintaining refs to callsite data. As an alternative we
574// could store a pointer to the CallSiteInfo struct but we also need the frame
575// index. Using ArrayRefs instead makes it a little easier to read.
576struct CallSiteEntry {
577 // Subset of frames for the corresponding CallSiteInfo.
578 ArrayRef<Frame> Frames;
579 // Potential targets for indirect calls.
580 ArrayRef<GlobalValue::GUID> CalleeGuids;
581};
582
583static void handleCallSite(Instruction &I, const Function *CalledFunction,
584 ArrayRef<uint64_t> InlinedCallStack,
585 const std::vector<CallSiteEntry> &CallSiteEntries,
586 Module &M,
587 std::set<std::vector<uint64_t>> &MatchedCallSites,
588 OptimizationRemarkEmitter &ORE) {
589 auto &Ctx = M.getContext();
590 // Set of Callee GUIDs to attach to indirect calls. We accumulate all of them
591 // to support cases where the instuction's inlined frames match multiple call
592 // site entries, which can happen if the profile was collected from a binary
593 // where this instruction was eventually inlined into multiple callers.
594 SetVector<GlobalValue::GUID> CalleeGuids;
595 bool CallsiteMDAdded = false;
596 for (const auto &CallSiteEntry : CallSiteEntries) {
597 // If we found and thus matched all frames on the call, create and
598 // attach call stack metadata.
599 if (stackFrameIncludesInlinedCallStack(ProfileCallStack: CallSiteEntry.Frames,
600 InlinedCallStack)) {
601 NumOfMemProfMatchedCallSites++;
602 // Only need to find one with a matching call stack and add a single
603 // callsite metadata.
604 if (!CallsiteMDAdded) {
605 addCallsiteMetadata(I, InlinedCallStack, Ctx);
606
607 // Accumulate call site matching information upon request.
608 if (ClPrintMemProfMatchInfo) {
609 std::vector<uint64_t> CallStack;
610 append_range(C&: CallStack, R&: InlinedCallStack);
611 MatchedCallSites.insert(x: std::move(CallStack));
612 }
613 ORE.emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemProfUse", &I)
614 << ore::NV("CallSite", &I) << " in function "
615 << ore::NV("Caller", I.getFunction())
616 << " matched callsite with frame count "
617 << ore::NV("Frames", InlinedCallStack.size()));
618
619 // If this is a direct call, we're done.
620 if (CalledFunction)
621 break;
622 CallsiteMDAdded = true;
623 }
624
625 assert(!CalledFunction && "Didn't expect direct call");
626
627 // Collect Callee GUIDs from all matching CallSiteEntries.
628 CalleeGuids.insert(Start: CallSiteEntry.CalleeGuids.begin(),
629 End: CallSiteEntry.CalleeGuids.end());
630 }
631 }
632 // Try to attach indirect call metadata if possible.
633 addVPMetadata(M, I, CalleeGuids: CalleeGuids.getArrayRef());
634}
635
636static void
637readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader,
638 const TargetLibraryInfo &TLI,
639 std::map<uint64_t, AllocMatchInfo> &FullStackIdToAllocMatchInfo,
640 std::set<std::vector<uint64_t>> &MatchedCallSites,
641 DenseMap<uint64_t, LocToLocMap> &UndriftMaps,
642 OptimizationRemarkEmitter &ORE, uint64_t MaxColdSize) {
643 auto &Ctx = M.getContext();
644 // Previously we used getIRPGOFuncName() here. If F is local linkage,
645 // getIRPGOFuncName() returns FuncName with prefix 'FileName;'. But
646 // llvm-profdata uses FuncName in dwarf to create GUID which doesn't
647 // contain FileName's prefix. It caused local linkage function can't
648 // find MemProfRecord. So we use getName() now.
649 // 'unique-internal-linkage-names' can make MemProf work better for local
650 // linkage function.
651 auto FuncName = F.getName();
652 auto FuncGUID = Function::getGUIDAssumingExternalLinkage(GlobalName: FuncName);
653 if (PrintFunctionGuids)
654 errs() << "MemProf: Function GUID " << FuncGUID << " is " << FuncName
655 << "\n";
656 std::optional<memprof::MemProfRecord> MemProfRec;
657 auto Err = MemProfReader->getMemProfRecord(FuncNameHash: FuncGUID).moveInto(Value&: MemProfRec);
658 if (Err) {
659 handleAllErrors(E: std::move(Err), Handlers: [&](const InstrProfError &IPE) {
660 auto Err = IPE.get();
661 bool SkipWarning = false;
662 LLVM_DEBUG(dbgs() << "Error in reading profile for Func " << FuncName
663 << ": ");
664 if (Err == instrprof_error::unknown_function) {
665 NumOfMemProfMissing++;
666 SkipWarning = !PGOWarnMissing;
667 LLVM_DEBUG(dbgs() << "unknown function");
668 } else if (Err == instrprof_error::hash_mismatch) {
669 NumOfMemProfMismatch++;
670 SkipWarning =
671 NoPGOWarnMismatch ||
672 (NoPGOWarnMismatchComdatWeak &&
673 (F.hasComdat() ||
674 F.getLinkage() == GlobalValue::AvailableExternallyLinkage));
675 LLVM_DEBUG(dbgs() << "hash mismatch (skip=" << SkipWarning << ")");
676 }
677
678 if (SkipWarning)
679 return;
680
681 std::string Msg = (IPE.message() + Twine(" ") + F.getName().str() +
682 Twine(" Hash = ") + std::to_string(val: FuncGUID))
683 .str();
684
685 Ctx.diagnose(
686 DI: DiagnosticInfoPGOProfile(M.getName().data(), Msg, DS_Warning));
687 });
688 return;
689 }
690
691 NumOfMemProfFunc++;
692
693 // If requested, undrfit MemProfRecord so that the source locations in it
694 // match those in the IR.
695 if (SalvageStaleProfile)
696 undriftMemProfRecord(UndriftMaps, MemProfRec&: *MemProfRec);
697
698 // Detect if there are non-zero column numbers in the profile. If not,
699 // treat all column numbers as 0 when matching (i.e. ignore any non-zero
700 // columns in the IR). The profiled binary might have been built with
701 // column numbers disabled, for example.
702 bool ProfileHasColumns = false;
703
704 // Build maps of the location hash to all profile data with that leaf location
705 // (allocation info and the callsites).
706 std::map<uint64_t, std::set<const AllocationInfo *>> LocHashToAllocInfo;
707
708 // For the callsites we need to record slices of the frame array (see comments
709 // below where the map entries are added) along with their CalleeGuids.
710 std::map<uint64_t, std::vector<CallSiteEntry>> LocHashToCallSites;
711 for (auto &AI : MemProfRec->AllocSites) {
712 NumOfMemProfAllocContextProfiles++;
713 // Associate the allocation info with the leaf frame. The later matching
714 // code will match any inlined call sequences in the IR with a longer prefix
715 // of call stack frames.
716 uint64_t StackId = computeStackId(Frame: AI.CallStack[0]);
717 LocHashToAllocInfo[StackId].insert(x: &AI);
718 ProfileHasColumns |= AI.CallStack[0].Column;
719 }
720 for (auto &CS : MemProfRec->CallSites) {
721 NumOfMemProfCallSiteProfiles++;
722 // Need to record all frames from leaf up to and including this function,
723 // as any of these may or may not have been inlined at this point.
724 unsigned Idx = 0;
725 for (auto &StackFrame : CS.Frames) {
726 uint64_t StackId = computeStackId(Frame: StackFrame);
727 ArrayRef<Frame> FrameSlice = ArrayRef<Frame>(CS.Frames).drop_front(N: Idx++);
728 // The callee guids for the slice containing all frames (due to the
729 // increment above Idx is now 1) comes from the CalleeGuids recorded in
730 // the CallSite. For the slices not containing the leaf-most frame, the
731 // callee guid is simply the function GUID of the prior frame.
732 LocHashToCallSites[StackId].push_back(
733 x: {.Frames: FrameSlice, .CalleeGuids: (Idx == 1 ? CS.CalleeGuids
734 : ArrayRef<GlobalValue::GUID>(
735 CS.Frames[Idx - 2].Function))});
736
737 ProfileHasColumns |= StackFrame.Column;
738 // Once we find this function, we can stop recording.
739 if (StackFrame.Function == FuncGUID)
740 break;
741 }
742 assert(Idx <= CS.Frames.size() && CS.Frames[Idx - 1].Function == FuncGUID);
743 }
744
745 auto GetOffset = [](const DILocation *DIL) {
746 return (DIL->getLine() - DIL->getScope()->getSubprogram()->getLine()) &
747 0xffff;
748 };
749
750 // Now walk the instructions, looking up the associated profile data using
751 // debug locations.
752 for (auto &BB : F) {
753 for (auto &I : BB) {
754 if (I.isDebugOrPseudoInst())
755 continue;
756 // We are only interested in calls (allocation or interior call stack
757 // context calls).
758 auto *CI = dyn_cast<CallBase>(Val: &I);
759 if (!CI)
760 continue;
761 auto *CalledFunction = CI->getCalledFunction();
762 if (CalledFunction && CalledFunction->isIntrinsic())
763 continue;
764 // List of call stack ids computed from the location hashes on debug
765 // locations (leaf to inlined at root).
766 SmallVector<uint64_t, 8> InlinedCallStack;
767 // Was the leaf location found in one of the profile maps?
768 bool LeafFound = false;
769 // If leaf was found in a map, iterators pointing to its location in both
770 // of the maps. It might exist in neither, one, or both (the latter case
771 // can happen because we don't currently have discriminators to
772 // distinguish the case when a single line/col maps to both an allocation
773 // and another callsite).
774 auto AllocInfoIter = LocHashToAllocInfo.end();
775 auto CallSitesIter = LocHashToCallSites.end();
776 for (const DILocation *DIL = I.getDebugLoc(); DIL != nullptr;
777 DIL = DIL->getInlinedAt()) {
778 // Use C++ linkage name if possible. Need to compile with
779 // -fdebug-info-for-profiling to get linkage name.
780 StringRef Name = DIL->getScope()->getSubprogram()->getLinkageName();
781 if (Name.empty())
782 Name = DIL->getScope()->getSubprogram()->getName();
783 auto CalleeGUID = Function::getGUIDAssumingExternalLinkage(GlobalName: Name);
784 auto StackId = computeStackId(Function: CalleeGUID, LineOffset: GetOffset(DIL),
785 Column: ProfileHasColumns ? DIL->getColumn() : 0);
786 // Check if we have found the profile's leaf frame. If yes, collect
787 // the rest of the call's inlined context starting here. If not, see if
788 // we find a match further up the inlined context (in case the profile
789 // was missing debug frames at the leaf).
790 if (!LeafFound) {
791 AllocInfoIter = LocHashToAllocInfo.find(x: StackId);
792 CallSitesIter = LocHashToCallSites.find(x: StackId);
793 if (AllocInfoIter != LocHashToAllocInfo.end() ||
794 CallSitesIter != LocHashToCallSites.end())
795 LeafFound = true;
796 }
797 if (LeafFound)
798 InlinedCallStack.push_back(Elt: StackId);
799 }
800 // If leaf not in either of the maps, skip inst.
801 if (!LeafFound)
802 continue;
803
804 // First add !memprof metadata from allocation info, if we found the
805 // instruction's leaf location in that map, and if the rest of the
806 // instruction's locations match the prefix Frame locations on an
807 // allocation context with the same leaf.
808 if (AllocInfoIter != LocHashToAllocInfo.end() &&
809 // Only consider allocations which support hinting.
810 isAllocationWithHotColdVariant(Callee: CI->getCalledFunction(), TLI))
811 handleAllocSite(I, CI, InlinedCallStack, Ctx, ORE, MaxColdSize,
812 AllocInfoSet: AllocInfoIter->second, FullStackIdToAllocMatchInfo);
813 else if (CallSitesIter != LocHashToCallSites.end())
814 // Otherwise, add callsite metadata. If we reach here then we found the
815 // instruction's leaf location in the callsites map and not the
816 // allocation map.
817 handleCallSite(I, CalledFunction, InlinedCallStack,
818 CallSiteEntries: CallSitesIter->second, M, MatchedCallSites, ORE);
819 }
820 }
821}
822
823MemProfUsePass::MemProfUsePass(std::string MemoryProfileFile,
824 IntrusiveRefCntPtr<vfs::FileSystem> FS)
825 : MemoryProfileFileName(MemoryProfileFile), FS(FS) {
826 if (!FS)
827 this->FS = vfs::getRealFileSystem();
828}
829
830PreservedAnalyses MemProfUsePass::run(Module &M, ModuleAnalysisManager &AM) {
831 // Return immediately if the module doesn't contain any function or global
832 // variables.
833 if (M.empty() && M.globals().empty())
834 return PreservedAnalyses::all();
835
836 LLVM_DEBUG(dbgs() << "Read in memory profile:\n");
837 auto &Ctx = M.getContext();
838 auto ReaderOrErr = IndexedInstrProfReader::create(Path: MemoryProfileFileName, FS&: *FS);
839 if (Error E = ReaderOrErr.takeError()) {
840 handleAllErrors(E: std::move(E), Handlers: [&](const ErrorInfoBase &EI) {
841 Ctx.diagnose(
842 DI: DiagnosticInfoPGOProfile(MemoryProfileFileName.data(), EI.message()));
843 });
844 return PreservedAnalyses::all();
845 }
846
847 std::unique_ptr<IndexedInstrProfReader> MemProfReader =
848 std::move(ReaderOrErr.get());
849 if (!MemProfReader) {
850 Ctx.diagnose(DI: DiagnosticInfoPGOProfile(
851 MemoryProfileFileName.data(), StringRef("Cannot get MemProfReader")));
852 return PreservedAnalyses::all();
853 }
854
855 if (!MemProfReader->hasMemoryProfile()) {
856 Ctx.diagnose(DI: DiagnosticInfoPGOProfile(MemoryProfileFileName.data(),
857 "Not a memory profile"));
858 return PreservedAnalyses::all();
859 }
860
861 const bool Changed =
862 annotateGlobalVariables(M, DataAccessProf: MemProfReader->getDataAccessProfileData());
863
864 // If the module doesn't contain any function, return after we process all
865 // global variables.
866 if (M.empty())
867 return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
868
869 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(IR&: M).getManager();
870
871 TargetLibraryInfo &TLI = FAM.getResult<TargetLibraryAnalysis>(IR&: *M.begin());
872 DenseMap<uint64_t, LocToLocMap> UndriftMaps;
873 if (SalvageStaleProfile)
874 UndriftMaps = computeUndriftMap(M, MemProfReader: MemProfReader.get(), TLI);
875
876 // Map from the stack hash of each matched allocation context in the function
877 // profiles to match info such as the total profiled size (bytes), allocation
878 // type, number of frames matched to the allocation itself, and the full array
879 // of call stack ids.
880 std::map<uint64_t, AllocMatchInfo> FullStackIdToAllocMatchInfo;
881
882 // Set of the matched call sites, each expressed as a sequence of an inline
883 // call stack.
884 std::set<std::vector<uint64_t>> MatchedCallSites;
885
886 uint64_t MaxColdSize = 0;
887 if (auto *MemProfSum = MemProfReader->getMemProfSummary())
888 MaxColdSize = MemProfSum->getMaxColdTotalSize();
889
890 for (auto &F : M) {
891 if (F.isDeclaration())
892 continue;
893
894 const TargetLibraryInfo &TLI = FAM.getResult<TargetLibraryAnalysis>(IR&: F);
895 auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F);
896 readMemprof(M, F, MemProfReader: MemProfReader.get(), TLI, FullStackIdToAllocMatchInfo,
897 MatchedCallSites, UndriftMaps, ORE, MaxColdSize);
898 }
899
900 if (ClPrintMemProfMatchInfo) {
901 for (const auto &[Id, Info] : FullStackIdToAllocMatchInfo) {
902 for (auto Frames : Info.MatchedFramesSet) {
903 // TODO: To reduce verbosity, should we change the existing message
904 // so that we emit a list of matched frame counts in a single message
905 // about the context (instead of one message per frame count?
906 errs() << "MemProf " << getAllocTypeAttributeString(Type: Info.AllocType)
907 << " context with id " << Id << " has total profiled size "
908 << Info.TotalSize << " is matched with " << Frames << " frames";
909 if (PrintMatchedAllocStack) {
910 errs() << " and call stack";
911 for (auto &F : Info.CallStack)
912 errs() << " " << computeStackId(Frame: F);
913 }
914 errs() << "\n";
915 }
916 }
917
918 for (const auto &CallStack : MatchedCallSites) {
919 errs() << "MemProf callsite match for inline call stack";
920 for (uint64_t StackId : CallStack)
921 errs() << " " << StackId;
922 errs() << "\n";
923 }
924 }
925
926 return PreservedAnalyses::none();
927}
928
929bool MemProfUsePass::annotateGlobalVariables(
930 Module &M, const memprof::DataAccessProfData *DataAccessProf) {
931 if (!AnnotateStaticDataSectionPrefix || M.globals().empty())
932 return false;
933
934 if (!DataAccessProf) {
935 M.addModuleFlag(Behavior: Module::Warning, Key: "EnableDataAccessProf", Val: 0U);
936 // FIXME: Add a diagnostic message without failing the compilation when
937 // data access profile payload is not available.
938 return false;
939 }
940 M.addModuleFlag(Behavior: Module::Warning, Key: "EnableDataAccessProf", Val: 1U);
941
942 bool Changed = false;
943 // Iterate all global variables in the module and annotate them based on
944 // data access profiles. Note it's up to the linker to decide how to map input
945 // sections to output sections, and one conservative practice is to map
946 // unlikely-prefixed ones to unlikely output section, and map the rest
947 // (hot-prefixed or prefix-less) to the canonical output section.
948 for (GlobalVariable &GVar : M.globals()) {
949 assert(!GVar.getSectionPrefix().has_value() &&
950 "GVar shouldn't have section prefix yet");
951 auto Kind = llvm::memprof::getAnnotationKind(GV: GVar);
952 if (Kind != llvm::memprof::AnnotationKind::AnnotationOK) {
953 HandleUnsupportedAnnotationKinds(GVar, Kind);
954 continue;
955 }
956
957 StringRef Name = GVar.getName();
958 SymbolHandleRef Handle = SymbolHandleRef(Name);
959 // Skip string literals as their mangled names don't stay stable across
960 // binary releases.
961 if (!AnnotateStringLiteralSectionPrefix)
962 if (Name.starts_with(Prefix: ".str"))
963 continue;
964
965 if (Name.starts_with(Prefix: ".str")) {
966 std::optional<uint64_t> Hash = getStringContentHash(GVar);
967 if (!Hash) {
968 LLVM_DEBUG(dbgs() << "Cannot compute content hash for string literal "
969 << Name << "\n");
970 continue;
971 }
972 Handle = SymbolHandleRef(Hash.value());
973 }
974
975 // DataAccessProfRecord's get* methods will canonicalize the name under the
976 // hood before looking it up, so optimizer doesn't need to do it.
977 std::optional<DataAccessProfRecord> Record =
978 DataAccessProf->getProfileRecord(SymID: Handle);
979 // Annotate a global variable as hot if it has non-zero sampled count, and
980 // annotate it as cold if it's seen in the profiled binary
981 // file but doesn't have any access sample.
982 // For logging, optimization remark emitter requires a llvm::Function, but
983 // it's not well defined how to associate a global variable with a function.
984 // So we just print out the static data section prefix in LLVM_DEBUG.
985 if (Record && Record->AccessCount > 0) {
986 ++NumOfMemProfHotGlobalVars;
987 Changed |= GVar.setSectionPrefix("hot");
988 LLVM_DEBUG(dbgs() << "Global variable " << Name
989 << " is annotated as hot\n");
990 } else if (DataAccessProf->isKnownColdSymbol(SymID: Handle)) {
991 ++NumOfMemProfColdGlobalVars;
992 Changed |= GVar.setSectionPrefix("unlikely");
993 Changed = true;
994 LLVM_DEBUG(dbgs() << "Global variable " << Name
995 << " is annotated as unlikely\n");
996 } else {
997 ++NumOfMemProfUnknownGlobalVars;
998 LLVM_DEBUG(dbgs() << "Global variable " << Name << " is not annotated\n");
999 }
1000 }
1001
1002 return Changed;
1003}
1004