1 | //===- MemProfUse.cpp - memory allocation profile use pass --*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file implements the MemProfUsePass which reads memory profiling data |
10 | // and uses it to add metadata to instructions to guide optimization. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "llvm/Transforms/Instrumentation/MemProfUse.h" |
15 | #include "llvm/ADT/SmallVector.h" |
16 | #include "llvm/ADT/Statistic.h" |
17 | #include "llvm/ADT/StringRef.h" |
18 | #include "llvm/Analysis/MemoryProfileInfo.h" |
19 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
20 | #include "llvm/Analysis/TargetLibraryInfo.h" |
21 | #include "llvm/IR/DiagnosticInfo.h" |
22 | #include "llvm/IR/Function.h" |
23 | #include "llvm/IR/IntrinsicInst.h" |
24 | #include "llvm/IR/Module.h" |
25 | #include "llvm/ProfileData/InstrProf.h" |
26 | #include "llvm/ProfileData/InstrProfReader.h" |
27 | #include "llvm/ProfileData/MemProfCommon.h" |
28 | #include "llvm/Support/BLAKE3.h" |
29 | #include "llvm/Support/CommandLine.h" |
30 | #include "llvm/Support/Debug.h" |
31 | #include "llvm/Support/HashBuilder.h" |
32 | #include "llvm/Support/VirtualFileSystem.h" |
33 | #include "llvm/Transforms/Utils/LongestCommonSequence.h" |
34 | #include <map> |
35 | #include <set> |
36 | |
37 | using namespace llvm; |
38 | using namespace llvm::memprof; |
39 | |
40 | #define DEBUG_TYPE "memprof" |
41 | |
42 | namespace llvm { |
43 | extern cl::opt<bool> PGOWarnMissing; |
44 | extern cl::opt<bool> NoPGOWarnMismatch; |
45 | extern cl::opt<bool> NoPGOWarnMismatchComdatWeak; |
46 | } // namespace llvm |
47 | |
48 | // By default disable matching of allocation profiles onto operator new that |
49 | // already explicitly pass a hot/cold hint, since we don't currently |
50 | // override these hints anyway. |
51 | static cl::opt<bool> ClMemProfMatchHotColdNew( |
52 | "memprof-match-hot-cold-new" , |
53 | cl::desc( |
54 | "Match allocation profiles onto existing hot/cold operator new calls" ), |
55 | cl::Hidden, cl::init(Val: false)); |
56 | |
57 | static cl::opt<bool> |
58 | ClPrintMemProfMatchInfo("memprof-print-match-info" , |
59 | cl::desc("Print matching stats for each allocation " |
60 | "context in this module's profiles" ), |
61 | cl::Hidden, cl::init(Val: false)); |
62 | |
63 | static cl::opt<bool> |
64 | SalvageStaleProfile("memprof-salvage-stale-profile" , |
65 | cl::desc("Salvage stale MemProf profile" ), |
66 | cl::init(Val: false), cl::Hidden); |
67 | |
68 | static cl::opt<bool> ClMemProfAttachCalleeGuids( |
69 | "memprof-attach-calleeguids" , |
70 | cl::desc( |
71 | "Attach calleeguids as value profile metadata for indirect calls." ), |
72 | cl::init(Val: true), cl::Hidden); |
73 | |
74 | static cl::opt<unsigned> MinMatchedColdBytePercent( |
75 | "memprof-matching-cold-threshold" , cl::init(Val: 100), cl::Hidden, |
76 | cl::desc("Min percent of cold bytes matched to hint allocation cold" )); |
77 | |
78 | // Matching statistics |
79 | STATISTIC(NumOfMemProfMissing, "Number of functions without memory profile." ); |
80 | STATISTIC(NumOfMemProfMismatch, |
81 | "Number of functions having mismatched memory profile hash." ); |
82 | STATISTIC(NumOfMemProfFunc, "Number of functions having valid memory profile." ); |
83 | STATISTIC(NumOfMemProfAllocContextProfiles, |
84 | "Number of alloc contexts in memory profile." ); |
85 | STATISTIC(NumOfMemProfCallSiteProfiles, |
86 | "Number of callsites in memory profile." ); |
87 | STATISTIC(NumOfMemProfMatchedAllocContexts, |
88 | "Number of matched memory profile alloc contexts." ); |
89 | STATISTIC(NumOfMemProfMatchedAllocs, |
90 | "Number of matched memory profile allocs." ); |
91 | STATISTIC(NumOfMemProfMatchedCallSites, |
92 | "Number of matched memory profile callsites." ); |
93 | |
94 | static void addCallsiteMetadata(Instruction &I, |
95 | ArrayRef<uint64_t> InlinedCallStack, |
96 | LLVMContext &Ctx) { |
97 | I.setMetadata(KindID: LLVMContext::MD_callsite, |
98 | Node: buildCallstackMetadata(CallStack: InlinedCallStack, Ctx)); |
99 | } |
100 | |
101 | static uint64_t computeStackId(GlobalValue::GUID Function, uint32_t LineOffset, |
102 | uint32_t Column) { |
103 | llvm::HashBuilder<llvm::TruncatedBLAKE3<8>, llvm::endianness::little> |
104 | HashBuilder; |
105 | HashBuilder.add(Args: Function, Args: LineOffset, Args: Column); |
106 | llvm::BLAKE3Result<8> Hash = HashBuilder.final(); |
107 | uint64_t Id; |
108 | std::memcpy(dest: &Id, src: Hash.data(), n: sizeof(Hash)); |
109 | return Id; |
110 | } |
111 | |
112 | static uint64_t computeStackId(const memprof::Frame &Frame) { |
113 | return computeStackId(Function: Frame.Function, LineOffset: Frame.LineOffset, Column: Frame.Column); |
114 | } |
115 | |
116 | static AllocationType addCallStack(CallStackTrie &AllocTrie, |
117 | const AllocationInfo *AllocInfo, |
118 | uint64_t FullStackId) { |
119 | SmallVector<uint64_t> StackIds; |
120 | for (const auto &StackFrame : AllocInfo->CallStack) |
121 | StackIds.push_back(Elt: computeStackId(Frame: StackFrame)); |
122 | auto AllocType = getAllocType(TotalLifetimeAccessDensity: AllocInfo->Info.getTotalLifetimeAccessDensity(), |
123 | AllocCount: AllocInfo->Info.getAllocCount(), |
124 | TotalLifetime: AllocInfo->Info.getTotalLifetime()); |
125 | std::vector<ContextTotalSize> ContextSizeInfo; |
126 | if (recordContextSizeInfoForAnalysis()) { |
127 | auto TotalSize = AllocInfo->Info.getTotalSize(); |
128 | assert(TotalSize); |
129 | assert(FullStackId != 0); |
130 | ContextSizeInfo.push_back(x: {.FullStackId: FullStackId, .TotalSize: TotalSize}); |
131 | } |
132 | AllocTrie.addCallStack(AllocType, StackIds, ContextSizeInfo: std::move(ContextSizeInfo)); |
133 | return AllocType; |
134 | } |
135 | |
136 | // Return true if InlinedCallStack, computed from a call instruction's debug |
137 | // info, is a prefix of ProfileCallStack, a list of Frames from profile data |
138 | // (either the allocation data or a callsite). |
139 | static bool |
140 | stackFrameIncludesInlinedCallStack(ArrayRef<Frame> ProfileCallStack, |
141 | ArrayRef<uint64_t> InlinedCallStack) { |
142 | return ProfileCallStack.size() >= InlinedCallStack.size() && |
143 | llvm::equal(LRange: ProfileCallStack.take_front(N: InlinedCallStack.size()), |
144 | RRange&: InlinedCallStack, P: [](const Frame &F, uint64_t StackId) { |
145 | return computeStackId(Frame: F) == StackId; |
146 | }); |
147 | } |
148 | |
149 | static bool isAllocationWithHotColdVariant(const Function *Callee, |
150 | const TargetLibraryInfo &TLI) { |
151 | if (!Callee) |
152 | return false; |
153 | LibFunc Func; |
154 | if (!TLI.getLibFunc(FDecl: *Callee, F&: Func)) |
155 | return false; |
156 | switch (Func) { |
157 | case LibFunc_Znwm: |
158 | case LibFunc_ZnwmRKSt9nothrow_t: |
159 | case LibFunc_ZnwmSt11align_val_t: |
160 | case LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t: |
161 | case LibFunc_Znam: |
162 | case LibFunc_ZnamRKSt9nothrow_t: |
163 | case LibFunc_ZnamSt11align_val_t: |
164 | case LibFunc_ZnamSt11align_val_tRKSt9nothrow_t: |
165 | case LibFunc_size_returning_new: |
166 | case LibFunc_size_returning_new_aligned: |
167 | return true; |
168 | case LibFunc_Znwm12__hot_cold_t: |
169 | case LibFunc_ZnwmRKSt9nothrow_t12__hot_cold_t: |
170 | case LibFunc_ZnwmSt11align_val_t12__hot_cold_t: |
171 | case LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t12__hot_cold_t: |
172 | case LibFunc_Znam12__hot_cold_t: |
173 | case LibFunc_ZnamRKSt9nothrow_t12__hot_cold_t: |
174 | case LibFunc_ZnamSt11align_val_t12__hot_cold_t: |
175 | case LibFunc_ZnamSt11align_val_tRKSt9nothrow_t12__hot_cold_t: |
176 | case LibFunc_size_returning_new_hot_cold: |
177 | case LibFunc_size_returning_new_aligned_hot_cold: |
178 | return ClMemProfMatchHotColdNew; |
179 | default: |
180 | return false; |
181 | } |
182 | } |
183 | |
184 | struct AllocMatchInfo { |
185 | uint64_t TotalSize = 0; |
186 | AllocationType AllocType = AllocationType::None; |
187 | }; |
188 | |
189 | DenseMap<uint64_t, SmallVector<CallEdgeTy, 0>> |
190 | memprof::(Module &M, const TargetLibraryInfo &TLI, |
191 | function_ref<bool(uint64_t)> IsPresentInProfile) { |
192 | DenseMap<uint64_t, SmallVector<CallEdgeTy, 0>> Calls; |
193 | |
194 | auto GetOffset = [](const DILocation *DIL) { |
195 | return (DIL->getLine() - DIL->getScope()->getSubprogram()->getLine()) & |
196 | 0xffff; |
197 | }; |
198 | |
199 | for (Function &F : M) { |
200 | if (F.isDeclaration()) |
201 | continue; |
202 | |
203 | for (auto &BB : F) { |
204 | for (auto &I : BB) { |
205 | if (!isa<CallBase>(Val: &I) || isa<IntrinsicInst>(Val: &I)) |
206 | continue; |
207 | |
208 | auto *CB = dyn_cast<CallBase>(Val: &I); |
209 | auto *CalledFunction = CB->getCalledFunction(); |
210 | // Disregard indirect calls and intrinsics. |
211 | if (!CalledFunction || CalledFunction->isIntrinsic()) |
212 | continue; |
213 | |
214 | StringRef CalleeName = CalledFunction->getName(); |
215 | // True if we are calling a heap allocation function that supports |
216 | // hot/cold variants. |
217 | bool IsAlloc = isAllocationWithHotColdVariant(Callee: CalledFunction, TLI); |
218 | // True for the first iteration below, indicating that we are looking at |
219 | // a leaf node. |
220 | bool IsLeaf = true; |
221 | for (const DILocation *DIL = I.getDebugLoc(); DIL; |
222 | DIL = DIL->getInlinedAt()) { |
223 | StringRef CallerName = DIL->getSubprogramLinkageName(); |
224 | assert(!CallerName.empty() && |
225 | "Be sure to enable -fdebug-info-for-profiling" ); |
226 | uint64_t CallerGUID = memprof::getGUID(FunctionName: CallerName); |
227 | uint64_t CalleeGUID = memprof::getGUID(FunctionName: CalleeName); |
228 | // Pretend that we are calling a function with GUID == 0 if we are |
229 | // in the inline stack leading to a heap allocation function. |
230 | if (IsAlloc) { |
231 | if (IsLeaf) { |
232 | // For leaf nodes, set CalleeGUID to 0 without consulting |
233 | // IsPresentInProfile. |
234 | CalleeGUID = 0; |
235 | } else if (!IsPresentInProfile(CalleeGUID)) { |
236 | // In addition to the leaf case above, continue to set CalleeGUID |
237 | // to 0 as long as we don't see CalleeGUID in the profile. |
238 | CalleeGUID = 0; |
239 | } else { |
240 | // Once we encounter a callee that exists in the profile, stop |
241 | // setting CalleeGUID to 0. |
242 | IsAlloc = false; |
243 | } |
244 | } |
245 | |
246 | LineLocation Loc = {GetOffset(DIL), DIL->getColumn()}; |
247 | Calls[CallerGUID].emplace_back(Args&: Loc, Args&: CalleeGUID); |
248 | CalleeName = CallerName; |
249 | IsLeaf = false; |
250 | } |
251 | } |
252 | } |
253 | } |
254 | |
255 | // Sort each call list by the source location. |
256 | for (auto &[CallerGUID, CallList] : Calls) { |
257 | llvm::sort(C&: CallList); |
258 | CallList.erase(CS: llvm::unique(R&: CallList), CE: CallList.end()); |
259 | } |
260 | |
261 | return Calls; |
262 | } |
263 | |
264 | DenseMap<uint64_t, LocToLocMap> |
265 | memprof::computeUndriftMap(Module &M, IndexedInstrProfReader *MemProfReader, |
266 | const TargetLibraryInfo &TLI) { |
267 | DenseMap<uint64_t, LocToLocMap> UndriftMaps; |
268 | |
269 | DenseMap<uint64_t, SmallVector<memprof::CallEdgeTy, 0>> CallsFromProfile = |
270 | MemProfReader->getMemProfCallerCalleePairs(); |
271 | DenseMap<uint64_t, SmallVector<memprof::CallEdgeTy, 0>> CallsFromIR = |
272 | extractCallsFromIR(M, TLI, IsPresentInProfile: [&](uint64_t GUID) { |
273 | return CallsFromProfile.contains(Val: GUID); |
274 | }); |
275 | |
276 | // Compute an undrift map for each CallerGUID. |
277 | for (const auto &[CallerGUID, IRAnchors] : CallsFromIR) { |
278 | auto It = CallsFromProfile.find(Val: CallerGUID); |
279 | if (It == CallsFromProfile.end()) |
280 | continue; |
281 | const auto &ProfileAnchors = It->second; |
282 | |
283 | LocToLocMap Matchings; |
284 | longestCommonSequence<LineLocation, GlobalValue::GUID>( |
285 | AnchorList1: ProfileAnchors, AnchorList2: IRAnchors, FunctionMatchesProfile: std::equal_to<GlobalValue::GUID>(), |
286 | InsertMatching: [&](LineLocation A, LineLocation B) { Matchings.try_emplace(k: A, args&: B); }); |
287 | [[maybe_unused]] bool Inserted = |
288 | UndriftMaps.try_emplace(Key: CallerGUID, Args: std::move(Matchings)).second; |
289 | |
290 | // The insertion must succeed because we visit each GUID exactly once. |
291 | assert(Inserted); |
292 | } |
293 | |
294 | return UndriftMaps; |
295 | } |
296 | |
297 | // Given a MemProfRecord, undrift all the source locations present in the |
298 | // record in place. |
299 | static void |
300 | undriftMemProfRecord(const DenseMap<uint64_t, LocToLocMap> &UndriftMaps, |
301 | memprof::MemProfRecord &MemProfRec) { |
302 | // Undrift a call stack in place. |
303 | auto UndriftCallStack = [&](std::vector<Frame> &CallStack) { |
304 | for (auto &F : CallStack) { |
305 | auto I = UndriftMaps.find(Val: F.Function); |
306 | if (I == UndriftMaps.end()) |
307 | continue; |
308 | auto J = I->second.find(x: LineLocation(F.LineOffset, F.Column)); |
309 | if (J == I->second.end()) |
310 | continue; |
311 | auto &NewLoc = J->second; |
312 | F.LineOffset = NewLoc.LineOffset; |
313 | F.Column = NewLoc.Column; |
314 | } |
315 | }; |
316 | |
317 | for (auto &AS : MemProfRec.AllocSites) |
318 | UndriftCallStack(AS.CallStack); |
319 | |
320 | for (auto &CS : MemProfRec.CallSites) |
321 | UndriftCallStack(CS.Frames); |
322 | } |
323 | |
324 | // Helper function to process CalleeGuids and create value profile metadata |
325 | static void addVPMetadata(Module &M, Instruction &I, |
326 | ArrayRef<GlobalValue::GUID> CalleeGuids) { |
327 | if (!ClMemProfAttachCalleeGuids || CalleeGuids.empty()) |
328 | return; |
329 | |
330 | if (I.getMetadata(KindID: LLVMContext::MD_prof)) { |
331 | uint64_t Unused; |
332 | // TODO: When merging is implemented, increase this to a typical ICP value |
333 | // (e.g., 3-6) For now, we only need to check if existing data exists, so 1 |
334 | // is sufficient |
335 | auto ExistingVD = getValueProfDataFromInst(Inst: I, ValueKind: IPVK_IndirectCallTarget, |
336 | /*MaxNumValueData=*/1, TotalC&: Unused); |
337 | // We don't know how to merge value profile data yet. |
338 | if (!ExistingVD.empty()) { |
339 | return; |
340 | } |
341 | } |
342 | |
343 | SmallVector<InstrProfValueData, 4> VDs; |
344 | uint64_t TotalCount = 0; |
345 | |
346 | for (const GlobalValue::GUID CalleeGUID : CalleeGuids) { |
347 | InstrProfValueData VD; |
348 | VD.Value = CalleeGUID; |
349 | // For MemProf, we don't have actual call counts, so we assign |
350 | // a weight of 1 to each potential target. |
351 | // TODO: Consider making this weight configurable or increasing it to |
352 | // improve effectiveness for ICP. |
353 | VD.Count = 1; |
354 | VDs.push_back(Elt: VD); |
355 | TotalCount += VD.Count; |
356 | } |
357 | |
358 | if (!VDs.empty()) { |
359 | annotateValueSite(M, Inst&: I, VDs, Sum: TotalCount, ValueKind: IPVK_IndirectCallTarget, |
360 | MaxMDCount: VDs.size()); |
361 | } |
362 | } |
363 | |
364 | static void readMemprof(Module &M, Function &F, |
365 | IndexedInstrProfReader *MemProfReader, |
366 | const TargetLibraryInfo &TLI, |
367 | std::map<std::pair<uint64_t, unsigned>, AllocMatchInfo> |
368 | &FullStackIdToAllocMatchInfo, |
369 | std::set<std::vector<uint64_t>> &MatchedCallSites, |
370 | DenseMap<uint64_t, LocToLocMap> &UndriftMaps, |
371 | OptimizationRemarkEmitter &ORE, uint64_t MaxColdSize) { |
372 | auto &Ctx = M.getContext(); |
373 | // Previously we used getIRPGOFuncName() here. If F is local linkage, |
374 | // getIRPGOFuncName() returns FuncName with prefix 'FileName;'. But |
375 | // llvm-profdata uses FuncName in dwarf to create GUID which doesn't |
376 | // contain FileName's prefix. It caused local linkage function can't |
377 | // find MemProfRecord. So we use getName() now. |
378 | // 'unique-internal-linkage-names' can make MemProf work better for local |
379 | // linkage function. |
380 | auto FuncName = F.getName(); |
381 | auto FuncGUID = Function::getGUIDAssumingExternalLinkage(GlobalName: FuncName); |
382 | std::optional<memprof::MemProfRecord> MemProfRec; |
383 | auto Err = MemProfReader->getMemProfRecord(FuncNameHash: FuncGUID).moveInto(Value&: MemProfRec); |
384 | if (Err) { |
385 | handleAllErrors(E: std::move(Err), Handlers: [&](const InstrProfError &IPE) { |
386 | auto Err = IPE.get(); |
387 | bool SkipWarning = false; |
388 | LLVM_DEBUG(dbgs() << "Error in reading profile for Func " << FuncName |
389 | << ": " ); |
390 | if (Err == instrprof_error::unknown_function) { |
391 | NumOfMemProfMissing++; |
392 | SkipWarning = !PGOWarnMissing; |
393 | LLVM_DEBUG(dbgs() << "unknown function" ); |
394 | } else if (Err == instrprof_error::hash_mismatch) { |
395 | NumOfMemProfMismatch++; |
396 | SkipWarning = |
397 | NoPGOWarnMismatch || |
398 | (NoPGOWarnMismatchComdatWeak && |
399 | (F.hasComdat() || |
400 | F.getLinkage() == GlobalValue::AvailableExternallyLinkage)); |
401 | LLVM_DEBUG(dbgs() << "hash mismatch (skip=" << SkipWarning << ")" ); |
402 | } |
403 | |
404 | if (SkipWarning) |
405 | return; |
406 | |
407 | std::string Msg = (IPE.message() + Twine(" " ) + F.getName().str() + |
408 | Twine(" Hash = " ) + std::to_string(val: FuncGUID)) |
409 | .str(); |
410 | |
411 | Ctx.diagnose( |
412 | DI: DiagnosticInfoPGOProfile(M.getName().data(), Msg, DS_Warning)); |
413 | }); |
414 | return; |
415 | } |
416 | |
417 | NumOfMemProfFunc++; |
418 | |
419 | // If requested, undrfit MemProfRecord so that the source locations in it |
420 | // match those in the IR. |
421 | if (SalvageStaleProfile) |
422 | undriftMemProfRecord(UndriftMaps, MemProfRec&: *MemProfRec); |
423 | |
424 | // Detect if there are non-zero column numbers in the profile. If not, |
425 | // treat all column numbers as 0 when matching (i.e. ignore any non-zero |
426 | // columns in the IR). The profiled binary might have been built with |
427 | // column numbers disabled, for example. |
428 | bool ProfileHasColumns = false; |
429 | |
430 | // Build maps of the location hash to all profile data with that leaf location |
431 | // (allocation info and the callsites). |
432 | std::map<uint64_t, std::set<const AllocationInfo *>> LocHashToAllocInfo; |
433 | |
434 | // Helper struct for maintaining refs to callsite data. As an alternative we |
435 | // could store a pointer to the CallSiteInfo struct but we also need the frame |
436 | // index. Using ArrayRefs instead makes it a little easier to read. |
437 | struct CallSiteEntry { |
438 | // Subset of frames for the corresponding CallSiteInfo. |
439 | ArrayRef<Frame> Frames; |
440 | // Potential targets for indirect calls. |
441 | ArrayRef<GlobalValue::GUID> CalleeGuids; |
442 | |
443 | // Only compare Frame contents. |
444 | // Use pointer-based equality instead of ArrayRef's operator== which does |
445 | // element-wise comparison. We want to check if it's the same slice of the |
446 | // underlying array, not just equivalent content. |
447 | bool operator==(const CallSiteEntry &Other) const { |
448 | return Frames.data() == Other.Frames.data() && |
449 | Frames.size() == Other.Frames.size(); |
450 | } |
451 | }; |
452 | |
453 | struct CallSiteEntryHash { |
454 | size_t operator()(const CallSiteEntry &Entry) const { |
455 | return computeFullStackId(CallStack: Entry.Frames); |
456 | } |
457 | }; |
458 | |
459 | // For the callsites we need to record slices of the frame array (see comments |
460 | // below where the map entries are added) along with their CalleeGuids. |
461 | std::map<uint64_t, std::unordered_set<CallSiteEntry, CallSiteEntryHash>> |
462 | LocHashToCallSites; |
463 | for (auto &AI : MemProfRec->AllocSites) { |
464 | NumOfMemProfAllocContextProfiles++; |
465 | // Associate the allocation info with the leaf frame. The later matching |
466 | // code will match any inlined call sequences in the IR with a longer prefix |
467 | // of call stack frames. |
468 | uint64_t StackId = computeStackId(Frame: AI.CallStack[0]); |
469 | LocHashToAllocInfo[StackId].insert(x: &AI); |
470 | ProfileHasColumns |= AI.CallStack[0].Column; |
471 | } |
472 | for (auto &CS : MemProfRec->CallSites) { |
473 | NumOfMemProfCallSiteProfiles++; |
474 | // Need to record all frames from leaf up to and including this function, |
475 | // as any of these may or may not have been inlined at this point. |
476 | unsigned Idx = 0; |
477 | for (auto &StackFrame : CS.Frames) { |
478 | uint64_t StackId = computeStackId(Frame: StackFrame); |
479 | ArrayRef<Frame> FrameSlice = ArrayRef<Frame>(CS.Frames).drop_front(N: Idx++); |
480 | ArrayRef<GlobalValue::GUID> CalleeGuids(CS.CalleeGuids); |
481 | LocHashToCallSites[StackId].insert(x: {.Frames: FrameSlice, .CalleeGuids: CalleeGuids}); |
482 | |
483 | ProfileHasColumns |= StackFrame.Column; |
484 | // Once we find this function, we can stop recording. |
485 | if (StackFrame.Function == FuncGUID) |
486 | break; |
487 | } |
488 | assert(Idx <= CS.Frames.size() && CS.Frames[Idx - 1].Function == FuncGUID); |
489 | } |
490 | |
491 | auto GetOffset = [](const DILocation *DIL) { |
492 | return (DIL->getLine() - DIL->getScope()->getSubprogram()->getLine()) & |
493 | 0xffff; |
494 | }; |
495 | |
496 | // Now walk the instructions, looking up the associated profile data using |
497 | // debug locations. |
498 | for (auto &BB : F) { |
499 | for (auto &I : BB) { |
500 | if (I.isDebugOrPseudoInst()) |
501 | continue; |
502 | // We are only interested in calls (allocation or interior call stack |
503 | // context calls). |
504 | auto *CI = dyn_cast<CallBase>(Val: &I); |
505 | if (!CI) |
506 | continue; |
507 | auto *CalledFunction = CI->getCalledFunction(); |
508 | if (CalledFunction && CalledFunction->isIntrinsic()) |
509 | continue; |
510 | // List of call stack ids computed from the location hashes on debug |
511 | // locations (leaf to inlined at root). |
512 | SmallVector<uint64_t, 8> InlinedCallStack; |
513 | // Was the leaf location found in one of the profile maps? |
514 | bool LeafFound = false; |
515 | // If leaf was found in a map, iterators pointing to its location in both |
516 | // of the maps. It might exist in neither, one, or both (the latter case |
517 | // can happen because we don't currently have discriminators to |
518 | // distinguish the case when a single line/col maps to both an allocation |
519 | // and another callsite). |
520 | auto AllocInfoIter = LocHashToAllocInfo.end(); |
521 | auto CallSitesIter = LocHashToCallSites.end(); |
522 | for (const DILocation *DIL = I.getDebugLoc(); DIL != nullptr; |
523 | DIL = DIL->getInlinedAt()) { |
524 | // Use C++ linkage name if possible. Need to compile with |
525 | // -fdebug-info-for-profiling to get linkage name. |
526 | StringRef Name = DIL->getScope()->getSubprogram()->getLinkageName(); |
527 | if (Name.empty()) |
528 | Name = DIL->getScope()->getSubprogram()->getName(); |
529 | auto CalleeGUID = Function::getGUIDAssumingExternalLinkage(GlobalName: Name); |
530 | auto StackId = computeStackId(Function: CalleeGUID, LineOffset: GetOffset(DIL), |
531 | Column: ProfileHasColumns ? DIL->getColumn() : 0); |
532 | // Check if we have found the profile's leaf frame. If yes, collect |
533 | // the rest of the call's inlined context starting here. If not, see if |
534 | // we find a match further up the inlined context (in case the profile |
535 | // was missing debug frames at the leaf). |
536 | if (!LeafFound) { |
537 | AllocInfoIter = LocHashToAllocInfo.find(x: StackId); |
538 | CallSitesIter = LocHashToCallSites.find(x: StackId); |
539 | if (AllocInfoIter != LocHashToAllocInfo.end() || |
540 | CallSitesIter != LocHashToCallSites.end()) |
541 | LeafFound = true; |
542 | } |
543 | if (LeafFound) |
544 | InlinedCallStack.push_back(Elt: StackId); |
545 | } |
546 | // If leaf not in either of the maps, skip inst. |
547 | if (!LeafFound) |
548 | continue; |
549 | |
550 | // First add !memprof metadata from allocation info, if we found the |
551 | // instruction's leaf location in that map, and if the rest of the |
552 | // instruction's locations match the prefix Frame locations on an |
553 | // allocation context with the same leaf. |
554 | if (AllocInfoIter != LocHashToAllocInfo.end() && |
555 | // Only consider allocations which support hinting. |
556 | isAllocationWithHotColdVariant(Callee: CI->getCalledFunction(), TLI)) { |
557 | // We may match this instruction's location list to multiple MIB |
558 | // contexts. Add them to a Trie specialized for trimming the contexts to |
559 | // the minimal needed to disambiguate contexts with unique behavior. |
560 | CallStackTrie AllocTrie(&ORE, MaxColdSize); |
561 | uint64_t TotalSize = 0; |
562 | uint64_t TotalColdSize = 0; |
563 | for (auto *AllocInfo : AllocInfoIter->second) { |
564 | // Check the full inlined call stack against this one. |
565 | // If we found and thus matched all frames on the call, include |
566 | // this MIB. |
567 | if (stackFrameIncludesInlinedCallStack(ProfileCallStack: AllocInfo->CallStack, |
568 | InlinedCallStack)) { |
569 | NumOfMemProfMatchedAllocContexts++; |
570 | uint64_t FullStackId = 0; |
571 | if (ClPrintMemProfMatchInfo || recordContextSizeInfoForAnalysis()) |
572 | FullStackId = computeFullStackId(CallStack: AllocInfo->CallStack); |
573 | auto AllocType = addCallStack(AllocTrie, AllocInfo, FullStackId); |
574 | TotalSize += AllocInfo->Info.getTotalSize(); |
575 | if (AllocType == AllocationType::Cold) |
576 | TotalColdSize += AllocInfo->Info.getTotalSize(); |
577 | // Record information about the allocation if match info printing |
578 | // was requested. |
579 | if (ClPrintMemProfMatchInfo) { |
580 | assert(FullStackId != 0); |
581 | FullStackIdToAllocMatchInfo[std::make_pair( |
582 | x&: FullStackId, y: InlinedCallStack.size())] = { |
583 | .TotalSize: AllocInfo->Info.getTotalSize(), .AllocType: AllocType}; |
584 | } |
585 | } |
586 | } |
587 | // If the threshold for the percent of cold bytes is less than 100%, |
588 | // and not all bytes are cold, see if we should still hint this |
589 | // allocation as cold without context sensitivity. |
590 | if (TotalColdSize < TotalSize && MinMatchedColdBytePercent < 100 && |
591 | TotalColdSize * 100 >= MinMatchedColdBytePercent * TotalSize) { |
592 | AllocTrie.addSingleAllocTypeAttribute(CI, AT: AllocationType::Cold, |
593 | Descriptor: "dominant" ); |
594 | continue; |
595 | } |
596 | |
597 | // We might not have matched any to the full inlined call stack. |
598 | // But if we did, create and attach metadata, or a function attribute if |
599 | // all contexts have identical profiled behavior. |
600 | if (!AllocTrie.empty()) { |
601 | NumOfMemProfMatchedAllocs++; |
602 | // MemprofMDAttached will be false if a function attribute was |
603 | // attached. |
604 | bool MemprofMDAttached = AllocTrie.buildAndAttachMIBMetadata(CI); |
605 | assert(MemprofMDAttached == I.hasMetadata(LLVMContext::MD_memprof)); |
606 | if (MemprofMDAttached) { |
607 | // Add callsite metadata for the instruction's location list so that |
608 | // it simpler later on to identify which part of the MIB contexts |
609 | // are from this particular instruction (including during inlining, |
610 | // when the callsite metadata will be updated appropriately). |
611 | // FIXME: can this be changed to strip out the matching stack |
612 | // context ids from the MIB contexts and not add any callsite |
613 | // metadata here to save space? |
614 | addCallsiteMetadata(I, InlinedCallStack, Ctx); |
615 | } |
616 | } |
617 | continue; |
618 | } |
619 | |
620 | if (CallSitesIter == LocHashToCallSites.end()) |
621 | continue; |
622 | |
623 | // Otherwise, add callsite metadata. If we reach here then we found the |
624 | // instruction's leaf location in the callsites map and not the allocation |
625 | // map. |
626 | for (const auto &CallSiteEntry : CallSitesIter->second) { |
627 | // If we found and thus matched all frames on the call, create and |
628 | // attach call stack metadata. |
629 | if (stackFrameIncludesInlinedCallStack(ProfileCallStack: CallSiteEntry.Frames, |
630 | InlinedCallStack)) { |
631 | NumOfMemProfMatchedCallSites++; |
632 | addCallsiteMetadata(I, InlinedCallStack, Ctx); |
633 | |
634 | // Try to attach indirect call metadata if possible. |
635 | if (!CalledFunction) |
636 | addVPMetadata(M, I, CalleeGuids: CallSiteEntry.CalleeGuids); |
637 | |
638 | // Only need to find one with a matching call stack and add a single |
639 | // callsite metadata. |
640 | |
641 | // Accumulate call site matching information upon request. |
642 | if (ClPrintMemProfMatchInfo) { |
643 | std::vector<uint64_t> CallStack; |
644 | append_range(C&: CallStack, R&: InlinedCallStack); |
645 | MatchedCallSites.insert(x: std::move(CallStack)); |
646 | } |
647 | break; |
648 | } |
649 | } |
650 | } |
651 | } |
652 | } |
653 | |
654 | MemProfUsePass::MemProfUsePass(std::string MemoryProfileFile, |
655 | IntrusiveRefCntPtr<vfs::FileSystem> FS) |
656 | : MemoryProfileFileName(MemoryProfileFile), FS(FS) { |
657 | if (!FS) |
658 | this->FS = vfs::getRealFileSystem(); |
659 | } |
660 | |
661 | PreservedAnalyses MemProfUsePass::run(Module &M, ModuleAnalysisManager &AM) { |
662 | // Return immediately if the module doesn't contain any function. |
663 | if (M.empty()) |
664 | return PreservedAnalyses::all(); |
665 | |
666 | LLVM_DEBUG(dbgs() << "Read in memory profile:" ); |
667 | auto &Ctx = M.getContext(); |
668 | auto ReaderOrErr = IndexedInstrProfReader::create(Path: MemoryProfileFileName, FS&: *FS); |
669 | if (Error E = ReaderOrErr.takeError()) { |
670 | handleAllErrors(E: std::move(E), Handlers: [&](const ErrorInfoBase &EI) { |
671 | Ctx.diagnose( |
672 | DI: DiagnosticInfoPGOProfile(MemoryProfileFileName.data(), EI.message())); |
673 | }); |
674 | return PreservedAnalyses::all(); |
675 | } |
676 | |
677 | std::unique_ptr<IndexedInstrProfReader> MemProfReader = |
678 | std::move(ReaderOrErr.get()); |
679 | if (!MemProfReader) { |
680 | Ctx.diagnose(DI: DiagnosticInfoPGOProfile( |
681 | MemoryProfileFileName.data(), StringRef("Cannot get MemProfReader" ))); |
682 | return PreservedAnalyses::all(); |
683 | } |
684 | |
685 | if (!MemProfReader->hasMemoryProfile()) { |
686 | Ctx.diagnose(DI: DiagnosticInfoPGOProfile(MemoryProfileFileName.data(), |
687 | "Not a memory profile" )); |
688 | return PreservedAnalyses::all(); |
689 | } |
690 | |
691 | auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(IR&: M).getManager(); |
692 | |
693 | TargetLibraryInfo &TLI = FAM.getResult<TargetLibraryAnalysis>(IR&: *M.begin()); |
694 | DenseMap<uint64_t, LocToLocMap> UndriftMaps; |
695 | if (SalvageStaleProfile) |
696 | UndriftMaps = computeUndriftMap(M, MemProfReader: MemProfReader.get(), TLI); |
697 | |
698 | // Map from the stack hash and matched frame count of each allocation context |
699 | // in the function profiles to the total profiled size (bytes) and allocation |
700 | // type. |
701 | std::map<std::pair<uint64_t, unsigned>, AllocMatchInfo> |
702 | FullStackIdToAllocMatchInfo; |
703 | |
704 | // Set of the matched call sites, each expressed as a sequence of an inline |
705 | // call stack. |
706 | std::set<std::vector<uint64_t>> MatchedCallSites; |
707 | |
708 | uint64_t MaxColdSize = 0; |
709 | if (auto *MemProfSum = MemProfReader->getMemProfSummary()) |
710 | MaxColdSize = MemProfSum->getMaxColdTotalSize(); |
711 | |
712 | for (auto &F : M) { |
713 | if (F.isDeclaration()) |
714 | continue; |
715 | |
716 | const TargetLibraryInfo &TLI = FAM.getResult<TargetLibraryAnalysis>(IR&: F); |
717 | auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F); |
718 | readMemprof(M, F, MemProfReader: MemProfReader.get(), TLI, FullStackIdToAllocMatchInfo, |
719 | MatchedCallSites, UndriftMaps, ORE, MaxColdSize); |
720 | } |
721 | |
722 | if (ClPrintMemProfMatchInfo) { |
723 | for (const auto &[IdLengthPair, Info] : FullStackIdToAllocMatchInfo) { |
724 | auto [Id, Length] = IdLengthPair; |
725 | errs() << "MemProf " << getAllocTypeAttributeString(Type: Info.AllocType) |
726 | << " context with id " << Id << " has total profiled size " |
727 | << Info.TotalSize << " is matched with " << Length << " frames\n" ; |
728 | } |
729 | |
730 | for (const auto &CallStack : MatchedCallSites) { |
731 | errs() << "MemProf callsite match for inline call stack" ; |
732 | for (uint64_t StackId : CallStack) |
733 | errs() << " " << StackId; |
734 | errs() << "\n" ; |
735 | } |
736 | } |
737 | |
738 | return PreservedAnalyses::none(); |
739 | } |
740 | |