1//==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements support for context disambiguation of allocation
10// calls for profile guided heap optimization. Specifically, it uses Memprof
11// profiles which indicate context specific allocation behavior (currently
12// distinguishing cold vs hot memory allocations). Cloning is performed to
13// expose the cold allocation call contexts, and the allocation calls are
14// subsequently annotated with an attribute for later transformation.
15//
16// The transformations can be performed either directly on IR (regular LTO), or
17// on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
18// Both types of LTO operate on a the same base graph representation, which
19// uses CRTP to support either IR or Index formats.
20//
21//===----------------------------------------------------------------------===//
22
23#include "llvm/Transforms/IPO/MemProfContextDisambiguation.h"
24#include "llvm/ADT/DenseMap.h"
25#include "llvm/ADT/DenseSet.h"
26#include "llvm/ADT/MapVector.h"
27#include "llvm/ADT/SetOperations.h"
28#include "llvm/ADT/SmallPtrSet.h"
29#include "llvm/ADT/SmallSet.h"
30#include "llvm/ADT/SmallVector.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/StringExtras.h"
33#include "llvm/Analysis/MemoryProfileInfo.h"
34#include "llvm/Analysis/ModuleSummaryAnalysis.h"
35#include "llvm/Analysis/OptimizationRemarkEmitter.h"
36#include "llvm/Bitcode/BitcodeReader.h"
37#include "llvm/IR/Instructions.h"
38#include "llvm/IR/Module.h"
39#include "llvm/IR/ModuleSummaryIndex.h"
40#include "llvm/Pass.h"
41#include "llvm/Support/CommandLine.h"
42#include "llvm/Support/GraphWriter.h"
43#include "llvm/Support/InterleavedRange.h"
44#include "llvm/Support/SHA1.h"
45#include "llvm/Support/raw_ostream.h"
46#include "llvm/Transforms/IPO.h"
47#include "llvm/Transforms/Utils/CallPromotionUtils.h"
48#include "llvm/Transforms/Utils/Cloning.h"
49#include "llvm/Transforms/Utils/Instrumentation.h"
50#include <deque>
51#include <sstream>
52#include <unordered_map>
53#include <vector>
54using namespace llvm;
55using namespace llvm::memprof;
56
57#define DEBUG_TYPE "memprof-context-disambiguation"
58
59STATISTIC(FunctionClonesAnalysis,
60 "Number of function clones created during whole program analysis");
61STATISTIC(FunctionClonesThinBackend,
62 "Number of function clones created during ThinLTO backend");
63STATISTIC(FunctionsClonedThinBackend,
64 "Number of functions that had clones created during ThinLTO backend");
65STATISTIC(
66 FunctionCloneDuplicatesThinBackend,
67 "Number of function clone duplicates detected during ThinLTO backend");
68STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
69 "cloned) during whole program analysis");
70STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
71 "during whole program analysis");
72STATISTIC(AllocTypeNotColdThinBackend,
73 "Number of not cold static allocations (possibly cloned) during "
74 "ThinLTO backend");
75STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
76 "(possibly cloned) during ThinLTO backend");
77STATISTIC(OrigAllocsThinBackend,
78 "Number of original (not cloned) allocations with memprof profiles "
79 "during ThinLTO backend");
80STATISTIC(
81 AllocVersionsThinBackend,
82 "Number of allocation versions (including clones) during ThinLTO backend");
83STATISTIC(MaxAllocVersionsThinBackend,
84 "Maximum number of allocation versions created for an original "
85 "allocation during ThinLTO backend");
86STATISTIC(UnclonableAllocsThinBackend,
87 "Number of unclonable ambigous allocations during ThinLTO backend");
88STATISTIC(RemovedEdgesWithMismatchedCallees,
89 "Number of edges removed due to mismatched callees (profiled vs IR)");
90STATISTIC(FoundProfiledCalleeCount,
91 "Number of profiled callees found via tail calls");
92STATISTIC(FoundProfiledCalleeDepth,
93 "Aggregate depth of profiled callees found via tail calls");
94STATISTIC(FoundProfiledCalleeMaxDepth,
95 "Maximum depth of profiled callees found via tail calls");
96STATISTIC(FoundProfiledCalleeNonUniquelyCount,
97 "Number of profiled callees found via multiple tail call chains");
98STATISTIC(DeferredBackedges, "Number of backedges with deferred cloning");
99STATISTIC(NewMergedNodes, "Number of new nodes created during merging");
100STATISTIC(NonNewMergedNodes, "Number of non new nodes used during merging");
101STATISTIC(MissingAllocForContextId,
102 "Number of missing alloc nodes for context ids");
103STATISTIC(SkippedCallsCloning,
104 "Number of calls skipped during cloning due to unexpected operand");
105STATISTIC(MismatchedCloneAssignments,
106 "Number of callsites assigned to call multiple non-matching clones");
107STATISTIC(TotalMergeInvokes, "Number of merge invocations for nodes");
108STATISTIC(TotalMergeIters, "Number of merge iterations for nodes");
109STATISTIC(MaxMergeIters, "Max merge iterations for nodes");
110STATISTIC(NumImportantContextIds, "Number of important context ids");
111STATISTIC(NumFixupEdgeIdsInserted, "Number of fixup edge ids inserted");
112STATISTIC(NumFixupEdgesAdded, "Number of fixup edges added");
113STATISTIC(NumFixedContexts, "Number of contexts with fixed edges");
114STATISTIC(AliaseesPrevailingInDiffModuleFromAlias,
115 "Number of aliasees prevailing in a different module than its alias");
116
117static cl::opt<std::string> DotFilePathPrefix(
118 "memprof-dot-file-path-prefix", cl::init(Val: ""), cl::Hidden,
119 cl::value_desc("filename"),
120 cl::desc("Specify the path prefix of the MemProf dot files."));
121
122static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(Val: false),
123 cl::Hidden,
124 cl::desc("Export graph to dot files."));
125
126// TODO: Remove this option once new handling is validated more widely.
127static cl::opt<bool> DoMergeIteration(
128 "memprof-merge-iteration", cl::init(Val: true), cl::Hidden,
129 cl::desc("Iteratively apply merging on a node to catch new callers"));
130
131// How much of the graph to export to dot.
132enum DotScope {
133 All, // The full CCG graph.
134 Alloc, // Only contexts for the specified allocation.
135 Context, // Only the specified context.
136};
137
138static cl::opt<DotScope> DotGraphScope(
139 "memprof-dot-scope", cl::desc("Scope of graph to export to dot"),
140 cl::Hidden, cl::init(Val: DotScope::All),
141 cl::values(
142 clEnumValN(DotScope::All, "all", "Export full callsite graph"),
143 clEnumValN(DotScope::Alloc, "alloc",
144 "Export only nodes with contexts feeding given "
145 "-memprof-dot-alloc-id"),
146 clEnumValN(DotScope::Context, "context",
147 "Export only nodes with given -memprof-dot-context-id")));
148
149static cl::opt<unsigned>
150 AllocIdForDot("memprof-dot-alloc-id", cl::init(Val: 0), cl::Hidden,
151 cl::desc("Id of alloc to export if -memprof-dot-scope=alloc "
152 "or to highlight if -memprof-dot-scope=all"));
153
154static cl::opt<unsigned> ContextIdForDot(
155 "memprof-dot-context-id", cl::init(Val: 0), cl::Hidden,
156 cl::desc("Id of context to export if -memprof-dot-scope=context or to "
157 "highlight otherwise"));
158
159static cl::opt<bool>
160 DumpCCG("memprof-dump-ccg", cl::init(Val: false), cl::Hidden,
161 cl::desc("Dump CallingContextGraph to stdout after each stage."));
162
163static cl::opt<bool>
164 VerifyCCG("memprof-verify-ccg", cl::init(Val: false), cl::Hidden,
165 cl::desc("Perform verification checks on CallingContextGraph."));
166
167static cl::opt<bool>
168 VerifyNodes("memprof-verify-nodes", cl::init(Val: false), cl::Hidden,
169 cl::desc("Perform frequent verification checks on nodes."));
170
171static cl::opt<std::string> MemProfImportSummary(
172 "memprof-import-summary",
173 cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
174 cl::Hidden);
175
176static cl::opt<unsigned>
177 TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(Val: 5),
178 cl::Hidden,
179 cl::desc("Max depth to recursively search for missing "
180 "frames through tail calls."));
181
182// Optionally enable cloning of callsites involved with recursive cycles
183static cl::opt<bool> AllowRecursiveCallsites(
184 "memprof-allow-recursive-callsites", cl::init(Val: true), cl::Hidden,
185 cl::desc("Allow cloning of callsites involved in recursive cycles"));
186
187static cl::opt<bool> CloneRecursiveContexts(
188 "memprof-clone-recursive-contexts", cl::init(Val: true), cl::Hidden,
189 cl::desc("Allow cloning of contexts through recursive cycles"));
190
191// Generally this is needed for correct assignment of allocation clones to
192// function clones, however, allow it to be disabled for debugging while the
193// functionality is new and being tested more widely.
194static cl::opt<bool>
195 MergeClones("memprof-merge-clones", cl::init(Val: true), cl::Hidden,
196 cl::desc("Merge clones before assigning functions"));
197
198// When disabled, try to detect and prevent cloning of recursive contexts.
199// This is only necessary until we support cloning through recursive cycles.
200// Leave on by default for now, as disabling requires a little bit of compile
201// time overhead and doesn't affect correctness, it will just inflate the cold
202// hinted bytes reporting a bit when -memprof-report-hinted-sizes is enabled.
203static cl::opt<bool> AllowRecursiveContexts(
204 "memprof-allow-recursive-contexts", cl::init(Val: true), cl::Hidden,
205 cl::desc("Allow cloning of contexts having recursive cycles"));
206
207// Set the minimum absolute count threshold for allowing inlining of indirect
208// calls promoted during cloning.
209static cl::opt<unsigned> MemProfICPNoInlineThreshold(
210 "memprof-icp-noinline-threshold", cl::init(Val: 2), cl::Hidden,
211 cl::desc("Minimum absolute count for promoted target to be inlinable"));
212
213namespace llvm {
214cl::opt<bool> EnableMemProfContextDisambiguation(
215 "enable-memprof-context-disambiguation", cl::Hidden,
216 cl::desc("Enable MemProf context disambiguation"));
217
218// Indicate we are linking with an allocator that supports hot/cold operator
219// new interfaces.
220cl::opt<bool> SupportsHotColdNew(
221 "supports-hot-cold-new", cl::init(Val: false), cl::Hidden,
222 cl::desc("Linking with hot/cold operator new interfaces"));
223
224static cl::opt<bool> MemProfRequireDefinitionForPromotion(
225 "memprof-require-definition-for-promotion", cl::init(Val: false), cl::Hidden,
226 cl::desc(
227 "Require target function definition when promoting indirect calls"));
228
229extern cl::opt<bool> MemProfReportHintedSizes;
230extern cl::opt<unsigned> MinClonedColdBytePercent;
231
232cl::opt<unsigned> MemProfTopNImportant(
233 "memprof-top-n-important", cl::init(Val: 10), cl::Hidden,
234 cl::desc("Number of largest cold contexts to consider important"));
235
236cl::opt<bool> MemProfFixupImportant(
237 "memprof-fixup-important", cl::init(Val: true), cl::Hidden,
238 cl::desc("Enables edge fixup for important contexts"));
239
240extern cl::opt<unsigned> MaxSummaryIndirectEdges;
241
242} // namespace llvm
243
244namespace {
245
246/// CRTP base for graphs built from either IR or ThinLTO summary index.
247///
248/// The graph represents the call contexts in all memprof metadata on allocation
249/// calls, with nodes for the allocations themselves, as well as for the calls
250/// in each context. The graph is initially built from the allocation memprof
251/// metadata (or summary) MIBs. It is then updated to match calls with callsite
252/// metadata onto the nodes, updating it to reflect any inlining performed on
253/// those calls.
254///
255/// Each MIB (representing an allocation's call context with allocation
256/// behavior) is assigned a unique context id during the graph build. The edges
257/// and nodes in the graph are decorated with the context ids they carry. This
258/// is used to correctly update the graph when cloning is performed so that we
259/// can uniquify the context for a single (possibly cloned) allocation.
260template <typename DerivedCCG, typename FuncTy, typename CallTy>
261class CallsiteContextGraph {
262public:
263 CallsiteContextGraph() = default;
264 CallsiteContextGraph(const CallsiteContextGraph &) = default;
265 CallsiteContextGraph(CallsiteContextGraph &&) = default;
266
267 /// Main entry point to perform analysis and transformations on graph.
268 bool process(function_ref<void(StringRef, StringRef, const Twine &)>
269 EmitRemark = nullptr,
270 bool AllowExtraAnalysis = false);
271
272 /// Perform cloning on the graph necessary to uniquely identify the allocation
273 /// behavior of an allocation based on its context.
274 void identifyClones();
275
276 /// Assign callsite clones to functions, cloning functions as needed to
277 /// accommodate the combinations of their callsite clones reached by callers.
278 /// For regular LTO this clones functions and callsites in the IR, but for
279 /// ThinLTO the cloning decisions are noted in the summaries and later applied
280 /// in applyImport.
281 bool assignFunctions();
282
283 void dump() const;
284 void print(raw_ostream &OS) const;
285 void printTotalSizes(raw_ostream &OS,
286 function_ref<void(StringRef, StringRef, const Twine &)>
287 EmitRemark = nullptr) const;
288
289 friend raw_ostream &operator<<(raw_ostream &OS,
290 const CallsiteContextGraph &CCG) {
291 CCG.print(OS);
292 return OS;
293 }
294
295 friend struct GraphTraits<
296 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
297 friend struct DOTGraphTraits<
298 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
299
300 void exportToDot(std::string Label) const;
301
302 /// Represents a function clone via FuncTy pointer and clone number pair.
303 struct FuncInfo final
304 : public std::pair<FuncTy *, unsigned /*Clone number*/> {
305 using Base = std::pair<FuncTy *, unsigned>;
306 FuncInfo(const Base &B) : Base(B) {}
307 FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
308 explicit operator bool() const { return this->first != nullptr; }
309 FuncTy *func() const { return this->first; }
310 unsigned cloneNo() const { return this->second; }
311 };
312
313 /// Represents a callsite clone via CallTy and clone number pair.
314 struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
315 using Base = std::pair<CallTy, unsigned>;
316 CallInfo(const Base &B) : Base(B) {}
317 CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
318 : Base(Call, CloneNo) {}
319 explicit operator bool() const { return (bool)this->first; }
320 CallTy call() const { return this->first; }
321 unsigned cloneNo() const { return this->second; }
322 void setCloneNo(unsigned N) { this->second = N; }
323 void print(raw_ostream &OS) const {
324 if (!operator bool()) {
325 assert(!cloneNo());
326 OS << "null Call";
327 return;
328 }
329 call()->print(OS);
330 OS << "\t(clone " << cloneNo() << ")";
331 }
332 void dump() const {
333 print(OS&: dbgs());
334 dbgs() << "\n";
335 }
336 friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
337 Call.print(OS);
338 return OS;
339 }
340 };
341
342 struct ContextEdge;
343
344 /// Node in the Callsite Context Graph
345 struct ContextNode {
346 // Assigned to nodes as they are created, useful for debugging.
347 unsigned NodeId = 0;
348
349 // Keep this for now since in the IR case where we have an Instruction* it
350 // is not as immediately discoverable. Used for printing richer information
351 // when dumping graph.
352 bool IsAllocation;
353
354 // Keeps track of when the Call was reset to null because there was
355 // recursion.
356 bool Recursive = false;
357
358 // This will be formed by ORing together the AllocationType enum values
359 // for contexts including this node.
360 uint8_t AllocTypes = 0;
361
362 // The corresponding allocation or interior call. This is the primary call
363 // for which we have created this node.
364 CallInfo Call;
365
366 // List of other calls that can be treated the same as the primary call
367 // through cloning. I.e. located in the same function and have the same
368 // (possibly pruned) stack ids. They will be updated the same way as the
369 // primary call when assigning to function clones.
370 SmallVector<CallInfo, 0> MatchingCalls;
371
372 // For alloc nodes this is a unique id assigned when constructed, and for
373 // callsite stack nodes it is the original stack id when the node is
374 // constructed from the memprof MIB metadata on the alloc nodes. Note that
375 // this is only used when matching callsite metadata onto the stack nodes
376 // created when processing the allocation memprof MIBs, and for labeling
377 // nodes in the dot graph. Therefore we don't bother to assign a value for
378 // clones.
379 uint64_t OrigStackOrAllocId = 0;
380
381 // Edges to all callees in the profiled call stacks.
382 // TODO: Should this be a map (from Callee node) for more efficient lookup?
383 std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
384
385 // Edges to all callers in the profiled call stacks.
386 // TODO: Should this be a map (from Caller node) for more efficient lookup?
387 std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
388
389 // Returns true if we need to look at the callee edges for determining the
390 // node context ids and allocation type.
391 bool useCallerEdgesForContextInfo() const {
392 // Typically if the callee edges are empty either the caller edges are
393 // also empty, or this is an allocation (leaf node). However, if we are
394 // allowing recursive callsites and contexts this will be violated for
395 // incompletely cloned recursive cycles.
396 assert(!CalleeEdges.empty() || CallerEdges.empty() || IsAllocation ||
397 (AllowRecursiveCallsites && AllowRecursiveContexts));
398 // When cloning for a recursive context, during cloning we might be in the
399 // midst of cloning for a recurrence and have moved context ids off of a
400 // caller edge onto the clone but not yet off of the incoming caller
401 // (back) edge. If we don't look at those we miss the fact that this node
402 // still has context ids of interest.
403 return IsAllocation || CloneRecursiveContexts;
404 }
405
406 // Compute the context ids for this node from the union of its edge context
407 // ids.
408 DenseSet<uint32_t> getContextIds() const {
409 unsigned Count = 0;
410 // Compute the number of ids for reserve below. In general we only need to
411 // look at one set of edges, typically the callee edges, since other than
412 // allocations and in some cases during recursion cloning, all the context
413 // ids on the callers should also flow out via callee edges.
414 for (auto &Edge : CalleeEdges.empty() ? CallerEdges : CalleeEdges)
415 Count += Edge->getContextIds().size();
416 DenseSet<uint32_t> ContextIds;
417 ContextIds.reserve(Size: Count);
418 auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
419 CalleeEdges, useCallerEdgesForContextInfo()
420 ? CallerEdges
421 : std::vector<std::shared_ptr<ContextEdge>>());
422 for (const auto &Edge : Edges)
423 ContextIds.insert_range(Edge->getContextIds());
424 return ContextIds;
425 }
426
427 // Compute the allocation type for this node from the OR of its edge
428 // allocation types.
429 uint8_t computeAllocType() const {
430 uint8_t BothTypes =
431 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
432 uint8_t AllocType = (uint8_t)AllocationType::None;
433 auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
434 CalleeEdges, useCallerEdgesForContextInfo()
435 ? CallerEdges
436 : std::vector<std::shared_ptr<ContextEdge>>());
437 for (const auto &Edge : Edges) {
438 AllocType |= Edge->AllocTypes;
439 // Bail early if alloc type reached both, no further refinement.
440 if (AllocType == BothTypes)
441 return AllocType;
442 }
443 return AllocType;
444 }
445
446 // The context ids set for this node is empty if its edge context ids are
447 // also all empty.
448 bool emptyContextIds() const {
449 auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
450 CalleeEdges, useCallerEdgesForContextInfo()
451 ? CallerEdges
452 : std::vector<std::shared_ptr<ContextEdge>>());
453 for (const auto &Edge : Edges) {
454 if (!Edge->getContextIds().empty())
455 return false;
456 }
457 return true;
458 }
459
460 // List of clones of this ContextNode, initially empty.
461 std::vector<ContextNode *> Clones;
462
463 // If a clone, points to the original uncloned node.
464 ContextNode *CloneOf = nullptr;
465
466 ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
467
468 ContextNode(bool IsAllocation, CallInfo C)
469 : IsAllocation(IsAllocation), Call(C) {}
470
471 void addClone(ContextNode *Clone) {
472 if (CloneOf) {
473 CloneOf->Clones.push_back(Clone);
474 Clone->CloneOf = CloneOf;
475 } else {
476 Clones.push_back(Clone);
477 assert(!Clone->CloneOf);
478 Clone->CloneOf = this;
479 }
480 }
481
482 ContextNode *getOrigNode() {
483 if (!CloneOf)
484 return this;
485 return CloneOf;
486 }
487
488 void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
489 unsigned int ContextId);
490
491 ContextEdge *findEdgeFromCallee(const ContextNode *Callee);
492 ContextEdge *findEdgeFromCaller(const ContextNode *Caller);
493 void eraseCalleeEdge(const ContextEdge *Edge);
494 void eraseCallerEdge(const ContextEdge *Edge);
495
496 void setCall(CallInfo C) { Call = std::move(C); }
497
498 bool hasCall() const { return (bool)Call.call(); }
499
500 void printCall(raw_ostream &OS) const { Call.print(OS); }
501
502 // True if this node was effectively removed from the graph, in which case
503 // it should have an allocation type of None and empty context ids.
504 bool isRemoved() const {
505 // Typically if the callee edges are empty either the caller edges are
506 // also empty, or this is an allocation (leaf node). However, if we are
507 // allowing recursive callsites and contexts this will be violated for
508 // incompletely cloned recursive cycles.
509 assert((AllowRecursiveCallsites && AllowRecursiveContexts) ||
510 (AllocTypes == (uint8_t)AllocationType::None) ==
511 emptyContextIds());
512 return AllocTypes == (uint8_t)AllocationType::None;
513 }
514
515 void dump() const;
516 void print(raw_ostream &OS) const;
517
518 friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
519 Node.print(OS);
520 return OS;
521 }
522 };
523
524 /// Edge in the Callsite Context Graph from a ContextNode N to a caller or
525 /// callee.
526 struct ContextEdge {
527 ContextNode *Callee;
528 ContextNode *Caller;
529
530 // This will be formed by ORing together the AllocationType enum values
531 // for contexts including this edge.
532 uint8_t AllocTypes = 0;
533
534 // Set just before initiating cloning when cloning of recursive contexts is
535 // enabled. Used to defer cloning of backedges until we have done cloning of
536 // the callee node for non-backedge caller edges. This exposes cloning
537 // opportunities through the backedge of the cycle.
538 // TODO: Note that this is not updated during cloning, and it is unclear
539 // whether that would be needed.
540 bool IsBackedge = false;
541
542 // The set of IDs for contexts including this edge.
543 DenseSet<uint32_t> ContextIds;
544
545 ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
546 DenseSet<uint32_t> ContextIds)
547 : Callee(Callee), Caller(Caller), AllocTypes(AllocType),
548 ContextIds(std::move(ContextIds)) {}
549
550 DenseSet<uint32_t> &getContextIds() { return ContextIds; }
551
552 // Helper to clear the fields of this edge when we are removing it from the
553 // graph.
554 inline void clear() {
555 ContextIds.clear();
556 AllocTypes = (uint8_t)AllocationType::None;
557 Caller = nullptr;
558 Callee = nullptr;
559 }
560
561 // Check if edge was removed from the graph. This is useful while iterating
562 // over a copy of edge lists when performing operations that mutate the
563 // graph in ways that might remove one of the edges.
564 inline bool isRemoved() const {
565 if (Callee || Caller)
566 return false;
567 // Any edges that have been removed from the graph but are still in a
568 // shared_ptr somewhere should have all fields null'ed out by clear()
569 // above.
570 assert(AllocTypes == (uint8_t)AllocationType::None);
571 assert(ContextIds.empty());
572 return true;
573 }
574
575 void dump() const;
576 void print(raw_ostream &OS) const;
577
578 friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
579 Edge.print(OS);
580 return OS;
581 }
582 };
583
584 /// Helpers to remove edges that have allocation type None (due to not
585 /// carrying any context ids) after transformations.
586 void removeNoneTypeCalleeEdges(ContextNode *Node);
587 void removeNoneTypeCallerEdges(ContextNode *Node);
588 void
589 recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node,
590 DenseSet<const ContextNode *> &Visited);
591
592protected:
593 /// Get a list of nodes corresponding to the stack ids in the given callsite
594 /// context.
595 template <class NodeT, class IteratorT>
596 std::vector<uint64_t>
597 getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
598
599 /// Adds nodes for the given allocation and any stack ids on its memprof MIB
600 /// metadata (or summary).
601 ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);
602
603 /// Adds nodes for the given MIB stack ids.
604 template <class NodeT, class IteratorT>
605 void addStackNodesForMIB(
606 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
607 CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
608 ArrayRef<ContextTotalSize> ContextSizeInfo,
609 std::map<uint64_t, uint32_t> &TotalSizeToContextIdTopNCold);
610
611 /// Matches all callsite metadata (or summary) to the nodes created for
612 /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
613 /// inlining performed on those callsite instructions.
614 void updateStackNodes();
615
616 /// Optionally fixup edges for the N largest cold contexts to better enable
617 /// cloning. This is particularly helpful if the context includes recursion
618 /// as well as inlining, resulting in a single stack node for multiple stack
619 /// ids in the context. With recursion it is particularly difficult to get the
620 /// edge updates correct as in the general case we have lost the original
621 /// stack id ordering for the context. Do more expensive fixup for the largest
622 /// contexts, controlled by MemProfTopNImportant and MemProfFixupImportant.
623 void fixupImportantContexts();
624
625 /// Update graph to conservatively handle any callsite stack nodes that target
626 /// multiple different callee target functions.
627 void handleCallsitesWithMultipleTargets();
628
629 /// Mark backedges via the standard DFS based backedge algorithm.
630 void markBackedges();
631
632 /// Merge clones generated during cloning for different allocations but that
633 /// are called by the same caller node, to ensure proper function assignment.
634 void mergeClones();
635
636 // Try to partition calls on the given node (already placed into the AllCalls
637 // array) by callee function, creating new copies of Node as needed to hold
638 // calls with different callees, and moving the callee edges appropriately.
639 // Returns true if partitioning was successful.
640 bool partitionCallsByCallee(
641 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
642 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode);
643
644 /// Save lists of calls with MemProf metadata in each function, for faster
645 /// iteration.
646 MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
647
648 /// Map from callsite node to the enclosing caller function.
649 std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
650
651 // When exporting to dot, and an allocation id is specified, contains the
652 // context ids on that allocation.
653 DenseSet<uint32_t> DotAllocContextIds;
654
655private:
656 using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
657
658 // Structure to keep track of information for each call as we are matching
659 // non-allocation callsites onto context nodes created from the allocation
660 // call metadata / summary contexts.
661 struct CallContextInfo {
662 // The callsite we're trying to match.
663 CallTy Call;
664 // The callsites stack ids that have a context node in the graph.
665 std::vector<uint64_t> StackIds;
666 // The function containing this callsite.
667 const FuncTy *Func;
668 // Initially empty, if needed this will be updated to contain the context
669 // ids for use in a new context node created for this callsite.
670 DenseSet<uint32_t> ContextIds;
671 };
672
673 /// Helper to remove edge from graph, updating edge iterator if it is provided
674 /// (in which case CalleeIter indicates which edge list is being iterated).
675 /// This will also perform the necessary clearing of the ContextEdge members
676 /// to enable later checking if the edge has been removed (since we may have
677 /// other copies of the shared_ptr in existence, and in fact rely on this to
678 /// enable removal while iterating over a copy of a node's edge list).
679 void removeEdgeFromGraph(ContextEdge *Edge, EdgeIter *EI = nullptr,
680 bool CalleeIter = true);
681
682 /// Assigns the given Node to calls at or inlined into the location with
683 /// the Node's stack id, after post order traversing and processing its
684 /// caller nodes. Uses the call information recorded in the given
685 /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
686 /// as needed. Called by updateStackNodes which sets up the given
687 /// StackIdToMatchingCalls map.
688 void assignStackNodesPostOrder(
689 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
690 DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls,
691 DenseMap<CallInfo, CallInfo> &CallToMatchingCall,
692 const DenseSet<uint32_t> &ImportantContextIds);
693
694 /// Duplicates the given set of context ids, updating the provided
695 /// map from each original id with the newly generated context ids,
696 /// and returning the new duplicated id set.
697 DenseSet<uint32_t> duplicateContextIds(
698 const DenseSet<uint32_t> &StackSequenceContextIds,
699 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
700
701 /// Propagates all duplicated context ids across the graph.
702 void propagateDuplicateContextIds(
703 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
704
705 /// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
706 /// else to its callers. Also updates OrigNode's edges to remove any context
707 /// ids moved to the newly created edge.
708 void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
709 bool TowardsCallee,
710 DenseSet<uint32_t> RemainingContextIds);
711
712 /// Get the stack id corresponding to the given Id or Index (for IR this will
713 /// return itself, for a summary index this will return the id recorded in the
714 /// index for that stack id index value).
715 uint64_t getStackId(uint64_t IdOrIndex) const {
716 return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
717 }
718
719 /// Returns true if the given call targets the callee of the given edge, or if
720 /// we were able to identify the call chain through intermediate tail calls.
721 /// In the latter case new context nodes are added to the graph for the
722 /// identified tail calls, and their synthesized nodes are added to
723 /// TailCallToContextNodeMap. The EdgeIter is updated in the latter case for
724 /// the updated edges and to prepare it for an increment in the caller.
725 bool
726 calleesMatch(CallTy Call, EdgeIter &EI,
727 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);
728
729 // Return the callee function of the given call, or nullptr if it can't be
730 // determined
731 const FuncTy *getCalleeFunc(CallTy Call) {
732 return static_cast<DerivedCCG *>(this)->getCalleeFunc(Call);
733 }
734
735 /// Returns true if the given call targets the given function, or if we were
736 /// able to identify the call chain through intermediate tail calls (in which
737 /// case FoundCalleeChain will be populated).
738 bool calleeMatchesFunc(
739 CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc,
740 std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) {
741 return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(
742 Call, Func, CallerFunc, FoundCalleeChain);
743 }
744
745 /// Returns true if both call instructions have the same callee.
746 bool sameCallee(CallTy Call1, CallTy Call2) {
747 return static_cast<DerivedCCG *>(this)->sameCallee(Call1, Call2);
748 }
749
750 /// Get a list of nodes corresponding to the stack ids in the given
751 /// callsite's context.
752 std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
753 return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
754 Call);
755 }
756
757 /// Get the last stack id in the context for callsite.
758 uint64_t getLastStackId(CallTy Call) {
759 return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
760 }
761
762 /// Update the allocation call to record type of allocated memory.
763 void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
764 AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
765 static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
766 }
767
768 /// Get the AllocationType assigned to the given allocation instruction clone.
769 AllocationType getAllocationCallType(const CallInfo &Call) const {
770 return static_cast<const DerivedCCG *>(this)->getAllocationCallType(Call);
771 }
772
773 /// Update non-allocation call to invoke (possibly cloned) function
774 /// CalleeFunc.
775 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
776 static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
777 }
778
779 /// Clone the given function for the given callsite, recording mapping of all
780 /// of the functions tracked calls to their new versions in the CallMap.
781 /// Assigns new clones to clone number CloneNo.
782 FuncInfo cloneFunctionForCallsite(
783 FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
784 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
785 return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
786 Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
787 }
788
789 /// Gets a label to use in the dot graph for the given call clone in the given
790 /// function.
791 std::string getLabel(const FuncTy *Func, const CallTy Call,
792 unsigned CloneNo) const {
793 return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
794 }
795
796 // Create and return a new ContextNode.
797 ContextNode *createNewNode(bool IsAllocation, const FuncTy *F = nullptr,
798 CallInfo C = CallInfo()) {
799 NodeOwner.push_back(std::make_unique<ContextNode>(IsAllocation, C));
800 auto *NewNode = NodeOwner.back().get();
801 if (F)
802 NodeToCallingFunc[NewNode] = F;
803 NewNode->NodeId = NodeOwner.size();
804 return NewNode;
805 }
806
807 /// Helpers to find the node corresponding to the given call or stackid.
808 ContextNode *getNodeForInst(const CallInfo &C);
809 ContextNode *getNodeForAlloc(const CallInfo &C);
810 ContextNode *getNodeForStackId(uint64_t StackId);
811
812 /// Computes the alloc type corresponding to the given context ids, by
813 /// unioning their recorded alloc types.
814 uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds) const;
815
816 /// Returns the allocation type of the intersection of the contexts of two
817 /// nodes (based on their provided context id sets), optimized for the case
818 /// when Node1Ids is smaller than Node2Ids.
819 uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
820 const DenseSet<uint32_t> &Node2Ids) const;
821
822 /// Returns the allocation type of the intersection of the contexts of two
823 /// nodes (based on their provided context id sets).
824 uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
825 const DenseSet<uint32_t> &Node2Ids) const;
826
827 /// Create a clone of Edge's callee and move Edge to that new callee node,
828 /// performing the necessary context id and allocation type updates.
829 /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
830 /// moved to an edge to the new callee.
831 ContextNode *
832 moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
833 DenseSet<uint32_t> ContextIdsToMove = {});
834
835 /// Change the callee of Edge to existing callee clone NewCallee, performing
836 /// the necessary context id and allocation type updates.
837 /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
838 /// moved to an edge to the new callee.
839 void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
840 ContextNode *NewCallee,
841 bool NewClone = false,
842 DenseSet<uint32_t> ContextIdsToMove = {});
843
844 /// Change the caller of the edge at the given callee edge iterator to be
845 /// NewCaller, performing the necessary context id and allocation type
846 /// updates. This is similar to the above moveEdgeToExistingCalleeClone, but
847 /// a simplified version of it as we always move the given edge and all of its
848 /// context ids.
849 void moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
850 ContextNode *NewCaller);
851
852 /// Recursive helper for marking backedges via DFS.
853 void markBackedges(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
854 DenseSet<const ContextNode *> &CurrentStack);
855
856 /// Recursive helper for merging clones.
857 void
858 mergeClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
859 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode);
860 /// Main worker for merging callee clones for a given node.
861 void mergeNodeCalleeClones(
862 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
863 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode);
864 /// Helper to find other callers of the given set of callee edges that can
865 /// share the same callee merge node.
866 void findOtherCallersToShareMerge(
867 ContextNode *Node, std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges,
868 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode,
869 DenseSet<ContextNode *> &OtherCallersToShareMerge);
870
871 /// Recursively perform cloning on the graph for the given Node and its
872 /// callers, in order to uniquely identify the allocation behavior of an
873 /// allocation given its context. The context ids of the allocation being
874 /// processed are given in AllocContextIds.
875 void identifyClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
876 const DenseSet<uint32_t> &AllocContextIds);
877
878 /// Map from each context ID to the AllocationType assigned to that context.
879 DenseMap<uint32_t, AllocationType> ContextIdToAllocationType;
880
881 /// Map from each contextID to the profiled full contexts and their total
882 /// sizes (there may be more than one due to context trimming),
883 /// optionally populated when requested (via MemProfReportHintedSizes or
884 /// MinClonedColdBytePercent).
885 DenseMap<uint32_t, std::vector<ContextTotalSize>> ContextIdToContextSizeInfos;
886
887 /// Identifies the context node created for a stack id when adding the MIB
888 /// contexts to the graph. This is used to locate the context nodes when
889 /// trying to assign the corresponding callsites with those stack ids to these
890 /// nodes.
891 DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
892
893 /// Saves information for the contexts identified as important (the largest
894 /// cold contexts up to MemProfTopNImportant).
895 struct ImportantContextInfo {
896 // The original list of leaf first stack ids corresponding to this context.
897 std::vector<uint64_t> StackIds;
898 // Max length of stack ids corresponding to a single stack ContextNode for
899 // this context (i.e. the max length of a key in StackIdsToNode below).
900 unsigned MaxLength = 0;
901 // Mapping of slices of the stack ids to the corresponding ContextNode
902 // (there can be multiple stack ids due to inlining). Populated when
903 // updating stack nodes while matching them to the IR or summary.
904 std::map<std::vector<uint64_t>, ContextNode *> StackIdsToNode;
905 };
906
907 // Map of important full context ids to information about each.
908 DenseMap<uint32_t, ImportantContextInfo> ImportantContextIdInfo;
909
910 // For each important context id found in Node (if any), records the list of
911 // stack ids that corresponded to the given callsite Node. There can be more
912 // than one in the case of inlining.
913 void recordStackNode(std::vector<uint64_t> &StackIds, ContextNode *Node,
914 // We pass in the Node's context ids to avoid the
915 // overhead of computing them as the caller already has
916 // them in some cases.
917 const DenseSet<uint32_t> &NodeContextIds,
918 const DenseSet<uint32_t> &ImportantContextIds) {
919 if (!MemProfTopNImportant) {
920 assert(ImportantContextIds.empty());
921 return;
922 }
923 DenseSet<uint32_t> Ids =
924 set_intersection(S1: NodeContextIds, S2: ImportantContextIds);
925 if (Ids.empty())
926 return;
927 auto Size = StackIds.size();
928 for (auto Id : Ids) {
929 auto &Entry = ImportantContextIdInfo[Id];
930 Entry.StackIdsToNode[StackIds] = Node;
931 // Keep track of the max to simplify later analysis.
932 if (Size > Entry.MaxLength)
933 Entry.MaxLength = Size;
934 }
935 }
936
937 /// Maps to track the calls to their corresponding nodes in the graph.
938 MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
939 MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
940
941 /// Owner of all ContextNode unique_ptrs.
942 std::vector<std::unique_ptr<ContextNode>> NodeOwner;
943
944 /// Perform sanity checks on graph when requested.
945 void check() const;
946
947 /// Keeps track of the last unique context id assigned.
948 unsigned int LastContextId = 0;
949};
950
951template <typename DerivedCCG, typename FuncTy, typename CallTy>
952using ContextNode =
953 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
954template <typename DerivedCCG, typename FuncTy, typename CallTy>
955using ContextEdge =
956 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
957template <typename DerivedCCG, typename FuncTy, typename CallTy>
958using FuncInfo =
959 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
960template <typename DerivedCCG, typename FuncTy, typename CallTy>
961using CallInfo =
962 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
963
964/// CRTP derived class for graphs built from IR (regular LTO).
965class ModuleCallsiteContextGraph
966 : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
967 Instruction *> {
968public:
969 ModuleCallsiteContextGraph(
970 Module &M,
971 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
972
973private:
974 friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
975 Instruction *>;
976
977 uint64_t getStackId(uint64_t IdOrIndex) const;
978 const Function *getCalleeFunc(Instruction *Call);
979 bool calleeMatchesFunc(
980 Instruction *Call, const Function *Func, const Function *CallerFunc,
981 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain);
982 bool sameCallee(Instruction *Call1, Instruction *Call2);
983 bool findProfiledCalleeThroughTailCalls(
984 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
985 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
986 bool &FoundMultipleCalleeChains);
987 uint64_t getLastStackId(Instruction *Call);
988 std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
989 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
990 AllocationType getAllocationCallType(const CallInfo &Call) const;
991 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
992 CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
993 Instruction *>::FuncInfo
994 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
995 DenseMap<CallInfo, CallInfo> &CallMap,
996 std::vector<CallInfo> &CallsWithMetadataInFunc,
997 unsigned CloneNo);
998 std::string getLabel(const Function *Func, const Instruction *Call,
999 unsigned CloneNo) const;
1000
1001 const Module &Mod;
1002 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
1003};
1004
1005/// Represents a call in the summary index graph, which can either be an
1006/// allocation or an interior callsite node in an allocation's context.
1007/// Holds a pointer to the corresponding data structure in the index.
1008struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
1009 IndexCall() : PointerUnion() {}
1010 IndexCall(std::nullptr_t) : IndexCall() {}
1011 IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {}
1012 IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
1013 IndexCall(PointerUnion PT) : PointerUnion(PT) {}
1014
1015 IndexCall *operator->() { return this; }
1016
1017 void print(raw_ostream &OS) const {
1018 PointerUnion<CallsiteInfo *, AllocInfo *> Base = *this;
1019 if (auto *AI = llvm::dyn_cast_if_present<AllocInfo *>(Val&: Base)) {
1020 OS << *AI;
1021 } else {
1022 auto *CI = llvm::dyn_cast_if_present<CallsiteInfo *>(Val&: Base);
1023 assert(CI);
1024 OS << *CI;
1025 }
1026 }
1027};
1028} // namespace
1029
1030namespace llvm {
1031template <> struct simplify_type<IndexCall> {
1032 using SimpleType = PointerUnion<CallsiteInfo *, AllocInfo *>;
1033 static SimpleType getSimplifiedValue(IndexCall &Val) { return Val; }
1034};
1035template <> struct simplify_type<const IndexCall> {
1036 using SimpleType = const PointerUnion<CallsiteInfo *, AllocInfo *>;
1037 static SimpleType getSimplifiedValue(const IndexCall &Val) { return Val; }
1038};
1039} // namespace llvm
1040
1041namespace {
1042/// CRTP derived class for graphs built from summary index (ThinLTO).
1043class IndexCallsiteContextGraph
1044 : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
1045 IndexCall> {
1046public:
1047 IndexCallsiteContextGraph(
1048 ModuleSummaryIndex &Index,
1049 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
1050 isPrevailing);
1051
1052 ~IndexCallsiteContextGraph() {
1053 // Now that we are done with the graph it is safe to add the new
1054 // CallsiteInfo structs to the function summary vectors. The graph nodes
1055 // point into locations within these vectors, so we don't want to add them
1056 // any earlier.
1057 for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) {
1058 auto *FS = I.first;
1059 for (auto &Callsite : I.second)
1060 FS->addCallsite(Callsite: std::move(*Callsite.second));
1061 }
1062 }
1063
1064private:
1065 friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
1066 IndexCall>;
1067
1068 uint64_t getStackId(uint64_t IdOrIndex) const;
1069 const FunctionSummary *getCalleeFunc(IndexCall &Call);
1070 bool calleeMatchesFunc(
1071 IndexCall &Call, const FunctionSummary *Func,
1072 const FunctionSummary *CallerFunc,
1073 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);
1074 bool sameCallee(IndexCall &Call1, IndexCall &Call2);
1075 bool findProfiledCalleeThroughTailCalls(
1076 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
1077 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
1078 bool &FoundMultipleCalleeChains);
1079 uint64_t getLastStackId(IndexCall &Call);
1080 std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
1081 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
1082 AllocationType getAllocationCallType(const CallInfo &Call) const;
1083 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
1084 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
1085 IndexCall>::FuncInfo
1086 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
1087 DenseMap<CallInfo, CallInfo> &CallMap,
1088 std::vector<CallInfo> &CallsWithMetadataInFunc,
1089 unsigned CloneNo);
1090 std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
1091 unsigned CloneNo) const;
1092 DenseSet<GlobalValue::GUID> findAliaseeGUIDsPrevailingInDifferentModule();
1093
1094 // Saves mapping from function summaries containing memprof records back to
1095 // its VI, for use in checking and debugging.
1096 std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
1097
1098 const ModuleSummaryIndex &Index;
1099 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
1100 isPrevailing;
1101
1102 // Saves/owns the callsite info structures synthesized for missing tail call
1103 // frames that we discover while building the graph.
1104 // It maps from the summary of the function making the tail call, to a map
1105 // of callee ValueInfo to corresponding synthesized callsite info.
1106 std::unordered_map<FunctionSummary *,
1107 std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>>
1108 FunctionCalleesToSynthesizedCallsiteInfos;
1109};
1110} // namespace
1111
1112template <>
1113struct llvm::DenseMapInfo<CallsiteContextGraph<
1114 ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
1115 : public DenseMapInfo<std::pair<Instruction *, unsigned>> {};
1116template <>
1117struct llvm::DenseMapInfo<CallsiteContextGraph<
1118 IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
1119 : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
1120template <>
1121struct llvm::DenseMapInfo<IndexCall>
1122 : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
1123
1124namespace {
1125
1126// Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
1127// type we should actually use on the corresponding allocation.
1128// If we can't clone a node that has NotCold+Cold alloc type, we will fall
1129// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
1130// from NotCold.
1131AllocationType allocTypeToUse(uint8_t AllocTypes) {
1132 assert(AllocTypes != (uint8_t)AllocationType::None);
1133 if (AllocTypes ==
1134 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
1135 return AllocationType::NotCold;
1136 else
1137 return (AllocationType)AllocTypes;
1138}
1139
1140// Helper to check if the alloc types for all edges recorded in the
1141// InAllocTypes vector match the alloc types for all edges in the Edges
1142// vector.
1143template <typename DerivedCCG, typename FuncTy, typename CallTy>
1144bool allocTypesMatch(
1145 const std::vector<uint8_t> &InAllocTypes,
1146 const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
1147 &Edges) {
1148 // This should be called only when the InAllocTypes vector was computed for
1149 // this set of Edges. Make sure the sizes are the same.
1150 assert(InAllocTypes.size() == Edges.size());
1151 return std::equal(
1152 InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(), Edges.end(),
1153 [](const uint8_t &l,
1154 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
1155 // Can share if one of the edges is None type - don't
1156 // care about the type along that edge as it doesn't
1157 // exist for those context ids.
1158 if (l == (uint8_t)AllocationType::None ||
1159 r->AllocTypes == (uint8_t)AllocationType::None)
1160 return true;
1161 return allocTypeToUse(AllocTypes: l) == allocTypeToUse(r->AllocTypes);
1162 });
1163}
1164
1165// Helper to check if the alloc types for all edges recorded in the
1166// InAllocTypes vector match the alloc types for callee edges in the given
1167// clone. Because the InAllocTypes were computed from the original node's callee
1168// edges, and other cloning could have happened after this clone was created, we
1169// need to find the matching clone callee edge, which may or may not exist.
1170template <typename DerivedCCG, typename FuncTy, typename CallTy>
1171bool allocTypesMatchClone(
1172 const std::vector<uint8_t> &InAllocTypes,
1173 const ContextNode<DerivedCCG, FuncTy, CallTy> *Clone) {
1174 const ContextNode<DerivedCCG, FuncTy, CallTy> *Node = Clone->CloneOf;
1175 assert(Node);
1176 // InAllocTypes should have been computed for the original node's callee
1177 // edges.
1178 assert(InAllocTypes.size() == Node->CalleeEdges.size());
1179 // First create a map of the clone callee edge callees to the edge alloc type.
1180 DenseMap<const ContextNode<DerivedCCG, FuncTy, CallTy> *, uint8_t>
1181 EdgeCalleeMap;
1182 for (const auto &E : Clone->CalleeEdges) {
1183 assert(!EdgeCalleeMap.contains(E->Callee));
1184 EdgeCalleeMap[E->Callee] = E->AllocTypes;
1185 }
1186 // Next, walk the original node's callees, and look for the corresponding
1187 // clone edge to that callee.
1188 for (unsigned I = 0; I < Node->CalleeEdges.size(); I++) {
1189 auto Iter = EdgeCalleeMap.find(Node->CalleeEdges[I]->Callee);
1190 // Not found is ok, we will simply add an edge if we use this clone.
1191 if (Iter == EdgeCalleeMap.end())
1192 continue;
1193 // Can share if one of the edges is None type - don't
1194 // care about the type along that edge as it doesn't
1195 // exist for those context ids.
1196 if (InAllocTypes[I] == (uint8_t)AllocationType::None ||
1197 Iter->second == (uint8_t)AllocationType::None)
1198 continue;
1199 if (allocTypeToUse(Iter->second) != allocTypeToUse(AllocTypes: InAllocTypes[I]))
1200 return false;
1201 }
1202 return true;
1203}
1204
1205} // end anonymous namespace
1206
1207template <typename DerivedCCG, typename FuncTy, typename CallTy>
1208typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1209CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
1210 const CallInfo &C) {
1211 ContextNode *Node = getNodeForAlloc(C);
1212 if (Node)
1213 return Node;
1214
1215 return NonAllocationCallToContextNodeMap.lookup(C);
1216}
1217
1218template <typename DerivedCCG, typename FuncTy, typename CallTy>
1219typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1220CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
1221 const CallInfo &C) {
1222 return AllocationCallToContextNodeMap.lookup(C);
1223}
1224
1225template <typename DerivedCCG, typename FuncTy, typename CallTy>
1226typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1227CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
1228 uint64_t StackId) {
1229 auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
1230 if (StackEntryNode != StackEntryIdToContextNodeMap.end())
1231 return StackEntryNode->second;
1232 return nullptr;
1233}
1234
1235template <typename DerivedCCG, typename FuncTy, typename CallTy>
1236void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1237 addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
1238 unsigned int ContextId) {
1239 for (auto &Edge : CallerEdges) {
1240 if (Edge->Caller == Caller) {
1241 Edge->AllocTypes |= (uint8_t)AllocType;
1242 Edge->getContextIds().insert(ContextId);
1243 return;
1244 }
1245 }
1246 std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
1247 this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
1248 CallerEdges.push_back(Edge);
1249 Caller->CalleeEdges.push_back(Edge);
1250}
1251
1252template <typename DerivedCCG, typename FuncTy, typename CallTy>
1253void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::removeEdgeFromGraph(
1254 ContextEdge *Edge, EdgeIter *EI, bool CalleeIter) {
1255 assert(!EI || (*EI)->get() == Edge);
1256 assert(!Edge->isRemoved());
1257 // Save the Caller and Callee pointers so we can erase Edge from their edge
1258 // lists after clearing Edge below. We do the clearing first in case it is
1259 // destructed after removing from the edge lists (if those were the last
1260 // shared_ptr references to Edge).
1261 auto *Callee = Edge->Callee;
1262 auto *Caller = Edge->Caller;
1263
1264 // Make sure the edge fields are cleared out so we can properly detect
1265 // removed edges if Edge is not destructed because there is still a shared_ptr
1266 // reference.
1267 Edge->clear();
1268
1269#ifndef NDEBUG
1270 auto CalleeCallerCount = Callee->CallerEdges.size();
1271 auto CallerCalleeCount = Caller->CalleeEdges.size();
1272#endif
1273 if (!EI) {
1274 Callee->eraseCallerEdge(Edge);
1275 Caller->eraseCalleeEdge(Edge);
1276 } else if (CalleeIter) {
1277 Callee->eraseCallerEdge(Edge);
1278 *EI = Caller->CalleeEdges.erase(*EI);
1279 } else {
1280 Caller->eraseCalleeEdge(Edge);
1281 *EI = Callee->CallerEdges.erase(*EI);
1282 }
1283 assert(Callee->CallerEdges.size() < CalleeCallerCount);
1284 assert(Caller->CalleeEdges.size() < CallerCalleeCount);
1285}
1286
1287template <typename DerivedCCG, typename FuncTy, typename CallTy>
1288void CallsiteContextGraph<
1289 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
1290 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
1291 auto Edge = *EI;
1292 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1293 assert(Edge->ContextIds.empty());
1294 removeEdgeFromGraph(Edge: Edge.get(), EI: &EI, /*CalleeIter=*/true);
1295 } else
1296 ++EI;
1297 }
1298}
1299
1300template <typename DerivedCCG, typename FuncTy, typename CallTy>
1301void CallsiteContextGraph<
1302 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCallerEdges(ContextNode *Node) {
1303 for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
1304 auto Edge = *EI;
1305 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1306 assert(Edge->ContextIds.empty());
1307 Edge->Caller->eraseCalleeEdge(Edge.get());
1308 EI = Node->CallerEdges.erase(EI);
1309 } else
1310 ++EI;
1311 }
1312}
1313
1314template <typename DerivedCCG, typename FuncTy, typename CallTy>
1315typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1316CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1317 findEdgeFromCallee(const ContextNode *Callee) {
1318 for (const auto &Edge : CalleeEdges)
1319 if (Edge->Callee == Callee)
1320 return Edge.get();
1321 return nullptr;
1322}
1323
1324template <typename DerivedCCG, typename FuncTy, typename CallTy>
1325typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1326CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1327 findEdgeFromCaller(const ContextNode *Caller) {
1328 for (const auto &Edge : CallerEdges)
1329 if (Edge->Caller == Caller)
1330 return Edge.get();
1331 return nullptr;
1332}
1333
1334template <typename DerivedCCG, typename FuncTy, typename CallTy>
1335void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1336 eraseCalleeEdge(const ContextEdge *Edge) {
1337 auto EI = llvm::find_if(
1338 CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
1339 return CalleeEdge.get() == Edge;
1340 });
1341 assert(EI != CalleeEdges.end());
1342 CalleeEdges.erase(EI);
1343}
1344
1345template <typename DerivedCCG, typename FuncTy, typename CallTy>
1346void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1347 eraseCallerEdge(const ContextEdge *Edge) {
1348 auto EI = llvm::find_if(
1349 CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
1350 return CallerEdge.get() == Edge;
1351 });
1352 assert(EI != CallerEdges.end());
1353 CallerEdges.erase(EI);
1354}
1355
1356template <typename DerivedCCG, typename FuncTy, typename CallTy>
1357uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
1358 DenseSet<uint32_t> &ContextIds) const {
1359 uint8_t BothTypes =
1360 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1361 uint8_t AllocType = (uint8_t)AllocationType::None;
1362 for (auto Id : ContextIds) {
1363 AllocType |= (uint8_t)ContextIdToAllocationType.at(Val: Id);
1364 // Bail early if alloc type reached both, no further refinement.
1365 if (AllocType == BothTypes)
1366 return AllocType;
1367 }
1368 return AllocType;
1369}
1370
1371template <typename DerivedCCG, typename FuncTy, typename CallTy>
1372uint8_t
1373CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
1374 const DenseSet<uint32_t> &Node1Ids,
1375 const DenseSet<uint32_t> &Node2Ids) const {
1376 uint8_t BothTypes =
1377 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1378 uint8_t AllocType = (uint8_t)AllocationType::None;
1379 for (auto Id : Node1Ids) {
1380 if (!Node2Ids.count(V: Id))
1381 continue;
1382 AllocType |= (uint8_t)ContextIdToAllocationType.at(Val: Id);
1383 // Bail early if alloc type reached both, no further refinement.
1384 if (AllocType == BothTypes)
1385 return AllocType;
1386 }
1387 return AllocType;
1388}
1389
1390template <typename DerivedCCG, typename FuncTy, typename CallTy>
1391uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
1392 const DenseSet<uint32_t> &Node1Ids,
1393 const DenseSet<uint32_t> &Node2Ids) const {
1394 if (Node1Ids.size() < Node2Ids.size())
1395 return intersectAllocTypesImpl(Node1Ids, Node2Ids);
1396 else
1397 return intersectAllocTypesImpl(Node1Ids: Node2Ids, Node2Ids: Node1Ids);
1398}
1399
1400template <typename DerivedCCG, typename FuncTy, typename CallTy>
1401typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1402CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
1403 CallInfo Call, const FuncTy *F) {
1404 assert(!getNodeForAlloc(Call));
1405 ContextNode *AllocNode = createNewNode(/*IsAllocation=*/true, F, C: Call);
1406 AllocationCallToContextNodeMap[Call] = AllocNode;
1407 // Use LastContextId as a uniq id for MIB allocation nodes.
1408 AllocNode->OrigStackOrAllocId = LastContextId;
1409 // Alloc type should be updated as we add in the MIBs. We should assert
1410 // afterwards that it is not still None.
1411 AllocNode->AllocTypes = (uint8_t)AllocationType::None;
1412
1413 return AllocNode;
1414}
1415
1416static std::string getAllocTypeString(uint8_t AllocTypes) {
1417 if (!AllocTypes)
1418 return "None";
1419 std::string Str;
1420 if (AllocTypes & (uint8_t)AllocationType::NotCold)
1421 Str += "NotCold";
1422 if (AllocTypes & (uint8_t)AllocationType::Cold)
1423 Str += "Cold";
1424 return Str;
1425}
1426
1427template <typename DerivedCCG, typename FuncTy, typename CallTy>
1428template <class NodeT, class IteratorT>
1429void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
1430 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
1431 CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
1432 ArrayRef<ContextTotalSize> ContextSizeInfo,
1433 std::map<uint64_t, uint32_t> &TotalSizeToContextIdTopNCold) {
1434 // Treating the hot alloc type as NotCold before the disambiguation for "hot"
1435 // is done.
1436 if (AllocType == AllocationType::Hot)
1437 AllocType = AllocationType::NotCold;
1438
1439 ContextIdToAllocationType[++LastContextId] = AllocType;
1440
1441 bool IsImportant = false;
1442 if (!ContextSizeInfo.empty()) {
1443 auto &Entry = ContextIdToContextSizeInfos[LastContextId];
1444 // If this is a cold allocation, and we are collecting non-zero largest
1445 // contexts, see if this is a candidate.
1446 if (AllocType == AllocationType::Cold && MemProfTopNImportant > 0) {
1447 uint64_t TotalCold = 0;
1448 for (auto &CSI : ContextSizeInfo)
1449 TotalCold += CSI.TotalSize;
1450 // Record this context if either we haven't found the first top-n largest
1451 // yet, or if it is larger than the smallest already recorded.
1452 if (TotalSizeToContextIdTopNCold.size() < MemProfTopNImportant ||
1453 // Since TotalSizeToContextIdTopNCold is a std::map, it is implicitly
1454 // sorted in ascending size of its key which is the size.
1455 TotalCold > TotalSizeToContextIdTopNCold.begin()->first) {
1456 if (TotalSizeToContextIdTopNCold.size() == MemProfTopNImportant) {
1457 // Remove old one and its associated entries.
1458 auto IdToRemove = TotalSizeToContextIdTopNCold.begin()->second;
1459 TotalSizeToContextIdTopNCold.erase(
1460 position: TotalSizeToContextIdTopNCold.begin());
1461 assert(ImportantContextIdInfo.count(IdToRemove));
1462 ImportantContextIdInfo.erase(IdToRemove);
1463 }
1464 TotalSizeToContextIdTopNCold[TotalCold] = LastContextId;
1465 IsImportant = true;
1466 }
1467 }
1468 Entry.insert(position: Entry.begin(), first: ContextSizeInfo.begin(), last: ContextSizeInfo.end());
1469 }
1470
1471 // Update alloc type and context ids for this MIB.
1472 AllocNode->AllocTypes |= (uint8_t)AllocType;
1473
1474 // Now add or update nodes for each stack id in alloc's context.
1475 // Later when processing the stack ids on non-alloc callsites we will adjust
1476 // for any inlining in the context.
1477 ContextNode *PrevNode = AllocNode;
1478 // Look for recursion (direct recursion should have been collapsed by
1479 // module summary analysis, here we should just be detecting mutual
1480 // recursion). Mark these nodes so we don't try to clone.
1481 SmallSet<uint64_t, 8> StackIdSet;
1482 // Skip any on the allocation call (inlining).
1483 for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
1484 ContextIter != StackContext.end(); ++ContextIter) {
1485 auto StackId = getStackId(IdOrIndex: *ContextIter);
1486 if (IsImportant)
1487 ImportantContextIdInfo[LastContextId].StackIds.push_back(StackId);
1488 ContextNode *StackNode = getNodeForStackId(StackId);
1489 if (!StackNode) {
1490 StackNode = createNewNode(/*IsAllocation=*/false);
1491 StackEntryIdToContextNodeMap[StackId] = StackNode;
1492 StackNode->OrigStackOrAllocId = StackId;
1493 }
1494 // Marking a node recursive will prevent its cloning completely, even for
1495 // non-recursive contexts flowing through it.
1496 if (!AllowRecursiveCallsites) {
1497 auto Ins = StackIdSet.insert(StackId);
1498 if (!Ins.second)
1499 StackNode->Recursive = true;
1500 }
1501 StackNode->AllocTypes |= (uint8_t)AllocType;
1502 PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
1503 PrevNode = StackNode;
1504 }
1505}
1506
1507template <typename DerivedCCG, typename FuncTy, typename CallTy>
1508DenseSet<uint32_t>
1509CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
1510 const DenseSet<uint32_t> &StackSequenceContextIds,
1511 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1512 DenseSet<uint32_t> NewContextIds;
1513 for (auto OldId : StackSequenceContextIds) {
1514 NewContextIds.insert(V: ++LastContextId);
1515 OldToNewContextIds[OldId].insert(V: LastContextId);
1516 assert(ContextIdToAllocationType.count(OldId));
1517 // The new context has the same allocation type and size info as original.
1518 ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
1519 auto CSI = ContextIdToContextSizeInfos.find(Val: OldId);
1520 if (CSI != ContextIdToContextSizeInfos.end())
1521 ContextIdToContextSizeInfos[LastContextId] = CSI->second;
1522 if (DotAllocContextIds.contains(V: OldId))
1523 DotAllocContextIds.insert(V: LastContextId);
1524 }
1525 return NewContextIds;
1526}
1527
1528template <typename DerivedCCG, typename FuncTy, typename CallTy>
1529void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1530 propagateDuplicateContextIds(
1531 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1532 // Build a set of duplicated context ids corresponding to the input id set.
1533 auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
1534 DenseSet<uint32_t> NewIds;
1535 for (auto Id : ContextIds)
1536 if (auto NewId = OldToNewContextIds.find(Val: Id);
1537 NewId != OldToNewContextIds.end())
1538 NewIds.insert_range(R: NewId->second);
1539 return NewIds;
1540 };
1541
1542 // Recursively update context ids sets along caller edges.
1543 auto UpdateCallers = [&](ContextNode *Node,
1544 DenseSet<const ContextEdge *> &Visited,
1545 auto &&UpdateCallers) -> void {
1546 for (const auto &Edge : Node->CallerEdges) {
1547 auto Inserted = Visited.insert(Edge.get());
1548 if (!Inserted.second)
1549 continue;
1550 ContextNode *NextNode = Edge->Caller;
1551 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
1552 // Only need to recursively iterate to NextNode via this caller edge if
1553 // it resulted in any added ids to NextNode.
1554 if (!NewIdsToAdd.empty()) {
1555 Edge->getContextIds().insert_range(NewIdsToAdd);
1556 UpdateCallers(NextNode, Visited, UpdateCallers);
1557 }
1558 }
1559 };
1560
1561 DenseSet<const ContextEdge *> Visited;
1562 for (auto &Entry : AllocationCallToContextNodeMap) {
1563 auto *Node = Entry.second;
1564 UpdateCallers(Node, Visited, UpdateCallers);
1565 }
1566}
1567
1568template <typename DerivedCCG, typename FuncTy, typename CallTy>
1569void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
1570 ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee,
1571 // This must be passed by value to make a copy since it will be adjusted
1572 // as ids are moved.
1573 DenseSet<uint32_t> RemainingContextIds) {
1574 auto &OrigEdges =
1575 TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
1576 DenseSet<uint32_t> RecursiveContextIds;
1577 DenseSet<uint32_t> AllCallerContextIds;
1578 if (AllowRecursiveCallsites) {
1579 // Identify which context ids are recursive which is needed to properly
1580 // update the RemainingContextIds set. The relevant recursive context ids
1581 // are those that are in multiple edges.
1582 for (auto &CE : OrigEdges) {
1583 AllCallerContextIds.reserve(Size: CE->getContextIds().size());
1584 for (auto Id : CE->getContextIds())
1585 if (!AllCallerContextIds.insert(Id).second)
1586 RecursiveContextIds.insert(Id);
1587 }
1588 }
1589 // Increment iterator in loop so that we can remove edges as needed.
1590 for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
1591 auto Edge = *EI;
1592 DenseSet<uint32_t> NewEdgeContextIds;
1593 DenseSet<uint32_t> NotFoundContextIds;
1594 // Remove any matching context ids from Edge, return set that were found and
1595 // removed, these are the new edge's context ids. Also update the remaining
1596 // (not found ids).
1597 set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
1598 NotFoundContextIds);
1599 // Update the remaining context ids set for the later edges. This is a
1600 // compile time optimization.
1601 if (RecursiveContextIds.empty()) {
1602 // No recursive ids, so all of the previously remaining context ids that
1603 // were not seen on this edge are the new remaining set.
1604 RemainingContextIds.swap(RHS&: NotFoundContextIds);
1605 } else {
1606 // Keep the recursive ids in the remaining set as we expect to see those
1607 // on another edge. We can remove the non-recursive remaining ids that
1608 // were seen on this edge, however. We already have the set of remaining
1609 // ids that were on this edge (in NewEdgeContextIds). Figure out which are
1610 // non-recursive and only remove those. Note that despite the higher
1611 // overhead of updating the remaining context ids set when recursion
1612 // handling is enabled, it was found to be at worst performance neutral
1613 // and in one case a clear win.
1614 DenseSet<uint32_t> NonRecursiveRemainingCurEdgeIds =
1615 set_difference(S1: NewEdgeContextIds, S2: RecursiveContextIds);
1616 set_subtract(S1&: RemainingContextIds, S2: NonRecursiveRemainingCurEdgeIds);
1617 }
1618 // If no matching context ids for this edge, skip it.
1619 if (NewEdgeContextIds.empty()) {
1620 ++EI;
1621 continue;
1622 }
1623 if (TowardsCallee) {
1624 uint8_t NewAllocType = computeAllocType(ContextIds&: NewEdgeContextIds);
1625 auto NewEdge = std::make_shared<ContextEdge>(
1626 Edge->Callee, NewNode, NewAllocType, std::move(NewEdgeContextIds));
1627 NewNode->CalleeEdges.push_back(NewEdge);
1628 NewEdge->Callee->CallerEdges.push_back(NewEdge);
1629 } else {
1630 uint8_t NewAllocType = computeAllocType(ContextIds&: NewEdgeContextIds);
1631 auto NewEdge = std::make_shared<ContextEdge>(
1632 NewNode, Edge->Caller, NewAllocType, std::move(NewEdgeContextIds));
1633 NewNode->CallerEdges.push_back(NewEdge);
1634 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
1635 }
1636 // Remove old edge if context ids empty.
1637 if (Edge->getContextIds().empty()) {
1638 removeEdgeFromGraph(Edge: Edge.get(), EI: &EI, CalleeIter: TowardsCallee);
1639 continue;
1640 }
1641 ++EI;
1642 }
1643}
1644
1645template <typename DerivedCCG, typename FuncTy, typename CallTy>
1646static void checkEdge(
1647 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
1648 // Confirm that alloc type is not None and that we have at least one context
1649 // id.
1650 assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
1651 assert(!Edge->ContextIds.empty());
1652}
1653
1654template <typename DerivedCCG, typename FuncTy, typename CallTy>
1655static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
1656 bool CheckEdges = true) {
1657 if (Node->isRemoved())
1658 return;
1659#ifndef NDEBUG
1660 // Compute node's context ids once for use in asserts.
1661 auto NodeContextIds = Node->getContextIds();
1662#endif
1663 // Node's context ids should be the union of both its callee and caller edge
1664 // context ids.
1665 if (Node->CallerEdges.size()) {
1666 DenseSet<uint32_t> CallerEdgeContextIds(
1667 Node->CallerEdges.front()->ContextIds);
1668 for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) {
1669 if (CheckEdges)
1670 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1671 set_union(CallerEdgeContextIds, Edge->ContextIds);
1672 }
1673 // Node can have more context ids than callers if some contexts terminate at
1674 // node and some are longer. If we are allowing recursive callsites and
1675 // contexts this will be violated for incompletely cloned recursive cycles,
1676 // so skip the checking in that case.
1677 assert((AllowRecursiveCallsites && AllowRecursiveContexts) ||
1678 NodeContextIds == CallerEdgeContextIds ||
1679 set_is_subset(CallerEdgeContextIds, NodeContextIds));
1680 }
1681 if (Node->CalleeEdges.size()) {
1682 DenseSet<uint32_t> CalleeEdgeContextIds(
1683 Node->CalleeEdges.front()->ContextIds);
1684 for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) {
1685 if (CheckEdges)
1686 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1687 set_union(CalleeEdgeContextIds, Edge->getContextIds());
1688 }
1689 // If we are allowing recursive callsites and contexts this will be violated
1690 // for incompletely cloned recursive cycles, so skip the checking in that
1691 // case.
1692 assert((AllowRecursiveCallsites && AllowRecursiveContexts) ||
1693 NodeContextIds == CalleeEdgeContextIds);
1694 }
1695 // FIXME: Since this checking is only invoked under an option, we should
1696 // change the error checking from using assert to something that will trigger
1697 // an error on a release build.
1698#ifndef NDEBUG
1699 // Make sure we don't end up with duplicate edges between the same caller and
1700 // callee.
1701 DenseSet<ContextNode<DerivedCCG, FuncTy, CallTy> *> NodeSet;
1702 for (const auto &E : Node->CalleeEdges)
1703 NodeSet.insert(E->Callee);
1704 assert(NodeSet.size() == Node->CalleeEdges.size());
1705#endif
1706}
1707
1708template <typename DerivedCCG, typename FuncTy, typename CallTy>
1709void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1710 assignStackNodesPostOrder(ContextNode *Node,
1711 DenseSet<const ContextNode *> &Visited,
1712 DenseMap<uint64_t, std::vector<CallContextInfo>>
1713 &StackIdToMatchingCalls,
1714 DenseMap<CallInfo, CallInfo> &CallToMatchingCall,
1715 const DenseSet<uint32_t> &ImportantContextIds) {
1716 auto Inserted = Visited.insert(Node);
1717 if (!Inserted.second)
1718 return;
1719 // Post order traversal. Iterate over a copy since we may add nodes and
1720 // therefore new callers during the recursive call, invalidating any
1721 // iterator over the original edge vector. We don't need to process these
1722 // new nodes as they were already processed on creation.
1723 auto CallerEdges = Node->CallerEdges;
1724 for (auto &Edge : CallerEdges) {
1725 // Skip any that have been removed during the recursion.
1726 if (Edge->isRemoved()) {
1727 assert(!is_contained(Node->CallerEdges, Edge));
1728 continue;
1729 }
1730 assignStackNodesPostOrder(Node: Edge->Caller, Visited, StackIdToMatchingCalls,
1731 CallToMatchingCall, ImportantContextIds);
1732 }
1733
1734 // If this node's stack id is in the map, update the graph to contain new
1735 // nodes representing any inlining at interior callsites. Note we move the
1736 // associated context ids over to the new nodes.
1737
1738 // Ignore this node if it is for an allocation or we didn't record any
1739 // stack id lists ending at it.
1740 if (Node->IsAllocation ||
1741 !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
1742 return;
1743
1744 auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
1745 // Handle the simple case first. A single call with a single stack id.
1746 // In this case there is no need to create any new context nodes, simply
1747 // assign the context node for stack id to this Call.
1748 if (Calls.size() == 1) {
1749 auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
1750 if (Ids.size() == 1) {
1751 assert(SavedContextIds.empty());
1752 // It should be this Node
1753 assert(Node == getNodeForStackId(Ids[0]));
1754 if (Node->Recursive)
1755 return;
1756 Node->setCall(Call);
1757 NonAllocationCallToContextNodeMap[Call] = Node;
1758 NodeToCallingFunc[Node] = Func;
1759 recordStackNode(StackIds&: Ids, Node, NodeContextIds: Node->getContextIds(), ImportantContextIds);
1760 return;
1761 }
1762 }
1763
1764#ifndef NDEBUG
1765 // Find the node for the last stack id, which should be the same
1766 // across all calls recorded for this id, and is this node's id.
1767 uint64_t LastId = Node->OrigStackOrAllocId;
1768 ContextNode *LastNode = getNodeForStackId(LastId);
1769 // We should only have kept stack ids that had nodes.
1770 assert(LastNode);
1771 assert(LastNode == Node);
1772#else
1773 ContextNode *LastNode = Node;
1774#endif
1775
1776 // Compute the last node's context ids once, as it is shared by all calls in
1777 // this entry.
1778 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1779
1780 [[maybe_unused]] bool PrevIterCreatedNode = false;
1781 bool CreatedNode = false;
1782 for (unsigned I = 0; I < Calls.size();
1783 I++, PrevIterCreatedNode = CreatedNode) {
1784 CreatedNode = false;
1785 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1786 // Skip any for which we didn't assign any ids, these don't get a node in
1787 // the graph.
1788 if (SavedContextIds.empty()) {
1789 // If this call has a matching call (located in the same function and
1790 // having the same stack ids), simply add it to the context node created
1791 // for its matching call earlier. These can be treated the same through
1792 // cloning and get updated at the same time.
1793 if (!CallToMatchingCall.contains(Call))
1794 continue;
1795 auto MatchingCall = CallToMatchingCall[Call];
1796 if (!NonAllocationCallToContextNodeMap.contains(MatchingCall)) {
1797 // This should only happen if we had a prior iteration, and it didn't
1798 // create a node because of the below recomputation of context ids
1799 // finding none remaining and continuing early.
1800 assert(I > 0 && !PrevIterCreatedNode);
1801 continue;
1802 }
1803 NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back(
1804 Call);
1805 continue;
1806 }
1807
1808 assert(LastId == Ids.back());
1809
1810 // Recompute the context ids for this stack id sequence (the
1811 // intersection of the context ids of the corresponding nodes).
1812 // Start with the ids we saved in the map for this call, which could be
1813 // duplicated context ids. We have to recompute as we might have overlap
1814 // overlap between the saved context ids for different last nodes, and
1815 // removed them already during the post order traversal.
1816 set_intersect(SavedContextIds, LastNodeContextIds);
1817 ContextNode *PrevNode = LastNode;
1818 bool Skip = false;
1819 // Iterate backwards through the stack Ids, starting after the last Id
1820 // in the list, which was handled once outside for all Calls.
1821 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1822 auto Id = *IdIter;
1823 ContextNode *CurNode = getNodeForStackId(StackId: Id);
1824 // We should only have kept stack ids that had nodes and weren't
1825 // recursive.
1826 assert(CurNode);
1827 assert(!CurNode->Recursive);
1828
1829 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1830 if (!Edge) {
1831 Skip = true;
1832 break;
1833 }
1834 PrevNode = CurNode;
1835
1836 // Update the context ids, which is the intersection of the ids along
1837 // all edges in the sequence.
1838 set_intersect(SavedContextIds, Edge->getContextIds());
1839
1840 // If we now have no context ids for clone, skip this call.
1841 if (SavedContextIds.empty()) {
1842 Skip = true;
1843 break;
1844 }
1845 }
1846 if (Skip)
1847 continue;
1848
1849 // Create new context node.
1850 ContextNode *NewNode = createNewNode(/*IsAllocation=*/false, F: Func, C: Call);
1851 NonAllocationCallToContextNodeMap[Call] = NewNode;
1852 CreatedNode = true;
1853 NewNode->AllocTypes = computeAllocType(ContextIds&: SavedContextIds);
1854
1855 ContextNode *FirstNode = getNodeForStackId(StackId: Ids[0]);
1856 assert(FirstNode);
1857
1858 // Connect to callees of innermost stack frame in inlined call chain.
1859 // This updates context ids for FirstNode's callee's to reflect those
1860 // moved to NewNode.
1861 connectNewNode(NewNode, OrigNode: FirstNode, /*TowardsCallee=*/true, RemainingContextIds: SavedContextIds);
1862
1863 // Connect to callers of outermost stack frame in inlined call chain.
1864 // This updates context ids for FirstNode's caller's to reflect those
1865 // moved to NewNode.
1866 connectNewNode(NewNode, OrigNode: LastNode, /*TowardsCallee=*/false, RemainingContextIds: SavedContextIds);
1867
1868 // Now we need to remove context ids from edges/nodes between First and
1869 // Last Node.
1870 PrevNode = nullptr;
1871 for (auto Id : Ids) {
1872 ContextNode *CurNode = getNodeForStackId(StackId: Id);
1873 // We should only have kept stack ids that had nodes.
1874 assert(CurNode);
1875
1876 // Remove the context ids moved to NewNode from CurNode, and the
1877 // edge from the prior node.
1878 if (PrevNode) {
1879 auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
1880 // If the sequence contained recursion, we might have already removed
1881 // some edges during the connectNewNode calls above.
1882 if (!PrevEdge) {
1883 PrevNode = CurNode;
1884 continue;
1885 }
1886 set_subtract(PrevEdge->getContextIds(), SavedContextIds);
1887 if (PrevEdge->getContextIds().empty())
1888 removeEdgeFromGraph(Edge: PrevEdge);
1889 }
1890 // Since we update the edges from leaf to tail, only look at the callee
1891 // edges. This isn't an alloc node, so if there are no callee edges, the
1892 // alloc type is None.
1893 CurNode->AllocTypes = CurNode->CalleeEdges.empty()
1894 ? (uint8_t)AllocationType::None
1895 : CurNode->computeAllocType();
1896 PrevNode = CurNode;
1897 }
1898
1899 recordStackNode(StackIds&: Ids, Node: NewNode, NodeContextIds: SavedContextIds, ImportantContextIds);
1900
1901 if (VerifyNodes) {
1902 checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true);
1903 for (auto Id : Ids) {
1904 ContextNode *CurNode = getNodeForStackId(StackId: Id);
1905 // We should only have kept stack ids that had nodes.
1906 assert(CurNode);
1907 checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /*CheckEdges=*/true);
1908 }
1909 }
1910 }
1911}
1912
1913template <typename DerivedCCG, typename FuncTy, typename CallTy>
1914void CallsiteContextGraph<DerivedCCG, FuncTy,
1915 CallTy>::fixupImportantContexts() {
1916 if (ImportantContextIdInfo.empty())
1917 return;
1918
1919 // Update statistics as we are done building this map at this point.
1920 NumImportantContextIds = ImportantContextIdInfo.size();
1921
1922 if (!MemProfFixupImportant)
1923 return;
1924
1925 if (ExportToDot)
1926 exportToDot(Label: "beforestackfixup");
1927
1928 // For each context we identified as important, walk through the saved context
1929 // stack ids in order from leaf upwards, and make sure all edges are correct.
1930 // These can be difficult to get right when updating the graph while mapping
1931 // nodes onto summary or IR, especially when there is recursion. In
1932 // particular, when we have created new nodes to reflect inlining, it is
1933 // sometimes impossible to know exactly how to update the edges in the face of
1934 // recursion, as we have lost the original ordering of the stack ids in the
1935 // contexts.
1936 // TODO: Consider only doing this if we detect the context has recursive
1937 // cycles.
1938 //
1939 // I.e. assume we have a context with stack ids like: {A B A C A D E}
1940 // and let's say A was inlined into B, C, and D. The original graph will have
1941 // multiple recursive cycles through A. When we match the original context
1942 // nodes onto the IR or summary, we will merge {A B} into one context node,
1943 // {A C} onto another, and {A D} onto another. Looking at the stack sequence
1944 // above, we should end up with a non-cyclic set of edges like:
1945 // {AB} <- {AC} <- {AD} <- E. However, because we normally have lost the
1946 // original ordering, we won't get the edges correct initially (it's
1947 // impossible without the original ordering). Here we do the fixup (add and
1948 // removing edges where necessary) for this context. In the
1949 // ImportantContextInfo struct in this case we should have a MaxLength = 2,
1950 // and map entries for {A B}, {A C}, {A D}, and {E}.
1951 for (auto &[CurContextId, Info] : ImportantContextIdInfo) {
1952 if (Info.StackIdsToNode.empty())
1953 continue;
1954 bool Changed = false;
1955 ContextNode *PrevNode = nullptr;
1956 ContextNode *CurNode = nullptr;
1957 DenseSet<const ContextEdge *> VisitedEdges;
1958 ArrayRef<uint64_t> AllStackIds(Info.StackIds);
1959 // Try to identify what callsite ContextNode maps to which slice of the
1960 // context's ordered stack ids.
1961 for (unsigned I = 0; I < AllStackIds.size(); I++, PrevNode = CurNode) {
1962 // We will do this greedily, trying up to MaxLength stack ids in a row, to
1963 // see if we recorded a context node for that sequence.
1964 auto Len = Info.MaxLength;
1965 auto LenToEnd = AllStackIds.size() - I;
1966 if (Len > LenToEnd)
1967 Len = LenToEnd;
1968 CurNode = nullptr;
1969 // Try to find a recorded context node starting with the longest length
1970 // recorded, and on down until we check for just a single stack node.
1971 for (; Len > 0; Len--) {
1972 // Get the slice of the original stack id sequence to check.
1973 auto CheckStackIds = AllStackIds.slice(I, Len);
1974 auto EntryIt = Info.StackIdsToNode.find(CheckStackIds);
1975 if (EntryIt == Info.StackIdsToNode.end())
1976 continue;
1977 CurNode = EntryIt->second;
1978 // Skip forward so we don't try to look for the ones we just matched.
1979 // We increment by Len - 1, because the outer for loop will increment I.
1980 I += Len - 1;
1981 break;
1982 }
1983 // Give up if we couldn't find a node. Since we need to clone from the
1984 // leaf allocation upwards, no sense in doing anymore fixup further up
1985 // the context if we couldn't match part of the original stack context
1986 // onto a callsite node.
1987 if (!CurNode)
1988 break;
1989 // No edges to fix up until we have a pair of nodes that should be
1990 // adjacent in the graph.
1991 if (!PrevNode)
1992 continue;
1993 // See if we already have a call edge from CurNode to PrevNode.
1994 auto *CurEdge = PrevNode->findEdgeFromCaller(CurNode);
1995 if (CurEdge) {
1996 // We already have an edge. Make sure it contains this context id.
1997 if (CurEdge->getContextIds().insert(CurContextId).second) {
1998 NumFixupEdgeIdsInserted++;
1999 Changed = true;
2000 }
2001 } else {
2002 // No edge exists - add one.
2003 NumFixupEdgesAdded++;
2004 DenseSet<uint32_t> ContextIds({CurContextId});
2005 auto AllocType = computeAllocType(ContextIds);
2006 auto NewEdge = std::make_shared<ContextEdge>(
2007 PrevNode, CurNode, AllocType, std::move(ContextIds));
2008 PrevNode->CallerEdges.push_back(NewEdge);
2009 CurNode->CalleeEdges.push_back(NewEdge);
2010 // Save the new edge for the below handling.
2011 CurEdge = NewEdge.get();
2012 Changed = true;
2013 }
2014 VisitedEdges.insert(CurEdge);
2015 // Now remove this context id from any other caller edges calling
2016 // PrevNode.
2017 for (auto &Edge : PrevNode->CallerEdges) {
2018 // Skip the edge updating/created above and edges we have already
2019 // visited (due to recursion).
2020 if (Edge.get() != CurEdge && !VisitedEdges.contains(Edge.get()))
2021 Edge->getContextIds().erase(CurContextId);
2022 }
2023 }
2024 if (Changed)
2025 NumFixedContexts++;
2026 }
2027}
2028
2029template <typename DerivedCCG, typename FuncTy, typename CallTy>
2030void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
2031 // Map of stack id to all calls with that as the last (outermost caller)
2032 // callsite id that has a context node (some might not due to pruning
2033 // performed during matching of the allocation profile contexts).
2034 // The CallContextInfo contains the Call and a list of its stack ids with
2035 // ContextNodes, the function containing Call, and the set of context ids
2036 // the analysis will eventually identify for use in any new node created
2037 // for that callsite.
2038 DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls;
2039 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
2040 for (auto &Call : CallsWithMetadata) {
2041 // Ignore allocations, already handled.
2042 if (AllocationCallToContextNodeMap.count(Call))
2043 continue;
2044 auto StackIdsWithContextNodes =
2045 getStackIdsWithContextNodesForCall(Call: Call.call());
2046 // If there were no nodes created for MIBs on allocs (maybe this was in
2047 // the unambiguous part of the MIB stack that was pruned), ignore.
2048 if (StackIdsWithContextNodes.empty())
2049 continue;
2050 // Otherwise, record this Call along with the list of ids for the last
2051 // (outermost caller) stack id with a node.
2052 StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
2053 {Call.call(), StackIdsWithContextNodes, Func, {}});
2054 }
2055 }
2056
2057 // First make a pass through all stack ids that correspond to a call,
2058 // as identified in the above loop. Compute the context ids corresponding to
2059 // each of these calls when they correspond to multiple stack ids due to
2060 // due to inlining. Perform any duplication of context ids required when
2061 // there is more than one call with the same stack ids. Their (possibly newly
2062 // duplicated) context ids are saved in the StackIdToMatchingCalls map.
2063 DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
2064 // Save a map from each call to any that are found to match it. I.e. located
2065 // in the same function and have the same (possibly pruned) stack ids. We use
2066 // this to avoid creating extra graph nodes as they can be treated the same.
2067 DenseMap<CallInfo, CallInfo> CallToMatchingCall;
2068 for (auto &It : StackIdToMatchingCalls) {
2069 auto &Calls = It.getSecond();
2070 // Skip single calls with a single stack id. These don't need a new node.
2071 if (Calls.size() == 1) {
2072 auto &Ids = Calls[0].StackIds;
2073 if (Ids.size() == 1)
2074 continue;
2075 }
2076 // In order to do the best and maximal matching of inlined calls to context
2077 // node sequences we will sort the vectors of stack ids in descending order
2078 // of length, and within each length, lexicographically by stack id. The
2079 // latter is so that we can specially handle calls that have identical stack
2080 // id sequences (either due to cloning or artificially because of the MIB
2081 // context pruning). Those with the same Ids are then sorted by function to
2082 // facilitate efficiently mapping them to the same context node.
2083 // Because the functions are pointers, to ensure a stable sort first assign
2084 // each function pointer to its first index in the Calls array, and then use
2085 // that to sort by.
2086 DenseMap<const FuncTy *, unsigned> FuncToIndex;
2087 for (const auto &[Idx, CallCtxInfo] : enumerate(Calls))
2088 FuncToIndex.insert({CallCtxInfo.Func, Idx});
2089 llvm::stable_sort(
2090 Calls,
2091 [&FuncToIndex](const CallContextInfo &A, const CallContextInfo &B) {
2092 return A.StackIds.size() > B.StackIds.size() ||
2093 (A.StackIds.size() == B.StackIds.size() &&
2094 (A.StackIds < B.StackIds ||
2095 (A.StackIds == B.StackIds &&
2096 FuncToIndex[A.Func] < FuncToIndex[B.Func])));
2097 });
2098
2099 // Find the node for the last stack id, which should be the same
2100 // across all calls recorded for this id, and is the id for this
2101 // entry in the StackIdToMatchingCalls map.
2102 uint64_t LastId = It.getFirst();
2103 ContextNode *LastNode = getNodeForStackId(StackId: LastId);
2104 // We should only have kept stack ids that had nodes.
2105 assert(LastNode);
2106
2107 if (LastNode->Recursive)
2108 continue;
2109
2110 // Initialize the context ids with the last node's. We will subsequently
2111 // refine the context ids by computing the intersection along all edges.
2112 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
2113 assert(!LastNodeContextIds.empty());
2114
2115#ifndef NDEBUG
2116 // Save the set of functions seen for a particular set of the same stack
2117 // ids. This is used to ensure that they have been correctly sorted to be
2118 // adjacent in the Calls list, since we rely on that to efficiently place
2119 // all such matching calls onto the same context node.
2120 DenseSet<const FuncTy *> MatchingIdsFuncSet;
2121#endif
2122
2123 for (unsigned I = 0; I < Calls.size(); I++) {
2124 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
2125 assert(SavedContextIds.empty());
2126 assert(LastId == Ids.back());
2127
2128#ifndef NDEBUG
2129 // If this call has a different set of ids than the last one, clear the
2130 // set used to ensure they are sorted properly.
2131 if (I > 0 && Ids != Calls[I - 1].StackIds)
2132 MatchingIdsFuncSet.clear();
2133#endif
2134
2135 // First compute the context ids for this stack id sequence (the
2136 // intersection of the context ids of the corresponding nodes).
2137 // Start with the remaining saved ids for the last node.
2138 assert(!LastNodeContextIds.empty());
2139 DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
2140
2141 ContextNode *PrevNode = LastNode;
2142 ContextNode *CurNode = LastNode;
2143 bool Skip = false;
2144
2145 // Iterate backwards through the stack Ids, starting after the last Id
2146 // in the list, which was handled once outside for all Calls.
2147 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
2148 auto Id = *IdIter;
2149 CurNode = getNodeForStackId(StackId: Id);
2150 // We should only have kept stack ids that had nodes.
2151 assert(CurNode);
2152
2153 if (CurNode->Recursive) {
2154 Skip = true;
2155 break;
2156 }
2157
2158 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
2159 // If there is no edge then the nodes belong to different MIB contexts,
2160 // and we should skip this inlined context sequence. For example, this
2161 // particular inlined context may include stack ids A->B, and we may
2162 // indeed have nodes for both A and B, but it is possible that they were
2163 // never profiled in sequence in a single MIB for any allocation (i.e.
2164 // we might have profiled an allocation that involves the callsite A,
2165 // but through a different one of its callee callsites, and we might
2166 // have profiled an allocation that involves callsite B, but reached
2167 // from a different caller callsite).
2168 if (!Edge) {
2169 Skip = true;
2170 break;
2171 }
2172 PrevNode = CurNode;
2173
2174 // Update the context ids, which is the intersection of the ids along
2175 // all edges in the sequence.
2176 set_intersect(StackSequenceContextIds, Edge->getContextIds());
2177
2178 // If we now have no context ids for clone, skip this call.
2179 if (StackSequenceContextIds.empty()) {
2180 Skip = true;
2181 break;
2182 }
2183 }
2184 if (Skip)
2185 continue;
2186
2187 // If some of this call's stack ids did not have corresponding nodes (due
2188 // to pruning), don't include any context ids for contexts that extend
2189 // beyond these nodes. Otherwise we would be matching part of unrelated /
2190 // not fully matching stack contexts. To do this, subtract any context ids
2191 // found in caller nodes of the last node found above.
2192 if (Ids.back() != getLastStackId(Call)) {
2193 for (const auto &PE : LastNode->CallerEdges) {
2194 set_subtract(StackSequenceContextIds, PE->getContextIds());
2195 if (StackSequenceContextIds.empty())
2196 break;
2197 }
2198 // If we now have no context ids for clone, skip this call.
2199 if (StackSequenceContextIds.empty())
2200 continue;
2201 }
2202
2203#ifndef NDEBUG
2204 // If the prior call had the same stack ids this set would not be empty.
2205 // Check if we already have a call that "matches" because it is located
2206 // in the same function. If the Calls list was sorted properly we should
2207 // not encounter this situation as all such entries should be adjacent
2208 // and processed in bulk further below.
2209 assert(!MatchingIdsFuncSet.contains(Func));
2210
2211 MatchingIdsFuncSet.insert(Func);
2212#endif
2213
2214 // Check if the next set of stack ids is the same (since the Calls vector
2215 // of tuples is sorted by the stack ids we can just look at the next one).
2216 // If so, save them in the CallToMatchingCall map so that they get
2217 // assigned to the same context node, and skip them.
2218 bool DuplicateContextIds = false;
2219 for (unsigned J = I + 1; J < Calls.size(); J++) {
2220 auto &CallCtxInfo = Calls[J];
2221 auto &NextIds = CallCtxInfo.StackIds;
2222 if (NextIds != Ids)
2223 break;
2224 auto *NextFunc = CallCtxInfo.Func;
2225 if (NextFunc != Func) {
2226 // We have another Call with the same ids but that cannot share this
2227 // node, must duplicate ids for it.
2228 DuplicateContextIds = true;
2229 break;
2230 }
2231 auto &NextCall = CallCtxInfo.Call;
2232 CallToMatchingCall[NextCall] = Call;
2233 // Update I so that it gets incremented correctly to skip this call.
2234 I = J;
2235 }
2236
2237 // If we don't have duplicate context ids, then we can assign all the
2238 // context ids computed for the original node sequence to this call.
2239 // If there are duplicate calls with the same stack ids then we synthesize
2240 // new context ids that are duplicates of the originals. These are
2241 // assigned to SavedContextIds, which is a reference into the map entry
2242 // for this call, allowing us to access these ids later on.
2243 OldToNewContextIds.reserve(NumEntries: OldToNewContextIds.size() +
2244 StackSequenceContextIds.size());
2245 SavedContextIds =
2246 DuplicateContextIds
2247 ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
2248 : StackSequenceContextIds;
2249 assert(!SavedContextIds.empty());
2250
2251 if (!DuplicateContextIds) {
2252 // Update saved last node's context ids to remove those that are
2253 // assigned to other calls, so that it is ready for the next call at
2254 // this stack id.
2255 set_subtract(S1&: LastNodeContextIds, S2: StackSequenceContextIds);
2256 if (LastNodeContextIds.empty())
2257 break;
2258 }
2259 }
2260 }
2261
2262 // Propagate the duplicate context ids over the graph.
2263 propagateDuplicateContextIds(OldToNewContextIds);
2264
2265 if (VerifyCCG)
2266 check();
2267
2268 // Now perform a post-order traversal over the graph, starting with the
2269 // allocation nodes, essentially processing nodes from callers to callees.
2270 // For any that contains an id in the map, update the graph to contain new
2271 // nodes representing any inlining at interior callsites. Note we move the
2272 // associated context ids over to the new nodes.
2273 DenseSet<const ContextNode *> Visited;
2274 DenseSet<uint32_t> ImportantContextIds(llvm::from_range,
2275 ImportantContextIdInfo.keys());
2276 for (auto &Entry : AllocationCallToContextNodeMap)
2277 assignStackNodesPostOrder(Node: Entry.second, Visited, StackIdToMatchingCalls,
2278 CallToMatchingCall, ImportantContextIds);
2279
2280 fixupImportantContexts();
2281
2282 if (VerifyCCG)
2283 check();
2284}
2285
2286uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
2287 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
2288 Call->getMetadata(KindID: LLVMContext::MD_callsite));
2289 return CallsiteContext.back();
2290}
2291
2292uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
2293 assert(isa<CallsiteInfo *>(Call));
2294 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
2295 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Val&: Call));
2296 // Need to convert index into stack id.
2297 return Index.getStackIdAtIndex(Index: CallsiteContext.back());
2298}
2299
2300static const std::string MemProfCloneSuffix = ".memprof.";
2301
2302static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
2303 // We use CloneNo == 0 to refer to the original version, which doesn't get
2304 // renamed with a suffix.
2305 if (!CloneNo)
2306 return Base.str();
2307 return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
2308}
2309
2310static bool isMemProfClone(const Function &F) {
2311 return F.getName().contains(Other: MemProfCloneSuffix);
2312}
2313
2314// Return the clone number of the given function by extracting it from the
2315// memprof suffix. Assumes the caller has already confirmed it is a memprof
2316// clone.
2317static unsigned getMemProfCloneNum(const Function &F) {
2318 assert(isMemProfClone(F));
2319 auto Pos = F.getName().find_last_of(C: '.');
2320 assert(Pos > 0);
2321 unsigned CloneNo;
2322 bool Err = F.getName().drop_front(N: Pos + 1).getAsInteger(Radix: 10, Result&: CloneNo);
2323 assert(!Err);
2324 (void)Err;
2325 return CloneNo;
2326}
2327
2328std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
2329 const Instruction *Call,
2330 unsigned CloneNo) const {
2331 return (Twine(Call->getFunction()->getName()) + " -> " +
2332 cast<CallBase>(Val: Call)->getCalledFunction()->getName())
2333 .str();
2334}
2335
2336std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
2337 const IndexCall &Call,
2338 unsigned CloneNo) const {
2339 auto VI = FSToVIMap.find(x: Func);
2340 assert(VI != FSToVIMap.end());
2341 std::string CallerName = getMemProfFuncName(Base: VI->second.name(), CloneNo);
2342 if (isa<AllocInfo *>(Val: Call))
2343 return CallerName + " -> alloc";
2344 else {
2345 auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Val: Call);
2346 return CallerName + " -> " +
2347 getMemProfFuncName(Base: Callsite->Callee.name(),
2348 CloneNo: Callsite->Clones[CloneNo]);
2349 }
2350}
2351
2352std::vector<uint64_t>
2353ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
2354 Instruction *Call) {
2355 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
2356 Call->getMetadata(KindID: LLVMContext::MD_callsite));
2357 return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
2358 CallsiteContext);
2359}
2360
2361std::vector<uint64_t>
2362IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
2363 assert(isa<CallsiteInfo *>(Call));
2364 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
2365 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Val&: Call));
2366 return getStackIdsWithContextNodes<CallsiteInfo,
2367 SmallVector<unsigned>::const_iterator>(
2368 CallsiteContext);
2369}
2370
2371template <typename DerivedCCG, typename FuncTy, typename CallTy>
2372template <class NodeT, class IteratorT>
2373std::vector<uint64_t>
2374CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
2375 CallStack<NodeT, IteratorT> &CallsiteContext) {
2376 std::vector<uint64_t> StackIds;
2377 for (auto IdOrIndex : CallsiteContext) {
2378 auto StackId = getStackId(IdOrIndex);
2379 ContextNode *Node = getNodeForStackId(StackId);
2380 if (!Node)
2381 break;
2382 StackIds.push_back(StackId);
2383 }
2384 return StackIds;
2385}
2386
2387ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
2388 Module &M,
2389 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)
2390 : Mod(M), OREGetter(OREGetter) {
2391 // Map for keeping track of the largest cold contexts up to the number given
2392 // by MemProfTopNImportant. Must be a std::map (not DenseMap) because keys
2393 // must be sorted.
2394 std::map<uint64_t, uint32_t> TotalSizeToContextIdTopNCold;
2395 for (auto &F : M) {
2396 std::vector<CallInfo> CallsWithMetadata;
2397 for (auto &BB : F) {
2398 for (auto &I : BB) {
2399 if (!isa<CallBase>(Val: I))
2400 continue;
2401 if (auto *MemProfMD = I.getMetadata(KindID: LLVMContext::MD_memprof)) {
2402 CallsWithMetadata.push_back(x: &I);
2403 auto *AllocNode = addAllocNode(Call: &I, F: &F);
2404 auto *CallsiteMD = I.getMetadata(KindID: LLVMContext::MD_callsite);
2405 assert(CallsiteMD);
2406 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
2407 // Add all of the MIBs and their stack nodes.
2408 for (auto &MDOp : MemProfMD->operands()) {
2409 auto *MIBMD = cast<const MDNode>(Val: MDOp);
2410 std::vector<ContextTotalSize> ContextSizeInfo;
2411 // Collect the context size information if it exists.
2412 if (MIBMD->getNumOperands() > 2) {
2413 for (unsigned I = 2; I < MIBMD->getNumOperands(); I++) {
2414 MDNode *ContextSizePair =
2415 dyn_cast<MDNode>(Val: MIBMD->getOperand(I));
2416 assert(ContextSizePair->getNumOperands() == 2);
2417 uint64_t FullStackId = mdconst::dyn_extract<ConstantInt>(
2418 MD: ContextSizePair->getOperand(I: 0))
2419 ->getZExtValue();
2420 uint64_t TotalSize = mdconst::dyn_extract<ConstantInt>(
2421 MD: ContextSizePair->getOperand(I: 1))
2422 ->getZExtValue();
2423 ContextSizeInfo.push_back(x: {.FullStackId: FullStackId, .TotalSize: TotalSize});
2424 }
2425 }
2426 MDNode *StackNode = getMIBStackNode(MIB: MIBMD);
2427 assert(StackNode);
2428 CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode);
2429 addStackNodesForMIB<MDNode, MDNode::op_iterator>(
2430 AllocNode, StackContext, CallsiteContext,
2431 AllocType: getMIBAllocType(MIB: MIBMD), ContextSizeInfo,
2432 TotalSizeToContextIdTopNCold);
2433 }
2434 // If exporting the graph to dot and an allocation id of interest was
2435 // specified, record all the context ids for this allocation node.
2436 if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
2437 DotAllocContextIds = AllocNode->getContextIds();
2438 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2439 // Memprof and callsite metadata on memory allocations no longer
2440 // needed.
2441 I.setMetadata(KindID: LLVMContext::MD_memprof, Node: nullptr);
2442 I.setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
2443 }
2444 // For callsite metadata, add to list for this function for later use.
2445 else if (I.getMetadata(KindID: LLVMContext::MD_callsite)) {
2446 CallsWithMetadata.push_back(x: &I);
2447 }
2448 }
2449 }
2450 if (!CallsWithMetadata.empty())
2451 FuncToCallsWithMetadata[&F] = CallsWithMetadata;
2452 }
2453
2454 if (DumpCCG) {
2455 dbgs() << "CCG before updating call stack chains:\n";
2456 dbgs() << *this;
2457 }
2458
2459 if (ExportToDot)
2460 exportToDot(Label: "prestackupdate");
2461
2462 updateStackNodes();
2463
2464 if (ExportToDot)
2465 exportToDot(Label: "poststackupdate");
2466
2467 handleCallsitesWithMultipleTargets();
2468
2469 markBackedges();
2470
2471 // Strip off remaining callsite metadata, no longer needed.
2472 for (auto &FuncEntry : FuncToCallsWithMetadata)
2473 for (auto &Call : FuncEntry.second)
2474 Call.call()->setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
2475}
2476
2477// Finds the set of GUIDs for weak aliasees that are prevailing in different
2478// modules than any of their aliases. We need to handle these specially.
2479DenseSet<GlobalValue::GUID>
2480IndexCallsiteContextGraph::findAliaseeGUIDsPrevailingInDifferentModule() {
2481 DenseSet<GlobalValue::GUID> AliaseeGUIDs;
2482 for (auto &I : Index) {
2483 auto VI = Index.getValueInfo(R: I);
2484 for (auto &S : VI.getSummaryList()) {
2485 // We only care about aliases to functions.
2486 auto *AS = dyn_cast<AliasSummary>(Val: S.get());
2487 if (!AS)
2488 continue;
2489 auto *AliaseeSummary = &AS->getAliasee();
2490 auto *AliaseeFS = dyn_cast<FunctionSummary>(Val: AliaseeSummary);
2491 if (!AliaseeFS)
2492 continue;
2493 // Skip this summary if it is not for the prevailing symbol for this GUID.
2494 // The linker doesn't resolve local linkage values so don't check whether
2495 // those are prevailing.
2496 if (!GlobalValue::isLocalLinkage(Linkage: S->linkage()) &&
2497 !isPrevailing(VI.getGUID(), S.get()))
2498 continue;
2499 // Prevailing aliasee could be in a different module only if it is weak.
2500 if (!GlobalValue::isWeakForLinker(Linkage: AliaseeSummary->linkage()))
2501 continue;
2502 auto AliaseeGUID = AS->getAliaseeGUID();
2503 // If the aliasee copy in this module is not prevailing, record it.
2504 if (!isPrevailing(AliaseeGUID, AliaseeSummary))
2505 AliaseeGUIDs.insert(V: AliaseeGUID);
2506 }
2507 }
2508 AliaseesPrevailingInDiffModuleFromAlias += AliaseeGUIDs.size();
2509 return AliaseeGUIDs;
2510}
2511
2512IndexCallsiteContextGraph::IndexCallsiteContextGraph(
2513 ModuleSummaryIndex &Index,
2514 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
2515 isPrevailing)
2516 : Index(Index), isPrevailing(isPrevailing) {
2517 // Since we use the aliasee summary info to create the necessary clones for
2518 // its aliases, conservatively skip recording the aliasee function's callsites
2519 // in the CCG for any that are prevailing in a different module than one of
2520 // its aliases. We could record the necessary information to do this in the
2521 // summary, but this case should not be common.
2522 DenseSet<GlobalValue::GUID> GUIDsToSkip =
2523 findAliaseeGUIDsPrevailingInDifferentModule();
2524 // Map for keeping track of the largest cold contexts up to the number given
2525 // by MemProfTopNImportant. Must be a std::map (not DenseMap) because keys
2526 // must be sorted.
2527 std::map<uint64_t, uint32_t> TotalSizeToContextIdTopNCold;
2528 for (auto &I : Index) {
2529 auto VI = Index.getValueInfo(R: I);
2530 if (GUIDsToSkip.contains(V: VI.getGUID()))
2531 continue;
2532 for (auto &S : VI.getSummaryList()) {
2533 // We should only add the prevailing nodes. Otherwise we may try to clone
2534 // in a weak copy that won't be linked (and may be different than the
2535 // prevailing version).
2536 // We only keep the memprof summary on the prevailing copy now when
2537 // building the combined index, as a space optimization, however don't
2538 // rely on this optimization. The linker doesn't resolve local linkage
2539 // values so don't check whether those are prevailing.
2540 if (!GlobalValue::isLocalLinkage(Linkage: S->linkage()) &&
2541 !isPrevailing(VI.getGUID(), S.get()))
2542 continue;
2543 auto *FS = dyn_cast<FunctionSummary>(Val: S.get());
2544 if (!FS)
2545 continue;
2546 std::vector<CallInfo> CallsWithMetadata;
2547 if (!FS->allocs().empty()) {
2548 for (auto &AN : FS->mutableAllocs()) {
2549 // This can happen because of recursion elimination handling that
2550 // currently exists in ModuleSummaryAnalysis. Skip these for now.
2551 // We still added them to the summary because we need to be able to
2552 // correlate properly in applyImport in the backends.
2553 if (AN.MIBs.empty())
2554 continue;
2555 IndexCall AllocCall(&AN);
2556 CallsWithMetadata.push_back(x: AllocCall);
2557 auto *AllocNode = addAllocNode(Call: AllocCall, F: FS);
2558 // Pass an empty CallStack to the CallsiteContext (second)
2559 // parameter, since for ThinLTO we already collapsed out the inlined
2560 // stack ids on the allocation call during ModuleSummaryAnalysis.
2561 CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
2562 EmptyContext;
2563 unsigned I = 0;
2564 assert(!metadataMayIncludeContextSizeInfo() ||
2565 AN.ContextSizeInfos.size() == AN.MIBs.size());
2566 // Now add all of the MIBs and their stack nodes.
2567 for (auto &MIB : AN.MIBs) {
2568 CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
2569 StackContext(&MIB);
2570 std::vector<ContextTotalSize> ContextSizeInfo;
2571 if (!AN.ContextSizeInfos.empty()) {
2572 for (auto [FullStackId, TotalSize] : AN.ContextSizeInfos[I])
2573 ContextSizeInfo.push_back(x: {.FullStackId: FullStackId, .TotalSize: TotalSize});
2574 }
2575 addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
2576 AllocNode, StackContext, CallsiteContext&: EmptyContext, AllocType: MIB.AllocType,
2577 ContextSizeInfo, TotalSizeToContextIdTopNCold);
2578 I++;
2579 }
2580 // If exporting the graph to dot and an allocation id of interest was
2581 // specified, record all the context ids for this allocation node.
2582 if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
2583 DotAllocContextIds = AllocNode->getContextIds();
2584 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2585 // Initialize version 0 on the summary alloc node to the current alloc
2586 // type, unless it has both types in which case make it default, so
2587 // that in the case where we aren't able to clone the original version
2588 // always ends up with the default allocation behavior.
2589 AN.Versions[0] = (uint8_t)allocTypeToUse(AllocTypes: AllocNode->AllocTypes);
2590 }
2591 }
2592 // For callsite metadata, add to list for this function for later use.
2593 if (!FS->callsites().empty())
2594 for (auto &SN : FS->mutableCallsites()) {
2595 IndexCall StackNodeCall(&SN);
2596 CallsWithMetadata.push_back(x: StackNodeCall);
2597 }
2598
2599 if (!CallsWithMetadata.empty())
2600 FuncToCallsWithMetadata[FS] = CallsWithMetadata;
2601
2602 if (!FS->allocs().empty() || !FS->callsites().empty())
2603 FSToVIMap[FS] = VI;
2604 }
2605 }
2606
2607 if (DumpCCG) {
2608 dbgs() << "CCG before updating call stack chains:\n";
2609 dbgs() << *this;
2610 }
2611
2612 if (ExportToDot)
2613 exportToDot(Label: "prestackupdate");
2614
2615 updateStackNodes();
2616
2617 if (ExportToDot)
2618 exportToDot(Label: "poststackupdate");
2619
2620 handleCallsitesWithMultipleTargets();
2621
2622 markBackedges();
2623}
2624
2625template <typename DerivedCCG, typename FuncTy, typename CallTy>
2626void CallsiteContextGraph<DerivedCCG, FuncTy,
2627 CallTy>::handleCallsitesWithMultipleTargets() {
2628 // Look for and workaround callsites that call multiple functions.
2629 // This can happen for indirect calls, which needs better handling, and in
2630 // more rare cases (e.g. macro expansion).
2631 // TODO: To fix this for indirect calls we will want to perform speculative
2632 // devirtualization using either the normal PGO info with ICP, or using the
2633 // information in the profiled MemProf contexts. We can do this prior to
2634 // this transformation for regular LTO, and for ThinLTO we can simulate that
2635 // effect in the summary and perform the actual speculative devirtualization
2636 // while cloning in the ThinLTO backend.
2637
2638 // Keep track of the new nodes synthesized for discovered tail calls missing
2639 // from the profiled contexts.
2640 MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;
2641
2642 std::vector<std::pair<CallInfo, ContextNode *>> NewCallToNode;
2643 for (auto &Entry : NonAllocationCallToContextNodeMap) {
2644 auto *Node = Entry.second;
2645 assert(Node->Clones.empty());
2646 // Check all node callees and see if in the same function.
2647 // We need to check all of the calls recorded in this Node, because in some
2648 // cases we may have had multiple calls with the same debug info calling
2649 // different callees. This can happen, for example, when an object is
2650 // constructed in the paramter list - the destructor call of the object has
2651 // the same debug info (line/col) as the call the object was passed to.
2652 // Here we will prune any that don't match all callee nodes.
2653 std::vector<CallInfo> AllCalls;
2654 AllCalls.reserve(Node->MatchingCalls.size() + 1);
2655 AllCalls.push_back(Node->Call);
2656 llvm::append_range(AllCalls, Node->MatchingCalls);
2657
2658 // First see if we can partition the calls by callee function, creating new
2659 // nodes to host each set of calls calling the same callees. This is
2660 // necessary for support indirect calls with ThinLTO, for which we
2661 // synthesized CallsiteInfo records for each target. They will all have the
2662 // same callsite stack ids and would be sharing a context node at this
2663 // point. We need to perform separate cloning for each, which will be
2664 // applied along with speculative devirtualization in the ThinLTO backends
2665 // as needed. Note this does not currently support looking through tail
2666 // calls, it is unclear if we need that for indirect call targets.
2667 // First partition calls by callee func. Map indexed by func, value is
2668 // struct with list of matching calls, assigned node.
2669 if (partitionCallsByCallee(Node, AllCalls, NewCallToNode))
2670 continue;
2671
2672 auto It = AllCalls.begin();
2673 // Iterate through the calls until we find the first that matches.
2674 for (; It != AllCalls.end(); ++It) {
2675 auto ThisCall = *It;
2676 bool Match = true;
2677 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();
2678 ++EI) {
2679 auto Edge = *EI;
2680 if (!Edge->Callee->hasCall())
2681 continue;
2682 assert(NodeToCallingFunc.count(Edge->Callee));
2683 // Check if the called function matches that of the callee node.
2684 if (!calleesMatch(Call: ThisCall.call(), EI, TailCallToContextNodeMap)) {
2685 Match = false;
2686 break;
2687 }
2688 }
2689 // Found a call that matches the callee nodes, we can quit now.
2690 if (Match) {
2691 // If the first match is not the primary call on the Node, update it
2692 // now. We will update the list of matching calls further below.
2693 if (Node->Call != ThisCall) {
2694 Node->setCall(ThisCall);
2695 // We need to update the NonAllocationCallToContextNodeMap, but don't
2696 // want to do this during iteration over that map, so save the calls
2697 // that need updated entries.
2698 NewCallToNode.push_back({ThisCall, Node});
2699 }
2700 break;
2701 }
2702 }
2703 // We will update this list below (or leave it cleared if there was no
2704 // match found above).
2705 Node->MatchingCalls.clear();
2706 // If we hit the end of the AllCalls vector, no call matching the callee
2707 // nodes was found, clear the call information in the node.
2708 if (It == AllCalls.end()) {
2709 RemovedEdgesWithMismatchedCallees++;
2710 // Work around by setting Node to have a null call, so it gets
2711 // skipped during cloning. Otherwise assignFunctions will assert
2712 // because its data structures are not designed to handle this case.
2713 Node->setCall(CallInfo());
2714 continue;
2715 }
2716 // Now add back any matching calls that call the same function as the
2717 // matching primary call on Node.
2718 for (++It; It != AllCalls.end(); ++It) {
2719 auto ThisCall = *It;
2720 if (!sameCallee(Call1: Node->Call.call(), Call2: ThisCall.call()))
2721 continue;
2722 Node->MatchingCalls.push_back(ThisCall);
2723 }
2724 }
2725
2726 // Remove all mismatched nodes identified in the above loop from the node map
2727 // (checking whether they have a null call which is set above). For a
2728 // MapVector like NonAllocationCallToContextNodeMap it is much more efficient
2729 // to do the removal via remove_if than by individually erasing entries above.
2730 // Also remove any entries if we updated the node's primary call above.
2731 NonAllocationCallToContextNodeMap.remove_if([](const auto &it) {
2732 return !it.second->hasCall() || it.second->Call != it.first;
2733 });
2734
2735 // Add entries for any new primary calls recorded above.
2736 for (auto &[Call, Node] : NewCallToNode)
2737 NonAllocationCallToContextNodeMap[Call] = Node;
2738
2739 // Add the new nodes after the above loop so that the iteration is not
2740 // invalidated.
2741 for (auto &[Call, Node] : TailCallToContextNodeMap)
2742 NonAllocationCallToContextNodeMap[Call] = Node;
2743}
2744
2745template <typename DerivedCCG, typename FuncTy, typename CallTy>
2746bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::partitionCallsByCallee(
2747 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
2748 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode) {
2749 // Struct to keep track of all the calls having the same callee function,
2750 // and the node we eventually assign to them. Eventually we will record the
2751 // context node assigned to this group of calls.
2752 struct CallsWithSameCallee {
2753 std::vector<CallInfo> Calls;
2754 ContextNode *Node = nullptr;
2755 };
2756
2757 // First partition calls by callee function. Build map from each function
2758 // to the list of matching calls.
2759 DenseMap<const FuncTy *, CallsWithSameCallee> CalleeFuncToCallInfo;
2760 for (auto ThisCall : AllCalls) {
2761 auto *F = getCalleeFunc(Call: ThisCall.call());
2762 if (F)
2763 CalleeFuncToCallInfo[F].Calls.push_back(ThisCall);
2764 }
2765
2766 // Next, walk through all callee edges. For each callee node, get its
2767 // containing function and see if it was recorded in the above map (meaning we
2768 // have at least one matching call). Build another map from each callee node
2769 // with a matching call to the structure instance created above containing all
2770 // the calls.
2771 DenseMap<ContextNode *, CallsWithSameCallee *> CalleeNodeToCallInfo;
2772 for (const auto &Edge : Node->CalleeEdges) {
2773 if (!Edge->Callee->hasCall())
2774 continue;
2775 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2776 if (CalleeFuncToCallInfo.contains(ProfiledCalleeFunc))
2777 CalleeNodeToCallInfo[Edge->Callee] =
2778 &CalleeFuncToCallInfo[ProfiledCalleeFunc];
2779 }
2780
2781 // If there are entries in the second map, then there were no matching
2782 // calls/callees, nothing to do here. Return so we can go to the handling that
2783 // looks through tail calls.
2784 if (CalleeNodeToCallInfo.empty())
2785 return false;
2786
2787 // Walk through all callee edges again. Any and all callee edges that didn't
2788 // match any calls (callee not in the CalleeNodeToCallInfo map) are moved to a
2789 // new caller node (UnmatchedCalleesNode) which gets a null call so that it is
2790 // ignored during cloning. If it is in the map, then we use the node recorded
2791 // in that entry (creating it if needed), and move the callee edge to it.
2792 // The first callee will use the original node instead of creating a new one.
2793 // Note that any of the original calls on this node (in AllCalls) that didn't
2794 // have a callee function automatically get dropped from the node as part of
2795 // this process.
2796 ContextNode *UnmatchedCalleesNode = nullptr;
2797 // Track whether we already assigned original node to a callee.
2798 bool UsedOrigNode = false;
2799 assert(NodeToCallingFunc[Node]);
2800 // Iterate over a copy of Node's callee edges, since we may need to remove
2801 // edges in moveCalleeEdgeToNewCaller, and this simplifies the handling and
2802 // makes it less error-prone.
2803 auto CalleeEdges = Node->CalleeEdges;
2804 for (auto &Edge : CalleeEdges) {
2805 if (!Edge->Callee->hasCall())
2806 continue;
2807
2808 // Will be updated below to point to whatever (caller) node this callee edge
2809 // should be moved to.
2810 ContextNode *CallerNodeToUse = nullptr;
2811
2812 // Handle the case where there were no matching calls first. Move this
2813 // callee edge to the UnmatchedCalleesNode, creating it if needed.
2814 if (!CalleeNodeToCallInfo.contains(Edge->Callee)) {
2815 if (!UnmatchedCalleesNode)
2816 UnmatchedCalleesNode =
2817 createNewNode(/*IsAllocation=*/false, F: NodeToCallingFunc[Node]);
2818 CallerNodeToUse = UnmatchedCalleesNode;
2819 } else {
2820 // Look up the information recorded for this callee node, and use the
2821 // recorded caller node (creating it if needed).
2822 auto *Info = CalleeNodeToCallInfo[Edge->Callee];
2823 if (!Info->Node) {
2824 // If we haven't assigned any callees to the original node use it.
2825 if (!UsedOrigNode) {
2826 Info->Node = Node;
2827 // Clear the set of matching calls which will be updated below.
2828 Node->MatchingCalls.clear();
2829 UsedOrigNode = true;
2830 } else
2831 Info->Node =
2832 createNewNode(/*IsAllocation=*/false, F: NodeToCallingFunc[Node]);
2833 assert(!Info->Calls.empty());
2834 // The first call becomes the primary call for this caller node, and the
2835 // rest go in the matching calls list.
2836 Info->Node->setCall(Info->Calls.front());
2837 llvm::append_range(Info->Node->MatchingCalls,
2838 llvm::drop_begin(Info->Calls));
2839 // Save the primary call to node correspondence so that we can update
2840 // the NonAllocationCallToContextNodeMap, which is being iterated in the
2841 // caller of this function.
2842 NewCallToNode.push_back({Info->Node->Call, Info->Node});
2843 }
2844 CallerNodeToUse = Info->Node;
2845 }
2846
2847 // Don't need to move edge if we are using the original node;
2848 if (CallerNodeToUse == Node)
2849 continue;
2850
2851 moveCalleeEdgeToNewCaller(Edge, NewCaller: CallerNodeToUse);
2852 }
2853 // Now that we are done moving edges, clean up any caller edges that ended
2854 // up with no type or context ids. During moveCalleeEdgeToNewCaller all
2855 // caller edges from Node are replicated onto the new callers, and it
2856 // simplifies the handling to leave them until we have moved all
2857 // edges/context ids.
2858 for (auto &I : CalleeNodeToCallInfo)
2859 removeNoneTypeCallerEdges(Node: I.second->Node);
2860 if (UnmatchedCalleesNode)
2861 removeNoneTypeCallerEdges(Node: UnmatchedCalleesNode);
2862 removeNoneTypeCallerEdges(Node);
2863
2864 return true;
2865}
2866
2867uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2868 // In the Module (IR) case this is already the Id.
2869 return IdOrIndex;
2870}
2871
2872uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2873 // In the Index case this is an index into the stack id list in the summary
2874 // index, convert it to an Id.
2875 return Index.getStackIdAtIndex(Index: IdOrIndex);
2876}
2877
2878template <typename DerivedCCG, typename FuncTy, typename CallTy>
2879bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
2880 CallTy Call, EdgeIter &EI,
2881 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) {
2882 auto Edge = *EI;
2883 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2884 const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller];
2885 // Will be populated in order of callee to caller if we find a chain of tail
2886 // calls between the profiled caller and callee.
2887 std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain;
2888 if (!calleeMatchesFunc(Call, Func: ProfiledCalleeFunc, CallerFunc,
2889 FoundCalleeChain))
2890 return false;
2891
2892 // The usual case where the profiled callee matches that of the IR/summary.
2893 if (FoundCalleeChain.empty())
2894 return true;
2895
2896 auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) {
2897 auto *CurEdge = Callee->findEdgeFromCaller(Caller);
2898 // If there is already an edge between these nodes, simply update it and
2899 // return.
2900 if (CurEdge) {
2901 CurEdge->ContextIds.insert_range(Edge->ContextIds);
2902 CurEdge->AllocTypes |= Edge->AllocTypes;
2903 return;
2904 }
2905 // Otherwise, create a new edge and insert it into the caller and callee
2906 // lists.
2907 auto NewEdge = std::make_shared<ContextEdge>(
2908 Callee, Caller, Edge->AllocTypes, Edge->ContextIds);
2909 Callee->CallerEdges.push_back(NewEdge);
2910 if (Caller == Edge->Caller) {
2911 // If we are inserting the new edge into the current edge's caller, insert
2912 // the new edge before the current iterator position, and then increment
2913 // back to the current edge.
2914 EI = Caller->CalleeEdges.insert(EI, NewEdge);
2915 ++EI;
2916 assert(*EI == Edge &&
2917 "Iterator position not restored after insert and increment");
2918 } else
2919 Caller->CalleeEdges.push_back(NewEdge);
2920 };
2921
2922 // Create new nodes for each found callee and connect in between the profiled
2923 // caller and callee.
2924 auto *CurCalleeNode = Edge->Callee;
2925 for (auto &[NewCall, Func] : FoundCalleeChain) {
2926 ContextNode *NewNode = nullptr;
2927 // First check if we have already synthesized a node for this tail call.
2928 if (TailCallToContextNodeMap.count(NewCall)) {
2929 NewNode = TailCallToContextNodeMap[NewCall];
2930 NewNode->AllocTypes |= Edge->AllocTypes;
2931 } else {
2932 FuncToCallsWithMetadata[Func].push_back({NewCall});
2933 // Create Node and record node info.
2934 NewNode = createNewNode(/*IsAllocation=*/false, F: Func, C: NewCall);
2935 TailCallToContextNodeMap[NewCall] = NewNode;
2936 NewNode->AllocTypes = Edge->AllocTypes;
2937 }
2938
2939 // Hook up node to its callee node
2940 AddEdge(NewNode, CurCalleeNode);
2941
2942 CurCalleeNode = NewNode;
2943 }
2944
2945 // Hook up edge's original caller to new callee node.
2946 AddEdge(Edge->Caller, CurCalleeNode);
2947
2948#ifndef NDEBUG
2949 // Save this because Edge's fields get cleared below when removed.
2950 auto *Caller = Edge->Caller;
2951#endif
2952
2953 // Remove old edge
2954 removeEdgeFromGraph(Edge: Edge.get(), EI: &EI, /*CalleeIter=*/true);
2955
2956 // To simplify the increment of EI in the caller, subtract one from EI.
2957 // In the final AddEdge call we would have either added a new callee edge,
2958 // to Edge->Caller, or found an existing one. Either way we are guaranteed
2959 // that there is at least one callee edge.
2960 assert(!Caller->CalleeEdges.empty());
2961 --EI;
2962
2963 return true;
2964}
2965
2966bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2967 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
2968 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
2969 bool &FoundMultipleCalleeChains) {
2970 // Stop recursive search if we have already explored the maximum specified
2971 // depth.
2972 if (Depth > TailCallSearchDepth)
2973 return false;
2974
2975 auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) {
2976 FoundCalleeChain.push_back(x: {Callsite, F});
2977 };
2978
2979 auto *CalleeFunc = dyn_cast<Function>(Val: CurCallee);
2980 if (!CalleeFunc) {
2981 auto *Alias = dyn_cast<GlobalAlias>(Val: CurCallee);
2982 assert(Alias);
2983 CalleeFunc = dyn_cast<Function>(Val: Alias->getAliasee());
2984 assert(CalleeFunc);
2985 }
2986
2987 // Look for tail calls in this function, and check if they either call the
2988 // profiled callee directly, or indirectly (via a recursive search).
2989 // Only succeed if there is a single unique tail call chain found between the
2990 // profiled caller and callee, otherwise we could perform incorrect cloning.
2991 bool FoundSingleCalleeChain = false;
2992 for (auto &BB : *CalleeFunc) {
2993 for (auto &I : BB) {
2994 auto *CB = dyn_cast<CallBase>(Val: &I);
2995 if (!CB || !CB->isTailCall())
2996 continue;
2997 auto *CalledValue = CB->getCalledOperand();
2998 auto *CalledFunction = CB->getCalledFunction();
2999 if (CalledValue && !CalledFunction) {
3000 CalledValue = CalledValue->stripPointerCasts();
3001 // Stripping pointer casts can reveal a called function.
3002 CalledFunction = dyn_cast<Function>(Val: CalledValue);
3003 }
3004 // Check if this is an alias to a function. If so, get the
3005 // called aliasee for the checks below.
3006 if (auto *GA = dyn_cast<GlobalAlias>(Val: CalledValue)) {
3007 assert(!CalledFunction &&
3008 "Expected null called function in callsite for alias");
3009 CalledFunction = dyn_cast<Function>(Val: GA->getAliaseeObject());
3010 }
3011 if (!CalledFunction)
3012 continue;
3013 if (CalledFunction == ProfiledCallee) {
3014 if (FoundSingleCalleeChain) {
3015 FoundMultipleCalleeChains = true;
3016 return false;
3017 }
3018 FoundSingleCalleeChain = true;
3019 FoundProfiledCalleeCount++;
3020 FoundProfiledCalleeDepth += Depth;
3021 if (Depth > FoundProfiledCalleeMaxDepth)
3022 FoundProfiledCalleeMaxDepth = Depth;
3023 SaveCallsiteInfo(&I, CalleeFunc);
3024 } else if (findProfiledCalleeThroughTailCalls(
3025 ProfiledCallee, CurCallee: CalledFunction, Depth: Depth + 1,
3026 FoundCalleeChain, FoundMultipleCalleeChains)) {
3027 // findProfiledCalleeThroughTailCalls should not have returned
3028 // true if FoundMultipleCalleeChains.
3029 assert(!FoundMultipleCalleeChains);
3030 if (FoundSingleCalleeChain) {
3031 FoundMultipleCalleeChains = true;
3032 return false;
3033 }
3034 FoundSingleCalleeChain = true;
3035 SaveCallsiteInfo(&I, CalleeFunc);
3036 } else if (FoundMultipleCalleeChains)
3037 return false;
3038 }
3039 }
3040
3041 return FoundSingleCalleeChain;
3042}
3043
3044const Function *ModuleCallsiteContextGraph::getCalleeFunc(Instruction *Call) {
3045 auto *CB = dyn_cast<CallBase>(Val: Call);
3046 if (!CB->getCalledOperand() || CB->isIndirectCall())
3047 return nullptr;
3048 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
3049 auto *Alias = dyn_cast<GlobalAlias>(Val: CalleeVal);
3050 if (Alias)
3051 return dyn_cast<Function>(Val: Alias->getAliasee());
3052 return dyn_cast<Function>(Val: CalleeVal);
3053}
3054
3055bool ModuleCallsiteContextGraph::calleeMatchesFunc(
3056 Instruction *Call, const Function *Func, const Function *CallerFunc,
3057 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) {
3058 auto *CB = dyn_cast<CallBase>(Val: Call);
3059 if (!CB->getCalledOperand() || CB->isIndirectCall())
3060 return false;
3061 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
3062 auto *CalleeFunc = dyn_cast<Function>(Val: CalleeVal);
3063 if (CalleeFunc == Func)
3064 return true;
3065 auto *Alias = dyn_cast<GlobalAlias>(Val: CalleeVal);
3066 if (Alias && Alias->getAliasee() == Func)
3067 return true;
3068
3069 // Recursively search for the profiled callee through tail calls starting with
3070 // the actual Callee. The discovered tail call chain is saved in
3071 // FoundCalleeChain, and we will fixup the graph to include these callsites
3072 // after returning.
3073 // FIXME: We will currently redo the same recursive walk if we find the same
3074 // mismatched callee from another callsite. We can improve this with more
3075 // bookkeeping of the created chain of new nodes for each mismatch.
3076 unsigned Depth = 1;
3077 bool FoundMultipleCalleeChains = false;
3078 if (!findProfiledCalleeThroughTailCalls(ProfiledCallee: Func, CurCallee: CalleeVal, Depth,
3079 FoundCalleeChain,
3080 FoundMultipleCalleeChains)) {
3081 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: "
3082 << Func->getName() << " from " << CallerFunc->getName()
3083 << " that actually called " << CalleeVal->getName()
3084 << (FoundMultipleCalleeChains
3085 ? " (found multiple possible chains)"
3086 : "")
3087 << "\n");
3088 if (FoundMultipleCalleeChains)
3089 FoundProfiledCalleeNonUniquelyCount++;
3090 return false;
3091 }
3092
3093 return true;
3094}
3095
3096bool ModuleCallsiteContextGraph::sameCallee(Instruction *Call1,
3097 Instruction *Call2) {
3098 auto *CB1 = cast<CallBase>(Val: Call1);
3099 if (!CB1->getCalledOperand() || CB1->isIndirectCall())
3100 return false;
3101 auto *CalleeVal1 = CB1->getCalledOperand()->stripPointerCasts();
3102 auto *CalleeFunc1 = dyn_cast<Function>(Val: CalleeVal1);
3103 auto *CB2 = cast<CallBase>(Val: Call2);
3104 if (!CB2->getCalledOperand() || CB2->isIndirectCall())
3105 return false;
3106 auto *CalleeVal2 = CB2->getCalledOperand()->stripPointerCasts();
3107 auto *CalleeFunc2 = dyn_cast<Function>(Val: CalleeVal2);
3108 return CalleeFunc1 == CalleeFunc2;
3109}
3110
3111bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
3112 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
3113 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
3114 bool &FoundMultipleCalleeChains) {
3115 // Stop recursive search if we have already explored the maximum specified
3116 // depth.
3117 if (Depth > TailCallSearchDepth)
3118 return false;
3119
3120 auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) {
3121 // Make a CallsiteInfo for each discovered callee, if one hasn't already
3122 // been synthesized.
3123 if (!FunctionCalleesToSynthesizedCallsiteInfos.count(x: FS) ||
3124 !FunctionCalleesToSynthesizedCallsiteInfos[FS].count(x: Callee))
3125 // StackIds is empty (we don't have debug info available in the index for
3126 // these callsites)
3127 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] =
3128 std::make_unique<CallsiteInfo>(args&: Callee, args: SmallVector<unsigned>());
3129 CallsiteInfo *NewCallsiteInfo =
3130 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get();
3131 FoundCalleeChain.push_back(x: {NewCallsiteInfo, FS});
3132 };
3133
3134 // Look for tail calls in this function, and check if they either call the
3135 // profiled callee directly, or indirectly (via a recursive search).
3136 // Only succeed if there is a single unique tail call chain found between the
3137 // profiled caller and callee, otherwise we could perform incorrect cloning.
3138 bool FoundSingleCalleeChain = false;
3139 for (auto &S : CurCallee.getSummaryList()) {
3140 if (!GlobalValue::isLocalLinkage(Linkage: S->linkage()) &&
3141 !isPrevailing(CurCallee.getGUID(), S.get()))
3142 continue;
3143 auto *FS = dyn_cast<FunctionSummary>(Val: S->getBaseObject());
3144 if (!FS)
3145 continue;
3146 auto FSVI = CurCallee;
3147 auto *AS = dyn_cast<AliasSummary>(Val: S.get());
3148 if (AS)
3149 FSVI = AS->getAliaseeVI();
3150 for (auto &CallEdge : FS->calls()) {
3151 if (!CallEdge.second.hasTailCall())
3152 continue;
3153 if (CallEdge.first == ProfiledCallee) {
3154 if (FoundSingleCalleeChain) {
3155 FoundMultipleCalleeChains = true;
3156 return false;
3157 }
3158 FoundSingleCalleeChain = true;
3159 FoundProfiledCalleeCount++;
3160 FoundProfiledCalleeDepth += Depth;
3161 if (Depth > FoundProfiledCalleeMaxDepth)
3162 FoundProfiledCalleeMaxDepth = Depth;
3163 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
3164 // Add FS to FSToVIMap in case it isn't already there.
3165 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
3166 FSToVIMap[FS] = FSVI;
3167 } else if (findProfiledCalleeThroughTailCalls(
3168 ProfiledCallee, CurCallee: CallEdge.first, Depth: Depth + 1,
3169 FoundCalleeChain, FoundMultipleCalleeChains)) {
3170 // findProfiledCalleeThroughTailCalls should not have returned
3171 // true if FoundMultipleCalleeChains.
3172 assert(!FoundMultipleCalleeChains);
3173 if (FoundSingleCalleeChain) {
3174 FoundMultipleCalleeChains = true;
3175 return false;
3176 }
3177 FoundSingleCalleeChain = true;
3178 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
3179 // Add FS to FSToVIMap in case it isn't already there.
3180 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
3181 FSToVIMap[FS] = FSVI;
3182 } else if (FoundMultipleCalleeChains)
3183 return false;
3184 }
3185 }
3186
3187 return FoundSingleCalleeChain;
3188}
3189
3190const FunctionSummary *
3191IndexCallsiteContextGraph::getCalleeFunc(IndexCall &Call) {
3192 ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Val&: Call)->Callee;
3193 if (Callee.getSummaryList().empty())
3194 return nullptr;
3195 return dyn_cast<FunctionSummary>(Val: Callee.getSummaryList()[0]->getBaseObject());
3196}
3197
3198bool IndexCallsiteContextGraph::calleeMatchesFunc(
3199 IndexCall &Call, const FunctionSummary *Func,
3200 const FunctionSummary *CallerFunc,
3201 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {
3202 ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Val&: Call)->Callee;
3203 // If there is no summary list then this is a call to an externally defined
3204 // symbol.
3205 AliasSummary *Alias =
3206 Callee.getSummaryList().empty()
3207 ? nullptr
3208 : dyn_cast<AliasSummary>(Val: Callee.getSummaryList()[0].get());
3209 assert(FSToVIMap.count(Func));
3210 auto FuncVI = FSToVIMap[Func];
3211 if (Callee == FuncVI ||
3212 // If callee is an alias, check the aliasee, since only function
3213 // summary base objects will contain the stack node summaries and thus
3214 // get a context node.
3215 (Alias && Alias->getAliaseeVI() == FuncVI))
3216 return true;
3217
3218 // Recursively search for the profiled callee through tail calls starting with
3219 // the actual Callee. The discovered tail call chain is saved in
3220 // FoundCalleeChain, and we will fixup the graph to include these callsites
3221 // after returning.
3222 // FIXME: We will currently redo the same recursive walk if we find the same
3223 // mismatched callee from another callsite. We can improve this with more
3224 // bookkeeping of the created chain of new nodes for each mismatch.
3225 unsigned Depth = 1;
3226 bool FoundMultipleCalleeChains = false;
3227 if (!findProfiledCalleeThroughTailCalls(
3228 ProfiledCallee: FuncVI, CurCallee: Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) {
3229 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI
3230 << " from " << FSToVIMap[CallerFunc]
3231 << " that actually called " << Callee
3232 << (FoundMultipleCalleeChains
3233 ? " (found multiple possible chains)"
3234 : "")
3235 << "\n");
3236 if (FoundMultipleCalleeChains)
3237 FoundProfiledCalleeNonUniquelyCount++;
3238 return false;
3239 }
3240
3241 return true;
3242}
3243
3244bool IndexCallsiteContextGraph::sameCallee(IndexCall &Call1, IndexCall &Call2) {
3245 ValueInfo Callee1 = dyn_cast_if_present<CallsiteInfo *>(Val&: Call1)->Callee;
3246 ValueInfo Callee2 = dyn_cast_if_present<CallsiteInfo *>(Val&: Call2)->Callee;
3247 return Callee1 == Callee2;
3248}
3249
3250template <typename DerivedCCG, typename FuncTy, typename CallTy>
3251void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
3252 const {
3253 print(OS&: dbgs());
3254 dbgs() << "\n";
3255}
3256
3257template <typename DerivedCCG, typename FuncTy, typename CallTy>
3258void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
3259 raw_ostream &OS) const {
3260 OS << "Node " << this << "\n";
3261 OS << "\t";
3262 printCall(OS);
3263 if (Recursive)
3264 OS << " (recursive)";
3265 OS << "\n";
3266 if (!MatchingCalls.empty()) {
3267 OS << "\tMatchingCalls:\n";
3268 for (auto &MatchingCall : MatchingCalls) {
3269 OS << "\t";
3270 MatchingCall.print(OS);
3271 OS << "\n";
3272 }
3273 }
3274 OS << "\tNodeId: " << NodeId << "\n";
3275 OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
3276 OS << "\tContextIds:";
3277 // Make a copy of the computed context ids that we can sort for stability.
3278 auto ContextIds = getContextIds();
3279 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3280 std::sort(first: SortedIds.begin(), last: SortedIds.end());
3281 for (auto Id : SortedIds)
3282 OS << " " << Id;
3283 OS << "\n";
3284 OS << "\tCalleeEdges:\n";
3285 for (auto &Edge : CalleeEdges)
3286 OS << "\t\t" << *Edge << " (Callee NodeId: " << Edge->Callee->NodeId
3287 << ")\n";
3288 OS << "\tCallerEdges:\n";
3289 for (auto &Edge : CallerEdges)
3290 OS << "\t\t" << *Edge << " (Caller NodeId: " << Edge->Caller->NodeId
3291 << ")\n";
3292 if (!Clones.empty()) {
3293 OS << "\tClones: ";
3294 ListSeparator LS;
3295 for (auto *C : Clones)
3296 OS << LS << C << " NodeId: " << C->NodeId;
3297 OS << "\n";
3298 } else if (CloneOf) {
3299 OS << "\tClone of " << CloneOf << " NodeId: " << CloneOf->NodeId << "\n";
3300 }
3301}
3302
3303template <typename DerivedCCG, typename FuncTy, typename CallTy>
3304void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
3305 const {
3306 print(OS&: dbgs());
3307 dbgs() << "\n";
3308}
3309
3310template <typename DerivedCCG, typename FuncTy, typename CallTy>
3311void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
3312 raw_ostream &OS) const {
3313 OS << "Edge from Callee " << Callee << " to Caller: " << Caller
3314 << (IsBackedge ? " (BE)" : "")
3315 << " AllocTypes: " << getAllocTypeString(AllocTypes);
3316 OS << " ContextIds:";
3317 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3318 std::sort(first: SortedIds.begin(), last: SortedIds.end());
3319 for (auto Id : SortedIds)
3320 OS << " " << Id;
3321}
3322
3323template <typename DerivedCCG, typename FuncTy, typename CallTy>
3324void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
3325 print(OS&: dbgs());
3326}
3327
3328template <typename DerivedCCG, typename FuncTy, typename CallTy>
3329void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
3330 raw_ostream &OS) const {
3331 OS << "Callsite Context Graph:\n";
3332 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3333 for (const auto Node : nodes<GraphType>(this)) {
3334 if (Node->isRemoved())
3335 continue;
3336 Node->print(OS);
3337 OS << "\n";
3338 }
3339}
3340
3341template <typename DerivedCCG, typename FuncTy, typename CallTy>
3342void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
3343 raw_ostream &OS,
3344 function_ref<void(StringRef, StringRef, const Twine &)> EmitRemark) const {
3345 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3346 for (const auto Node : nodes<GraphType>(this)) {
3347 if (Node->isRemoved())
3348 continue;
3349 if (!Node->IsAllocation)
3350 continue;
3351 DenseSet<uint32_t> ContextIds = Node->getContextIds();
3352 auto AllocTypeFromCall = getAllocationCallType(Call: Node->Call);
3353 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3354 std::sort(first: SortedIds.begin(), last: SortedIds.end());
3355 for (auto Id : SortedIds) {
3356 auto TypeI = ContextIdToAllocationType.find(Val: Id);
3357 assert(TypeI != ContextIdToAllocationType.end());
3358 auto CSI = ContextIdToContextSizeInfos.find(Val: Id);
3359 if (CSI != ContextIdToContextSizeInfos.end()) {
3360 for (auto &Info : CSI->second) {
3361 std::string Msg =
3362 "MemProf hinting: " + getAllocTypeString(AllocTypes: (uint8_t)TypeI->second) +
3363 " full allocation context " + std::to_string(val: Info.FullStackId) +
3364 " with total size " + std::to_string(val: Info.TotalSize) + " is " +
3365 getAllocTypeString(Node->AllocTypes) + " after cloning";
3366 if (allocTypeToUse(Node->AllocTypes) != AllocTypeFromCall)
3367 Msg += " marked " + getAllocTypeString(AllocTypes: (uint8_t)AllocTypeFromCall) +
3368 " due to cold byte percent";
3369 // Print the internal context id to aid debugging and visualization.
3370 Msg += " (internal context id " + std::to_string(val: Id) + ")";
3371 if (MemProfReportHintedSizes)
3372 OS << Msg << "\n";
3373 if (EmitRemark)
3374 EmitRemark(DEBUG_TYPE, "MemProfReport", Msg);
3375 }
3376 } else {
3377 // This is only emitted if the context size info is not present.
3378 std::string Msg =
3379 "MemProf hinting: " + getAllocTypeString(AllocTypes: (uint8_t)TypeI->second) +
3380 " is " + getAllocTypeString(Node->AllocTypes) + " after cloning";
3381 if (allocTypeToUse(Node->AllocTypes) != AllocTypeFromCall)
3382 Msg += " marked " + getAllocTypeString(AllocTypes: (uint8_t)AllocTypeFromCall) +
3383 " due to cold byte percent";
3384 // Print the internal context id to aid debugging and visualization.
3385 Msg += " (internal context id " + std::to_string(val: Id) + ")";
3386 if (MemProfReportHintedSizes)
3387 OS << Msg << "\n";
3388 if (EmitRemark)
3389 EmitRemark(DEBUG_TYPE, "MemProfReport", Msg);
3390 }
3391 }
3392 }
3393}
3394
3395template <typename DerivedCCG, typename FuncTy, typename CallTy>
3396void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
3397 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3398 for (const auto Node : nodes<GraphType>(this)) {
3399 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3400 for (auto &Edge : Node->CallerEdges)
3401 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
3402 }
3403}
3404
3405template <typename DerivedCCG, typename FuncTy, typename CallTy>
3406struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
3407 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3408 using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
3409
3410 using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
3411 static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
3412
3413 using nodes_iterator =
3414 mapped_iterator<typename std::vector<NodePtrTy>::const_iterator,
3415 decltype(&getNode)>;
3416
3417 static nodes_iterator nodes_begin(GraphType G) {
3418 return nodes_iterator(G->NodeOwner.begin(), &getNode);
3419 }
3420
3421 static nodes_iterator nodes_end(GraphType G) {
3422 return nodes_iterator(G->NodeOwner.end(), &getNode);
3423 }
3424
3425 static NodeRef getEntryNode(GraphType G) {
3426 return G->NodeOwner.begin()->get();
3427 }
3428
3429 using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
3430 static const ContextNode<DerivedCCG, FuncTy, CallTy> *
3431 GetCallee(const EdgePtrTy &P) {
3432 return P->Callee;
3433 }
3434
3435 using ChildIteratorType =
3436 mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
3437 DerivedCCG, FuncTy, CallTy>>>::const_iterator,
3438 decltype(&GetCallee)>;
3439
3440 static ChildIteratorType child_begin(NodeRef N) {
3441 return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
3442 }
3443
3444 static ChildIteratorType child_end(NodeRef N) {
3445 return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
3446 }
3447};
3448
3449template <typename DerivedCCG, typename FuncTy, typename CallTy>
3450struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
3451 : public DefaultDOTGraphTraits {
3452 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {
3453 // If the user requested the full graph to be exported, but provided an
3454 // allocation id, or if the user gave a context id and requested more than
3455 // just a specific context to be exported, note that highlighting is
3456 // enabled.
3457 DoHighlight =
3458 (AllocIdForDot.getNumOccurrences() && DotGraphScope == DotScope::All) ||
3459 (ContextIdForDot.getNumOccurrences() &&
3460 DotGraphScope != DotScope::Context);
3461 }
3462
3463 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3464 using GTraits = GraphTraits<GraphType>;
3465 using NodeRef = typename GTraits::NodeRef;
3466 using ChildIteratorType = typename GTraits::ChildIteratorType;
3467
3468 static std::string getNodeLabel(NodeRef Node, GraphType G) {
3469 std::string LabelString =
3470 (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
3471 Twine(Node->OrigStackOrAllocId) + " NodeId: " + Twine(Node->NodeId))
3472 .str();
3473 LabelString += "\n";
3474 if (Node->hasCall()) {
3475 auto Func = G->NodeToCallingFunc.find(Node);
3476 assert(Func != G->NodeToCallingFunc.end());
3477 LabelString +=
3478 G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
3479 for (auto &MatchingCall : Node->MatchingCalls) {
3480 LabelString += "\n";
3481 LabelString += G->getLabel(Func->second, MatchingCall.call(),
3482 MatchingCall.cloneNo());
3483 }
3484 } else {
3485 LabelString += "null call";
3486 if (Node->Recursive)
3487 LabelString += " (recursive)";
3488 else
3489 LabelString += " (external)";
3490 }
3491 return LabelString;
3492 }
3493
3494 static std::string getNodeAttributes(NodeRef Node, GraphType G) {
3495 auto ContextIds = Node->getContextIds();
3496 // If highlighting enabled, see if this node contains any of the context ids
3497 // of interest. If so, it will use a different color and a larger fontsize
3498 // (which makes the node larger as well).
3499 bool Highlight = false;
3500 if (DoHighlight) {
3501 assert(ContextIdForDot.getNumOccurrences() ||
3502 AllocIdForDot.getNumOccurrences());
3503 if (ContextIdForDot.getNumOccurrences())
3504 Highlight = ContextIds.contains(ContextIdForDot);
3505 else
3506 Highlight = set_intersects(ContextIds, G->DotAllocContextIds);
3507 }
3508 std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
3509 getContextIds(ContextIds) + "\"")
3510 .str();
3511 // Default fontsize is 14
3512 if (Highlight)
3513 AttributeString += ",fontsize=\"30\"";
3514 AttributeString +=
3515 (Twine(",fillcolor=\"") + getColor(AllocTypes: Node->AllocTypes, Highlight) + "\"")
3516 .str();
3517 if (Node->CloneOf) {
3518 AttributeString += ",color=\"blue\"";
3519 AttributeString += ",style=\"filled,bold,dashed\"";
3520 } else
3521 AttributeString += ",style=\"filled\"";
3522 return AttributeString;
3523 }
3524
3525 static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
3526 GraphType G) {
3527 auto &Edge = *(ChildIter.getCurrent());
3528 // If highlighting enabled, see if this edge contains any of the context ids
3529 // of interest. If so, it will use a different color and a heavier arrow
3530 // size and weight (the larger weight makes the highlighted path
3531 // straighter).
3532 bool Highlight = false;
3533 if (DoHighlight) {
3534 assert(ContextIdForDot.getNumOccurrences() ||
3535 AllocIdForDot.getNumOccurrences());
3536 if (ContextIdForDot.getNumOccurrences())
3537 Highlight = Edge->ContextIds.contains(ContextIdForDot);
3538 else
3539 Highlight = set_intersects(Edge->ContextIds, G->DotAllocContextIds);
3540 }
3541 auto Color = getColor(AllocTypes: Edge->AllocTypes, Highlight);
3542 std::string AttributeString =
3543 (Twine("tooltip=\"") + getContextIds(ContextIds: Edge->ContextIds) + "\"" +
3544 // fillcolor is the arrow head and color is the line
3545 Twine(",fillcolor=\"") + Color + "\"" + Twine(",color=\"") + Color +
3546 "\"")
3547 .str();
3548 if (Edge->IsBackedge)
3549 AttributeString += ",style=\"dotted\"";
3550 // Default penwidth and weight are both 1.
3551 if (Highlight)
3552 AttributeString += ",penwidth=\"2.0\",weight=\"2\"";
3553 return AttributeString;
3554 }
3555
3556 // Since the NodeOwners list includes nodes that are no longer connected to
3557 // the graph, skip them here.
3558 static bool isNodeHidden(NodeRef Node, GraphType G) {
3559 if (Node->isRemoved())
3560 return true;
3561 // If a scope smaller than the full graph was requested, see if this node
3562 // contains any of the context ids of interest.
3563 if (DotGraphScope == DotScope::Alloc)
3564 return !set_intersects(Node->getContextIds(), G->DotAllocContextIds);
3565 if (DotGraphScope == DotScope::Context)
3566 return !Node->getContextIds().contains(ContextIdForDot);
3567 return false;
3568 }
3569
3570private:
3571 static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
3572 std::string IdString = "ContextIds:";
3573 if (ContextIds.size() < 100) {
3574 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3575 std::sort(first: SortedIds.begin(), last: SortedIds.end());
3576 for (auto Id : SortedIds)
3577 IdString += (" " + Twine(Id)).str();
3578 } else {
3579 IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
3580 }
3581 return IdString;
3582 }
3583
3584 static std::string getColor(uint8_t AllocTypes, bool Highlight) {
3585 // If DoHighlight is not enabled, we want to use the highlight colors for
3586 // NotCold and Cold, and the non-highlight color for NotCold+Cold. This is
3587 // both compatible with the color scheme before highlighting was supported,
3588 // and for the NotCold+Cold color the non-highlight color is a bit more
3589 // readable.
3590 if (AllocTypes == (uint8_t)AllocationType::NotCold)
3591 // Color "brown1" actually looks like a lighter red.
3592 return !DoHighlight || Highlight ? "brown1" : "lightpink";
3593 if (AllocTypes == (uint8_t)AllocationType::Cold)
3594 return !DoHighlight || Highlight ? "cyan" : "lightskyblue";
3595 if (AllocTypes ==
3596 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
3597 return Highlight ? "magenta" : "mediumorchid1";
3598 return "gray";
3599 }
3600
3601 static std::string getNodeId(NodeRef Node) {
3602 std::stringstream SStream;
3603 SStream << std::hex << "N0x" << (unsigned long long)Node;
3604 std::string Result = SStream.str();
3605 return Result;
3606 }
3607
3608 // True if we should highlight a specific context or allocation's contexts in
3609 // the emitted graph.
3610 static bool DoHighlight;
3611};
3612
3613template <typename DerivedCCG, typename FuncTy, typename CallTy>
3614bool DOTGraphTraits<
3615 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>::DoHighlight =
3616 false;
3617
3618template <typename DerivedCCG, typename FuncTy, typename CallTy>
3619void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
3620 std::string Label) const {
3621 WriteGraph(this, "", false, Label,
3622 DotFilePathPrefix + "ccg." + Label + ".dot");
3623}
3624
3625template <typename DerivedCCG, typename FuncTy, typename CallTy>
3626typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
3627CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
3628 const std::shared_ptr<ContextEdge> &Edge,
3629 DenseSet<uint32_t> ContextIdsToMove) {
3630 ContextNode *Node = Edge->Callee;
3631 assert(NodeToCallingFunc.count(Node));
3632 ContextNode *Clone =
3633 createNewNode(IsAllocation: Node->IsAllocation, F: NodeToCallingFunc[Node], C: Node->Call);
3634 Node->addClone(Clone);
3635 Clone->MatchingCalls = Node->MatchingCalls;
3636 moveEdgeToExistingCalleeClone(Edge, NewCallee: Clone, /*NewClone=*/true,
3637 ContextIdsToMove);
3638 return Clone;
3639}
3640
3641template <typename DerivedCCG, typename FuncTy, typename CallTy>
3642void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3643 moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
3644 ContextNode *NewCallee, bool NewClone,
3645 DenseSet<uint32_t> ContextIdsToMove) {
3646 // NewCallee and Edge's current callee must be clones of the same original
3647 // node (Edge's current callee may be the original node too).
3648 assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
3649
3650 bool EdgeIsRecursive = Edge->Callee == Edge->Caller;
3651
3652 ContextNode *OldCallee = Edge->Callee;
3653
3654 // We might already have an edge to the new callee from earlier cloning for a
3655 // different allocation. If one exists we will reuse it.
3656 auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller);
3657
3658 // Callers will pass an empty ContextIdsToMove set when they want to move the
3659 // edge. Copy in Edge's ids for simplicity.
3660 if (ContextIdsToMove.empty())
3661 ContextIdsToMove = Edge->getContextIds();
3662
3663 // If we are moving all of Edge's ids, then just move the whole Edge.
3664 // Otherwise only move the specified subset, to a new edge if needed.
3665 if (Edge->getContextIds().size() == ContextIdsToMove.size()) {
3666 // First, update the alloc types on New Callee from Edge.
3667 // Do this before we potentially clear Edge's fields below!
3668 NewCallee->AllocTypes |= Edge->AllocTypes;
3669 // Moving the whole Edge.
3670 if (ExistingEdgeToNewCallee) {
3671 // Since we already have an edge to NewCallee, simply move the ids
3672 // onto it, and remove the existing Edge.
3673 ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
3674 ExistingEdgeToNewCallee->AllocTypes |= Edge->AllocTypes;
3675 assert(Edge->ContextIds == ContextIdsToMove);
3676 removeEdgeFromGraph(Edge: Edge.get());
3677 } else {
3678 // Otherwise just reconnect Edge to NewCallee.
3679 Edge->Callee = NewCallee;
3680 NewCallee->CallerEdges.push_back(Edge);
3681 // Remove it from callee where it was previously connected.
3682 OldCallee->eraseCallerEdge(Edge.get());
3683 // Don't need to update Edge's context ids since we are simply
3684 // reconnecting it.
3685 }
3686 } else {
3687 // Only moving a subset of Edge's ids.
3688 // Compute the alloc type of the subset of ids being moved.
3689 auto CallerEdgeAllocType = computeAllocType(ContextIds&: ContextIdsToMove);
3690 if (ExistingEdgeToNewCallee) {
3691 // Since we already have an edge to NewCallee, simply move the ids
3692 // onto it.
3693 ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
3694 ExistingEdgeToNewCallee->AllocTypes |= CallerEdgeAllocType;
3695 } else {
3696 // Otherwise, create a new edge to NewCallee for the ids being moved.
3697 auto NewEdge = std::make_shared<ContextEdge>(
3698 NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove);
3699 Edge->Caller->CalleeEdges.push_back(NewEdge);
3700 NewCallee->CallerEdges.push_back(NewEdge);
3701 }
3702 // In either case, need to update the alloc types on NewCallee, and remove
3703 // those ids and update the alloc type on the original Edge.
3704 NewCallee->AllocTypes |= CallerEdgeAllocType;
3705 set_subtract(Edge->ContextIds, ContextIdsToMove);
3706 Edge->AllocTypes = computeAllocType(ContextIds&: Edge->ContextIds);
3707 }
3708 // Now walk the old callee node's callee edges and move Edge's context ids
3709 // over to the corresponding edge into the clone (which is created here if
3710 // this is a newly created clone).
3711 for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
3712 ContextNode *CalleeToUse = OldCalleeEdge->Callee;
3713 // If this is a direct recursion edge, use NewCallee (the clone) as the
3714 // callee as well, so that any edge updated/created here is also direct
3715 // recursive.
3716 if (CalleeToUse == OldCallee) {
3717 // If this is a recursive edge, see if we already moved a recursive edge
3718 // (which would have to have been this one) - if we were only moving a
3719 // subset of context ids it would still be on OldCallee.
3720 if (EdgeIsRecursive) {
3721 assert(OldCalleeEdge == Edge);
3722 continue;
3723 }
3724 CalleeToUse = NewCallee;
3725 }
3726 // The context ids moving to the new callee are the subset of this edge's
3727 // context ids and the context ids on the caller edge being moved.
3728 DenseSet<uint32_t> EdgeContextIdsToMove =
3729 set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove);
3730 set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
3731 OldCalleeEdge->AllocTypes =
3732 computeAllocType(ContextIds&: OldCalleeEdge->getContextIds());
3733 if (!NewClone) {
3734 // Update context ids / alloc type on corresponding edge to NewCallee.
3735 // There is a chance this may not exist if we are reusing an existing
3736 // clone, specifically during function assignment, where we would have
3737 // removed none type edges after creating the clone. If we can't find
3738 // a corresponding edge there, fall through to the cloning below.
3739 if (auto *NewCalleeEdge = NewCallee->findEdgeFromCallee(CalleeToUse)) {
3740 NewCalleeEdge->getContextIds().insert_range(EdgeContextIdsToMove);
3741 NewCalleeEdge->AllocTypes |= computeAllocType(ContextIds&: EdgeContextIdsToMove);
3742 continue;
3743 }
3744 }
3745 auto NewEdge = std::make_shared<ContextEdge>(
3746 CalleeToUse, NewCallee, computeAllocType(ContextIds&: EdgeContextIdsToMove),
3747 EdgeContextIdsToMove);
3748 NewCallee->CalleeEdges.push_back(NewEdge);
3749 NewEdge->Callee->CallerEdges.push_back(NewEdge);
3750 }
3751 // Recompute the node alloc type now that its callee edges have been
3752 // updated (since we will compute from those edges).
3753 OldCallee->AllocTypes = OldCallee->computeAllocType();
3754 // OldCallee alloc type should be None iff its context id set is now empty.
3755 assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
3756 OldCallee->emptyContextIds());
3757 if (VerifyCCG) {
3758 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
3759 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
3760 for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
3761 checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
3762 /*CheckEdges=*/false);
3763 for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
3764 checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
3765 /*CheckEdges=*/false);
3766 }
3767}
3768
3769template <typename DerivedCCG, typename FuncTy, typename CallTy>
3770void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3771 moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
3772 ContextNode *NewCaller) {
3773 auto *OldCallee = Edge->Callee;
3774 auto *NewCallee = OldCallee;
3775 // If this edge was direct recursive, make any new/updated edge also direct
3776 // recursive to NewCaller.
3777 bool Recursive = Edge->Caller == Edge->Callee;
3778 if (Recursive)
3779 NewCallee = NewCaller;
3780
3781 ContextNode *OldCaller = Edge->Caller;
3782 OldCaller->eraseCalleeEdge(Edge.get());
3783
3784 // We might already have an edge to the new caller. If one exists we will
3785 // reuse it.
3786 auto ExistingEdgeToNewCaller = NewCaller->findEdgeFromCallee(NewCallee);
3787
3788 if (ExistingEdgeToNewCaller) {
3789 // Since we already have an edge to NewCaller, simply move the ids
3790 // onto it, and remove the existing Edge.
3791 ExistingEdgeToNewCaller->getContextIds().insert_range(
3792 Edge->getContextIds());
3793 ExistingEdgeToNewCaller->AllocTypes |= Edge->AllocTypes;
3794 Edge->ContextIds.clear();
3795 Edge->AllocTypes = (uint8_t)AllocationType::None;
3796 OldCallee->eraseCallerEdge(Edge.get());
3797 } else {
3798 // Otherwise just reconnect Edge to NewCaller.
3799 Edge->Caller = NewCaller;
3800 NewCaller->CalleeEdges.push_back(Edge);
3801 if (Recursive) {
3802 assert(NewCallee == NewCaller);
3803 // In the case of (direct) recursive edges, we update the callee as well
3804 // so that it becomes recursive on the new caller.
3805 Edge->Callee = NewCallee;
3806 NewCallee->CallerEdges.push_back(Edge);
3807 OldCallee->eraseCallerEdge(Edge.get());
3808 }
3809 // Don't need to update Edge's context ids since we are simply
3810 // reconnecting it.
3811 }
3812 // In either case, need to update the alloc types on New Caller.
3813 NewCaller->AllocTypes |= Edge->AllocTypes;
3814
3815 // Now walk the old caller node's caller edges and move Edge's context ids
3816 // over to the corresponding edge into the node (which is created here if
3817 // this is a newly created node). We can tell whether this is a newly created
3818 // node by seeing if it has any caller edges yet.
3819#ifndef NDEBUG
3820 bool IsNewNode = NewCaller->CallerEdges.empty();
3821#endif
3822 // If we just moved a direct recursive edge, presumably its context ids should
3823 // also flow out of OldCaller via some other non-recursive callee edge. We
3824 // don't want to remove the recursive context ids from other caller edges yet,
3825 // otherwise the context ids get into an inconsistent state on OldCaller.
3826 // We will update these context ids on the non-recursive caller edge when and
3827 // if they are updated on the non-recursive callee.
3828 if (!Recursive) {
3829 for (auto &OldCallerEdge : OldCaller->CallerEdges) {
3830 auto OldCallerCaller = OldCallerEdge->Caller;
3831 // The context ids moving to the new caller are the subset of this edge's
3832 // context ids and the context ids on the callee edge being moved.
3833 DenseSet<uint32_t> EdgeContextIdsToMove = set_intersection(
3834 OldCallerEdge->getContextIds(), Edge->getContextIds());
3835 if (OldCaller == OldCallerCaller) {
3836 OldCallerCaller = NewCaller;
3837 // Don't actually move this one. The caller will move it directly via a
3838 // call to this function with this as the Edge if it is appropriate to
3839 // move to a diff node that has a matching callee (itself).
3840 continue;
3841 }
3842 set_subtract(OldCallerEdge->getContextIds(), EdgeContextIdsToMove);
3843 OldCallerEdge->AllocTypes =
3844 computeAllocType(ContextIds&: OldCallerEdge->getContextIds());
3845 // In this function we expect that any pre-existing node already has edges
3846 // from the same callers as the old node. That should be true in the
3847 // current use case, where we will remove None-type edges after copying
3848 // over all caller edges from the callee.
3849 auto *ExistingCallerEdge = NewCaller->findEdgeFromCaller(OldCallerCaller);
3850 // Since we would have skipped caller edges when moving a direct recursive
3851 // edge, this may not hold true when recursive handling enabled.
3852 assert(IsNewNode || ExistingCallerEdge || AllowRecursiveCallsites);
3853 if (ExistingCallerEdge) {
3854 ExistingCallerEdge->getContextIds().insert_range(EdgeContextIdsToMove);
3855 ExistingCallerEdge->AllocTypes |=
3856 computeAllocType(ContextIds&: EdgeContextIdsToMove);
3857 continue;
3858 }
3859 auto NewEdge = std::make_shared<ContextEdge>(
3860 NewCaller, OldCallerCaller, computeAllocType(ContextIds&: EdgeContextIdsToMove),
3861 EdgeContextIdsToMove);
3862 NewCaller->CallerEdges.push_back(NewEdge);
3863 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
3864 }
3865 }
3866 // Recompute the node alloc type now that its caller edges have been
3867 // updated (since we will compute from those edges).
3868 OldCaller->AllocTypes = OldCaller->computeAllocType();
3869 // OldCaller alloc type should be None iff its context id set is now empty.
3870 assert((OldCaller->AllocTypes == (uint8_t)AllocationType::None) ==
3871 OldCaller->emptyContextIds());
3872 if (VerifyCCG) {
3873 checkNode<DerivedCCG, FuncTy, CallTy>(OldCaller, /*CheckEdges=*/false);
3874 checkNode<DerivedCCG, FuncTy, CallTy>(NewCaller, /*CheckEdges=*/false);
3875 for (const auto &OldCallerEdge : OldCaller->CallerEdges)
3876 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallerEdge->Caller,
3877 /*CheckEdges=*/false);
3878 for (const auto &NewCallerEdge : NewCaller->CallerEdges)
3879 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallerEdge->Caller,
3880 /*CheckEdges=*/false);
3881 }
3882}
3883
3884template <typename DerivedCCG, typename FuncTy, typename CallTy>
3885void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3886 recursivelyRemoveNoneTypeCalleeEdges(
3887 ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
3888 auto Inserted = Visited.insert(Node);
3889 if (!Inserted.second)
3890 return;
3891
3892 removeNoneTypeCalleeEdges(Node);
3893
3894 for (auto *Clone : Node->Clones)
3895 recursivelyRemoveNoneTypeCalleeEdges(Node: Clone, Visited);
3896
3897 // The recursive call may remove some of this Node's caller edges.
3898 // Iterate over a copy and skip any that were removed.
3899 auto CallerEdges = Node->CallerEdges;
3900 for (auto &Edge : CallerEdges) {
3901 // Skip any that have been removed by an earlier recursive call.
3902 if (Edge->isRemoved()) {
3903 assert(!is_contained(Node->CallerEdges, Edge));
3904 continue;
3905 }
3906 recursivelyRemoveNoneTypeCalleeEdges(Node: Edge->Caller, Visited);
3907 }
3908}
3909
3910// This is the standard DFS based backedge discovery algorithm.
3911template <typename DerivedCCG, typename FuncTy, typename CallTy>
3912void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges() {
3913 // If we are cloning recursive contexts, find and mark backedges from all root
3914 // callers, using the typical DFS based backedge analysis.
3915 if (!CloneRecursiveContexts)
3916 return;
3917 DenseSet<const ContextNode *> Visited;
3918 DenseSet<const ContextNode *> CurrentStack;
3919 for (auto &Entry : NonAllocationCallToContextNodeMap) {
3920 auto *Node = Entry.second;
3921 if (Node->isRemoved())
3922 continue;
3923 // It is a root if it doesn't have callers.
3924 if (!Node->CallerEdges.empty())
3925 continue;
3926 markBackedges(Node, Visited, CurrentStack);
3927 assert(CurrentStack.empty());
3928 }
3929}
3930
3931// Recursive helper for above markBackedges method.
3932template <typename DerivedCCG, typename FuncTy, typename CallTy>
3933void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges(
3934 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3935 DenseSet<const ContextNode *> &CurrentStack) {
3936 auto I = Visited.insert(Node);
3937 // We should only call this for unvisited nodes.
3938 assert(I.second);
3939 (void)I;
3940 for (auto &CalleeEdge : Node->CalleeEdges) {
3941 auto *Callee = CalleeEdge->Callee;
3942 if (Visited.count(Callee)) {
3943 // Since this was already visited we need to check if it is currently on
3944 // the recursive stack in which case it is a backedge.
3945 if (CurrentStack.count(Callee))
3946 CalleeEdge->IsBackedge = true;
3947 continue;
3948 }
3949 CurrentStack.insert(Callee);
3950 markBackedges(Callee, Visited, CurrentStack);
3951 CurrentStack.erase(Callee);
3952 }
3953}
3954
3955template <typename DerivedCCG, typename FuncTy, typename CallTy>
3956void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
3957 DenseSet<const ContextNode *> Visited;
3958 for (auto &Entry : AllocationCallToContextNodeMap) {
3959 Visited.clear();
3960 identifyClones(Entry.second, Visited, Entry.second->getContextIds());
3961 }
3962 Visited.clear();
3963 for (auto &Entry : AllocationCallToContextNodeMap)
3964 recursivelyRemoveNoneTypeCalleeEdges(Node: Entry.second, Visited);
3965 if (VerifyCCG)
3966 check();
3967}
3968
3969// helper function to check an AllocType is cold or notcold or both.
3970bool checkColdOrNotCold(uint8_t AllocType) {
3971 return (AllocType == (uint8_t)AllocationType::Cold) ||
3972 (AllocType == (uint8_t)AllocationType::NotCold) ||
3973 (AllocType ==
3974 ((uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold));
3975}
3976
3977template <typename DerivedCCG, typename FuncTy, typename CallTy>
3978void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
3979 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3980 const DenseSet<uint32_t> &AllocContextIds) {
3981 if (VerifyNodes)
3982 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3983 assert(!Node->CloneOf);
3984
3985 // If Node as a null call, then either it wasn't found in the module (regular
3986 // LTO) or summary index (ThinLTO), or there were other conditions blocking
3987 // cloning (e.g. recursion, calls multiple targets, etc).
3988 // Do this here so that we don't try to recursively clone callers below, which
3989 // isn't useful at least for this node.
3990 if (!Node->hasCall())
3991 return;
3992
3993 // No need to look at any callers if allocation type already unambiguous.
3994 if (hasSingleAllocType(Node->AllocTypes))
3995 return;
3996
3997#ifndef NDEBUG
3998 auto Insert =
3999#endif
4000 Visited.insert(Node);
4001 // We should not have visited this node yet.
4002 assert(Insert.second);
4003 // The recursive call to identifyClones may delete the current edge from the
4004 // CallerEdges vector. Make a copy and iterate on that, simpler than passing
4005 // in an iterator and having recursive call erase from it. Other edges may
4006 // also get removed during the recursion, which will have null Callee and
4007 // Caller pointers (and are deleted later), so we skip those below.
4008 {
4009 auto CallerEdges = Node->CallerEdges;
4010 for (auto &Edge : CallerEdges) {
4011 // Skip any that have been removed by an earlier recursive call.
4012 if (Edge->isRemoved()) {
4013 assert(!is_contained(Node->CallerEdges, Edge));
4014 continue;
4015 }
4016 // Defer backedges. See comments further below where these edges are
4017 // handled during the cloning of this Node.
4018 if (Edge->IsBackedge) {
4019 // We should only mark these if cloning recursive contexts, where we
4020 // need to do this deferral.
4021 assert(CloneRecursiveContexts);
4022 continue;
4023 }
4024 // Ignore any caller we previously visited via another edge.
4025 if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
4026 identifyClones(Edge->Caller, Visited, AllocContextIds);
4027 }
4028 }
4029 }
4030
4031 // Check if we reached an unambiguous call or have have only a single caller.
4032 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
4033 return;
4034
4035 // We need to clone.
4036
4037 // Try to keep the original version as alloc type NotCold. This will make
4038 // cases with indirect calls or any other situation with an unknown call to
4039 // the original function get the default behavior. We do this by sorting the
4040 // CallerEdges of the Node we will clone by alloc type.
4041 //
4042 // Give NotCold edge the lowest sort priority so those edges are at the end of
4043 // the caller edges vector, and stay on the original version (since the below
4044 // code clones greedily until it finds all remaining edges have the same type
4045 // and leaves the remaining ones on the original Node).
4046 //
4047 // We shouldn't actually have any None type edges, so the sorting priority for
4048 // that is arbitrary, and we assert in that case below.
4049 const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4,
4050 /*Cold*/ 1,
4051 /*NotColdCold*/ 2};
4052 llvm::stable_sort(Node->CallerEdges,
4053 [&](const std::shared_ptr<ContextEdge> &A,
4054 const std::shared_ptr<ContextEdge> &B) {
4055 // Nodes with non-empty context ids should be sorted
4056 // before those with empty context ids.
4057 if (A->ContextIds.empty())
4058 // Either B ContextIds are non-empty (in which case we
4059 // should return false because B < A), or B ContextIds
4060 // are empty, in which case they are equal, and we
4061 // should maintain the original relative ordering.
4062 return false;
4063 if (B->ContextIds.empty())
4064 return true;
4065
4066 if (A->AllocTypes == B->AllocTypes)
4067 // Use the first context id for each edge as a
4068 // tie-breaker.
4069 return *A->ContextIds.begin() < *B->ContextIds.begin();
4070 return AllocTypeCloningPriority[A->AllocTypes] <
4071 AllocTypeCloningPriority[B->AllocTypes];
4072 });
4073
4074 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
4075
4076 DenseSet<uint32_t> RecursiveContextIds;
4077 assert(AllowRecursiveContexts || !CloneRecursiveContexts);
4078 // If we are allowing recursive callsites, but have also disabled recursive
4079 // contexts, look for context ids that show up in multiple caller edges.
4080 if (AllowRecursiveCallsites && !AllowRecursiveContexts) {
4081 DenseSet<uint32_t> AllCallerContextIds;
4082 for (auto &CE : Node->CallerEdges) {
4083 // Resize to the largest set of caller context ids, since we know the
4084 // final set will be at least that large.
4085 AllCallerContextIds.reserve(Size: CE->getContextIds().size());
4086 for (auto Id : CE->getContextIds())
4087 if (!AllCallerContextIds.insert(Id).second)
4088 RecursiveContextIds.insert(Id);
4089 }
4090 }
4091
4092 // Iterate until we find no more opportunities for disambiguating the alloc
4093 // types via cloning. In most cases this loop will terminate once the Node
4094 // has a single allocation type, in which case no more cloning is needed.
4095 // Iterate over a copy of Node's caller edges, since we may need to remove
4096 // edges in the moveEdgeTo* methods, and this simplifies the handling and
4097 // makes it less error-prone.
4098 auto CallerEdges = Node->CallerEdges;
4099 for (auto &CallerEdge : CallerEdges) {
4100 // Skip any that have been removed by an earlier recursive call.
4101 if (CallerEdge->isRemoved()) {
4102 assert(!is_contained(Node->CallerEdges, CallerEdge));
4103 continue;
4104 }
4105 assert(CallerEdge->Callee == Node);
4106
4107 // See if cloning the prior caller edge left this node with a single alloc
4108 // type or a single caller. In that case no more cloning of Node is needed.
4109 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
4110 break;
4111
4112 // If the caller was not successfully matched to a call in the IR/summary,
4113 // there is no point in trying to clone for it as we can't update that call.
4114 if (!CallerEdge->Caller->hasCall())
4115 continue;
4116
4117 // Only need to process the ids along this edge pertaining to the given
4118 // allocation.
4119 auto CallerEdgeContextsForAlloc =
4120 set_intersection(CallerEdge->getContextIds(), AllocContextIds);
4121 if (!RecursiveContextIds.empty())
4122 CallerEdgeContextsForAlloc =
4123 set_difference(CallerEdgeContextsForAlloc, RecursiveContextIds);
4124 if (CallerEdgeContextsForAlloc.empty())
4125 continue;
4126
4127 auto CallerAllocTypeForAlloc = computeAllocType(ContextIds&: CallerEdgeContextsForAlloc);
4128
4129 // Compute the node callee edge alloc types corresponding to the context ids
4130 // for this caller edge.
4131 std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
4132 CalleeEdgeAllocTypesForCallerEdge.reserve(n: Node->CalleeEdges.size());
4133 for (auto &CalleeEdge : Node->CalleeEdges)
4134 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
4135 Node1Ids: CalleeEdge->getContextIds(), Node2Ids: CallerEdgeContextsForAlloc));
4136
4137 // Don't clone if doing so will not disambiguate any alloc types amongst
4138 // caller edges (including the callee edges that would be cloned).
4139 // Otherwise we will simply move all edges to the clone.
4140 //
4141 // First check if by cloning we will disambiguate the caller allocation
4142 // type from node's allocation type. Query allocTypeToUse so that we don't
4143 // bother cloning to distinguish NotCold+Cold from NotCold. Note that
4144 // neither of these should be None type.
4145 //
4146 // Then check if by cloning node at least one of the callee edges will be
4147 // disambiguated by splitting out different context ids.
4148 //
4149 // However, always do the cloning if this is a backedge, in which case we
4150 // have not yet cloned along this caller edge.
4151 assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
4152 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
4153 if (!CallerEdge->IsBackedge &&
4154 allocTypeToUse(CallerAllocTypeForAlloc) ==
4155 allocTypeToUse(Node->AllocTypes) &&
4156 allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
4157 CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {
4158 continue;
4159 }
4160
4161 if (CallerEdge->IsBackedge) {
4162 // We should only mark these if cloning recursive contexts, where we
4163 // need to do this deferral.
4164 assert(CloneRecursiveContexts);
4165 DeferredBackedges++;
4166 }
4167
4168 // If this is a backedge, we now do recursive cloning starting from its
4169 // caller since we may have moved unambiguous caller contexts to a clone
4170 // of this Node in a previous iteration of the current loop, giving more
4171 // opportunity for cloning through the backedge. Because we sorted the
4172 // caller edges earlier so that cold caller edges are first, we would have
4173 // visited and cloned this node for any unamibiguously cold non-recursive
4174 // callers before any ambiguous backedge callers. Note that we don't do this
4175 // if the caller is already cloned or visited during cloning (e.g. via a
4176 // different context path from the allocation).
4177 // TODO: Can we do better in the case where the caller was already visited?
4178 if (CallerEdge->IsBackedge && !CallerEdge->Caller->CloneOf &&
4179 !Visited.count(CallerEdge->Caller)) {
4180 const auto OrigIdCount = CallerEdge->getContextIds().size();
4181 // Now do the recursive cloning of this backedge's caller, which was
4182 // deferred earlier.
4183 identifyClones(CallerEdge->Caller, Visited, CallerEdgeContextsForAlloc);
4184 removeNoneTypeCalleeEdges(Node: CallerEdge->Caller);
4185 // See if the recursive call to identifyClones moved the context ids to a
4186 // new edge from this node to a clone of caller, and switch to looking at
4187 // that new edge so that we clone Node for the new caller clone.
4188 bool UpdatedEdge = false;
4189 if (OrigIdCount > CallerEdge->getContextIds().size()) {
4190 for (auto E : Node->CallerEdges) {
4191 // Only interested in clones of the current edges caller.
4192 if (E->Caller->CloneOf != CallerEdge->Caller)
4193 continue;
4194 // See if this edge contains any of the context ids originally on the
4195 // current caller edge.
4196 auto CallerEdgeContextsForAllocNew =
4197 set_intersection(CallerEdgeContextsForAlloc, E->getContextIds());
4198 if (CallerEdgeContextsForAllocNew.empty())
4199 continue;
4200 // Make sure we don't pick a previously existing caller edge of this
4201 // Node, which would be processed on a different iteration of the
4202 // outer loop over the saved CallerEdges.
4203 if (llvm::is_contained(CallerEdges, E))
4204 continue;
4205 // The CallerAllocTypeForAlloc and CalleeEdgeAllocTypesForCallerEdge
4206 // are updated further below for all cases where we just invoked
4207 // identifyClones recursively.
4208 CallerEdgeContextsForAlloc.swap(CallerEdgeContextsForAllocNew);
4209 CallerEdge = E;
4210 UpdatedEdge = true;
4211 break;
4212 }
4213 }
4214 // If cloning removed this edge (and we didn't update it to a new edge
4215 // above), we're done with this edge. It's possible we moved all of the
4216 // context ids to an existing clone, in which case there's no need to do
4217 // further processing for them.
4218 if (CallerEdge->isRemoved())
4219 continue;
4220
4221 // Now we need to update the information used for the cloning decisions
4222 // further below, as we may have modified edges and their context ids.
4223
4224 // Note if we changed the CallerEdge above we would have already updated
4225 // the context ids.
4226 if (!UpdatedEdge) {
4227 CallerEdgeContextsForAlloc = set_intersection(
4228 CallerEdgeContextsForAlloc, CallerEdge->getContextIds());
4229 if (CallerEdgeContextsForAlloc.empty())
4230 continue;
4231 }
4232 // Update the other information that depends on the edges and on the now
4233 // updated CallerEdgeContextsForAlloc.
4234 CallerAllocTypeForAlloc = computeAllocType(ContextIds&: CallerEdgeContextsForAlloc);
4235 CalleeEdgeAllocTypesForCallerEdge.clear();
4236 for (auto &CalleeEdge : Node->CalleeEdges) {
4237 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
4238 Node1Ids: CalleeEdge->getContextIds(), Node2Ids: CallerEdgeContextsForAlloc));
4239 }
4240 }
4241
4242 // First see if we can use an existing clone. Check each clone and its
4243 // callee edges for matching alloc types.
4244 ContextNode *Clone = nullptr;
4245 for (auto *CurClone : Node->Clones) {
4246 if (allocTypeToUse(CurClone->AllocTypes) !=
4247 allocTypeToUse(CallerAllocTypeForAlloc))
4248 continue;
4249
4250 bool BothSingleAlloc = hasSingleAllocType(CurClone->AllocTypes) &&
4251 hasSingleAllocType(CallerAllocTypeForAlloc);
4252 // The above check should mean that if both have single alloc types that
4253 // they should be equal.
4254 assert(!BothSingleAlloc ||
4255 CurClone->AllocTypes == CallerAllocTypeForAlloc);
4256
4257 // If either both have a single alloc type (which are the same), or if the
4258 // clone's callee edges have the same alloc types as those for the current
4259 // allocation on Node's callee edges (CalleeEdgeAllocTypesForCallerEdge),
4260 // then we can reuse this clone.
4261 if (BothSingleAlloc || allocTypesMatchClone<DerivedCCG, FuncTy, CallTy>(
4262 CalleeEdgeAllocTypesForCallerEdge, CurClone)) {
4263 Clone = CurClone;
4264 break;
4265 }
4266 }
4267
4268 // The edge iterator is adjusted when we move the CallerEdge to the clone.
4269 if (Clone)
4270 moveEdgeToExistingCalleeClone(Edge: CallerEdge, NewCallee: Clone, /*NewClone=*/false,
4271 ContextIdsToMove: CallerEdgeContextsForAlloc);
4272 else
4273 Clone = moveEdgeToNewCalleeClone(Edge: CallerEdge, ContextIdsToMove: CallerEdgeContextsForAlloc);
4274
4275 // Sanity check that no alloc types on clone or its edges are None.
4276 assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
4277 }
4278
4279 // We should still have some context ids on the original Node.
4280 assert(!Node->emptyContextIds());
4281
4282 // Sanity check that no alloc types on node or edges are None.
4283 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
4284
4285 if (VerifyNodes)
4286 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
4287}
4288
4289void ModuleCallsiteContextGraph::updateAllocationCall(
4290 CallInfo &Call, AllocationType AllocType) {
4291 std::string AllocTypeString = getAllocTypeAttributeString(Type: AllocType);
4292 removeAnyExistingAmbiguousAttribute(CB: cast<CallBase>(Val: Call.call()));
4293 auto A = llvm::Attribute::get(Context&: Call.call()->getFunction()->getContext(),
4294 Kind: "memprof", Val: AllocTypeString);
4295 cast<CallBase>(Val: Call.call())->addFnAttr(Attr: A);
4296 OREGetter(Call.call()->getFunction())
4297 .emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
4298 << ore::NV("AllocationCall", Call.call()) << " in clone "
4299 << ore::NV("Caller", Call.call()->getFunction())
4300 << " marked with memprof allocation attribute "
4301 << ore::NV("Attribute", AllocTypeString));
4302}
4303
4304void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
4305 AllocationType AllocType) {
4306 auto *AI = cast<AllocInfo *>(Val: Call.call());
4307 assert(AI);
4308 assert(AI->Versions.size() > Call.cloneNo());
4309 AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
4310}
4311
4312AllocationType
4313ModuleCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
4314 const auto *CB = cast<CallBase>(Val: Call.call());
4315 if (!CB->getAttributes().hasFnAttr(Kind: "memprof"))
4316 return AllocationType::None;
4317 return CB->getAttributes().getFnAttr(Kind: "memprof").getValueAsString() == "cold"
4318 ? AllocationType::Cold
4319 : AllocationType::NotCold;
4320}
4321
4322AllocationType
4323IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
4324 const auto *AI = cast<AllocInfo *>(Val: Call.call());
4325 assert(AI->Versions.size() > Call.cloneNo());
4326 return (AllocationType)AI->Versions[Call.cloneNo()];
4327}
4328
4329void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
4330 FuncInfo CalleeFunc) {
4331 auto *CurF = getCalleeFunc(Call: CallerCall.call());
4332 auto NewCalleeCloneNo = CalleeFunc.cloneNo();
4333 if (isMemProfClone(F: *CurF)) {
4334 // If we already assigned this callsite to call a specific non-default
4335 // clone (i.e. not the original function which is clone 0), ensure that we
4336 // aren't trying to now update it to call a different clone, which is
4337 // indicative of a bug in the graph or function assignment.
4338 auto CurCalleeCloneNo = getMemProfCloneNum(F: *CurF);
4339 if (CurCalleeCloneNo != NewCalleeCloneNo) {
4340 LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was "
4341 << CurCalleeCloneNo << " now " << NewCalleeCloneNo
4342 << "\n");
4343 MismatchedCloneAssignments++;
4344 }
4345 }
4346 if (NewCalleeCloneNo > 0)
4347 cast<CallBase>(Val: CallerCall.call())->setCalledFunction(CalleeFunc.func());
4348 OREGetter(CallerCall.call()->getFunction())
4349 .emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
4350 << ore::NV("Call", CallerCall.call()) << " in clone "
4351 << ore::NV("Caller", CallerCall.call()->getFunction())
4352 << " assigned to call function clone "
4353 << ore::NV("Callee", CalleeFunc.func()));
4354}
4355
4356void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
4357 FuncInfo CalleeFunc) {
4358 auto *CI = cast<CallsiteInfo *>(Val: CallerCall.call());
4359 assert(CI &&
4360 "Caller cannot be an allocation which should not have profiled calls");
4361 assert(CI->Clones.size() > CallerCall.cloneNo());
4362 auto NewCalleeCloneNo = CalleeFunc.cloneNo();
4363 auto &CurCalleeCloneNo = CI->Clones[CallerCall.cloneNo()];
4364 // If we already assigned this callsite to call a specific non-default
4365 // clone (i.e. not the original function which is clone 0), ensure that we
4366 // aren't trying to now update it to call a different clone, which is
4367 // indicative of a bug in the graph or function assignment.
4368 if (CurCalleeCloneNo != 0 && CurCalleeCloneNo != NewCalleeCloneNo) {
4369 LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was "
4370 << CurCalleeCloneNo << " now " << NewCalleeCloneNo
4371 << "\n");
4372 MismatchedCloneAssignments++;
4373 }
4374 CurCalleeCloneNo = NewCalleeCloneNo;
4375}
4376
4377// Update the debug information attached to NewFunc to use the clone Name. Note
4378// this needs to be done for both any existing DISubprogram for the definition,
4379// as well as any separate declaration DISubprogram.
4380static void updateSubprogramLinkageName(Function *NewFunc, StringRef Name) {
4381 assert(Name == NewFunc->getName());
4382 auto *SP = NewFunc->getSubprogram();
4383 if (!SP)
4384 return;
4385 auto *MDName = MDString::get(Context&: NewFunc->getParent()->getContext(), Str: Name);
4386 SP->replaceLinkageName(LN: MDName);
4387 DISubprogram *Decl = SP->getDeclaration();
4388 if (!Decl)
4389 return;
4390 TempDISubprogram NewDecl = Decl->clone();
4391 NewDecl->replaceLinkageName(LN: MDName);
4392 SP->replaceDeclaration(Decl: MDNode::replaceWithUniqued(N: std::move(NewDecl)));
4393}
4394
4395CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
4396 Instruction *>::FuncInfo
4397ModuleCallsiteContextGraph::cloneFunctionForCallsite(
4398 FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
4399 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
4400 // Use existing LLVM facilities for cloning and obtaining Call in clone
4401 ValueToValueMapTy VMap;
4402 auto *NewFunc = CloneFunction(F: Func.func(), VMap);
4403 std::string Name = getMemProfFuncName(Base: Func.func()->getName(), CloneNo);
4404 assert(!Func.func()->getParent()->getFunction(Name));
4405 NewFunc->setName(Name);
4406 updateSubprogramLinkageName(NewFunc, Name);
4407 for (auto &Inst : CallsWithMetadataInFunc) {
4408 // This map always has the initial version in it.
4409 assert(Inst.cloneNo() == 0);
4410 CallMap[Inst] = {cast<Instruction>(Val&: VMap[Inst.call()]), CloneNo};
4411 }
4412 OREGetter(Func.func())
4413 .emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
4414 << "created clone " << ore::NV("NewFunction", NewFunc));
4415 return {NewFunc, CloneNo};
4416}
4417
4418CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
4419 IndexCall>::FuncInfo
4420IndexCallsiteContextGraph::cloneFunctionForCallsite(
4421 FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
4422 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
4423 // Check how many clones we have of Call (and therefore function).
4424 // The next clone number is the current size of versions array.
4425 // Confirm this matches the CloneNo provided by the caller, which is based on
4426 // the number of function clones we have.
4427 assert(CloneNo == (isa<AllocInfo *>(Call.call())
4428 ? cast<AllocInfo *>(Call.call())->Versions.size()
4429 : cast<CallsiteInfo *>(Call.call())->Clones.size()));
4430 // Walk all the instructions in this function. Create a new version for
4431 // each (by adding an entry to the Versions/Clones summary array), and copy
4432 // over the version being called for the function clone being cloned here.
4433 // Additionally, add an entry to the CallMap for the new function clone,
4434 // mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
4435 // to the new call clone.
4436 for (auto &Inst : CallsWithMetadataInFunc) {
4437 // This map always has the initial version in it.
4438 assert(Inst.cloneNo() == 0);
4439 if (auto *AI = dyn_cast<AllocInfo *>(Val: Inst.call())) {
4440 assert(AI->Versions.size() == CloneNo);
4441 // We assign the allocation type later (in updateAllocationCall), just add
4442 // an entry for it here.
4443 AI->Versions.push_back(Elt: 0);
4444 } else {
4445 auto *CI = cast<CallsiteInfo *>(Val: Inst.call());
4446 assert(CI && CI->Clones.size() == CloneNo);
4447 // We assign the clone number later (in updateCall), just add an entry for
4448 // it here.
4449 CI->Clones.push_back(Elt: 0);
4450 }
4451 CallMap[Inst] = {Inst.call(), CloneNo};
4452 }
4453 return {Func.func(), CloneNo};
4454}
4455
4456// We perform cloning for each allocation node separately. However, this
4457// sometimes results in a situation where the same node calls multiple
4458// clones of the same callee, created for different allocations. This
4459// causes issues when assigning functions to these clones, as each node can
4460// in reality only call a single callee clone.
4461//
4462// To address this, before assigning functions, merge callee clone nodes as
4463// needed using a post order traversal from the allocations. We attempt to
4464// use existing clones as the merge node when legal, and to share them
4465// among callers with the same properties (callers calling the same set of
4466// callee clone nodes for the same allocations).
4467//
4468// Without this fix, in some cases incorrect function assignment will lead
4469// to calling the wrong allocation clone.
4470template <typename DerivedCCG, typename FuncTy, typename CallTy>
4471void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones() {
4472 if (!MergeClones)
4473 return;
4474
4475 // Generate a map from context id to the associated allocation node for use
4476 // when merging clones.
4477 DenseMap<uint32_t, ContextNode *> ContextIdToAllocationNode;
4478 for (auto &Entry : AllocationCallToContextNodeMap) {
4479 auto *Node = Entry.second;
4480 for (auto Id : Node->getContextIds())
4481 ContextIdToAllocationNode[Id] = Node->getOrigNode();
4482 for (auto *Clone : Node->Clones) {
4483 for (auto Id : Clone->getContextIds())
4484 ContextIdToAllocationNode[Id] = Clone->getOrigNode();
4485 }
4486 }
4487
4488 // Post order traversal starting from allocations to ensure each callsite
4489 // calls a single clone of its callee. Callee nodes that are clones of each
4490 // other are merged (via new merge nodes if needed) to achieve this.
4491 DenseSet<const ContextNode *> Visited;
4492 for (auto &Entry : AllocationCallToContextNodeMap) {
4493 auto *Node = Entry.second;
4494
4495 mergeClones(Node, Visited, ContextIdToAllocationNode);
4496
4497 // Make a copy so the recursive post order traversal that may create new
4498 // clones doesn't mess up iteration. Note that the recursive traversal
4499 // itself does not call mergeClones on any of these nodes, which are all
4500 // (clones of) allocations.
4501 auto Clones = Node->Clones;
4502 for (auto *Clone : Clones)
4503 mergeClones(Clone, Visited, ContextIdToAllocationNode);
4504 }
4505
4506 if (DumpCCG) {
4507 dbgs() << "CCG after merging:\n";
4508 dbgs() << *this;
4509 }
4510 if (ExportToDot)
4511 exportToDot(Label: "aftermerge");
4512
4513 if (VerifyCCG) {
4514 check();
4515 }
4516}
4517
4518// Recursive helper for above mergeClones method.
4519template <typename DerivedCCG, typename FuncTy, typename CallTy>
4520void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones(
4521 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
4522 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
4523 auto Inserted = Visited.insert(Node);
4524 if (!Inserted.second)
4525 return;
4526
4527 // Iteratively perform merging on this node to handle new caller nodes created
4528 // during the recursive traversal. We could do something more elegant such as
4529 // maintain a worklist, but this is a simple approach that doesn't cause a
4530 // measureable compile time effect, as most nodes don't have many caller
4531 // edges to check.
4532 bool FoundUnvisited = true;
4533 unsigned Iters = 0;
4534 while (FoundUnvisited) {
4535 Iters++;
4536 FoundUnvisited = false;
4537 // Make a copy since the recursive call may move a caller edge to a new
4538 // callee, messing up the iterator.
4539 auto CallerEdges = Node->CallerEdges;
4540 for (auto CallerEdge : CallerEdges) {
4541 // Skip any caller edge moved onto a different callee during recursion.
4542 if (CallerEdge->Callee != Node)
4543 continue;
4544 // If we found an unvisited caller, note that we should check the caller
4545 // edges again as mergeClones may add or change caller nodes.
4546 if (DoMergeIteration && !Visited.contains(CallerEdge->Caller))
4547 FoundUnvisited = true;
4548 mergeClones(CallerEdge->Caller, Visited, ContextIdToAllocationNode);
4549 }
4550 }
4551
4552 TotalMergeInvokes++;
4553 TotalMergeIters += Iters;
4554 if (Iters > MaxMergeIters)
4555 MaxMergeIters = Iters;
4556
4557 // Merge for this node after we handle its callers.
4558 mergeNodeCalleeClones(Node, Visited, ContextIdToAllocationNode);
4559}
4560
4561template <typename DerivedCCG, typename FuncTy, typename CallTy>
4562void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeNodeCalleeClones(
4563 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
4564 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
4565 // Ignore Node if we moved all of its contexts to clones.
4566 if (Node->emptyContextIds())
4567 return;
4568
4569 // First identify groups of clones among Node's callee edges, by building
4570 // a map from each callee base node to the associated callee edges from Node.
4571 MapVector<ContextNode *, std::vector<std::shared_ptr<ContextEdge>>>
4572 OrigNodeToCloneEdges;
4573 for (const auto &E : Node->CalleeEdges) {
4574 auto *Callee = E->Callee;
4575 if (!Callee->CloneOf && Callee->Clones.empty())
4576 continue;
4577 ContextNode *Base = Callee->getOrigNode();
4578 OrigNodeToCloneEdges[Base].push_back(E);
4579 }
4580
4581 // Helper for callee edge sorting below. Return true if A's callee has fewer
4582 // caller edges than B, or if A is a clone and B is not, or if A's first
4583 // context id is smaller than B's.
4584 auto CalleeCallerEdgeLessThan = [](const std::shared_ptr<ContextEdge> &A,
4585 const std::shared_ptr<ContextEdge> &B) {
4586 if (A->Callee->CallerEdges.size() != B->Callee->CallerEdges.size())
4587 return A->Callee->CallerEdges.size() < B->Callee->CallerEdges.size();
4588 if (A->Callee->CloneOf && !B->Callee->CloneOf)
4589 return true;
4590 else if (!A->Callee->CloneOf && B->Callee->CloneOf)
4591 return false;
4592 // Use the first context id for each edge as a
4593 // tie-breaker.
4594 return *A->ContextIds.begin() < *B->ContextIds.begin();
4595 };
4596
4597 // Process each set of callee clones called by Node, performing the needed
4598 // merging.
4599 for (auto Entry : OrigNodeToCloneEdges) {
4600 // CalleeEdges is the set of edges from Node reaching callees that are
4601 // mutual clones of each other.
4602 auto &CalleeEdges = Entry.second;
4603 auto NumCalleeClones = CalleeEdges.size();
4604 // A single edge means there is no merging needed.
4605 if (NumCalleeClones == 1)
4606 continue;
4607 // Sort the CalleeEdges calling this group of clones in ascending order of
4608 // their caller edge counts, putting the original non-clone node first in
4609 // cases of a tie. This simplifies finding an existing node to use as the
4610 // merge node.
4611 llvm::stable_sort(CalleeEdges, CalleeCallerEdgeLessThan);
4612
4613 /// Find other callers of the given set of callee edges that can
4614 /// share the same callee merge node. See the comments at this method
4615 /// definition for details.
4616 DenseSet<ContextNode *> OtherCallersToShareMerge;
4617 findOtherCallersToShareMerge(Node, CalleeEdges, ContextIdToAllocationNode,
4618 OtherCallersToShareMerge);
4619
4620 // Now do the actual merging. Identify existing or create a new MergeNode
4621 // during the first iteration. Move each callee over, along with edges from
4622 // other callers we've determined above can share the same merge node.
4623 ContextNode *MergeNode = nullptr;
4624 DenseMap<ContextNode *, unsigned> CallerToMoveCount;
4625 for (auto CalleeEdge : CalleeEdges) {
4626 auto *OrigCallee = CalleeEdge->Callee;
4627 // If we don't have a MergeNode yet (only happens on the first iteration,
4628 // as a new one will be created when we go to move the first callee edge
4629 // over as needed), see if we can use this callee.
4630 if (!MergeNode) {
4631 // If there are no other callers, simply use this callee.
4632 if (CalleeEdge->Callee->CallerEdges.size() == 1) {
4633 MergeNode = OrigCallee;
4634 NonNewMergedNodes++;
4635 continue;
4636 }
4637 // Otherwise, if we have identified other caller nodes that can share
4638 // the merge node with Node, see if all of OrigCallee's callers are
4639 // going to share the same merge node. In that case we can use callee
4640 // (since all of its callers would move to the new merge node).
4641 if (!OtherCallersToShareMerge.empty()) {
4642 bool MoveAllCallerEdges = true;
4643 for (auto CalleeCallerE : OrigCallee->CallerEdges) {
4644 if (CalleeCallerE == CalleeEdge)
4645 continue;
4646 if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller)) {
4647 MoveAllCallerEdges = false;
4648 break;
4649 }
4650 }
4651 // If we are going to move all callers over, we can use this callee as
4652 // the MergeNode.
4653 if (MoveAllCallerEdges) {
4654 MergeNode = OrigCallee;
4655 NonNewMergedNodes++;
4656 continue;
4657 }
4658 }
4659 }
4660 // Move this callee edge, creating a new merge node if necessary.
4661 if (MergeNode) {
4662 assert(MergeNode != OrigCallee);
4663 moveEdgeToExistingCalleeClone(Edge: CalleeEdge, NewCallee: MergeNode,
4664 /*NewClone*/ false);
4665 } else {
4666 MergeNode = moveEdgeToNewCalleeClone(Edge: CalleeEdge);
4667 NewMergedNodes++;
4668 }
4669 // Now move all identified edges from other callers over to the merge node
4670 // as well.
4671 if (!OtherCallersToShareMerge.empty()) {
4672 // Make and iterate over a copy of OrigCallee's caller edges because
4673 // some of these will be moved off of the OrigCallee and that would mess
4674 // up the iteration from OrigCallee.
4675 auto OrigCalleeCallerEdges = OrigCallee->CallerEdges;
4676 for (auto &CalleeCallerE : OrigCalleeCallerEdges) {
4677 if (CalleeCallerE == CalleeEdge)
4678 continue;
4679 if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller))
4680 continue;
4681 CallerToMoveCount[CalleeCallerE->Caller]++;
4682 moveEdgeToExistingCalleeClone(Edge: CalleeCallerE, NewCallee: MergeNode,
4683 /*NewClone*/ false);
4684 }
4685 }
4686 removeNoneTypeCalleeEdges(Node: OrigCallee);
4687 removeNoneTypeCalleeEdges(Node: MergeNode);
4688 }
4689 }
4690}
4691
4692// Look for other nodes that have edges to the same set of callee
4693// clones as the current Node. Those can share the eventual merge node
4694// (reducing cloning and binary size overhead) iff:
4695// - they have edges to the same set of callee clones
4696// - each callee edge reaches a subset of the same allocations as Node's
4697// corresponding edge to the same callee clone.
4698// The second requirement is to ensure that we don't undo any of the
4699// necessary cloning to distinguish contexts with different allocation
4700// behavior.
4701// FIXME: This is somewhat conservative, as we really just need to ensure
4702// that they don't reach the same allocations as contexts on edges from Node
4703// going to any of the *other* callee clones being merged. However, that
4704// requires more tracking and checking to get right.
4705template <typename DerivedCCG, typename FuncTy, typename CallTy>
4706void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
4707 findOtherCallersToShareMerge(
4708 ContextNode *Node,
4709 std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges,
4710 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode,
4711 DenseSet<ContextNode *> &OtherCallersToShareMerge) {
4712 auto NumCalleeClones = CalleeEdges.size();
4713 // This map counts how many edges to the same callee clone exist for other
4714 // caller nodes of each callee clone.
4715 DenseMap<ContextNode *, unsigned> OtherCallersToSharedCalleeEdgeCount;
4716 // Counts the number of other caller nodes that have edges to all callee
4717 // clones that don't violate the allocation context checking.
4718 unsigned PossibleOtherCallerNodes = 0;
4719
4720 // We only need to look at other Caller nodes if the first callee edge has
4721 // multiple callers (recall they are sorted in ascending order above).
4722 if (CalleeEdges[0]->Callee->CallerEdges.size() < 2)
4723 return;
4724
4725 // For each callee edge:
4726 // - Collect the count of other caller nodes calling the same callees.
4727 // - Collect the alloc nodes reached by contexts on each callee edge.
4728 DenseMap<ContextEdge *, DenseSet<ContextNode *>> CalleeEdgeToAllocNodes;
4729 for (auto CalleeEdge : CalleeEdges) {
4730 assert(CalleeEdge->Callee->CallerEdges.size() > 1);
4731 // For each other caller of the same callee, increment the count of
4732 // edges reaching the same callee clone.
4733 for (auto CalleeCallerEdges : CalleeEdge->Callee->CallerEdges) {
4734 if (CalleeCallerEdges->Caller == Node) {
4735 assert(CalleeCallerEdges == CalleeEdge);
4736 continue;
4737 }
4738 OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller]++;
4739 // If this caller edge now reaches all of the same callee clones,
4740 // increment the count of candidate other caller nodes.
4741 if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller] ==
4742 NumCalleeClones)
4743 PossibleOtherCallerNodes++;
4744 }
4745 // Collect the alloc nodes reached by contexts on each callee edge, for
4746 // later analysis.
4747 for (auto Id : CalleeEdge->getContextIds()) {
4748 auto *Alloc = ContextIdToAllocationNode.lookup(Id);
4749 if (!Alloc) {
4750 // FIXME: unclear why this happens occasionally, presumably
4751 // imperfect graph updates possibly with recursion.
4752 MissingAllocForContextId++;
4753 continue;
4754 }
4755 CalleeEdgeToAllocNodes[CalleeEdge.get()].insert(Alloc);
4756 }
4757 }
4758
4759 // Now walk the callee edges again, and make sure that for each candidate
4760 // caller node all of its edges to the callees reach the same allocs (or
4761 // a subset) as those along the corresponding callee edge from Node.
4762 for (auto CalleeEdge : CalleeEdges) {
4763 assert(CalleeEdge->Callee->CallerEdges.size() > 1);
4764 // Stop if we do not have any (more) candidate other caller nodes.
4765 if (!PossibleOtherCallerNodes)
4766 break;
4767 auto &CurCalleeAllocNodes = CalleeEdgeToAllocNodes[CalleeEdge.get()];
4768 // Check each other caller of this callee clone.
4769 for (auto &CalleeCallerE : CalleeEdge->Callee->CallerEdges) {
4770 // Not interested in the callee edge from Node itself.
4771 if (CalleeCallerE == CalleeEdge)
4772 continue;
4773 // Skip any callers that didn't have callee edges to all the same
4774 // callee clones.
4775 if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] !=
4776 NumCalleeClones)
4777 continue;
4778 // Make sure that each context along edge from candidate caller node
4779 // reaches an allocation also reached by this callee edge from Node.
4780 for (auto Id : CalleeCallerE->getContextIds()) {
4781 auto *Alloc = ContextIdToAllocationNode.lookup(Id);
4782 if (!Alloc)
4783 continue;
4784 // If not, simply reset the map entry to 0 so caller is ignored, and
4785 // reduce the count of candidate other caller nodes.
4786 if (!CurCalleeAllocNodes.contains(Alloc)) {
4787 OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] = 0;
4788 PossibleOtherCallerNodes--;
4789 break;
4790 }
4791 }
4792 }
4793 }
4794
4795 if (!PossibleOtherCallerNodes)
4796 return;
4797
4798 // Build the set of other caller nodes that can use the same callee merge
4799 // node.
4800 for (auto &[OtherCaller, Count] : OtherCallersToSharedCalleeEdgeCount) {
4801 if (Count != NumCalleeClones)
4802 continue;
4803 OtherCallersToShareMerge.insert(OtherCaller);
4804 }
4805}
4806
4807// This method assigns cloned callsites to functions, cloning the functions as
4808// needed. The assignment is greedy and proceeds roughly as follows:
4809//
4810// For each function Func:
4811// For each call with graph Node having clones:
4812// Initialize ClonesWorklist to Node and its clones
4813// Initialize NodeCloneCount to 0
4814// While ClonesWorklist is not empty:
4815// Clone = pop front ClonesWorklist
4816// NodeCloneCount++
4817// If Func has been cloned less than NodeCloneCount times:
4818// If NodeCloneCount is 1:
4819// Assign Clone to original Func
4820// Continue
4821// Create a new function clone
4822// If other callers not assigned to call a function clone yet:
4823// Assign them to call new function clone
4824// Continue
4825// Assign any other caller calling the cloned version to new clone
4826//
4827// For each caller of Clone:
4828// If caller is assigned to call a specific function clone:
4829// If we cannot assign Clone to that function clone:
4830// Create new callsite Clone NewClone
4831// Add NewClone to ClonesWorklist
4832// Continue
4833// Assign Clone to existing caller's called function clone
4834// Else:
4835// If Clone not already assigned to a function clone:
4836// Assign to first function clone without assignment
4837// Assign caller to selected function clone
4838// For each call with graph Node having clones:
4839// If number func clones > number call's callsite Node clones:
4840// Record func CallInfo clones without Node clone in UnassignedCallClones
4841// For callsite Nodes in DFS order from allocations:
4842// If IsAllocation:
4843// Update allocation with alloc type
4844// Else:
4845// For Call, all MatchingCalls, and associated UnnassignedCallClones:
4846// Update call to call recorded callee clone
4847//
4848template <typename DerivedCCG, typename FuncTy, typename CallTy>
4849bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
4850 bool Changed = false;
4851
4852 mergeClones();
4853
4854 // Keep track of the assignment of nodes (callsites) to function clones they
4855 // call.
4856 DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
4857
4858 // Update caller node to call function version CalleeFunc, by recording the
4859 // assignment in CallsiteToCalleeFuncCloneMap.
4860 auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
4861 const FuncInfo &CalleeFunc) {
4862 assert(Caller->hasCall());
4863 CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
4864 };
4865
4866 // Information for a single clone of this Func.
4867 struct FuncCloneInfo {
4868 // The function clone.
4869 FuncInfo FuncClone;
4870 // Remappings of each call of interest (from original uncloned call to the
4871 // corresponding cloned call in this function clone).
4872 DenseMap<CallInfo, CallInfo> CallMap;
4873 };
4874
4875 // Map to keep track of information needed to update calls in function clones
4876 // when their corresponding callsite node was not itself cloned for that
4877 // function clone. Because of call context pruning (i.e. we only keep as much
4878 // caller information as needed to distinguish hot vs cold), we may not have
4879 // caller edges coming to each callsite node from all possible function
4880 // callers. A function clone may get created for other callsites in the
4881 // function for which there are caller edges that were not pruned. Any other
4882 // callsites in that function clone, which were not themselved cloned for
4883 // that function clone, should get updated the same way as the corresponding
4884 // callsite in the original function (which may call a clone of its callee).
4885 //
4886 // We build this map after completing function cloning for each function, so
4887 // that we can record the information from its call maps before they are
4888 // destructed. The map will be used as we update calls to update any still
4889 // unassigned call clones. Note that we may create new node clones as we clone
4890 // other functions, so later on we check which node clones were still not
4891 // created. To this end, the inner map is a map from function clone number to
4892 // the list of calls cloned for that function (can be more than one due to the
4893 // Node's MatchingCalls array).
4894 //
4895 // The alternative is creating new callsite clone nodes below as we clone the
4896 // function, but that is tricker to get right and likely more overhead.
4897 //
4898 // Inner map is a std::map so sorted by key (clone number), in order to get
4899 // ordered remarks in the full LTO case.
4900 DenseMap<const ContextNode *, std::map<unsigned, SmallVector<CallInfo, 0>>>
4901 UnassignedCallClones;
4902
4903 // Walk all functions for which we saw calls with memprof metadata, and handle
4904 // cloning for each of its calls.
4905 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
4906 FuncInfo OrigFunc(Func);
4907 // Map from each clone number of OrigFunc to information about that function
4908 // clone (the function clone FuncInfo and call remappings). The index into
4909 // the vector is the clone number, as function clones are created and
4910 // numbered sequentially.
4911 std::vector<FuncCloneInfo> FuncCloneInfos;
4912 for (auto &Call : CallsWithMetadata) {
4913 ContextNode *Node = getNodeForInst(C: Call);
4914 // Skip call if we do not have a node for it (all uses of its stack ids
4915 // were either on inlined chains or pruned from the MIBs), or if we did
4916 // not create any clones for it.
4917 if (!Node || Node->Clones.empty())
4918 continue;
4919 assert(Node->hasCall() &&
4920 "Not having a call should have prevented cloning");
4921
4922 // Track the assignment of function clones to clones of the current
4923 // callsite Node being handled.
4924 std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
4925
4926 // Assign callsite version CallsiteClone to function version FuncClone,
4927 // and also assign (possibly cloned) Call to CallsiteClone.
4928 auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
4929 CallInfo &Call,
4930 ContextNode *CallsiteClone,
4931 bool IsAlloc) {
4932 // Record the clone of callsite node assigned to this function clone.
4933 FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
4934
4935 assert(FuncCloneInfos.size() > FuncClone.cloneNo());
4936 DenseMap<CallInfo, CallInfo> &CallMap =
4937 FuncCloneInfos[FuncClone.cloneNo()].CallMap;
4938 CallInfo CallClone(Call);
4939 if (auto It = CallMap.find(Call); It != CallMap.end())
4940 CallClone = It->second;
4941 CallsiteClone->setCall(CallClone);
4942 // Need to do the same for all matching calls.
4943 for (auto &MatchingCall : Node->MatchingCalls) {
4944 CallInfo CallClone(MatchingCall);
4945 if (auto It = CallMap.find(MatchingCall); It != CallMap.end())
4946 CallClone = It->second;
4947 // Updates the call in the list.
4948 MatchingCall = CallClone;
4949 }
4950 };
4951
4952 // Invokes moveEdgeToNewCalleeClone which creates a new clone, and then
4953 // performs the necessary fixups (removing none type edges, and
4954 // importantly, propagating any function call assignment of the original
4955 // node to the new clone).
4956 auto MoveEdgeToNewCalleeCloneAndSetUp =
4957 [&](const std::shared_ptr<ContextEdge> &Edge) {
4958 ContextNode *OrigCallee = Edge->Callee;
4959 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge);
4960 removeNoneTypeCalleeEdges(Node: NewClone);
4961 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
4962 // If the original Callee was already assigned to call a specific
4963 // function version, make sure its new clone is assigned to call
4964 // that same function clone.
4965 if (CallsiteToCalleeFuncCloneMap.count(OrigCallee))
4966 RecordCalleeFuncOfCallsite(
4967 NewClone, CallsiteToCalleeFuncCloneMap[OrigCallee]);
4968 return NewClone;
4969 };
4970
4971 // Keep track of the clones of callsite Node that need to be assigned to
4972 // function clones. This list may be expanded in the loop body below if we
4973 // find additional cloning is required.
4974 std::deque<ContextNode *> ClonesWorklist;
4975 // Ignore original Node if we moved all of its contexts to clones.
4976 if (!Node->emptyContextIds())
4977 ClonesWorklist.push_back(Node);
4978 llvm::append_range(ClonesWorklist, Node->Clones);
4979
4980 // Now walk through all of the clones of this callsite Node that we need,
4981 // and determine the assignment to a corresponding clone of the current
4982 // function (creating new function clones as needed).
4983 unsigned NodeCloneCount = 0;
4984 while (!ClonesWorklist.empty()) {
4985 ContextNode *Clone = ClonesWorklist.front();
4986 ClonesWorklist.pop_front();
4987 NodeCloneCount++;
4988 if (VerifyNodes)
4989 checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
4990
4991 // Need to create a new function clone if we have more callsite clones
4992 // than existing function clones, which would have been assigned to an
4993 // earlier clone in the list (we assign callsite clones to function
4994 // clones greedily).
4995 if (FuncCloneInfos.size() < NodeCloneCount) {
4996 // If this is the first callsite copy, assign to original function.
4997 if (NodeCloneCount == 1) {
4998 // Since FuncCloneInfos is empty in this case, no clones have
4999 // been created for this function yet, and no callers should have
5000 // been assigned a function clone for this callee node yet.
5001 assert(llvm::none_of(
5002 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
5003 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
5004 }));
5005 // Initialize with empty call map, assign Clone to original function
5006 // and its callers, and skip to the next clone.
5007 FuncCloneInfos.push_back(
5008 {OrigFunc, DenseMap<CallInfo, CallInfo>()});
5009 AssignCallsiteCloneToFuncClone(
5010 OrigFunc, Call, Clone,
5011 AllocationCallToContextNodeMap.count(Call));
5012 for (auto &CE : Clone->CallerEdges) {
5013 // Ignore any caller that does not have a recorded callsite Call.
5014 if (!CE->Caller->hasCall())
5015 continue;
5016 RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
5017 }
5018 continue;
5019 }
5020
5021 // First locate which copy of OrigFunc to clone again. If a caller
5022 // of this callsite clone was already assigned to call a particular
5023 // function clone, we need to redirect all of those callers to the
5024 // new function clone, and update their other callees within this
5025 // function.
5026 FuncInfo PreviousAssignedFuncClone;
5027 auto EI = llvm::find_if(
5028 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
5029 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
5030 });
5031 bool CallerAssignedToCloneOfFunc = false;
5032 if (EI != Clone->CallerEdges.end()) {
5033 const std::shared_ptr<ContextEdge> &Edge = *EI;
5034 PreviousAssignedFuncClone =
5035 CallsiteToCalleeFuncCloneMap[Edge->Caller];
5036 CallerAssignedToCloneOfFunc = true;
5037 }
5038
5039 // Clone function and save it along with the CallInfo map created
5040 // during cloning in the FuncCloneInfos.
5041 DenseMap<CallInfo, CallInfo> NewCallMap;
5042 unsigned CloneNo = FuncCloneInfos.size();
5043 assert(CloneNo > 0 && "Clone 0 is the original function, which "
5044 "should already exist in the map");
5045 FuncInfo NewFuncClone = cloneFunctionForCallsite(
5046 Func&: OrigFunc, Call, CallMap&: NewCallMap, CallsWithMetadataInFunc&: CallsWithMetadata, CloneNo);
5047 FuncCloneInfos.push_back({NewFuncClone, std::move(NewCallMap)});
5048 FunctionClonesAnalysis++;
5049 Changed = true;
5050
5051 // If no caller callsites were already assigned to a clone of this
5052 // function, we can simply assign this clone to the new func clone
5053 // and update all callers to it, then skip to the next clone.
5054 if (!CallerAssignedToCloneOfFunc) {
5055 AssignCallsiteCloneToFuncClone(
5056 NewFuncClone, Call, Clone,
5057 AllocationCallToContextNodeMap.count(Call));
5058 for (auto &CE : Clone->CallerEdges) {
5059 // Ignore any caller that does not have a recorded callsite Call.
5060 if (!CE->Caller->hasCall())
5061 continue;
5062 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
5063 }
5064 continue;
5065 }
5066
5067 // We may need to do additional node cloning in this case.
5068 // Reset the CallsiteToCalleeFuncCloneMap entry for any callers
5069 // that were previously assigned to call PreviousAssignedFuncClone,
5070 // to record that they now call NewFuncClone.
5071 // The none type edge removal may remove some of this Clone's caller
5072 // edges, if it is reached via another of its caller's callees.
5073 // Iterate over a copy and skip any that were removed.
5074 auto CallerEdges = Clone->CallerEdges;
5075 for (auto CE : CallerEdges) {
5076 // Skip any that have been removed on an earlier iteration.
5077 if (CE->isRemoved()) {
5078 assert(!is_contained(Clone->CallerEdges, CE));
5079 continue;
5080 }
5081 assert(CE);
5082 // Ignore any caller that does not have a recorded callsite Call.
5083 if (!CE->Caller->hasCall())
5084 continue;
5085
5086 if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
5087 // We subsequently fall through to later handling that
5088 // will perform any additional cloning required for
5089 // callers that were calling other function clones.
5090 CallsiteToCalleeFuncCloneMap[CE->Caller] !=
5091 PreviousAssignedFuncClone)
5092 continue;
5093
5094 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
5095
5096 // If we are cloning a function that was already assigned to some
5097 // callers, then essentially we are creating new callsite clones
5098 // of the other callsites in that function that are reached by those
5099 // callers. Clone the other callees of the current callsite's caller
5100 // that were already assigned to PreviousAssignedFuncClone
5101 // accordingly. This is important since we subsequently update the
5102 // calls from the nodes in the graph and their assignments to callee
5103 // functions recorded in CallsiteToCalleeFuncCloneMap.
5104 // The none type edge removal may remove some of this caller's
5105 // callee edges, if it is reached via another of its callees.
5106 // Iterate over a copy and skip any that were removed.
5107 auto CalleeEdges = CE->Caller->CalleeEdges;
5108 for (auto CalleeEdge : CalleeEdges) {
5109 // Skip any that have been removed on an earlier iteration when
5110 // cleaning up newly None type callee edges.
5111 if (CalleeEdge->isRemoved()) {
5112 assert(!is_contained(CE->Caller->CalleeEdges, CalleeEdge));
5113 continue;
5114 }
5115 assert(CalleeEdge);
5116 ContextNode *Callee = CalleeEdge->Callee;
5117 // Skip the current callsite, we are looking for other
5118 // callsites Caller calls, as well as any that does not have a
5119 // recorded callsite Call.
5120 if (Callee == Clone || !Callee->hasCall())
5121 continue;
5122 // Skip direct recursive calls. We don't need/want to clone the
5123 // caller node again, and this loop will not behave as expected if
5124 // we tried.
5125 if (Callee == CalleeEdge->Caller)
5126 continue;
5127 ContextNode *NewClone =
5128 MoveEdgeToNewCalleeCloneAndSetUp(CalleeEdge);
5129 // Moving the edge may have resulted in some none type
5130 // callee edges on the original Callee.
5131 removeNoneTypeCalleeEdges(Node: Callee);
5132 // Update NewClone with the new Call clone of this callsite's Call
5133 // created for the new function clone created earlier.
5134 // Recall that we have already ensured when building the graph
5135 // that each caller can only call callsites within the same
5136 // function, so we are guaranteed that Callee Call is in the
5137 // current OrigFunc.
5138 // CallMap is set up as indexed by original Call at clone 0.
5139 CallInfo OrigCall(Callee->getOrigNode()->Call);
5140 OrigCall.setCloneNo(0);
5141 DenseMap<CallInfo, CallInfo> &CallMap =
5142 FuncCloneInfos[NewFuncClone.cloneNo()].CallMap;
5143 assert(CallMap.count(OrigCall));
5144 CallInfo NewCall(CallMap[OrigCall]);
5145 assert(NewCall);
5146 NewClone->setCall(NewCall);
5147 // Need to do the same for all matching calls.
5148 for (auto &MatchingCall : NewClone->MatchingCalls) {
5149 CallInfo OrigMatchingCall(MatchingCall);
5150 OrigMatchingCall.setCloneNo(0);
5151 assert(CallMap.count(OrigMatchingCall));
5152 CallInfo NewCall(CallMap[OrigMatchingCall]);
5153 assert(NewCall);
5154 // Updates the call in the list.
5155 MatchingCall = NewCall;
5156 }
5157 }
5158 }
5159 // Fall through to handling below to perform the recording of the
5160 // function for this callsite clone. This enables handling of cases
5161 // where the callers were assigned to different clones of a function.
5162 }
5163
5164 auto FindFirstAvailFuncClone = [&]() {
5165 // Find first function in FuncCloneInfos without an assigned
5166 // clone of this callsite Node. We should always have one
5167 // available at this point due to the earlier cloning when the
5168 // FuncCloneInfos size was smaller than the clone number.
5169 for (auto &CF : FuncCloneInfos) {
5170 if (!FuncCloneToCurNodeCloneMap.count(CF.FuncClone))
5171 return CF.FuncClone;
5172 }
5173 llvm_unreachable(
5174 "Expected an available func clone for this callsite clone");
5175 };
5176
5177 // See if we can use existing function clone. Walk through
5178 // all caller edges to see if any have already been assigned to
5179 // a clone of this callsite's function. If we can use it, do so. If not,
5180 // because that function clone is already assigned to a different clone
5181 // of this callsite, then we need to clone again.
5182 // Basically, this checking is needed to handle the case where different
5183 // caller functions/callsites may need versions of this function
5184 // containing different mixes of callsite clones across the different
5185 // callsites within the function. If that happens, we need to create
5186 // additional function clones to handle the various combinations.
5187 //
5188 // Keep track of any new clones of this callsite created by the
5189 // following loop, as well as any existing clone that we decided to
5190 // assign this clone to.
5191 std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
5192 FuncInfo FuncCloneAssignedToCurCallsiteClone;
5193 // Iterate over a copy of Clone's caller edges, since we may need to
5194 // remove edges in the moveEdgeTo* methods, and this simplifies the
5195 // handling and makes it less error-prone.
5196 auto CloneCallerEdges = Clone->CallerEdges;
5197 for (auto &Edge : CloneCallerEdges) {
5198 // Skip removed edges (due to direct recursive edges updated when
5199 // updating callee edges when moving an edge and subsequently
5200 // removed by call to removeNoneTypeCalleeEdges on the Clone).
5201 if (Edge->isRemoved())
5202 continue;
5203 // Ignore any caller that does not have a recorded callsite Call.
5204 if (!Edge->Caller->hasCall())
5205 continue;
5206 // If this caller already assigned to call a version of OrigFunc, need
5207 // to ensure we can assign this callsite clone to that function clone.
5208 if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
5209 FuncInfo FuncCloneCalledByCaller =
5210 CallsiteToCalleeFuncCloneMap[Edge->Caller];
5211 // First we need to confirm that this function clone is available
5212 // for use by this callsite node clone.
5213 //
5214 // While FuncCloneToCurNodeCloneMap is built only for this Node and
5215 // its callsite clones, one of those callsite clones X could have
5216 // been assigned to the same function clone called by Edge's caller
5217 // - if Edge's caller calls another callsite within Node's original
5218 // function, and that callsite has another caller reaching clone X.
5219 // We need to clone Node again in this case.
5220 if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
5221 FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
5222 Clone) ||
5223 // Detect when we have multiple callers of this callsite that
5224 // have already been assigned to specific, and different, clones
5225 // of OrigFunc (due to other unrelated callsites in Func they
5226 // reach via call contexts). Is this Clone of callsite Node
5227 // assigned to a different clone of OrigFunc? If so, clone Node
5228 // again.
5229 (FuncCloneAssignedToCurCallsiteClone &&
5230 FuncCloneAssignedToCurCallsiteClone !=
5231 FuncCloneCalledByCaller)) {
5232 // We need to use a different newly created callsite clone, in
5233 // order to assign it to another new function clone on a
5234 // subsequent iteration over the Clones array (adjusted below).
5235 // Note we specifically do not reset the
5236 // CallsiteToCalleeFuncCloneMap entry for this caller, so that
5237 // when this new clone is processed later we know which version of
5238 // the function to copy (so that other callsite clones we have
5239 // assigned to that function clone are properly cloned over). See
5240 // comments in the function cloning handling earlier.
5241
5242 // Check if we already have cloned this callsite again while
5243 // walking through caller edges, for a caller calling the same
5244 // function clone. If so, we can move this edge to that new clone
5245 // rather than creating yet another new clone.
5246 if (FuncCloneToNewCallsiteCloneMap.count(
5247 FuncCloneCalledByCaller)) {
5248 ContextNode *NewClone =
5249 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
5250 moveEdgeToExistingCalleeClone(Edge, NewCallee: NewClone);
5251 // Cleanup any none type edges cloned over.
5252 removeNoneTypeCalleeEdges(Node: NewClone);
5253 } else {
5254 // Create a new callsite clone.
5255 ContextNode *NewClone = MoveEdgeToNewCalleeCloneAndSetUp(Edge);
5256 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
5257 NewClone;
5258 // Add to list of clones and process later.
5259 ClonesWorklist.push_back(NewClone);
5260 }
5261 // Moving the caller edge may have resulted in some none type
5262 // callee edges.
5263 removeNoneTypeCalleeEdges(Node: Clone);
5264 // We will handle the newly created callsite clone in a subsequent
5265 // iteration over this Node's Clones.
5266 continue;
5267 }
5268
5269 // Otherwise, we can use the function clone already assigned to this
5270 // caller.
5271 if (!FuncCloneAssignedToCurCallsiteClone) {
5272 FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
5273 // Assign Clone to FuncCloneCalledByCaller
5274 AssignCallsiteCloneToFuncClone(
5275 FuncCloneCalledByCaller, Call, Clone,
5276 AllocationCallToContextNodeMap.count(Call));
5277 } else
5278 // Don't need to do anything - callsite is already calling this
5279 // function clone.
5280 assert(FuncCloneAssignedToCurCallsiteClone ==
5281 FuncCloneCalledByCaller);
5282
5283 } else {
5284 // We have not already assigned this caller to a version of
5285 // OrigFunc. Do the assignment now.
5286
5287 // First check if we have already assigned this callsite clone to a
5288 // clone of OrigFunc for another caller during this iteration over
5289 // its caller edges.
5290 if (!FuncCloneAssignedToCurCallsiteClone) {
5291 FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone();
5292 assert(FuncCloneAssignedToCurCallsiteClone);
5293 // Assign Clone to FuncCloneAssignedToCurCallsiteClone
5294 AssignCallsiteCloneToFuncClone(
5295 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
5296 AllocationCallToContextNodeMap.count(Call));
5297 } else
5298 assert(FuncCloneToCurNodeCloneMap
5299 [FuncCloneAssignedToCurCallsiteClone] == Clone);
5300 // Update callers to record function version called.
5301 RecordCalleeFuncOfCallsite(Edge->Caller,
5302 FuncCloneAssignedToCurCallsiteClone);
5303 }
5304 }
5305 // If we didn't assign a function clone to this callsite clone yet, e.g.
5306 // none of its callers has a non-null call, do the assignment here.
5307 // We want to ensure that every callsite clone is assigned to some
5308 // function clone, so that the call updates below work as expected.
5309 // In particular if this is the original callsite, we want to ensure it
5310 // is assigned to the original function, otherwise the original function
5311 // will appear available for assignment to other callsite clones,
5312 // leading to unintended effects. For one, the unknown and not updated
5313 // callers will call into cloned paths leading to the wrong hints,
5314 // because they still call the original function (clone 0). Also,
5315 // because all callsites start out as being clone 0 by default, we can't
5316 // easily distinguish between callsites explicitly assigned to clone 0
5317 // vs those never assigned, which can lead to multiple updates of the
5318 // calls when invoking updateCall below, with mismatched clone values.
5319 // TODO: Add a flag to the callsite nodes or some other mechanism to
5320 // better distinguish and identify callsite clones that are not getting
5321 // assigned to function clones as expected.
5322 if (!FuncCloneAssignedToCurCallsiteClone) {
5323 FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone();
5324 assert(FuncCloneAssignedToCurCallsiteClone &&
5325 "No available func clone for this callsite clone");
5326 AssignCallsiteCloneToFuncClone(
5327 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
5328 /*IsAlloc=*/AllocationCallToContextNodeMap.contains(Call));
5329 }
5330 }
5331 if (VerifyCCG) {
5332 checkNode<DerivedCCG, FuncTy, CallTy>(Node);
5333 for (const auto &PE : Node->CalleeEdges)
5334 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
5335 for (const auto &CE : Node->CallerEdges)
5336 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
5337 for (auto *Clone : Node->Clones) {
5338 checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
5339 for (const auto &PE : Clone->CalleeEdges)
5340 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
5341 for (const auto &CE : Clone->CallerEdges)
5342 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
5343 }
5344 }
5345 }
5346
5347 if (FuncCloneInfos.size() < 2)
5348 continue;
5349
5350 // In this case there is more than just the original function copy.
5351 // Record call clones of any callsite nodes in the function that did not
5352 // themselves get cloned for all of the function clones.
5353 for (auto &Call : CallsWithMetadata) {
5354 ContextNode *Node = getNodeForInst(C: Call);
5355 if (!Node || !Node->hasCall() || Node->emptyContextIds())
5356 continue;
5357 // If Node has enough clones already to cover all function clones, we can
5358 // skip it. Need to add one for the original copy.
5359 // Use >= in case there were clones that were skipped due to having empty
5360 // context ids
5361 if (Node->Clones.size() + 1 >= FuncCloneInfos.size())
5362 continue;
5363 // First collect all function clones we cloned this callsite node for.
5364 // They may not be sequential due to empty clones e.g.
5365 DenseSet<unsigned> NodeCallClones;
5366 for (auto *C : Node->Clones)
5367 NodeCallClones.insert(C->Call.cloneNo());
5368 unsigned I = 0;
5369 // Now check all the function clones.
5370 for (auto &FC : FuncCloneInfos) {
5371 // Function clones should be sequential.
5372 assert(FC.FuncClone.cloneNo() == I);
5373 // Skip the first clone which got the original call.
5374 // Also skip any other clones created for this Node.
5375 if (++I == 1 || NodeCallClones.contains(V: I)) {
5376 continue;
5377 }
5378 // Record the call clones created for this callsite in this function
5379 // clone.
5380 auto &CallVector = UnassignedCallClones[Node][I];
5381 DenseMap<CallInfo, CallInfo> &CallMap = FC.CallMap;
5382 if (auto It = CallMap.find(Call); It != CallMap.end()) {
5383 CallInfo CallClone = It->second;
5384 CallVector.push_back(CallClone);
5385 } else {
5386 // All but the original clone (skipped earlier) should have an entry
5387 // for all calls.
5388 assert(false && "Expected to find call in CallMap");
5389 }
5390 // Need to do the same for all matching calls.
5391 for (auto &MatchingCall : Node->MatchingCalls) {
5392 if (auto It = CallMap.find(MatchingCall); It != CallMap.end()) {
5393 CallInfo CallClone = It->second;
5394 CallVector.push_back(CallClone);
5395 } else {
5396 // All but the original clone (skipped earlier) should have an entry
5397 // for all calls.
5398 assert(false && "Expected to find call in CallMap");
5399 }
5400 }
5401 }
5402 }
5403 }
5404
5405 uint8_t BothTypes =
5406 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
5407
5408 auto UpdateCalls = [&](ContextNode *Node,
5409 DenseSet<const ContextNode *> &Visited,
5410 auto &&UpdateCalls) {
5411 auto Inserted = Visited.insert(Node);
5412 if (!Inserted.second)
5413 return;
5414
5415 for (auto *Clone : Node->Clones)
5416 UpdateCalls(Clone, Visited, UpdateCalls);
5417
5418 for (auto &Edge : Node->CallerEdges)
5419 UpdateCalls(Edge->Caller, Visited, UpdateCalls);
5420
5421 // Skip if either no call to update, or if we ended up with no context ids
5422 // (we moved all edges onto other clones).
5423 if (!Node->hasCall() || Node->emptyContextIds())
5424 return;
5425
5426 if (Node->IsAllocation) {
5427 auto AT = allocTypeToUse(Node->AllocTypes);
5428 // If the allocation type is ambiguous, and more aggressive hinting
5429 // has been enabled via the MinClonedColdBytePercent flag, see if this
5430 // allocation should be hinted cold anyway because its fraction cold bytes
5431 // allocated is at least the given threshold.
5432 if (Node->AllocTypes == BothTypes && MinClonedColdBytePercent < 100 &&
5433 !ContextIdToContextSizeInfos.empty()) {
5434 uint64_t TotalCold = 0;
5435 uint64_t Total = 0;
5436 for (auto Id : Node->getContextIds()) {
5437 auto TypeI = ContextIdToAllocationType.find(Id);
5438 assert(TypeI != ContextIdToAllocationType.end());
5439 auto CSI = ContextIdToContextSizeInfos.find(Id);
5440 if (CSI != ContextIdToContextSizeInfos.end()) {
5441 for (auto &Info : CSI->second) {
5442 Total += Info.TotalSize;
5443 if (TypeI->second == AllocationType::Cold)
5444 TotalCold += Info.TotalSize;
5445 }
5446 }
5447 }
5448 if (TotalCold * 100 >= Total * MinClonedColdBytePercent)
5449 AT = AllocationType::Cold;
5450 }
5451 updateAllocationCall(Call&: Node->Call, AllocType: AT);
5452 assert(Node->MatchingCalls.empty());
5453 return;
5454 }
5455
5456 if (!CallsiteToCalleeFuncCloneMap.count(Node))
5457 return;
5458
5459 auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
5460 updateCall(CallerCall&: Node->Call, CalleeFunc);
5461 // Update all the matching calls as well.
5462 for (auto &Call : Node->MatchingCalls)
5463 updateCall(CallerCall&: Call, CalleeFunc);
5464
5465 // Now update all calls recorded earlier that are still in function clones
5466 // which don't have a clone of this callsite node.
5467 if (!UnassignedCallClones.contains(Node))
5468 return;
5469 DenseSet<unsigned> NodeCallClones;
5470 for (auto *C : Node->Clones)
5471 NodeCallClones.insert(C->Call.cloneNo());
5472 // Note that we already confirmed Node is in this map a few lines above.
5473 auto &ClonedCalls = UnassignedCallClones[Node];
5474 for (auto &[CloneNo, CallVector] : ClonedCalls) {
5475 // Should start at 1 as we never create an entry for original node.
5476 assert(CloneNo > 0);
5477 // If we subsequently created a clone, skip this one.
5478 if (NodeCallClones.contains(V: CloneNo))
5479 continue;
5480 // Use the original Node's CalleeFunc.
5481 for (auto &Call : CallVector)
5482 updateCall(CallerCall&: Call, CalleeFunc);
5483 }
5484 };
5485
5486 // Performs DFS traversal starting from allocation nodes to update calls to
5487 // reflect cloning decisions recorded earlier. For regular LTO this will
5488 // update the actual calls in the IR to call the appropriate function clone
5489 // (and add attributes to allocation calls), whereas for ThinLTO the decisions
5490 // are recorded in the summary entries.
5491 DenseSet<const ContextNode *> Visited;
5492 for (auto &Entry : AllocationCallToContextNodeMap)
5493 UpdateCalls(Entry.second, Visited, UpdateCalls);
5494
5495 return Changed;
5496}
5497
5498// Compute a SHA1 hash of the callsite and alloc version information of clone I
5499// in the summary, to use in detection of duplicate clones.
5500uint64_t ComputeHash(const FunctionSummary *FS, unsigned I) {
5501 SHA1 Hasher;
5502 // Update hash with any callsites that call non-default (non-zero) callee
5503 // versions.
5504 for (auto &SN : FS->callsites()) {
5505 // In theory all callsites and allocs in this function should have the same
5506 // number of clone entries, but handle any discrepancies gracefully below
5507 // for NDEBUG builds.
5508 assert(
5509 SN.Clones.size() > I &&
5510 "Callsite summary has fewer entries than other summaries in function");
5511 if (SN.Clones.size() <= I || !SN.Clones[I])
5512 continue;
5513 uint8_t Data[sizeof(SN.Clones[I])];
5514 support::endian::write32le(P: Data, V: SN.Clones[I]);
5515 Hasher.update(Data);
5516 }
5517 // Update hash with any allocs that have non-default (non-None) hints.
5518 for (auto &AN : FS->allocs()) {
5519 // In theory all callsites and allocs in this function should have the same
5520 // number of clone entries, but handle any discrepancies gracefully below
5521 // for NDEBUG builds.
5522 assert(AN.Versions.size() > I &&
5523 "Alloc summary has fewer entries than other summaries in function");
5524 if (AN.Versions.size() <= I ||
5525 (AllocationType)AN.Versions[I] == AllocationType::None)
5526 continue;
5527 Hasher.update(Data: ArrayRef<uint8_t>(&AN.Versions[I], 1));
5528 }
5529 return support::endian::read64le(P: Hasher.result().data());
5530}
5531
5532static SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> createFunctionClones(
5533 Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
5534 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
5535 &FuncToAliasMap,
5536 FunctionSummary *FS) {
5537 auto TakeDeclNameAndReplace = [](GlobalValue *DeclGV, GlobalValue *NewGV) {
5538 // We might have created this when adjusting callsite in another
5539 // function. It should be a declaration.
5540 assert(DeclGV->isDeclaration());
5541 NewGV->takeName(V: DeclGV);
5542 DeclGV->replaceAllUsesWith(V: NewGV);
5543 DeclGV->eraseFromParent();
5544 };
5545
5546 // Handle aliases to this function, and create analogous alias clones to the
5547 // provided clone of this function.
5548 auto CloneFuncAliases = [&](Function *NewF, unsigned I) {
5549 if (!FuncToAliasMap.count(x: &F))
5550 return;
5551 for (auto *A : FuncToAliasMap[&F]) {
5552 std::string AliasName = getMemProfFuncName(Base: A->getName(), CloneNo: I);
5553 auto *PrevA = M.getNamedAlias(Name: AliasName);
5554 auto *NewA = GlobalAlias::create(Ty: A->getValueType(),
5555 AddressSpace: A->getType()->getPointerAddressSpace(),
5556 Linkage: A->getLinkage(), Name: AliasName, Aliasee: NewF);
5557 NewA->copyAttributesFrom(Src: A);
5558 if (PrevA)
5559 TakeDeclNameAndReplace(PrevA, NewA);
5560 }
5561 };
5562
5563 // The first "clone" is the original copy, we should only call this if we
5564 // needed to create new clones.
5565 assert(NumClones > 1);
5566 SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
5567 VMaps.reserve(N: NumClones - 1);
5568 FunctionsClonedThinBackend++;
5569
5570 // Map of hash of callsite/alloc versions to the instantiated function clone
5571 // (possibly the original) implementing those calls. Used to avoid
5572 // instantiating duplicate function clones.
5573 // FIXME: Ideally the thin link would not generate such duplicate clones to
5574 // start with, but right now it happens due to phase ordering in the function
5575 // assignment and possible new clones that produces. We simply make each
5576 // duplicate an alias to the matching instantiated clone recorded in the map
5577 // (except for available_externally which are made declarations as they would
5578 // be aliases in the prevailing module, and available_externally aliases are
5579 // not well supported right now).
5580 DenseMap<uint64_t, Function *> HashToFunc;
5581
5582 // Save the hash of the original function version.
5583 HashToFunc[ComputeHash(FS, I: 0)] = &F;
5584
5585 for (unsigned I = 1; I < NumClones; I++) {
5586 VMaps.emplace_back(Args: std::make_unique<ValueToValueMapTy>());
5587 std::string Name = getMemProfFuncName(Base: F.getName(), CloneNo: I);
5588 auto Hash = ComputeHash(FS, I);
5589 // If this clone would duplicate a previously seen clone, don't generate the
5590 // duplicate clone body, just make an alias to satisfy any (potentially
5591 // cross-module) references.
5592 if (HashToFunc.contains(Val: Hash)) {
5593 FunctionCloneDuplicatesThinBackend++;
5594 auto *Func = HashToFunc[Hash];
5595 if (Func->hasAvailableExternallyLinkage()) {
5596 // Skip these as EliminateAvailableExternallyPass does not handle
5597 // available_externally aliases correctly and we end up with an
5598 // available_externally alias to a declaration. Just create a
5599 // declaration for now as we know we will have a definition in another
5600 // module.
5601 auto Decl = M.getOrInsertFunction(Name, T: Func->getFunctionType());
5602 ORE.emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
5603 << "created clone decl " << ore::NV("Decl", Decl.getCallee()));
5604 continue;
5605 }
5606 auto *PrevF = M.getFunction(Name);
5607 auto *Alias = GlobalAlias::create(Name, Aliasee: Func);
5608 if (PrevF)
5609 TakeDeclNameAndReplace(PrevF, Alias);
5610 ORE.emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
5611 << "created clone alias " << ore::NV("Alias", Alias));
5612
5613 // Now handle aliases to this function, and clone those as well.
5614 CloneFuncAliases(Func, I);
5615 continue;
5616 }
5617 auto *NewF = CloneFunction(F: &F, VMap&: *VMaps.back());
5618 HashToFunc[Hash] = NewF;
5619 FunctionClonesThinBackend++;
5620 // Strip memprof and callsite metadata from clone as they are no longer
5621 // needed.
5622 for (auto &BB : *NewF) {
5623 for (auto &Inst : BB) {
5624 Inst.setMetadata(KindID: LLVMContext::MD_memprof, Node: nullptr);
5625 Inst.setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
5626 }
5627 }
5628 auto *PrevF = M.getFunction(Name);
5629 if (PrevF)
5630 TakeDeclNameAndReplace(PrevF, NewF);
5631 else
5632 NewF->setName(Name);
5633 updateSubprogramLinkageName(NewFunc: NewF, Name);
5634 ORE.emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
5635 << "created clone " << ore::NV("NewFunction", NewF));
5636
5637 // Now handle aliases to this function, and clone those as well.
5638 CloneFuncAliases(NewF, I);
5639 }
5640 return VMaps;
5641}
5642
5643// Locate the summary for F. This is complicated by the fact that it might
5644// have been internalized or promoted.
5645static ValueInfo findValueInfoForFunc(const Function &F, const Module &M,
5646 const ModuleSummaryIndex *ImportSummary,
5647 const Function *CallingFunc = nullptr) {
5648 // FIXME: Ideally we would retain the original GUID in some fashion on the
5649 // function (e.g. as metadata), but for now do our best to locate the
5650 // summary without that information.
5651 ValueInfo TheFnVI = ImportSummary->getValueInfo(GUID: F.getGUID());
5652 if (!TheFnVI)
5653 // See if theFn was internalized, by checking index directly with
5654 // original name (this avoids the name adjustment done by getGUID() for
5655 // internal symbols).
5656 TheFnVI = ImportSummary->getValueInfo(
5657 GUID: GlobalValue::getGUIDAssumingExternalLinkage(GlobalName: F.getName()));
5658 if (TheFnVI)
5659 return TheFnVI;
5660 // Now query with the original name before any promotion was performed.
5661 StringRef OrigName =
5662 ModuleSummaryIndex::getOriginalNameBeforePromote(Name: F.getName());
5663 // When this pass is enabled, we always add thinlto_src_file provenance
5664 // metadata to imported function definitions, which allows us to recreate the
5665 // original internal symbol's GUID.
5666 auto SrcFileMD = F.getMetadata(Kind: "thinlto_src_file");
5667 // If this is a call to an imported/promoted local for which we didn't import
5668 // the definition, the metadata will not exist on the declaration. However,
5669 // since we are doing this early, before any inlining in the LTO backend, we
5670 // can simply look at the metadata on the calling function which must have
5671 // been from the same module if F was an internal symbol originally.
5672 if (!SrcFileMD && F.isDeclaration()) {
5673 // We would only call this for a declaration for a direct callsite, in which
5674 // case the caller would have provided the calling function pointer.
5675 assert(CallingFunc);
5676 SrcFileMD = CallingFunc->getMetadata(Kind: "thinlto_src_file");
5677 // If this is a promoted local (OrigName != F.getName()), since this is a
5678 // declaration, it must be imported from a different module and therefore we
5679 // should always find the metadata on its calling function. Any call to a
5680 // promoted local that came from this module should still be a definition.
5681 assert(SrcFileMD || OrigName == F.getName());
5682 }
5683 StringRef SrcFile = M.getSourceFileName();
5684 if (SrcFileMD)
5685 SrcFile = dyn_cast<MDString>(Val: SrcFileMD->getOperand(I: 0))->getString();
5686 std::string OrigId = GlobalValue::getGlobalIdentifier(
5687 Name: OrigName, Linkage: GlobalValue::InternalLinkage, FileName: SrcFile);
5688 TheFnVI = ImportSummary->getValueInfo(
5689 GUID: GlobalValue::getGUIDAssumingExternalLinkage(GlobalName: OrigId));
5690 // Internal func in original module may have gotten a numbered suffix if we
5691 // imported an external function with the same name. This happens
5692 // automatically during IR linking for naming conflicts. It would have to
5693 // still be internal in that case (otherwise it would have been renamed on
5694 // promotion in which case we wouldn't have a naming conflict).
5695 if (!TheFnVI && OrigName == F.getName() && F.hasLocalLinkage() &&
5696 F.getName().contains(C: '.')) {
5697 OrigName = F.getName().rsplit(Separator: '.').first;
5698 OrigId = GlobalValue::getGlobalIdentifier(
5699 Name: OrigName, Linkage: GlobalValue::InternalLinkage, FileName: SrcFile);
5700 TheFnVI = ImportSummary->getValueInfo(
5701 GUID: GlobalValue::getGUIDAssumingExternalLinkage(GlobalName: OrigId));
5702 }
5703 // The only way we may not have a VI is if this is a declaration created for
5704 // an imported reference. For distributed ThinLTO we may not have a VI for
5705 // such declarations in the distributed summary.
5706 assert(TheFnVI || F.isDeclaration());
5707 return TheFnVI;
5708}
5709
5710bool MemProfContextDisambiguation::initializeIndirectCallPromotionInfo(
5711 Module &M) {
5712 ICallAnalysis = std::make_unique<ICallPromotionAnalysis>();
5713 Symtab = std::make_unique<InstrProfSymtab>();
5714 // Don't add canonical names, to avoid multiple functions to the symtab
5715 // when they both have the same root name with "." suffixes stripped.
5716 // If we pick the wrong one then this could lead to incorrect ICP and calling
5717 // a memprof clone that we don't actually create (resulting in linker unsats).
5718 // What this means is that the GUID of the function (or its PGOFuncName
5719 // metadata) *must* match that in the VP metadata to allow promotion.
5720 // In practice this should not be a limitation, since local functions should
5721 // have PGOFuncName metadata and global function names shouldn't need any
5722 // special handling (they should not get the ".llvm.*" suffix that the
5723 // canonicalization handling is attempting to strip).
5724 if (Error E = Symtab->create(M, /*InLTO=*/true, /*AddCanonical=*/false)) {
5725 std::string SymtabFailure = toString(E: std::move(E));
5726 M.getContext().emitError(ErrorStr: "Failed to create symtab: " + SymtabFailure);
5727 return false;
5728 }
5729 return true;
5730}
5731
5732#ifndef NDEBUG
5733// Sanity check that the MIB stack ids match between the summary and
5734// instruction metadata.
5735static void checkAllocContextIds(
5736 const AllocInfo &AllocNode, const MDNode *MemProfMD,
5737 const CallStack<MDNode, MDNode::op_iterator> &CallsiteContext,
5738 const ModuleSummaryIndex *ImportSummary) {
5739 auto MIBIter = AllocNode.MIBs.begin();
5740 for (auto &MDOp : MemProfMD->operands()) {
5741 assert(MIBIter != AllocNode.MIBs.end());
5742 auto StackIdIndexIter = MIBIter->StackIdIndices.begin();
5743 auto *MIBMD = cast<const MDNode>(MDOp);
5744 MDNode *StackMDNode = getMIBStackNode(MIBMD);
5745 assert(StackMDNode);
5746 CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
5747 auto ContextIterBegin =
5748 StackContext.beginAfterSharedPrefix(CallsiteContext);
5749 // Skip the checking on the first iteration.
5750 uint64_t LastStackContextId =
5751 (ContextIterBegin != StackContext.end() && *ContextIterBegin == 0) ? 1
5752 : 0;
5753 for (auto ContextIter = ContextIterBegin; ContextIter != StackContext.end();
5754 ++ContextIter) {
5755 // If this is a direct recursion, simply skip the duplicate
5756 // entries, to be consistent with how the summary ids were
5757 // generated during ModuleSummaryAnalysis.
5758 if (LastStackContextId == *ContextIter)
5759 continue;
5760 LastStackContextId = *ContextIter;
5761 assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
5762 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
5763 *ContextIter);
5764 StackIdIndexIter++;
5765 }
5766 MIBIter++;
5767 }
5768}
5769#endif
5770
5771bool MemProfContextDisambiguation::applyImport(Module &M) {
5772 assert(ImportSummary);
5773 bool Changed = false;
5774
5775 // We also need to clone any aliases that reference cloned functions, because
5776 // the modified callsites may invoke via the alias. Keep track of the aliases
5777 // for each function.
5778 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
5779 FuncToAliasMap;
5780 for (auto &A : M.aliases()) {
5781 auto *Aliasee = A.getAliaseeObject();
5782 if (auto *F = dyn_cast<Function>(Val: Aliasee))
5783 FuncToAliasMap[F].insert(Ptr: &A);
5784 }
5785
5786 if (!initializeIndirectCallPromotionInfo(M))
5787 return false;
5788
5789 for (auto &F : M) {
5790 if (F.isDeclaration() || isMemProfClone(F))
5791 continue;
5792
5793 OptimizationRemarkEmitter ORE(&F);
5794
5795 SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
5796 bool ClonesCreated = false;
5797 unsigned NumClonesCreated = 0;
5798 auto CloneFuncIfNeeded = [&](unsigned NumClones, FunctionSummary *FS) {
5799 // We should at least have version 0 which is the original copy.
5800 assert(NumClones > 0);
5801 // If only one copy needed use original.
5802 if (NumClones == 1)
5803 return;
5804 // If we already performed cloning of this function, confirm that the
5805 // requested number of clones matches (the thin link should ensure the
5806 // number of clones for each constituent callsite is consistent within
5807 // each function), before returning.
5808 if (ClonesCreated) {
5809 assert(NumClonesCreated == NumClones);
5810 return;
5811 }
5812 VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap, FS);
5813 // The first "clone" is the original copy, which doesn't have a VMap.
5814 assert(VMaps.size() == NumClones - 1);
5815 Changed = true;
5816 ClonesCreated = true;
5817 NumClonesCreated = NumClones;
5818 };
5819
5820 auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
5821 Function *CalledFunction, FunctionSummary *FS) {
5822 // Perform cloning if not yet done.
5823 CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size(), FS);
5824
5825 assert(!isMemProfClone(*CalledFunction));
5826
5827 // Because we update the cloned calls by calling setCalledOperand (see
5828 // comment below), out of an abundance of caution make sure the called
5829 // function was actually the called operand (or its aliasee). We also
5830 // strip pointer casts when looking for calls (to match behavior during
5831 // summary generation), however, with opaque pointers in theory this
5832 // should not be an issue. Note we still clone the current function
5833 // (containing this call) above, as that could be needed for its callers.
5834 auto *GA = dyn_cast_or_null<GlobalAlias>(Val: CB->getCalledOperand());
5835 if (CalledFunction != CB->getCalledOperand() &&
5836 (!GA || CalledFunction != GA->getAliaseeObject())) {
5837 SkippedCallsCloning++;
5838 return;
5839 }
5840 // Update the calls per the summary info.
5841 // Save orig name since it gets updated in the first iteration
5842 // below.
5843 auto CalleeOrigName = CalledFunction->getName();
5844 for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
5845 // If the VMap is empty, this clone was a duplicate of another and was
5846 // created as an alias or a declaration.
5847 if (J > 0 && VMaps[J - 1]->empty())
5848 continue;
5849 // Do nothing if this version calls the original version of its
5850 // callee.
5851 if (!StackNode.Clones[J])
5852 continue;
5853 auto NewF = M.getOrInsertFunction(
5854 Name: getMemProfFuncName(Base: CalleeOrigName, CloneNo: StackNode.Clones[J]),
5855 T: CalledFunction->getFunctionType());
5856 CallBase *CBClone;
5857 // Copy 0 is the original function.
5858 if (!J)
5859 CBClone = CB;
5860 else
5861 CBClone = cast<CallBase>(Val&: (*VMaps[J - 1])[CB]);
5862 // Set the called operand directly instead of calling setCalledFunction,
5863 // as the latter mutates the function type on the call. In rare cases
5864 // we may have a slightly different type on a callee function
5865 // declaration due to it being imported from a different module with
5866 // incomplete types. We really just want to change the name of the
5867 // function to the clone, and not make any type changes.
5868 CBClone->setCalledOperand(NewF.getCallee());
5869 ORE.emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
5870 << ore::NV("Call", CBClone) << " in clone "
5871 << ore::NV("Caller", CBClone->getFunction())
5872 << " assigned to call function clone "
5873 << ore::NV("Callee", NewF.getCallee()));
5874 }
5875 };
5876
5877 // Locate the summary for F.
5878 ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
5879 // If not found, this could be an imported local (see comment in
5880 // findValueInfoForFunc). Skip for now as it will be cloned in its original
5881 // module (where it would have been promoted to global scope so should
5882 // satisfy any reference in this module).
5883 if (!TheFnVI)
5884 continue;
5885
5886 auto *GVSummary =
5887 ImportSummary->findSummaryInModule(VI: TheFnVI, ModuleId: M.getModuleIdentifier());
5888 if (!GVSummary) {
5889 // Must have been imported, use the summary which matches the definition。
5890 // (might be multiple if this was a linkonce_odr).
5891 auto SrcModuleMD = F.getMetadata(Kind: "thinlto_src_module");
5892 assert(SrcModuleMD &&
5893 "enable-import-metadata is needed to emit thinlto_src_module");
5894 StringRef SrcModule =
5895 dyn_cast<MDString>(Val: SrcModuleMD->getOperand(I: 0))->getString();
5896 for (auto &GVS : TheFnVI.getSummaryList()) {
5897 if (GVS->modulePath() == SrcModule) {
5898 GVSummary = GVS.get();
5899 break;
5900 }
5901 }
5902 assert(GVSummary && GVSummary->modulePath() == SrcModule);
5903 }
5904
5905 // If this was an imported alias skip it as we won't have the function
5906 // summary, and it should be cloned in the original module.
5907 if (isa<AliasSummary>(Val: GVSummary))
5908 continue;
5909
5910 auto *FS = cast<FunctionSummary>(Val: GVSummary->getBaseObject());
5911
5912 if (FS->allocs().empty() && FS->callsites().empty())
5913 continue;
5914
5915 auto SI = FS->callsites().begin();
5916 auto AI = FS->allocs().begin();
5917
5918 // To handle callsite infos synthesized for tail calls which have missing
5919 // frames in the profiled context, map callee VI to the synthesized callsite
5920 // info.
5921 DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite;
5922 // Iterate the callsites for this function in reverse, since we place all
5923 // those synthesized for tail calls at the end.
5924 for (auto CallsiteIt = FS->callsites().rbegin();
5925 CallsiteIt != FS->callsites().rend(); CallsiteIt++) {
5926 auto &Callsite = *CallsiteIt;
5927 // Stop as soon as we see a non-synthesized callsite info (see comment
5928 // above loop). All the entries added for discovered tail calls have empty
5929 // stack ids.
5930 if (!Callsite.StackIdIndices.empty())
5931 break;
5932 MapTailCallCalleeVIToCallsite.insert(KV: {Callsite.Callee, Callsite});
5933 }
5934
5935 // Keeps track of needed ICP for the function.
5936 SmallVector<ICallAnalysisData> ICallAnalysisInfo;
5937
5938 // Assume for now that the instructions are in the exact same order
5939 // as when the summary was created, but confirm this is correct by
5940 // matching the stack ids.
5941 for (auto &BB : F) {
5942 for (auto &I : BB) {
5943 auto *CB = dyn_cast<CallBase>(Val: &I);
5944 // Same handling as when creating module summary.
5945 if (!mayHaveMemprofSummary(CB))
5946 continue;
5947
5948 auto *CalledValue = CB->getCalledOperand();
5949 auto *CalledFunction = CB->getCalledFunction();
5950 if (CalledValue && !CalledFunction) {
5951 CalledValue = CalledValue->stripPointerCasts();
5952 // Stripping pointer casts can reveal a called function.
5953 CalledFunction = dyn_cast<Function>(Val: CalledValue);
5954 }
5955 // Check if this is an alias to a function. If so, get the
5956 // called aliasee for the checks below.
5957 if (auto *GA = dyn_cast<GlobalAlias>(Val: CalledValue)) {
5958 assert(!CalledFunction &&
5959 "Expected null called function in callsite for alias");
5960 CalledFunction = dyn_cast<Function>(Val: GA->getAliaseeObject());
5961 }
5962
5963 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
5964 I.getMetadata(KindID: LLVMContext::MD_callsite));
5965 auto *MemProfMD = I.getMetadata(KindID: LLVMContext::MD_memprof);
5966
5967 // Include allocs that were already assigned a memprof function
5968 // attribute in the statistics. Only do this for those that do not have
5969 // memprof metadata, since we add an "ambiguous" memprof attribute by
5970 // default.
5971 if (CB->getAttributes().hasFnAttr(Kind: "memprof") && !MemProfMD) {
5972 CB->getAttributes().getFnAttr(Kind: "memprof").getValueAsString() == "cold"
5973 ? AllocTypeColdThinBackend++
5974 : AllocTypeNotColdThinBackend++;
5975 OrigAllocsThinBackend++;
5976 AllocVersionsThinBackend++;
5977 if (!MaxAllocVersionsThinBackend)
5978 MaxAllocVersionsThinBackend = 1;
5979 continue;
5980 }
5981
5982 if (MemProfMD) {
5983 // Consult the next alloc node.
5984 assert(AI != FS->allocs().end());
5985 auto &AllocNode = *(AI++);
5986
5987#ifndef NDEBUG
5988 checkAllocContextIds(AllocNode, MemProfMD, CallsiteContext,
5989 ImportSummary);
5990#endif
5991
5992 // Perform cloning if not yet done.
5993 CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size(), FS);
5994
5995 OrigAllocsThinBackend++;
5996 AllocVersionsThinBackend += AllocNode.Versions.size();
5997 if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
5998 MaxAllocVersionsThinBackend = AllocNode.Versions.size();
5999
6000 // If there is only one version that means we didn't end up
6001 // considering this function for cloning, and in that case the alloc
6002 // will still be none type or should have gotten the default NotCold.
6003 // Skip that after calling clone helper since that does some sanity
6004 // checks that confirm we haven't decided yet that we need cloning.
6005 // We might have a single version that is cold due to the
6006 // MinClonedColdBytePercent heuristic, make sure we don't skip in that
6007 // case.
6008 if (AllocNode.Versions.size() == 1 &&
6009 (AllocationType)AllocNode.Versions[0] != AllocationType::Cold) {
6010 assert((AllocationType)AllocNode.Versions[0] ==
6011 AllocationType::NotCold ||
6012 (AllocationType)AllocNode.Versions[0] ==
6013 AllocationType::None);
6014 UnclonableAllocsThinBackend++;
6015 continue;
6016 }
6017
6018 // All versions should have a singular allocation type.
6019 assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
6020 return Type == ((uint8_t)AllocationType::NotCold |
6021 (uint8_t)AllocationType::Cold);
6022 }));
6023
6024 // Update the allocation types per the summary info.
6025 for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
6026 // If the VMap is empty, this clone was a duplicate of another and
6027 // was created as an alias or a declaration.
6028 if (J > 0 && VMaps[J - 1]->empty())
6029 continue;
6030 // Ignore any that didn't get an assigned allocation type.
6031 if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
6032 continue;
6033 AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
6034 AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
6035 : AllocTypeNotColdThinBackend++;
6036 std::string AllocTypeString = getAllocTypeAttributeString(Type: AllocTy);
6037 auto A = llvm::Attribute::get(Context&: F.getContext(), Kind: "memprof",
6038 Val: AllocTypeString);
6039 CallBase *CBClone;
6040 // Copy 0 is the original function.
6041 if (!J)
6042 CBClone = CB;
6043 else
6044 // Since VMaps are only created for new clones, we index with
6045 // clone J-1 (J==0 is the original clone and does not have a VMaps
6046 // entry).
6047 CBClone = cast<CallBase>(Val&: (*VMaps[J - 1])[CB]);
6048 removeAnyExistingAmbiguousAttribute(CB: CBClone);
6049 CBClone->addFnAttr(Attr: A);
6050 ORE.emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
6051 << ore::NV("AllocationCall", CBClone) << " in clone "
6052 << ore::NV("Caller", CBClone->getFunction())
6053 << " marked with memprof allocation attribute "
6054 << ore::NV("Attribute", AllocTypeString));
6055 }
6056 } else if (!CallsiteContext.empty()) {
6057 if (!CalledFunction) {
6058#ifndef NDEBUG
6059 // We should have skipped inline assembly calls.
6060 auto *CI = dyn_cast<CallInst>(CB);
6061 assert(!CI || !CI->isInlineAsm());
6062#endif
6063 // We should have skipped direct calls via a Constant.
6064 assert(CalledValue && !isa<Constant>(CalledValue));
6065
6066 // This is an indirect call, see if we have profile information and
6067 // whether any clones were recorded for the profiled targets (that
6068 // we synthesized CallsiteInfo summary records for when building the
6069 // index).
6070 auto NumClones =
6071 recordICPInfo(CB, AllCallsites: FS->callsites(), SI, ICallAnalysisInfo);
6072
6073 // Perform cloning if not yet done. This is done here in case
6074 // we don't need to do ICP, but might need to clone this
6075 // function as it is the target of other cloned calls.
6076 if (NumClones)
6077 CloneFuncIfNeeded(NumClones, FS);
6078 }
6079
6080 else {
6081 // Consult the next callsite node.
6082 assert(SI != FS->callsites().end());
6083 auto &StackNode = *(SI++);
6084
6085#ifndef NDEBUG
6086 // Sanity check that the stack ids match between the summary and
6087 // instruction metadata.
6088 auto StackIdIndexIter = StackNode.StackIdIndices.begin();
6089 for (auto StackId : CallsiteContext) {
6090 assert(StackIdIndexIter != StackNode.StackIdIndices.end());
6091 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
6092 StackId);
6093 StackIdIndexIter++;
6094 }
6095#endif
6096
6097 CloneCallsite(StackNode, CB, CalledFunction, FS);
6098 }
6099 } else if (CB->isTailCall() && CalledFunction) {
6100 // Locate the synthesized callsite info for the callee VI, if any was
6101 // created, and use that for cloning.
6102 ValueInfo CalleeVI =
6103 findValueInfoForFunc(F: *CalledFunction, M, ImportSummary, CallingFunc: &F);
6104 if (CalleeVI && MapTailCallCalleeVIToCallsite.count(Val: CalleeVI)) {
6105 auto Callsite = MapTailCallCalleeVIToCallsite.find(Val: CalleeVI);
6106 assert(Callsite != MapTailCallCalleeVIToCallsite.end());
6107 CloneCallsite(Callsite->second, CB, CalledFunction, FS);
6108 }
6109 }
6110 }
6111 }
6112
6113 // Now do any promotion required for cloning.
6114 performICP(M, AllCallsites: FS->callsites(), VMaps, ICallAnalysisInfo, ORE);
6115 }
6116
6117 // We skip some of the functions and instructions above, so remove all the
6118 // metadata in a single sweep here.
6119 for (auto &F : M) {
6120 // We can skip memprof clones because createFunctionClones already strips
6121 // the metadata from the newly created clones.
6122 if (F.isDeclaration() || isMemProfClone(F))
6123 continue;
6124 for (auto &BB : F) {
6125 for (auto &I : BB) {
6126 if (!isa<CallBase>(Val: I))
6127 continue;
6128 I.setMetadata(KindID: LLVMContext::MD_memprof, Node: nullptr);
6129 I.setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
6130 }
6131 }
6132 }
6133
6134 return Changed;
6135}
6136
6137unsigned MemProfContextDisambiguation::recordICPInfo(
6138 CallBase *CB, ArrayRef<CallsiteInfo> AllCallsites,
6139 ArrayRef<CallsiteInfo>::iterator &SI,
6140 SmallVector<ICallAnalysisData> &ICallAnalysisInfo) {
6141 // First see if we have profile information for this indirect call.
6142 uint32_t NumCandidates;
6143 uint64_t TotalCount;
6144 auto CandidateProfileData =
6145 ICallAnalysis->getPromotionCandidatesForInstruction(
6146 I: CB, TotalCount, NumCandidates, MaxNumValueData: MaxSummaryIndirectEdges);
6147 if (CandidateProfileData.empty())
6148 return 0;
6149
6150 // Iterate through all of the candidate profiled targets along with the
6151 // CallsiteInfo summary records synthesized for them when building the index,
6152 // and see if any are cloned and/or refer to clones.
6153 bool ICPNeeded = false;
6154 unsigned NumClones = 0;
6155 size_t CallsiteInfoStartIndex = std::distance(first: AllCallsites.begin(), last: SI);
6156 for (const auto &Candidate : CandidateProfileData) {
6157#ifndef NDEBUG
6158 auto CalleeValueInfo =
6159#endif
6160 ImportSummary->getValueInfo(GUID: Candidate.Value);
6161 // We might not have a ValueInfo if this is a distributed
6162 // ThinLTO backend and decided not to import that function.
6163 assert(!CalleeValueInfo || SI->Callee == CalleeValueInfo);
6164 assert(SI != AllCallsites.end());
6165 auto &StackNode = *(SI++);
6166 // See if any of the clones of the indirect callsite for this
6167 // profiled target should call a cloned version of the profiled
6168 // target. We only need to do the ICP here if so.
6169 ICPNeeded |= llvm::any_of(Range: StackNode.Clones,
6170 P: [](unsigned CloneNo) { return CloneNo != 0; });
6171 // Every callsite in the same function should have been cloned the same
6172 // number of times.
6173 assert(!NumClones || NumClones == StackNode.Clones.size());
6174 NumClones = StackNode.Clones.size();
6175 }
6176 if (!ICPNeeded)
6177 return NumClones;
6178 // Save information for ICP, which is performed later to avoid messing up the
6179 // current function traversal.
6180 ICallAnalysisInfo.push_back(Elt: {.CB: CB, .CandidateProfileData: CandidateProfileData.vec(), .NumCandidates: NumCandidates,
6181 .TotalCount: TotalCount, .CallsiteInfoStartIndex: CallsiteInfoStartIndex});
6182 return NumClones;
6183}
6184
6185void MemProfContextDisambiguation::performICP(
6186 Module &M, ArrayRef<CallsiteInfo> AllCallsites,
6187 ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps,
6188 ArrayRef<ICallAnalysisData> ICallAnalysisInfo,
6189 OptimizationRemarkEmitter &ORE) {
6190 // Now do any promotion required for cloning. Specifically, for each
6191 // recorded ICP candidate (which was only recorded because one clone of that
6192 // candidate should call a cloned target), we perform ICP (speculative
6193 // devirtualization) for each clone of the callsite, and update its callee
6194 // to the appropriate clone. Note that the ICP compares against the original
6195 // version of the target, which is what is in the vtable.
6196 for (auto &Info : ICallAnalysisInfo) {
6197 auto *CB = Info.CB;
6198 auto CallsiteIndex = Info.CallsiteInfoStartIndex;
6199 auto TotalCount = Info.TotalCount;
6200 unsigned NumPromoted = 0;
6201 unsigned NumClones = 0;
6202
6203 for (auto &Candidate : Info.CandidateProfileData) {
6204 auto &StackNode = AllCallsites[CallsiteIndex++];
6205
6206 // All calls in the same function must have the same number of clones.
6207 assert(!NumClones || NumClones == StackNode.Clones.size());
6208 NumClones = StackNode.Clones.size();
6209
6210 // See if the target is in the module. If it wasn't imported, it is
6211 // possible that this profile could have been collected on a different
6212 // target (or version of the code), and we need to be conservative
6213 // (similar to what is done in the ICP pass).
6214 Function *TargetFunction = Symtab->getFunction(FuncMD5Hash: Candidate.Value);
6215 if (TargetFunction == nullptr ||
6216 // Any ThinLTO global dead symbol removal should have already
6217 // occurred, so it should be safe to promote when the target is a
6218 // declaration.
6219 // TODO: Remove internal option once more fully tested.
6220 (MemProfRequireDefinitionForPromotion &&
6221 TargetFunction->isDeclaration())) {
6222 ORE.emit(RemarkBuilder: [&]() {
6223 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", CB)
6224 << "Memprof cannot promote indirect call: target with md5sum "
6225 << ore::NV("target md5sum", Candidate.Value) << " not found";
6226 });
6227 // FIXME: See if we can use the new declaration importing support to
6228 // at least get the declarations imported for this case. Hot indirect
6229 // targets should have been imported normally, however.
6230 continue;
6231 }
6232
6233 // Check if legal to promote
6234 const char *Reason = nullptr;
6235 if (!isLegalToPromote(CB: *CB, Callee: TargetFunction, FailureReason: &Reason)) {
6236 ORE.emit(RemarkBuilder: [&]() {
6237 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", CB)
6238 << "Memprof cannot promote indirect call to "
6239 << ore::NV("TargetFunction", TargetFunction)
6240 << " with count of " << ore::NV("TotalCount", TotalCount)
6241 << ": " << Reason;
6242 });
6243 continue;
6244 }
6245
6246 assert(!isMemProfClone(*TargetFunction));
6247
6248 // Handle each call clone, applying ICP so that each clone directly
6249 // calls the specified callee clone, guarded by the appropriate ICP
6250 // check.
6251 CallBase *CBClone = CB;
6252 for (unsigned J = 0; J < NumClones; J++) {
6253 // If the VMap is empty, this clone was a duplicate of another and was
6254 // created as an alias or a declaration.
6255 if (J > 0 && VMaps[J - 1]->empty())
6256 continue;
6257 // Copy 0 is the original function.
6258 if (J > 0)
6259 CBClone = cast<CallBase>(Val&: (*VMaps[J - 1])[CB]);
6260 // We do the promotion using the original name, so that the comparison
6261 // is against the name in the vtable. Then just below, change the new
6262 // direct call to call the cloned function.
6263 auto &DirectCall =
6264 pgo::promoteIndirectCall(CB&: *CBClone, F: TargetFunction, Count: Candidate.Count,
6265 TotalCount, AttachProfToDirectCall: isSamplePGO, ORE: &ORE);
6266 auto *TargetToUse = TargetFunction;
6267 // Call original if this version calls the original version of its
6268 // callee.
6269 if (StackNode.Clones[J]) {
6270 TargetToUse =
6271 cast<Function>(Val: M.getOrInsertFunction(
6272 Name: getMemProfFuncName(Base: TargetFunction->getName(),
6273 CloneNo: StackNode.Clones[J]),
6274 T: TargetFunction->getFunctionType())
6275 .getCallee());
6276 }
6277 DirectCall.setCalledFunction(TargetToUse);
6278 // During matching we generate synthetic VP metadata for indirect calls
6279 // not already having any, from the memprof profile's callee GUIDs. If
6280 // we subsequently promote and inline those callees, we currently lose
6281 // the ability to generate this synthetic VP metadata. Optionally apply
6282 // a noinline attribute to promoted direct calls, where the threshold is
6283 // set to capture synthetic VP metadata targets which get a count of 1.
6284 if (MemProfICPNoInlineThreshold &&
6285 Candidate.Count < MemProfICPNoInlineThreshold)
6286 DirectCall.setIsNoInline();
6287 ORE.emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
6288 << ore::NV("Call", CBClone) << " in clone "
6289 << ore::NV("Caller", CBClone->getFunction())
6290 << " promoted and assigned to call function clone "
6291 << ore::NV("Callee", TargetToUse));
6292 }
6293
6294 // Update TotalCount (all clones should get same count above)
6295 TotalCount -= Candidate.Count;
6296 NumPromoted++;
6297 }
6298 // Adjust the MD.prof metadata for all clones, now that we have the new
6299 // TotalCount and the number promoted.
6300 CallBase *CBClone = CB;
6301 for (unsigned J = 0; J < NumClones; J++) {
6302 // If the VMap is empty, this clone was a duplicate of another and was
6303 // created as an alias or a declaration.
6304 if (J > 0 && VMaps[J - 1]->empty())
6305 continue;
6306 // Copy 0 is the original function.
6307 if (J > 0)
6308 CBClone = cast<CallBase>(Val&: (*VMaps[J - 1])[CB]);
6309 // First delete the old one.
6310 CBClone->setMetadata(KindID: LLVMContext::MD_prof, Node: nullptr);
6311 // If all promoted, we don't need the MD.prof metadata.
6312 // Otherwise we need update with the un-promoted records back.
6313 if (TotalCount != 0)
6314 annotateValueSite(
6315 M, Inst&: *CBClone, VDs: ArrayRef(Info.CandidateProfileData).slice(N: NumPromoted),
6316 Sum: TotalCount, ValueKind: IPVK_IndirectCallTarget, MaxMDCount: Info.NumCandidates);
6317 }
6318 }
6319}
6320
6321template <typename DerivedCCG, typename FuncTy, typename CallTy>
6322bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process(
6323 function_ref<void(StringRef, StringRef, const Twine &)> EmitRemark,
6324 bool AllowExtraAnalysis) {
6325 if (DumpCCG) {
6326 dbgs() << "CCG before cloning:\n";
6327 dbgs() << *this;
6328 }
6329 if (ExportToDot)
6330 exportToDot(Label: "postbuild");
6331
6332 if (VerifyCCG) {
6333 check();
6334 }
6335
6336 identifyClones();
6337
6338 if (VerifyCCG) {
6339 check();
6340 }
6341
6342 if (DumpCCG) {
6343 dbgs() << "CCG after cloning:\n";
6344 dbgs() << *this;
6345 }
6346 if (ExportToDot)
6347 exportToDot(Label: "cloned");
6348
6349 bool Changed = assignFunctions();
6350
6351 if (DumpCCG) {
6352 dbgs() << "CCG after assigning function clones:\n";
6353 dbgs() << *this;
6354 }
6355 if (ExportToDot)
6356 exportToDot(Label: "clonefuncassign");
6357
6358 if (MemProfReportHintedSizes || AllowExtraAnalysis)
6359 printTotalSizes(OS&: errs(), EmitRemark);
6360
6361 return Changed;
6362}
6363
6364bool MemProfContextDisambiguation::processModule(
6365 Module &M,
6366 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
6367
6368 // If we have an import summary, then the cloning decisions were made during
6369 // the thin link on the index. Apply them and return.
6370 if (ImportSummary)
6371 return applyImport(M);
6372
6373 // TODO: If/when other types of memprof cloning are enabled beyond just for
6374 // hot and cold, we will need to change this to individually control the
6375 // AllocationType passed to addStackNodesForMIB during CCG construction.
6376 // Note that we specifically check this after applying imports above, so that
6377 // the option isn't needed to be passed to distributed ThinLTO backend
6378 // clang processes, which won't necessarily have visibility into the linker
6379 // dependences. Instead the information is communicated from the LTO link to
6380 // the backends via the combined summary index.
6381 if (!SupportsHotColdNew)
6382 return false;
6383
6384 ModuleCallsiteContextGraph CCG(M, OREGetter);
6385 // TODO: Set up remarks for regular LTO. We need to decide what function to
6386 // use in the callback.
6387 return CCG.process();
6388}
6389
6390MemProfContextDisambiguation::MemProfContextDisambiguation(
6391 const ModuleSummaryIndex *Summary, bool isSamplePGO)
6392 : ImportSummary(Summary), isSamplePGO(isSamplePGO) {
6393 // Check the dot graph printing options once here, to make sure we have valid
6394 // and expected combinations.
6395 if (DotGraphScope == DotScope::Alloc && !AllocIdForDot.getNumOccurrences())
6396 llvm::report_fatal_error(
6397 reason: "-memprof-dot-scope=alloc requires -memprof-dot-alloc-id");
6398 if (DotGraphScope == DotScope::Context &&
6399 !ContextIdForDot.getNumOccurrences())
6400 llvm::report_fatal_error(
6401 reason: "-memprof-dot-scope=context requires -memprof-dot-context-id");
6402 if (DotGraphScope == DotScope::All && AllocIdForDot.getNumOccurrences() &&
6403 ContextIdForDot.getNumOccurrences())
6404 llvm::report_fatal_error(
6405 reason: "-memprof-dot-scope=all can't have both -memprof-dot-alloc-id and "
6406 "-memprof-dot-context-id");
6407 if (ImportSummary) {
6408 // The MemProfImportSummary should only be used for testing ThinLTO
6409 // distributed backend handling via opt, in which case we don't have a
6410 // summary from the pass pipeline.
6411 assert(MemProfImportSummary.empty());
6412 return;
6413 }
6414 if (MemProfImportSummary.empty())
6415 return;
6416
6417 auto ReadSummaryFile =
6418 errorOrToExpected(EO: MemoryBuffer::getFile(Filename: MemProfImportSummary));
6419 if (!ReadSummaryFile) {
6420 logAllUnhandledErrors(E: ReadSummaryFile.takeError(), OS&: errs(),
6421 ErrorBanner: "Error loading file '" + MemProfImportSummary +
6422 "': ");
6423 return;
6424 }
6425 auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(Buffer: **ReadSummaryFile);
6426 if (!ImportSummaryForTestingOrErr) {
6427 logAllUnhandledErrors(E: ImportSummaryForTestingOrErr.takeError(), OS&: errs(),
6428 ErrorBanner: "Error parsing file '" + MemProfImportSummary +
6429 "': ");
6430 return;
6431 }
6432 ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
6433 ImportSummary = ImportSummaryForTesting.get();
6434}
6435
6436PreservedAnalyses MemProfContextDisambiguation::run(Module &M,
6437 ModuleAnalysisManager &AM) {
6438 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(IR&: M).getManager();
6439 auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
6440 return FAM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: *F);
6441 };
6442 if (!processModule(M, OREGetter))
6443 return PreservedAnalyses::all();
6444 return PreservedAnalyses::none();
6445}
6446
6447void MemProfContextDisambiguation::run(
6448 ModuleSummaryIndex &Index,
6449 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
6450 isPrevailing,
6451 LLVMContext &Ctx,
6452 function_ref<void(StringRef, StringRef, const Twine &)> EmitRemark) {
6453 // TODO: If/when other types of memprof cloning are enabled beyond just for
6454 // hot and cold, we will need to change this to individually control the
6455 // AllocationType passed to addStackNodesForMIB during CCG construction.
6456 // The index was set from the option, so these should be in sync.
6457 assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
6458 if (!SupportsHotColdNew)
6459 return;
6460
6461 bool AllowExtraAnalysis =
6462 OptimizationRemarkEmitter::allowExtraAnalysis(Ctx, DEBUG_TYPE);
6463
6464 IndexCallsiteContextGraph CCG(Index, isPrevailing);
6465 CCG.process(EmitRemark, AllowExtraAnalysis);
6466}
6467
6468// Strips MemProf attributes and metadata. Can be invoked by the pass pipeline
6469// when we don't have an index that has recorded that we are linking with
6470// allocation libraries containing the necessary APIs for downstream
6471// transformations.
6472PreservedAnalyses MemProfRemoveInfo::run(Module &M, ModuleAnalysisManager &AM) {
6473 // The profile matcher applies hotness attributes directly for allocations,
6474 // and those will cause us to generate calls to the hot/cold interfaces
6475 // unconditionally. If supports-hot-cold-new was not enabled in the LTO
6476 // link then assume we don't want these calls (e.g. not linking with
6477 // the appropriate library, or otherwise trying to disable this behavior).
6478 bool Changed = false;
6479 for (auto &F : M) {
6480 for (auto &BB : F) {
6481 for (auto &I : BB) {
6482 auto *CI = dyn_cast<CallBase>(Val: &I);
6483 if (!CI)
6484 continue;
6485 if (CI->hasFnAttr(Kind: "memprof")) {
6486 CI->removeFnAttr(Kind: "memprof");
6487 Changed = true;
6488 }
6489 if (!CI->hasMetadata(KindID: LLVMContext::MD_callsite)) {
6490 assert(!CI->hasMetadata(LLVMContext::MD_memprof));
6491 continue;
6492 }
6493 // Strip off all memprof metadata as it is no longer needed.
6494 // Importantly, this avoids the addition of new memprof attributes
6495 // after inlining propagation.
6496 CI->setMetadata(KindID: LLVMContext::MD_memprof, Node: nullptr);
6497 CI->setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
6498 Changed = true;
6499 }
6500 }
6501 }
6502 if (!Changed)
6503 return PreservedAnalyses::all();
6504 return PreservedAnalyses::none();
6505}
6506