1//==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements support for context disambiguation of allocation
10// calls for profile guided heap optimization. Specifically, it uses Memprof
11// profiles which indicate context specific allocation behavior (currently
12// distinguishing cold vs hot memory allocations). Cloning is performed to
13// expose the cold allocation call contexts, and the allocation calls are
14// subsequently annotated with an attribute for later transformation.
15//
16// The transformations can be performed either directly on IR (regular LTO), or
17// on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
18// Both types of LTO operate on a the same base graph representation, which
19// uses CRTP to support either IR or Index formats.
20//
21//===----------------------------------------------------------------------===//
22
23#include "llvm/Transforms/IPO/MemProfContextDisambiguation.h"
24#include "llvm/ADT/DenseMap.h"
25#include "llvm/ADT/DenseSet.h"
26#include "llvm/ADT/MapVector.h"
27#include "llvm/ADT/SetOperations.h"
28#include "llvm/ADT/SmallPtrSet.h"
29#include "llvm/ADT/SmallSet.h"
30#include "llvm/ADT/SmallVector.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/Analysis/MemoryProfileInfo.h"
33#include "llvm/Analysis/ModuleSummaryAnalysis.h"
34#include "llvm/Analysis/OptimizationRemarkEmitter.h"
35#include "llvm/Bitcode/BitcodeReader.h"
36#include "llvm/IR/Instructions.h"
37#include "llvm/IR/Module.h"
38#include "llvm/IR/ModuleSummaryIndex.h"
39#include "llvm/Pass.h"
40#include "llvm/Support/CommandLine.h"
41#include "llvm/Support/GraphWriter.h"
42#include "llvm/Support/InterleavedRange.h"
43#include "llvm/Support/raw_ostream.h"
44#include "llvm/Transforms/IPO.h"
45#include "llvm/Transforms/Utils/CallPromotionUtils.h"
46#include "llvm/Transforms/Utils/Cloning.h"
47#include "llvm/Transforms/Utils/Instrumentation.h"
48#include <deque>
49#include <sstream>
50#include <unordered_map>
51#include <vector>
52using namespace llvm;
53using namespace llvm::memprof;
54
55#define DEBUG_TYPE "memprof-context-disambiguation"
56
57STATISTIC(FunctionClonesAnalysis,
58 "Number of function clones created during whole program analysis");
59STATISTIC(FunctionClonesThinBackend,
60 "Number of function clones created during ThinLTO backend");
61STATISTIC(FunctionsClonedThinBackend,
62 "Number of functions that had clones created during ThinLTO backend");
63STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
64 "cloned) during whole program analysis");
65STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
66 "during whole program analysis");
67STATISTIC(AllocTypeNotColdThinBackend,
68 "Number of not cold static allocations (possibly cloned) during "
69 "ThinLTO backend");
70STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
71 "(possibly cloned) during ThinLTO backend");
72STATISTIC(OrigAllocsThinBackend,
73 "Number of original (not cloned) allocations with memprof profiles "
74 "during ThinLTO backend");
75STATISTIC(
76 AllocVersionsThinBackend,
77 "Number of allocation versions (including clones) during ThinLTO backend");
78STATISTIC(MaxAllocVersionsThinBackend,
79 "Maximum number of allocation versions created for an original "
80 "allocation during ThinLTO backend");
81STATISTIC(UnclonableAllocsThinBackend,
82 "Number of unclonable ambigous allocations during ThinLTO backend");
83STATISTIC(RemovedEdgesWithMismatchedCallees,
84 "Number of edges removed due to mismatched callees (profiled vs IR)");
85STATISTIC(FoundProfiledCalleeCount,
86 "Number of profiled callees found via tail calls");
87STATISTIC(FoundProfiledCalleeDepth,
88 "Aggregate depth of profiled callees found via tail calls");
89STATISTIC(FoundProfiledCalleeMaxDepth,
90 "Maximum depth of profiled callees found via tail calls");
91STATISTIC(FoundProfiledCalleeNonUniquelyCount,
92 "Number of profiled callees found via multiple tail call chains");
93STATISTIC(DeferredBackedges, "Number of backedges with deferred cloning");
94STATISTIC(NewMergedNodes, "Number of new nodes created during merging");
95STATISTIC(NonNewMergedNodes, "Number of non new nodes used during merging");
96STATISTIC(MissingAllocForContextId,
97 "Number of missing alloc nodes for context ids");
98
99static cl::opt<std::string> DotFilePathPrefix(
100 "memprof-dot-file-path-prefix", cl::init(Val: ""), cl::Hidden,
101 cl::value_desc("filename"),
102 cl::desc("Specify the path prefix of the MemProf dot files."));
103
104static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(Val: false),
105 cl::Hidden,
106 cl::desc("Export graph to dot files."));
107
108// How much of the graph to export to dot.
109enum DotScope {
110 All, // The full CCG graph.
111 Alloc, // Only contexts for the specified allocation.
112 Context, // Only the specified context.
113};
114
115static cl::opt<DotScope> DotGraphScope(
116 "memprof-dot-scope", cl::desc("Scope of graph to export to dot"),
117 cl::Hidden, cl::init(Val: DotScope::All),
118 cl::values(
119 clEnumValN(DotScope::All, "all", "Export full callsite graph"),
120 clEnumValN(DotScope::Alloc, "alloc",
121 "Export only nodes with contexts feeding given "
122 "-memprof-dot-alloc-id"),
123 clEnumValN(DotScope::Context, "context",
124 "Export only nodes with given -memprof-dot-context-id")));
125
126static cl::opt<unsigned>
127 AllocIdForDot("memprof-dot-alloc-id", cl::init(Val: 0), cl::Hidden,
128 cl::desc("Id of alloc to export if -memprof-dot-scope=alloc "
129 "or to highlight if -memprof-dot-scope=all"));
130
131static cl::opt<unsigned> ContextIdForDot(
132 "memprof-dot-context-id", cl::init(Val: 0), cl::Hidden,
133 cl::desc("Id of context to export if -memprof-dot-scope=context or to "
134 "highlight otherwise"));
135
136static cl::opt<bool>
137 DumpCCG("memprof-dump-ccg", cl::init(Val: false), cl::Hidden,
138 cl::desc("Dump CallingContextGraph to stdout after each stage."));
139
140static cl::opt<bool>
141 VerifyCCG("memprof-verify-ccg", cl::init(Val: false), cl::Hidden,
142 cl::desc("Perform verification checks on CallingContextGraph."));
143
144static cl::opt<bool>
145 VerifyNodes("memprof-verify-nodes", cl::init(Val: false), cl::Hidden,
146 cl::desc("Perform frequent verification checks on nodes."));
147
148static cl::opt<std::string> MemProfImportSummary(
149 "memprof-import-summary",
150 cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
151 cl::Hidden);
152
153static cl::opt<unsigned>
154 TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(Val: 5),
155 cl::Hidden,
156 cl::desc("Max depth to recursively search for missing "
157 "frames through tail calls."));
158
159// Optionally enable cloning of callsites involved with recursive cycles
160static cl::opt<bool> AllowRecursiveCallsites(
161 "memprof-allow-recursive-callsites", cl::init(Val: true), cl::Hidden,
162 cl::desc("Allow cloning of callsites involved in recursive cycles"));
163
164static cl::opt<bool> CloneRecursiveContexts(
165 "memprof-clone-recursive-contexts", cl::init(Val: true), cl::Hidden,
166 cl::desc("Allow cloning of contexts through recursive cycles"));
167
168// Generally this is needed for correct assignment of allocation clones to
169// function clones, however, allow it to be disabled for debugging while the
170// functionality is new and being tested more widely.
171static cl::opt<bool>
172 MergeClones("memprof-merge-clones", cl::init(Val: true), cl::Hidden,
173 cl::desc("Merge clones before assigning functions"));
174
175// When disabled, try to detect and prevent cloning of recursive contexts.
176// This is only necessary until we support cloning through recursive cycles.
177// Leave on by default for now, as disabling requires a little bit of compile
178// time overhead and doesn't affect correctness, it will just inflate the cold
179// hinted bytes reporting a bit when -memprof-report-hinted-sizes is enabled.
180static cl::opt<bool> AllowRecursiveContexts(
181 "memprof-allow-recursive-contexts", cl::init(Val: true), cl::Hidden,
182 cl::desc("Allow cloning of contexts having recursive cycles"));
183
184namespace llvm {
185cl::opt<bool> EnableMemProfContextDisambiguation(
186 "enable-memprof-context-disambiguation", cl::init(Val: false), cl::Hidden,
187 cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));
188
189// Indicate we are linking with an allocator that supports hot/cold operator
190// new interfaces.
191cl::opt<bool> SupportsHotColdNew(
192 "supports-hot-cold-new", cl::init(Val: false), cl::Hidden,
193 cl::desc("Linking with hot/cold operator new interfaces"));
194
195static cl::opt<bool> MemProfRequireDefinitionForPromotion(
196 "memprof-require-definition-for-promotion", cl::init(Val: false), cl::Hidden,
197 cl::desc(
198 "Require target function definition when promoting indirect calls"));
199} // namespace llvm
200
201extern cl::opt<bool> MemProfReportHintedSizes;
202extern cl::opt<unsigned> MinClonedColdBytePercent;
203
204namespace {
205/// CRTP base for graphs built from either IR or ThinLTO summary index.
206///
207/// The graph represents the call contexts in all memprof metadata on allocation
208/// calls, with nodes for the allocations themselves, as well as for the calls
209/// in each context. The graph is initially built from the allocation memprof
210/// metadata (or summary) MIBs. It is then updated to match calls with callsite
211/// metadata onto the nodes, updating it to reflect any inlining performed on
212/// those calls.
213///
214/// Each MIB (representing an allocation's call context with allocation
215/// behavior) is assigned a unique context id during the graph build. The edges
216/// and nodes in the graph are decorated with the context ids they carry. This
217/// is used to correctly update the graph when cloning is performed so that we
218/// can uniquify the context for a single (possibly cloned) allocation.
219template <typename DerivedCCG, typename FuncTy, typename CallTy>
220class CallsiteContextGraph {
221public:
222 CallsiteContextGraph() = default;
223 CallsiteContextGraph(const CallsiteContextGraph &) = default;
224 CallsiteContextGraph(CallsiteContextGraph &&) = default;
225
226 /// Main entry point to perform analysis and transformations on graph.
227 bool process();
228
229 /// Perform cloning on the graph necessary to uniquely identify the allocation
230 /// behavior of an allocation based on its context.
231 void identifyClones();
232
233 /// Assign callsite clones to functions, cloning functions as needed to
234 /// accommodate the combinations of their callsite clones reached by callers.
235 /// For regular LTO this clones functions and callsites in the IR, but for
236 /// ThinLTO the cloning decisions are noted in the summaries and later applied
237 /// in applyImport.
238 bool assignFunctions();
239
240 void dump() const;
241 void print(raw_ostream &OS) const;
242 void printTotalSizes(raw_ostream &OS) const;
243
244 friend raw_ostream &operator<<(raw_ostream &OS,
245 const CallsiteContextGraph &CCG) {
246 CCG.print(OS);
247 return OS;
248 }
249
250 friend struct GraphTraits<
251 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
252 friend struct DOTGraphTraits<
253 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
254
255 void exportToDot(std::string Label) const;
256
257 /// Represents a function clone via FuncTy pointer and clone number pair.
258 struct FuncInfo final
259 : public std::pair<FuncTy *, unsigned /*Clone number*/> {
260 using Base = std::pair<FuncTy *, unsigned>;
261 FuncInfo(const Base &B) : Base(B) {}
262 FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
263 explicit operator bool() const { return this->first != nullptr; }
264 FuncTy *func() const { return this->first; }
265 unsigned cloneNo() const { return this->second; }
266 };
267
268 /// Represents a callsite clone via CallTy and clone number pair.
269 struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
270 using Base = std::pair<CallTy, unsigned>;
271 CallInfo(const Base &B) : Base(B) {}
272 CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
273 : Base(Call, CloneNo) {}
274 explicit operator bool() const { return (bool)this->first; }
275 CallTy call() const { return this->first; }
276 unsigned cloneNo() const { return this->second; }
277 void setCloneNo(unsigned N) { this->second = N; }
278 void print(raw_ostream &OS) const {
279 if (!operator bool()) {
280 assert(!cloneNo());
281 OS << "null Call";
282 return;
283 }
284 call()->print(OS);
285 OS << "\t(clone " << cloneNo() << ")";
286 }
287 void dump() const {
288 print(OS&: dbgs());
289 dbgs() << "\n";
290 }
291 friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
292 Call.print(OS);
293 return OS;
294 }
295 };
296
297 struct ContextEdge;
298
299 /// Node in the Callsite Context Graph
300 struct ContextNode {
301 // Keep this for now since in the IR case where we have an Instruction* it
302 // is not as immediately discoverable. Used for printing richer information
303 // when dumping graph.
304 bool IsAllocation;
305
306 // Keeps track of when the Call was reset to null because there was
307 // recursion.
308 bool Recursive = false;
309
310 // This will be formed by ORing together the AllocationType enum values
311 // for contexts including this node.
312 uint8_t AllocTypes = 0;
313
314 // The corresponding allocation or interior call. This is the primary call
315 // for which we have created this node.
316 CallInfo Call;
317
318 // List of other calls that can be treated the same as the primary call
319 // through cloning. I.e. located in the same function and have the same
320 // (possibly pruned) stack ids. They will be updated the same way as the
321 // primary call when assigning to function clones.
322 SmallVector<CallInfo, 0> MatchingCalls;
323
324 // For alloc nodes this is a unique id assigned when constructed, and for
325 // callsite stack nodes it is the original stack id when the node is
326 // constructed from the memprof MIB metadata on the alloc nodes. Note that
327 // this is only used when matching callsite metadata onto the stack nodes
328 // created when processing the allocation memprof MIBs, and for labeling
329 // nodes in the dot graph. Therefore we don't bother to assign a value for
330 // clones.
331 uint64_t OrigStackOrAllocId = 0;
332
333 // Edges to all callees in the profiled call stacks.
334 // TODO: Should this be a map (from Callee node) for more efficient lookup?
335 std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
336
337 // Edges to all callers in the profiled call stacks.
338 // TODO: Should this be a map (from Caller node) for more efficient lookup?
339 std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
340
341 // Returns true if we need to look at the callee edges for determining the
342 // node context ids and allocation type.
343 bool useCallerEdgesForContextInfo() const {
344 // Typically if the callee edges are empty either the caller edges are
345 // also empty, or this is an allocation (leaf node). However, if we are
346 // allowing recursive callsites and contexts this will be violated for
347 // incompletely cloned recursive cycles.
348 assert(!CalleeEdges.empty() || CallerEdges.empty() || IsAllocation ||
349 (AllowRecursiveCallsites && AllowRecursiveContexts));
350 // When cloning for a recursive context, during cloning we might be in the
351 // midst of cloning for a recurrence and have moved context ids off of a
352 // caller edge onto the clone but not yet off of the incoming caller
353 // (back) edge. If we don't look at those we miss the fact that this node
354 // still has context ids of interest.
355 return IsAllocation || CloneRecursiveContexts;
356 }
357
358 // Compute the context ids for this node from the union of its edge context
359 // ids.
360 DenseSet<uint32_t> getContextIds() const {
361 unsigned Count = 0;
362 // Compute the number of ids for reserve below. In general we only need to
363 // look at one set of edges, typically the callee edges, since other than
364 // allocations and in some cases during recursion cloning, all the context
365 // ids on the callers should also flow out via callee edges.
366 for (auto &Edge : CalleeEdges.empty() ? CallerEdges : CalleeEdges)
367 Count += Edge->getContextIds().size();
368 DenseSet<uint32_t> ContextIds;
369 ContextIds.reserve(Size: Count);
370 auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
371 CalleeEdges, useCallerEdgesForContextInfo()
372 ? CallerEdges
373 : std::vector<std::shared_ptr<ContextEdge>>());
374 for (const auto &Edge : Edges)
375 ContextIds.insert_range(Edge->getContextIds());
376 return ContextIds;
377 }
378
379 // Compute the allocation type for this node from the OR of its edge
380 // allocation types.
381 uint8_t computeAllocType() const {
382 uint8_t BothTypes =
383 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
384 uint8_t AllocType = (uint8_t)AllocationType::None;
385 auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
386 CalleeEdges, useCallerEdgesForContextInfo()
387 ? CallerEdges
388 : std::vector<std::shared_ptr<ContextEdge>>());
389 for (const auto &Edge : Edges) {
390 AllocType |= Edge->AllocTypes;
391 // Bail early if alloc type reached both, no further refinement.
392 if (AllocType == BothTypes)
393 return AllocType;
394 }
395 return AllocType;
396 }
397
398 // The context ids set for this node is empty if its edge context ids are
399 // also all empty.
400 bool emptyContextIds() const {
401 auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
402 CalleeEdges, useCallerEdgesForContextInfo()
403 ? CallerEdges
404 : std::vector<std::shared_ptr<ContextEdge>>());
405 for (const auto &Edge : Edges) {
406 if (!Edge->getContextIds().empty())
407 return false;
408 }
409 return true;
410 }
411
412 // List of clones of this ContextNode, initially empty.
413 std::vector<ContextNode *> Clones;
414
415 // If a clone, points to the original uncloned node.
416 ContextNode *CloneOf = nullptr;
417
418 ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
419
420 ContextNode(bool IsAllocation, CallInfo C)
421 : IsAllocation(IsAllocation), Call(C) {}
422
423 void addClone(ContextNode *Clone) {
424 if (CloneOf) {
425 CloneOf->Clones.push_back(Clone);
426 Clone->CloneOf = CloneOf;
427 } else {
428 Clones.push_back(Clone);
429 assert(!Clone->CloneOf);
430 Clone->CloneOf = this;
431 }
432 }
433
434 ContextNode *getOrigNode() {
435 if (!CloneOf)
436 return this;
437 return CloneOf;
438 }
439
440 void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
441 unsigned int ContextId);
442
443 ContextEdge *findEdgeFromCallee(const ContextNode *Callee);
444 ContextEdge *findEdgeFromCaller(const ContextNode *Caller);
445 void eraseCalleeEdge(const ContextEdge *Edge);
446 void eraseCallerEdge(const ContextEdge *Edge);
447
448 void setCall(CallInfo C) { Call = C; }
449
450 bool hasCall() const { return (bool)Call.call(); }
451
452 void printCall(raw_ostream &OS) const { Call.print(OS); }
453
454 // True if this node was effectively removed from the graph, in which case
455 // it should have an allocation type of None and empty context ids.
456 bool isRemoved() const {
457 // Typically if the callee edges are empty either the caller edges are
458 // also empty, or this is an allocation (leaf node). However, if we are
459 // allowing recursive callsites and contexts this will be violated for
460 // incompletely cloned recursive cycles.
461 assert((AllowRecursiveCallsites && AllowRecursiveContexts) ||
462 (AllocTypes == (uint8_t)AllocationType::None) ==
463 emptyContextIds());
464 return AllocTypes == (uint8_t)AllocationType::None;
465 }
466
467 void dump() const;
468 void print(raw_ostream &OS) const;
469
470 friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
471 Node.print(OS);
472 return OS;
473 }
474 };
475
476 /// Edge in the Callsite Context Graph from a ContextNode N to a caller or
477 /// callee.
478 struct ContextEdge {
479 ContextNode *Callee;
480 ContextNode *Caller;
481
482 // This will be formed by ORing together the AllocationType enum values
483 // for contexts including this edge.
484 uint8_t AllocTypes = 0;
485
486 // Set just before initiating cloning when cloning of recursive contexts is
487 // enabled. Used to defer cloning of backedges until we have done cloning of
488 // the callee node for non-backedge caller edges. This exposes cloning
489 // opportunities through the backedge of the cycle.
490 // TODO: Note that this is not updated during cloning, and it is unclear
491 // whether that would be needed.
492 bool IsBackedge = false;
493
494 // The set of IDs for contexts including this edge.
495 DenseSet<uint32_t> ContextIds;
496
497 ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
498 DenseSet<uint32_t> ContextIds)
499 : Callee(Callee), Caller(Caller), AllocTypes(AllocType),
500 ContextIds(std::move(ContextIds)) {}
501
502 DenseSet<uint32_t> &getContextIds() { return ContextIds; }
503
504 // Helper to clear the fields of this edge when we are removing it from the
505 // graph.
506 inline void clear() {
507 ContextIds.clear();
508 AllocTypes = (uint8_t)AllocationType::None;
509 Caller = nullptr;
510 Callee = nullptr;
511 }
512
513 // Check if edge was removed from the graph. This is useful while iterating
514 // over a copy of edge lists when performing operations that mutate the
515 // graph in ways that might remove one of the edges.
516 inline bool isRemoved() const {
517 if (Callee || Caller)
518 return false;
519 // Any edges that have been removed from the graph but are still in a
520 // shared_ptr somewhere should have all fields null'ed out by clear()
521 // above.
522 assert(AllocTypes == (uint8_t)AllocationType::None);
523 assert(ContextIds.empty());
524 return true;
525 }
526
527 void dump() const;
528 void print(raw_ostream &OS) const;
529
530 friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
531 Edge.print(OS);
532 return OS;
533 }
534 };
535
536 /// Helpers to remove edges that have allocation type None (due to not
537 /// carrying any context ids) after transformations.
538 void removeNoneTypeCalleeEdges(ContextNode *Node);
539 void removeNoneTypeCallerEdges(ContextNode *Node);
540 void
541 recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node,
542 DenseSet<const ContextNode *> &Visited);
543
544protected:
545 /// Get a list of nodes corresponding to the stack ids in the given callsite
546 /// context.
547 template <class NodeT, class IteratorT>
548 std::vector<uint64_t>
549 getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
550
551 /// Adds nodes for the given allocation and any stack ids on its memprof MIB
552 /// metadata (or summary).
553 ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);
554
555 /// Adds nodes for the given MIB stack ids.
556 template <class NodeT, class IteratorT>
557 void addStackNodesForMIB(ContextNode *AllocNode,
558 CallStack<NodeT, IteratorT> &StackContext,
559 CallStack<NodeT, IteratorT> &CallsiteContext,
560 AllocationType AllocType,
561 ArrayRef<ContextTotalSize> ContextSizeInfo);
562
563 /// Matches all callsite metadata (or summary) to the nodes created for
564 /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
565 /// inlining performed on those callsite instructions.
566 void updateStackNodes();
567
568 /// Update graph to conservatively handle any callsite stack nodes that target
569 /// multiple different callee target functions.
570 void handleCallsitesWithMultipleTargets();
571
572 /// Mark backedges via the standard DFS based backedge algorithm.
573 void markBackedges();
574
575 /// Merge clones generated during cloning for different allocations but that
576 /// are called by the same caller node, to ensure proper function assignment.
577 void mergeClones();
578
579 // Try to partition calls on the given node (already placed into the AllCalls
580 // array) by callee function, creating new copies of Node as needed to hold
581 // calls with different callees, and moving the callee edges appropriately.
582 // Returns true if partitioning was successful.
583 bool partitionCallsByCallee(
584 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
585 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode);
586
587 /// Save lists of calls with MemProf metadata in each function, for faster
588 /// iteration.
589 MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
590
591 /// Map from callsite node to the enclosing caller function.
592 std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
593
594 // When exporting to dot, and an allocation id is specified, contains the
595 // context ids on that allocation.
596 DenseSet<uint32_t> DotAllocContextIds;
597
598private:
599 using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
600
601 // Structure to keep track of information for each call as we are matching
602 // non-allocation callsites onto context nodes created from the allocation
603 // call metadata / summary contexts.
604 struct CallContextInfo {
605 // The callsite we're trying to match.
606 CallTy Call;
607 // The callsites stack ids that have a context node in the graph.
608 std::vector<uint64_t> StackIds;
609 // The function containing this callsite.
610 const FuncTy *Func;
611 // Initially empty, if needed this will be updated to contain the context
612 // ids for use in a new context node created for this callsite.
613 DenseSet<uint32_t> ContextIds;
614 };
615
616 /// Helper to remove edge from graph, updating edge iterator if it is provided
617 /// (in which case CalleeIter indicates which edge list is being iterated).
618 /// This will also perform the necessary clearing of the ContextEdge members
619 /// to enable later checking if the edge has been removed (since we may have
620 /// other copies of the shared_ptr in existence, and in fact rely on this to
621 /// enable removal while iterating over a copy of a node's edge list).
622 void removeEdgeFromGraph(ContextEdge *Edge, EdgeIter *EI = nullptr,
623 bool CalleeIter = true);
624
625 /// Assigns the given Node to calls at or inlined into the location with
626 /// the Node's stack id, after post order traversing and processing its
627 /// caller nodes. Uses the call information recorded in the given
628 /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
629 /// as needed. Called by updateStackNodes which sets up the given
630 /// StackIdToMatchingCalls map.
631 void assignStackNodesPostOrder(
632 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
633 DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls,
634 DenseMap<CallInfo, CallInfo> &CallToMatchingCall);
635
636 /// Duplicates the given set of context ids, updating the provided
637 /// map from each original id with the newly generated context ids,
638 /// and returning the new duplicated id set.
639 DenseSet<uint32_t> duplicateContextIds(
640 const DenseSet<uint32_t> &StackSequenceContextIds,
641 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
642
643 /// Propagates all duplicated context ids across the graph.
644 void propagateDuplicateContextIds(
645 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
646
647 /// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
648 /// else to its callers. Also updates OrigNode's edges to remove any context
649 /// ids moved to the newly created edge.
650 void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
651 bool TowardsCallee,
652 DenseSet<uint32_t> RemainingContextIds);
653
654 /// Get the stack id corresponding to the given Id or Index (for IR this will
655 /// return itself, for a summary index this will return the id recorded in the
656 /// index for that stack id index value).
657 uint64_t getStackId(uint64_t IdOrIndex) const {
658 return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
659 }
660
661 /// Returns true if the given call targets the callee of the given edge, or if
662 /// we were able to identify the call chain through intermediate tail calls.
663 /// In the latter case new context nodes are added to the graph for the
664 /// identified tail calls, and their synthesized nodes are added to
665 /// TailCallToContextNodeMap. The EdgeIter is updated in the latter case for
666 /// the updated edges and to prepare it for an increment in the caller.
667 bool
668 calleesMatch(CallTy Call, EdgeIter &EI,
669 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);
670
671 // Return the callee function of the given call, or nullptr if it can't be
672 // determined
673 const FuncTy *getCalleeFunc(CallTy Call) {
674 return static_cast<DerivedCCG *>(this)->getCalleeFunc(Call);
675 }
676
677 /// Returns true if the given call targets the given function, or if we were
678 /// able to identify the call chain through intermediate tail calls (in which
679 /// case FoundCalleeChain will be populated).
680 bool calleeMatchesFunc(
681 CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc,
682 std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) {
683 return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(
684 Call, Func, CallerFunc, FoundCalleeChain);
685 }
686
687 /// Returns true if both call instructions have the same callee.
688 bool sameCallee(CallTy Call1, CallTy Call2) {
689 return static_cast<DerivedCCG *>(this)->sameCallee(Call1, Call2);
690 }
691
692 /// Get a list of nodes corresponding to the stack ids in the given
693 /// callsite's context.
694 std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
695 return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
696 Call);
697 }
698
699 /// Get the last stack id in the context for callsite.
700 uint64_t getLastStackId(CallTy Call) {
701 return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
702 }
703
704 /// Update the allocation call to record type of allocated memory.
705 void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
706 AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
707 static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
708 }
709
710 /// Get the AllocationType assigned to the given allocation instruction clone.
711 AllocationType getAllocationCallType(const CallInfo &Call) const {
712 return static_cast<const DerivedCCG *>(this)->getAllocationCallType(Call);
713 }
714
715 /// Update non-allocation call to invoke (possibly cloned) function
716 /// CalleeFunc.
717 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
718 static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
719 }
720
721 /// Clone the given function for the given callsite, recording mapping of all
722 /// of the functions tracked calls to their new versions in the CallMap.
723 /// Assigns new clones to clone number CloneNo.
724 FuncInfo cloneFunctionForCallsite(
725 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
726 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
727 return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
728 Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
729 }
730
731 /// Gets a label to use in the dot graph for the given call clone in the given
732 /// function.
733 std::string getLabel(const FuncTy *Func, const CallTy Call,
734 unsigned CloneNo) const {
735 return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
736 }
737
738 // Create and return a new ContextNode.
739 ContextNode *createNewNode(bool IsAllocation, const FuncTy *F = nullptr,
740 CallInfo C = CallInfo()) {
741 NodeOwner.push_back(std::make_unique<ContextNode>(IsAllocation, C));
742 auto *NewNode = NodeOwner.back().get();
743 if (F)
744 NodeToCallingFunc[NewNode] = F;
745 return NewNode;
746 }
747
748 /// Helpers to find the node corresponding to the given call or stackid.
749 ContextNode *getNodeForInst(const CallInfo &C);
750 ContextNode *getNodeForAlloc(const CallInfo &C);
751 ContextNode *getNodeForStackId(uint64_t StackId);
752
753 /// Computes the alloc type corresponding to the given context ids, by
754 /// unioning their recorded alloc types.
755 uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds) const;
756
757 /// Returns the allocation type of the intersection of the contexts of two
758 /// nodes (based on their provided context id sets), optimized for the case
759 /// when Node1Ids is smaller than Node2Ids.
760 uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
761 const DenseSet<uint32_t> &Node2Ids) const;
762
763 /// Returns the allocation type of the intersection of the contexts of two
764 /// nodes (based on their provided context id sets).
765 uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
766 const DenseSet<uint32_t> &Node2Ids) const;
767
768 /// Create a clone of Edge's callee and move Edge to that new callee node,
769 /// performing the necessary context id and allocation type updates.
770 /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
771 /// moved to an edge to the new callee.
772 ContextNode *
773 moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
774 DenseSet<uint32_t> ContextIdsToMove = {});
775
776 /// Change the callee of Edge to existing callee clone NewCallee, performing
777 /// the necessary context id and allocation type updates.
778 /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
779 /// moved to an edge to the new callee.
780 void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
781 ContextNode *NewCallee,
782 bool NewClone = false,
783 DenseSet<uint32_t> ContextIdsToMove = {});
784
785 /// Change the caller of the edge at the given callee edge iterator to be
786 /// NewCaller, performing the necessary context id and allocation type
787 /// updates. This is similar to the above moveEdgeToExistingCalleeClone, but
788 /// a simplified version of it as we always move the given edge and all of its
789 /// context ids.
790 void moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
791 ContextNode *NewCaller);
792
793 /// Recursive helper for marking backedges via DFS.
794 void markBackedges(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
795 DenseSet<const ContextNode *> &CurrentStack);
796
797 /// Recursive helper for merging clones.
798 void
799 mergeClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
800 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode);
801 /// Main worker for merging callee clones for a given node.
802 void mergeNodeCalleeClones(
803 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
804 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode);
805 /// Helper to find other callers of the given set of callee edges that can
806 /// share the same callee merge node.
807 void findOtherCallersToShareMerge(
808 ContextNode *Node, std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges,
809 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode,
810 DenseSet<ContextNode *> &OtherCallersToShareMerge);
811
812 /// Recursively perform cloning on the graph for the given Node and its
813 /// callers, in order to uniquely identify the allocation behavior of an
814 /// allocation given its context. The context ids of the allocation being
815 /// processed are given in AllocContextIds.
816 void identifyClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
817 const DenseSet<uint32_t> &AllocContextIds);
818
819 /// Map from each context ID to the AllocationType assigned to that context.
820 DenseMap<uint32_t, AllocationType> ContextIdToAllocationType;
821
822 /// Map from each contextID to the profiled full contexts and their total
823 /// sizes (there may be more than one due to context trimming),
824 /// optionally populated when requested (via MemProfReportHintedSizes or
825 /// MinClonedColdBytePercent).
826 DenseMap<uint32_t, std::vector<ContextTotalSize>> ContextIdToContextSizeInfos;
827
828 /// Identifies the context node created for a stack id when adding the MIB
829 /// contexts to the graph. This is used to locate the context nodes when
830 /// trying to assign the corresponding callsites with those stack ids to these
831 /// nodes.
832 DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
833
834 /// Maps to track the calls to their corresponding nodes in the graph.
835 MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
836 MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
837
838 /// Owner of all ContextNode unique_ptrs.
839 std::vector<std::unique_ptr<ContextNode>> NodeOwner;
840
841 /// Perform sanity checks on graph when requested.
842 void check() const;
843
844 /// Keeps track of the last unique context id assigned.
845 unsigned int LastContextId = 0;
846};
847
848template <typename DerivedCCG, typename FuncTy, typename CallTy>
849using ContextNode =
850 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
851template <typename DerivedCCG, typename FuncTy, typename CallTy>
852using ContextEdge =
853 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
854template <typename DerivedCCG, typename FuncTy, typename CallTy>
855using FuncInfo =
856 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
857template <typename DerivedCCG, typename FuncTy, typename CallTy>
858using CallInfo =
859 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
860
861/// CRTP derived class for graphs built from IR (regular LTO).
862class ModuleCallsiteContextGraph
863 : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
864 Instruction *> {
865public:
866 ModuleCallsiteContextGraph(
867 Module &M,
868 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
869
870private:
871 friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
872 Instruction *>;
873
874 uint64_t getStackId(uint64_t IdOrIndex) const;
875 const Function *getCalleeFunc(Instruction *Call);
876 bool calleeMatchesFunc(
877 Instruction *Call, const Function *Func, const Function *CallerFunc,
878 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain);
879 bool sameCallee(Instruction *Call1, Instruction *Call2);
880 bool findProfiledCalleeThroughTailCalls(
881 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
882 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
883 bool &FoundMultipleCalleeChains);
884 uint64_t getLastStackId(Instruction *Call);
885 std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
886 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
887 AllocationType getAllocationCallType(const CallInfo &Call) const;
888 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
889 CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
890 Instruction *>::FuncInfo
891 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
892 std::map<CallInfo, CallInfo> &CallMap,
893 std::vector<CallInfo> &CallsWithMetadataInFunc,
894 unsigned CloneNo);
895 std::string getLabel(const Function *Func, const Instruction *Call,
896 unsigned CloneNo) const;
897
898 const Module &Mod;
899 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
900};
901
902/// Represents a call in the summary index graph, which can either be an
903/// allocation or an interior callsite node in an allocation's context.
904/// Holds a pointer to the corresponding data structure in the index.
905struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
906 IndexCall() : PointerUnion() {}
907 IndexCall(std::nullptr_t) : IndexCall() {}
908 IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {}
909 IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
910 IndexCall(PointerUnion PT) : PointerUnion(PT) {}
911
912 IndexCall *operator->() { return this; }
913
914 void print(raw_ostream &OS) const {
915 PointerUnion<CallsiteInfo *, AllocInfo *> Base = *this;
916 if (auto *AI = llvm::dyn_cast_if_present<AllocInfo *>(Val&: Base)) {
917 OS << *AI;
918 } else {
919 auto *CI = llvm::dyn_cast_if_present<CallsiteInfo *>(Val&: Base);
920 assert(CI);
921 OS << *CI;
922 }
923 }
924};
925} // namespace
926
927namespace llvm {
928template <> struct simplify_type<IndexCall> {
929 using SimpleType = PointerUnion<CallsiteInfo *, AllocInfo *>;
930 static SimpleType getSimplifiedValue(IndexCall &Val) { return Val; }
931};
932template <> struct simplify_type<const IndexCall> {
933 using SimpleType = const PointerUnion<CallsiteInfo *, AllocInfo *>;
934 static SimpleType getSimplifiedValue(const IndexCall &Val) { return Val; }
935};
936} // namespace llvm
937
938namespace {
939/// CRTP derived class for graphs built from summary index (ThinLTO).
940class IndexCallsiteContextGraph
941 : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
942 IndexCall> {
943public:
944 IndexCallsiteContextGraph(
945 ModuleSummaryIndex &Index,
946 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
947 isPrevailing);
948
949 ~IndexCallsiteContextGraph() {
950 // Now that we are done with the graph it is safe to add the new
951 // CallsiteInfo structs to the function summary vectors. The graph nodes
952 // point into locations within these vectors, so we don't want to add them
953 // any earlier.
954 for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) {
955 auto *FS = I.first;
956 for (auto &Callsite : I.second)
957 FS->addCallsite(Callsite&: *Callsite.second);
958 }
959 }
960
961private:
962 friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
963 IndexCall>;
964
965 uint64_t getStackId(uint64_t IdOrIndex) const;
966 const FunctionSummary *getCalleeFunc(IndexCall &Call);
967 bool calleeMatchesFunc(
968 IndexCall &Call, const FunctionSummary *Func,
969 const FunctionSummary *CallerFunc,
970 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);
971 bool sameCallee(IndexCall &Call1, IndexCall &Call2);
972 bool findProfiledCalleeThroughTailCalls(
973 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
974 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
975 bool &FoundMultipleCalleeChains);
976 uint64_t getLastStackId(IndexCall &Call);
977 std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
978 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
979 AllocationType getAllocationCallType(const CallInfo &Call) const;
980 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
981 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
982 IndexCall>::FuncInfo
983 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
984 std::map<CallInfo, CallInfo> &CallMap,
985 std::vector<CallInfo> &CallsWithMetadataInFunc,
986 unsigned CloneNo);
987 std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
988 unsigned CloneNo) const;
989
990 // Saves mapping from function summaries containing memprof records back to
991 // its VI, for use in checking and debugging.
992 std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
993
994 const ModuleSummaryIndex &Index;
995 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
996 isPrevailing;
997
998 // Saves/owns the callsite info structures synthesized for missing tail call
999 // frames that we discover while building the graph.
1000 // It maps from the summary of the function making the tail call, to a map
1001 // of callee ValueInfo to corresponding synthesized callsite info.
1002 std::unordered_map<FunctionSummary *,
1003 std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>>
1004 FunctionCalleesToSynthesizedCallsiteInfos;
1005};
1006} // namespace
1007
1008namespace llvm {
1009template <>
1010struct DenseMapInfo<typename CallsiteContextGraph<
1011 ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
1012 : public DenseMapInfo<std::pair<Instruction *, unsigned>> {};
1013template <>
1014struct DenseMapInfo<typename CallsiteContextGraph<
1015 IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
1016 : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
1017template <>
1018struct DenseMapInfo<IndexCall>
1019 : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
1020} // end namespace llvm
1021
1022namespace {
1023
1024// Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
1025// type we should actually use on the corresponding allocation.
1026// If we can't clone a node that has NotCold+Cold alloc type, we will fall
1027// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
1028// from NotCold.
1029AllocationType allocTypeToUse(uint8_t AllocTypes) {
1030 assert(AllocTypes != (uint8_t)AllocationType::None);
1031 if (AllocTypes ==
1032 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
1033 return AllocationType::NotCold;
1034 else
1035 return (AllocationType)AllocTypes;
1036}
1037
1038// Helper to check if the alloc types for all edges recorded in the
1039// InAllocTypes vector match the alloc types for all edges in the Edges
1040// vector.
1041template <typename DerivedCCG, typename FuncTy, typename CallTy>
1042bool allocTypesMatch(
1043 const std::vector<uint8_t> &InAllocTypes,
1044 const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
1045 &Edges) {
1046 // This should be called only when the InAllocTypes vector was computed for
1047 // this set of Edges. Make sure the sizes are the same.
1048 assert(InAllocTypes.size() == Edges.size());
1049 return std::equal(
1050 InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(), Edges.end(),
1051 [](const uint8_t &l,
1052 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
1053 // Can share if one of the edges is None type - don't
1054 // care about the type along that edge as it doesn't
1055 // exist for those context ids.
1056 if (l == (uint8_t)AllocationType::None ||
1057 r->AllocTypes == (uint8_t)AllocationType::None)
1058 return true;
1059 return allocTypeToUse(AllocTypes: l) == allocTypeToUse(r->AllocTypes);
1060 });
1061}
1062
1063// Helper to check if the alloc types for all edges recorded in the
1064// InAllocTypes vector match the alloc types for callee edges in the given
1065// clone. Because the InAllocTypes were computed from the original node's callee
1066// edges, and other cloning could have happened after this clone was created, we
1067// need to find the matching clone callee edge, which may or may not exist.
1068template <typename DerivedCCG, typename FuncTy, typename CallTy>
1069bool allocTypesMatchClone(
1070 const std::vector<uint8_t> &InAllocTypes,
1071 const ContextNode<DerivedCCG, FuncTy, CallTy> *Clone) {
1072 const ContextNode<DerivedCCG, FuncTy, CallTy> *Node = Clone->CloneOf;
1073 assert(Node);
1074 // InAllocTypes should have been computed for the original node's callee
1075 // edges.
1076 assert(InAllocTypes.size() == Node->CalleeEdges.size());
1077 // First create a map of the clone callee edge callees to the edge alloc type.
1078 DenseMap<const ContextNode<DerivedCCG, FuncTy, CallTy> *, uint8_t>
1079 EdgeCalleeMap;
1080 for (const auto &E : Clone->CalleeEdges) {
1081 assert(!EdgeCalleeMap.contains(E->Callee));
1082 EdgeCalleeMap[E->Callee] = E->AllocTypes;
1083 }
1084 // Next, walk the original node's callees, and look for the corresponding
1085 // clone edge to that callee.
1086 for (unsigned I = 0; I < Node->CalleeEdges.size(); I++) {
1087 auto Iter = EdgeCalleeMap.find(Node->CalleeEdges[I]->Callee);
1088 // Not found is ok, we will simply add an edge if we use this clone.
1089 if (Iter == EdgeCalleeMap.end())
1090 continue;
1091 // Can share if one of the edges is None type - don't
1092 // care about the type along that edge as it doesn't
1093 // exist for those context ids.
1094 if (InAllocTypes[I] == (uint8_t)AllocationType::None ||
1095 Iter->second == (uint8_t)AllocationType::None)
1096 continue;
1097 if (allocTypeToUse(Iter->second) != allocTypeToUse(AllocTypes: InAllocTypes[I]))
1098 return false;
1099 }
1100 return true;
1101}
1102
1103} // end anonymous namespace
1104
1105template <typename DerivedCCG, typename FuncTy, typename CallTy>
1106typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1107CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
1108 const CallInfo &C) {
1109 ContextNode *Node = getNodeForAlloc(C);
1110 if (Node)
1111 return Node;
1112
1113 return NonAllocationCallToContextNodeMap.lookup(C);
1114}
1115
1116template <typename DerivedCCG, typename FuncTy, typename CallTy>
1117typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1118CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
1119 const CallInfo &C) {
1120 return AllocationCallToContextNodeMap.lookup(C);
1121}
1122
1123template <typename DerivedCCG, typename FuncTy, typename CallTy>
1124typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1125CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
1126 uint64_t StackId) {
1127 auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
1128 if (StackEntryNode != StackEntryIdToContextNodeMap.end())
1129 return StackEntryNode->second;
1130 return nullptr;
1131}
1132
1133template <typename DerivedCCG, typename FuncTy, typename CallTy>
1134void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1135 addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
1136 unsigned int ContextId) {
1137 for (auto &Edge : CallerEdges) {
1138 if (Edge->Caller == Caller) {
1139 Edge->AllocTypes |= (uint8_t)AllocType;
1140 Edge->getContextIds().insert(ContextId);
1141 return;
1142 }
1143 }
1144 std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
1145 this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
1146 CallerEdges.push_back(Edge);
1147 Caller->CalleeEdges.push_back(Edge);
1148}
1149
1150template <typename DerivedCCG, typename FuncTy, typename CallTy>
1151void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::removeEdgeFromGraph(
1152 ContextEdge *Edge, EdgeIter *EI, bool CalleeIter) {
1153 assert(!EI || (*EI)->get() == Edge);
1154 assert(!Edge->isRemoved());
1155 // Save the Caller and Callee pointers so we can erase Edge from their edge
1156 // lists after clearing Edge below. We do the clearing first in case it is
1157 // destructed after removing from the edge lists (if those were the last
1158 // shared_ptr references to Edge).
1159 auto *Callee = Edge->Callee;
1160 auto *Caller = Edge->Caller;
1161
1162 // Make sure the edge fields are cleared out so we can properly detect
1163 // removed edges if Edge is not destructed because there is still a shared_ptr
1164 // reference.
1165 Edge->clear();
1166
1167#ifndef NDEBUG
1168 auto CalleeCallerCount = Callee->CallerEdges.size();
1169 auto CallerCalleeCount = Caller->CalleeEdges.size();
1170#endif
1171 if (!EI) {
1172 Callee->eraseCallerEdge(Edge);
1173 Caller->eraseCalleeEdge(Edge);
1174 } else if (CalleeIter) {
1175 Callee->eraseCallerEdge(Edge);
1176 *EI = Caller->CalleeEdges.erase(*EI);
1177 } else {
1178 Caller->eraseCalleeEdge(Edge);
1179 *EI = Callee->CallerEdges.erase(*EI);
1180 }
1181 assert(Callee->CallerEdges.size() < CalleeCallerCount);
1182 assert(Caller->CalleeEdges.size() < CallerCalleeCount);
1183}
1184
1185template <typename DerivedCCG, typename FuncTy, typename CallTy>
1186void CallsiteContextGraph<
1187 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
1188 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
1189 auto Edge = *EI;
1190 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1191 assert(Edge->ContextIds.empty());
1192 removeEdgeFromGraph(Edge: Edge.get(), EI: &EI, /*CalleeIter=*/true);
1193 } else
1194 ++EI;
1195 }
1196}
1197
1198template <typename DerivedCCG, typename FuncTy, typename CallTy>
1199void CallsiteContextGraph<
1200 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCallerEdges(ContextNode *Node) {
1201 for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
1202 auto Edge = *EI;
1203 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1204 assert(Edge->ContextIds.empty());
1205 Edge->Caller->eraseCalleeEdge(Edge.get());
1206 EI = Node->CallerEdges.erase(EI);
1207 } else
1208 ++EI;
1209 }
1210}
1211
1212template <typename DerivedCCG, typename FuncTy, typename CallTy>
1213typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1214CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1215 findEdgeFromCallee(const ContextNode *Callee) {
1216 for (const auto &Edge : CalleeEdges)
1217 if (Edge->Callee == Callee)
1218 return Edge.get();
1219 return nullptr;
1220}
1221
1222template <typename DerivedCCG, typename FuncTy, typename CallTy>
1223typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1224CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1225 findEdgeFromCaller(const ContextNode *Caller) {
1226 for (const auto &Edge : CallerEdges)
1227 if (Edge->Caller == Caller)
1228 return Edge.get();
1229 return nullptr;
1230}
1231
1232template <typename DerivedCCG, typename FuncTy, typename CallTy>
1233void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1234 eraseCalleeEdge(const ContextEdge *Edge) {
1235 auto EI = llvm::find_if(
1236 CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
1237 return CalleeEdge.get() == Edge;
1238 });
1239 assert(EI != CalleeEdges.end());
1240 CalleeEdges.erase(EI);
1241}
1242
1243template <typename DerivedCCG, typename FuncTy, typename CallTy>
1244void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1245 eraseCallerEdge(const ContextEdge *Edge) {
1246 auto EI = llvm::find_if(
1247 CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
1248 return CallerEdge.get() == Edge;
1249 });
1250 assert(EI != CallerEdges.end());
1251 CallerEdges.erase(EI);
1252}
1253
1254template <typename DerivedCCG, typename FuncTy, typename CallTy>
1255uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
1256 DenseSet<uint32_t> &ContextIds) const {
1257 uint8_t BothTypes =
1258 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1259 uint8_t AllocType = (uint8_t)AllocationType::None;
1260 for (auto Id : ContextIds) {
1261 AllocType |= (uint8_t)ContextIdToAllocationType.at(Val: Id);
1262 // Bail early if alloc type reached both, no further refinement.
1263 if (AllocType == BothTypes)
1264 return AllocType;
1265 }
1266 return AllocType;
1267}
1268
1269template <typename DerivedCCG, typename FuncTy, typename CallTy>
1270uint8_t
1271CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
1272 const DenseSet<uint32_t> &Node1Ids,
1273 const DenseSet<uint32_t> &Node2Ids) const {
1274 uint8_t BothTypes =
1275 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1276 uint8_t AllocType = (uint8_t)AllocationType::None;
1277 for (auto Id : Node1Ids) {
1278 if (!Node2Ids.count(V: Id))
1279 continue;
1280 AllocType |= (uint8_t)ContextIdToAllocationType.at(Val: Id);
1281 // Bail early if alloc type reached both, no further refinement.
1282 if (AllocType == BothTypes)
1283 return AllocType;
1284 }
1285 return AllocType;
1286}
1287
1288template <typename DerivedCCG, typename FuncTy, typename CallTy>
1289uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
1290 const DenseSet<uint32_t> &Node1Ids,
1291 const DenseSet<uint32_t> &Node2Ids) const {
1292 if (Node1Ids.size() < Node2Ids.size())
1293 return intersectAllocTypesImpl(Node1Ids, Node2Ids);
1294 else
1295 return intersectAllocTypesImpl(Node1Ids: Node2Ids, Node2Ids: Node1Ids);
1296}
1297
1298template <typename DerivedCCG, typename FuncTy, typename CallTy>
1299typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1300CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
1301 CallInfo Call, const FuncTy *F) {
1302 assert(!getNodeForAlloc(Call));
1303 ContextNode *AllocNode = createNewNode(/*IsAllocation=*/true, F, C: Call);
1304 AllocationCallToContextNodeMap[Call] = AllocNode;
1305 // Use LastContextId as a uniq id for MIB allocation nodes.
1306 AllocNode->OrigStackOrAllocId = LastContextId;
1307 // Alloc type should be updated as we add in the MIBs. We should assert
1308 // afterwards that it is not still None.
1309 AllocNode->AllocTypes = (uint8_t)AllocationType::None;
1310
1311 return AllocNode;
1312}
1313
1314static std::string getAllocTypeString(uint8_t AllocTypes) {
1315 if (!AllocTypes)
1316 return "None";
1317 std::string Str;
1318 if (AllocTypes & (uint8_t)AllocationType::NotCold)
1319 Str += "NotCold";
1320 if (AllocTypes & (uint8_t)AllocationType::Cold)
1321 Str += "Cold";
1322 return Str;
1323}
1324
1325template <typename DerivedCCG, typename FuncTy, typename CallTy>
1326template <class NodeT, class IteratorT>
1327void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
1328 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
1329 CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
1330 ArrayRef<ContextTotalSize> ContextSizeInfo) {
1331 // Treating the hot alloc type as NotCold before the disambiguation for "hot"
1332 // is done.
1333 if (AllocType == AllocationType::Hot)
1334 AllocType = AllocationType::NotCold;
1335
1336 ContextIdToAllocationType[++LastContextId] = AllocType;
1337
1338 if (!ContextSizeInfo.empty()) {
1339 auto &Entry = ContextIdToContextSizeInfos[LastContextId];
1340 Entry.insert(position: Entry.begin(), first: ContextSizeInfo.begin(), last: ContextSizeInfo.end());
1341 }
1342
1343 // Update alloc type and context ids for this MIB.
1344 AllocNode->AllocTypes |= (uint8_t)AllocType;
1345
1346 // Now add or update nodes for each stack id in alloc's context.
1347 // Later when processing the stack ids on non-alloc callsites we will adjust
1348 // for any inlining in the context.
1349 ContextNode *PrevNode = AllocNode;
1350 // Look for recursion (direct recursion should have been collapsed by
1351 // module summary analysis, here we should just be detecting mutual
1352 // recursion). Mark these nodes so we don't try to clone.
1353 SmallSet<uint64_t, 8> StackIdSet;
1354 // Skip any on the allocation call (inlining).
1355 for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
1356 ContextIter != StackContext.end(); ++ContextIter) {
1357 auto StackId = getStackId(IdOrIndex: *ContextIter);
1358 ContextNode *StackNode = getNodeForStackId(StackId);
1359 if (!StackNode) {
1360 StackNode = createNewNode(/*IsAllocation=*/false);
1361 StackEntryIdToContextNodeMap[StackId] = StackNode;
1362 StackNode->OrigStackOrAllocId = StackId;
1363 }
1364 // Marking a node recursive will prevent its cloning completely, even for
1365 // non-recursive contexts flowing through it.
1366 if (!AllowRecursiveCallsites) {
1367 auto Ins = StackIdSet.insert(StackId);
1368 if (!Ins.second)
1369 StackNode->Recursive = true;
1370 }
1371 StackNode->AllocTypes |= (uint8_t)AllocType;
1372 PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
1373 PrevNode = StackNode;
1374 }
1375}
1376
1377template <typename DerivedCCG, typename FuncTy, typename CallTy>
1378DenseSet<uint32_t>
1379CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
1380 const DenseSet<uint32_t> &StackSequenceContextIds,
1381 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1382 DenseSet<uint32_t> NewContextIds;
1383 for (auto OldId : StackSequenceContextIds) {
1384 NewContextIds.insert(V: ++LastContextId);
1385 OldToNewContextIds[OldId].insert(V: LastContextId);
1386 assert(ContextIdToAllocationType.count(OldId));
1387 // The new context has the same allocation type as original.
1388 ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
1389 if (DotAllocContextIds.contains(V: OldId))
1390 DotAllocContextIds.insert(V: LastContextId);
1391 }
1392 return NewContextIds;
1393}
1394
1395template <typename DerivedCCG, typename FuncTy, typename CallTy>
1396void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1397 propagateDuplicateContextIds(
1398 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1399 // Build a set of duplicated context ids corresponding to the input id set.
1400 auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
1401 DenseSet<uint32_t> NewIds;
1402 for (auto Id : ContextIds)
1403 if (auto NewId = OldToNewContextIds.find(Val: Id);
1404 NewId != OldToNewContextIds.end())
1405 NewIds.insert_range(R: NewId->second);
1406 return NewIds;
1407 };
1408
1409 // Recursively update context ids sets along caller edges.
1410 auto UpdateCallers = [&](ContextNode *Node,
1411 DenseSet<const ContextEdge *> &Visited,
1412 auto &&UpdateCallers) -> void {
1413 for (const auto &Edge : Node->CallerEdges) {
1414 auto Inserted = Visited.insert(Edge.get());
1415 if (!Inserted.second)
1416 continue;
1417 ContextNode *NextNode = Edge->Caller;
1418 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
1419 // Only need to recursively iterate to NextNode via this caller edge if
1420 // it resulted in any added ids to NextNode.
1421 if (!NewIdsToAdd.empty()) {
1422 Edge->getContextIds().insert_range(NewIdsToAdd);
1423 UpdateCallers(NextNode, Visited, UpdateCallers);
1424 }
1425 }
1426 };
1427
1428 DenseSet<const ContextEdge *> Visited;
1429 for (auto &Entry : AllocationCallToContextNodeMap) {
1430 auto *Node = Entry.second;
1431 UpdateCallers(Node, Visited, UpdateCallers);
1432 }
1433}
1434
1435template <typename DerivedCCG, typename FuncTy, typename CallTy>
1436void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
1437 ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee,
1438 // This must be passed by value to make a copy since it will be adjusted
1439 // as ids are moved.
1440 DenseSet<uint32_t> RemainingContextIds) {
1441 auto &OrigEdges =
1442 TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
1443 DenseSet<uint32_t> RecursiveContextIds;
1444 DenseSet<uint32_t> AllCallerContextIds;
1445 if (AllowRecursiveCallsites) {
1446 // Identify which context ids are recursive which is needed to properly
1447 // update the RemainingContextIds set. The relevant recursive context ids
1448 // are those that are in multiple edges.
1449 for (auto &CE : OrigEdges) {
1450 AllCallerContextIds.reserve(Size: CE->getContextIds().size());
1451 for (auto Id : CE->getContextIds())
1452 if (!AllCallerContextIds.insert(Id).second)
1453 RecursiveContextIds.insert(Id);
1454 }
1455 }
1456 // Increment iterator in loop so that we can remove edges as needed.
1457 for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
1458 auto Edge = *EI;
1459 DenseSet<uint32_t> NewEdgeContextIds;
1460 DenseSet<uint32_t> NotFoundContextIds;
1461 // Remove any matching context ids from Edge, return set that were found and
1462 // removed, these are the new edge's context ids. Also update the remaining
1463 // (not found ids).
1464 set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
1465 NotFoundContextIds);
1466 // Update the remaining context ids set for the later edges. This is a
1467 // compile time optimization.
1468 if (RecursiveContextIds.empty()) {
1469 // No recursive ids, so all of the previously remaining context ids that
1470 // were not seen on this edge are the new remaining set.
1471 RemainingContextIds.swap(RHS&: NotFoundContextIds);
1472 } else {
1473 // Keep the recursive ids in the remaining set as we expect to see those
1474 // on another edge. We can remove the non-recursive remaining ids that
1475 // were seen on this edge, however. We already have the set of remaining
1476 // ids that were on this edge (in NewEdgeContextIds). Figure out which are
1477 // non-recursive and only remove those. Note that despite the higher
1478 // overhead of updating the remaining context ids set when recursion
1479 // handling is enabled, it was found to be at worst performance neutral
1480 // and in one case a clear win.
1481 DenseSet<uint32_t> NonRecursiveRemainingCurEdgeIds =
1482 set_difference(S1: NewEdgeContextIds, S2: RecursiveContextIds);
1483 set_subtract(S1&: RemainingContextIds, S2: NonRecursiveRemainingCurEdgeIds);
1484 }
1485 // If no matching context ids for this edge, skip it.
1486 if (NewEdgeContextIds.empty()) {
1487 ++EI;
1488 continue;
1489 }
1490 if (TowardsCallee) {
1491 uint8_t NewAllocType = computeAllocType(ContextIds&: NewEdgeContextIds);
1492 auto NewEdge = std::make_shared<ContextEdge>(
1493 Edge->Callee, NewNode, NewAllocType, std::move(NewEdgeContextIds));
1494 NewNode->CalleeEdges.push_back(NewEdge);
1495 NewEdge->Callee->CallerEdges.push_back(NewEdge);
1496 } else {
1497 uint8_t NewAllocType = computeAllocType(ContextIds&: NewEdgeContextIds);
1498 auto NewEdge = std::make_shared<ContextEdge>(
1499 NewNode, Edge->Caller, NewAllocType, std::move(NewEdgeContextIds));
1500 NewNode->CallerEdges.push_back(NewEdge);
1501 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
1502 }
1503 // Remove old edge if context ids empty.
1504 if (Edge->getContextIds().empty()) {
1505 removeEdgeFromGraph(Edge: Edge.get(), EI: &EI, CalleeIter: TowardsCallee);
1506 continue;
1507 }
1508 ++EI;
1509 }
1510}
1511
1512template <typename DerivedCCG, typename FuncTy, typename CallTy>
1513static void checkEdge(
1514 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
1515 // Confirm that alloc type is not None and that we have at least one context
1516 // id.
1517 assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
1518 assert(!Edge->ContextIds.empty());
1519}
1520
1521template <typename DerivedCCG, typename FuncTy, typename CallTy>
1522static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
1523 bool CheckEdges = true) {
1524 if (Node->isRemoved())
1525 return;
1526#ifndef NDEBUG
1527 // Compute node's context ids once for use in asserts.
1528 auto NodeContextIds = Node->getContextIds();
1529#endif
1530 // Node's context ids should be the union of both its callee and caller edge
1531 // context ids.
1532 if (Node->CallerEdges.size()) {
1533 DenseSet<uint32_t> CallerEdgeContextIds(
1534 Node->CallerEdges.front()->ContextIds);
1535 for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) {
1536 if (CheckEdges)
1537 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1538 set_union(CallerEdgeContextIds, Edge->ContextIds);
1539 }
1540 // Node can have more context ids than callers if some contexts terminate at
1541 // node and some are longer. If we are allowing recursive callsites and
1542 // contexts this will be violated for incompletely cloned recursive cycles,
1543 // so skip the checking in that case.
1544 assert((AllowRecursiveCallsites && AllowRecursiveContexts) ||
1545 NodeContextIds == CallerEdgeContextIds ||
1546 set_is_subset(CallerEdgeContextIds, NodeContextIds));
1547 }
1548 if (Node->CalleeEdges.size()) {
1549 DenseSet<uint32_t> CalleeEdgeContextIds(
1550 Node->CalleeEdges.front()->ContextIds);
1551 for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) {
1552 if (CheckEdges)
1553 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1554 set_union(CalleeEdgeContextIds, Edge->getContextIds());
1555 }
1556 // If we are allowing recursive callsites and contexts this will be violated
1557 // for incompletely cloned recursive cycles, so skip the checking in that
1558 // case.
1559 assert((AllowRecursiveCallsites && AllowRecursiveContexts) ||
1560 NodeContextIds == CalleeEdgeContextIds);
1561 }
1562 // FIXME: Since this checking is only invoked under an option, we should
1563 // change the error checking from using assert to something that will trigger
1564 // an error on a release build.
1565#ifndef NDEBUG
1566 // Make sure we don't end up with duplicate edges between the same caller and
1567 // callee.
1568 DenseSet<ContextNode<DerivedCCG, FuncTy, CallTy> *> NodeSet;
1569 for (const auto &E : Node->CalleeEdges)
1570 NodeSet.insert(E->Callee);
1571 assert(NodeSet.size() == Node->CalleeEdges.size());
1572#endif
1573}
1574
1575template <typename DerivedCCG, typename FuncTy, typename CallTy>
1576void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1577 assignStackNodesPostOrder(
1578 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
1579 DenseMap<uint64_t, std::vector<CallContextInfo>>
1580 &StackIdToMatchingCalls,
1581 DenseMap<CallInfo, CallInfo> &CallToMatchingCall) {
1582 auto Inserted = Visited.insert(Node);
1583 if (!Inserted.second)
1584 return;
1585 // Post order traversal. Iterate over a copy since we may add nodes and
1586 // therefore new callers during the recursive call, invalidating any
1587 // iterator over the original edge vector. We don't need to process these
1588 // new nodes as they were already processed on creation.
1589 auto CallerEdges = Node->CallerEdges;
1590 for (auto &Edge : CallerEdges) {
1591 // Skip any that have been removed during the recursion.
1592 if (Edge->isRemoved()) {
1593 assert(!is_contained(Node->CallerEdges, Edge));
1594 continue;
1595 }
1596 assignStackNodesPostOrder(Node: Edge->Caller, Visited, StackIdToMatchingCalls,
1597 CallToMatchingCall);
1598 }
1599
1600 // If this node's stack id is in the map, update the graph to contain new
1601 // nodes representing any inlining at interior callsites. Note we move the
1602 // associated context ids over to the new nodes.
1603
1604 // Ignore this node if it is for an allocation or we didn't record any
1605 // stack id lists ending at it.
1606 if (Node->IsAllocation ||
1607 !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
1608 return;
1609
1610 auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
1611 // Handle the simple case first. A single call with a single stack id.
1612 // In this case there is no need to create any new context nodes, simply
1613 // assign the context node for stack id to this Call.
1614 if (Calls.size() == 1) {
1615 auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
1616 if (Ids.size() == 1) {
1617 assert(SavedContextIds.empty());
1618 // It should be this Node
1619 assert(Node == getNodeForStackId(Ids[0]));
1620 if (Node->Recursive)
1621 return;
1622 Node->setCall(Call);
1623 NonAllocationCallToContextNodeMap[Call] = Node;
1624 NodeToCallingFunc[Node] = Func;
1625 return;
1626 }
1627 }
1628
1629#ifndef NDEBUG
1630 // Find the node for the last stack id, which should be the same
1631 // across all calls recorded for this id, and is this node's id.
1632 uint64_t LastId = Node->OrigStackOrAllocId;
1633 ContextNode *LastNode = getNodeForStackId(LastId);
1634 // We should only have kept stack ids that had nodes.
1635 assert(LastNode);
1636 assert(LastNode == Node);
1637#else
1638 ContextNode *LastNode = Node;
1639#endif
1640
1641 // Compute the last node's context ids once, as it is shared by all calls in
1642 // this entry.
1643 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1644
1645 [[maybe_unused]] bool PrevIterCreatedNode = false;
1646 bool CreatedNode = false;
1647 for (unsigned I = 0; I < Calls.size();
1648 I++, PrevIterCreatedNode = CreatedNode) {
1649 CreatedNode = false;
1650 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1651 // Skip any for which we didn't assign any ids, these don't get a node in
1652 // the graph.
1653 if (SavedContextIds.empty()) {
1654 // If this call has a matching call (located in the same function and
1655 // having the same stack ids), simply add it to the context node created
1656 // for its matching call earlier. These can be treated the same through
1657 // cloning and get updated at the same time.
1658 if (!CallToMatchingCall.contains(Call))
1659 continue;
1660 auto MatchingCall = CallToMatchingCall[Call];
1661 if (!NonAllocationCallToContextNodeMap.contains(MatchingCall)) {
1662 // This should only happen if we had a prior iteration, and it didn't
1663 // create a node because of the below recomputation of context ids
1664 // finding none remaining and continuing early.
1665 assert(I > 0 && !PrevIterCreatedNode);
1666 continue;
1667 }
1668 NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back(
1669 Call);
1670 continue;
1671 }
1672
1673 assert(LastId == Ids.back());
1674
1675 // Recompute the context ids for this stack id sequence (the
1676 // intersection of the context ids of the corresponding nodes).
1677 // Start with the ids we saved in the map for this call, which could be
1678 // duplicated context ids. We have to recompute as we might have overlap
1679 // overlap between the saved context ids for different last nodes, and
1680 // removed them already during the post order traversal.
1681 set_intersect(SavedContextIds, LastNodeContextIds);
1682 ContextNode *PrevNode = LastNode;
1683 bool Skip = false;
1684 // Iterate backwards through the stack Ids, starting after the last Id
1685 // in the list, which was handled once outside for all Calls.
1686 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1687 auto Id = *IdIter;
1688 ContextNode *CurNode = getNodeForStackId(StackId: Id);
1689 // We should only have kept stack ids that had nodes and weren't
1690 // recursive.
1691 assert(CurNode);
1692 assert(!CurNode->Recursive);
1693
1694 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1695 if (!Edge) {
1696 Skip = true;
1697 break;
1698 }
1699 PrevNode = CurNode;
1700
1701 // Update the context ids, which is the intersection of the ids along
1702 // all edges in the sequence.
1703 set_intersect(SavedContextIds, Edge->getContextIds());
1704
1705 // If we now have no context ids for clone, skip this call.
1706 if (SavedContextIds.empty()) {
1707 Skip = true;
1708 break;
1709 }
1710 }
1711 if (Skip)
1712 continue;
1713
1714 // Create new context node.
1715 ContextNode *NewNode = createNewNode(/*IsAllocation=*/false, F: Func, C: Call);
1716 NonAllocationCallToContextNodeMap[Call] = NewNode;
1717 CreatedNode = true;
1718 NewNode->AllocTypes = computeAllocType(ContextIds&: SavedContextIds);
1719
1720 ContextNode *FirstNode = getNodeForStackId(StackId: Ids[0]);
1721 assert(FirstNode);
1722
1723 // Connect to callees of innermost stack frame in inlined call chain.
1724 // This updates context ids for FirstNode's callee's to reflect those
1725 // moved to NewNode.
1726 connectNewNode(NewNode, OrigNode: FirstNode, /*TowardsCallee=*/true, RemainingContextIds: SavedContextIds);
1727
1728 // Connect to callers of outermost stack frame in inlined call chain.
1729 // This updates context ids for FirstNode's caller's to reflect those
1730 // moved to NewNode.
1731 connectNewNode(NewNode, OrigNode: LastNode, /*TowardsCallee=*/false, RemainingContextIds: SavedContextIds);
1732
1733 // Now we need to remove context ids from edges/nodes between First and
1734 // Last Node.
1735 PrevNode = nullptr;
1736 for (auto Id : Ids) {
1737 ContextNode *CurNode = getNodeForStackId(StackId: Id);
1738 // We should only have kept stack ids that had nodes.
1739 assert(CurNode);
1740
1741 // Remove the context ids moved to NewNode from CurNode, and the
1742 // edge from the prior node.
1743 if (PrevNode) {
1744 auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
1745 // If the sequence contained recursion, we might have already removed
1746 // some edges during the connectNewNode calls above.
1747 if (!PrevEdge) {
1748 PrevNode = CurNode;
1749 continue;
1750 }
1751 set_subtract(PrevEdge->getContextIds(), SavedContextIds);
1752 if (PrevEdge->getContextIds().empty())
1753 removeEdgeFromGraph(Edge: PrevEdge);
1754 }
1755 // Since we update the edges from leaf to tail, only look at the callee
1756 // edges. This isn't an alloc node, so if there are no callee edges, the
1757 // alloc type is None.
1758 CurNode->AllocTypes = CurNode->CalleeEdges.empty()
1759 ? (uint8_t)AllocationType::None
1760 : CurNode->computeAllocType();
1761 PrevNode = CurNode;
1762 }
1763 if (VerifyNodes) {
1764 checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true);
1765 for (auto Id : Ids) {
1766 ContextNode *CurNode = getNodeForStackId(StackId: Id);
1767 // We should only have kept stack ids that had nodes.
1768 assert(CurNode);
1769 checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /*CheckEdges=*/true);
1770 }
1771 }
1772 }
1773}
1774
1775template <typename DerivedCCG, typename FuncTy, typename CallTy>
1776void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
1777 // Map of stack id to all calls with that as the last (outermost caller)
1778 // callsite id that has a context node (some might not due to pruning
1779 // performed during matching of the allocation profile contexts).
1780 // The CallContextInfo contains the Call and a list of its stack ids with
1781 // ContextNodes, the function containing Call, and the set of context ids
1782 // the analysis will eventually identify for use in any new node created
1783 // for that callsite.
1784 DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls;
1785 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
1786 for (auto &Call : CallsWithMetadata) {
1787 // Ignore allocations, already handled.
1788 if (AllocationCallToContextNodeMap.count(Call))
1789 continue;
1790 auto StackIdsWithContextNodes =
1791 getStackIdsWithContextNodesForCall(Call: Call.call());
1792 // If there were no nodes created for MIBs on allocs (maybe this was in
1793 // the unambiguous part of the MIB stack that was pruned), ignore.
1794 if (StackIdsWithContextNodes.empty())
1795 continue;
1796 // Otherwise, record this Call along with the list of ids for the last
1797 // (outermost caller) stack id with a node.
1798 StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
1799 {Call.call(), StackIdsWithContextNodes, Func, {}});
1800 }
1801 }
1802
1803 // First make a pass through all stack ids that correspond to a call,
1804 // as identified in the above loop. Compute the context ids corresponding to
1805 // each of these calls when they correspond to multiple stack ids due to
1806 // due to inlining. Perform any duplication of context ids required when
1807 // there is more than one call with the same stack ids. Their (possibly newly
1808 // duplicated) context ids are saved in the StackIdToMatchingCalls map.
1809 DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
1810 // Save a map from each call to any that are found to match it. I.e. located
1811 // in the same function and have the same (possibly pruned) stack ids. We use
1812 // this to avoid creating extra graph nodes as they can be treated the same.
1813 DenseMap<CallInfo, CallInfo> CallToMatchingCall;
1814 for (auto &It : StackIdToMatchingCalls) {
1815 auto &Calls = It.getSecond();
1816 // Skip single calls with a single stack id. These don't need a new node.
1817 if (Calls.size() == 1) {
1818 auto &Ids = Calls[0].StackIds;
1819 if (Ids.size() == 1)
1820 continue;
1821 }
1822 // In order to do the best and maximal matching of inlined calls to context
1823 // node sequences we will sort the vectors of stack ids in descending order
1824 // of length, and within each length, lexicographically by stack id. The
1825 // latter is so that we can specially handle calls that have identical stack
1826 // id sequences (either due to cloning or artificially because of the MIB
1827 // context pruning). Those with the same Ids are then sorted by function to
1828 // facilitate efficiently mapping them to the same context node.
1829 // Because the functions are pointers, to ensure a stable sort first assign
1830 // each function pointer to its first index in the Calls array, and then use
1831 // that to sort by.
1832 DenseMap<const FuncTy *, unsigned> FuncToIndex;
1833 for (const auto &[Idx, CallCtxInfo] : enumerate(Calls))
1834 FuncToIndex.insert({CallCtxInfo.Func, Idx});
1835 llvm::stable_sort(
1836 Calls,
1837 [&FuncToIndex](const CallContextInfo &A, const CallContextInfo &B) {
1838 return A.StackIds.size() > B.StackIds.size() ||
1839 (A.StackIds.size() == B.StackIds.size() &&
1840 (A.StackIds < B.StackIds ||
1841 (A.StackIds == B.StackIds &&
1842 FuncToIndex[A.Func] < FuncToIndex[B.Func])));
1843 });
1844
1845 // Find the node for the last stack id, which should be the same
1846 // across all calls recorded for this id, and is the id for this
1847 // entry in the StackIdToMatchingCalls map.
1848 uint64_t LastId = It.getFirst();
1849 ContextNode *LastNode = getNodeForStackId(StackId: LastId);
1850 // We should only have kept stack ids that had nodes.
1851 assert(LastNode);
1852
1853 if (LastNode->Recursive)
1854 continue;
1855
1856 // Initialize the context ids with the last node's. We will subsequently
1857 // refine the context ids by computing the intersection along all edges.
1858 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1859 assert(!LastNodeContextIds.empty());
1860
1861#ifndef NDEBUG
1862 // Save the set of functions seen for a particular set of the same stack
1863 // ids. This is used to ensure that they have been correctly sorted to be
1864 // adjacent in the Calls list, since we rely on that to efficiently place
1865 // all such matching calls onto the same context node.
1866 DenseSet<const FuncTy *> MatchingIdsFuncSet;
1867#endif
1868
1869 for (unsigned I = 0; I < Calls.size(); I++) {
1870 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1871 assert(SavedContextIds.empty());
1872 assert(LastId == Ids.back());
1873
1874#ifndef NDEBUG
1875 // If this call has a different set of ids than the last one, clear the
1876 // set used to ensure they are sorted properly.
1877 if (I > 0 && Ids != Calls[I - 1].StackIds)
1878 MatchingIdsFuncSet.clear();
1879#endif
1880
1881 // First compute the context ids for this stack id sequence (the
1882 // intersection of the context ids of the corresponding nodes).
1883 // Start with the remaining saved ids for the last node.
1884 assert(!LastNodeContextIds.empty());
1885 DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
1886
1887 ContextNode *PrevNode = LastNode;
1888 ContextNode *CurNode = LastNode;
1889 bool Skip = false;
1890
1891 // Iterate backwards through the stack Ids, starting after the last Id
1892 // in the list, which was handled once outside for all Calls.
1893 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1894 auto Id = *IdIter;
1895 CurNode = getNodeForStackId(StackId: Id);
1896 // We should only have kept stack ids that had nodes.
1897 assert(CurNode);
1898
1899 if (CurNode->Recursive) {
1900 Skip = true;
1901 break;
1902 }
1903
1904 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1905 // If there is no edge then the nodes belong to different MIB contexts,
1906 // and we should skip this inlined context sequence. For example, this
1907 // particular inlined context may include stack ids A->B, and we may
1908 // indeed have nodes for both A and B, but it is possible that they were
1909 // never profiled in sequence in a single MIB for any allocation (i.e.
1910 // we might have profiled an allocation that involves the callsite A,
1911 // but through a different one of its callee callsites, and we might
1912 // have profiled an allocation that involves callsite B, but reached
1913 // from a different caller callsite).
1914 if (!Edge) {
1915 Skip = true;
1916 break;
1917 }
1918 PrevNode = CurNode;
1919
1920 // Update the context ids, which is the intersection of the ids along
1921 // all edges in the sequence.
1922 set_intersect(StackSequenceContextIds, Edge->getContextIds());
1923
1924 // If we now have no context ids for clone, skip this call.
1925 if (StackSequenceContextIds.empty()) {
1926 Skip = true;
1927 break;
1928 }
1929 }
1930 if (Skip)
1931 continue;
1932
1933 // If some of this call's stack ids did not have corresponding nodes (due
1934 // to pruning), don't include any context ids for contexts that extend
1935 // beyond these nodes. Otherwise we would be matching part of unrelated /
1936 // not fully matching stack contexts. To do this, subtract any context ids
1937 // found in caller nodes of the last node found above.
1938 if (Ids.back() != getLastStackId(Call)) {
1939 for (const auto &PE : LastNode->CallerEdges) {
1940 set_subtract(StackSequenceContextIds, PE->getContextIds());
1941 if (StackSequenceContextIds.empty())
1942 break;
1943 }
1944 // If we now have no context ids for clone, skip this call.
1945 if (StackSequenceContextIds.empty())
1946 continue;
1947 }
1948
1949#ifndef NDEBUG
1950 // If the prior call had the same stack ids this set would not be empty.
1951 // Check if we already have a call that "matches" because it is located
1952 // in the same function. If the Calls list was sorted properly we should
1953 // not encounter this situation as all such entries should be adjacent
1954 // and processed in bulk further below.
1955 assert(!MatchingIdsFuncSet.contains(Func));
1956
1957 MatchingIdsFuncSet.insert(Func);
1958#endif
1959
1960 // Check if the next set of stack ids is the same (since the Calls vector
1961 // of tuples is sorted by the stack ids we can just look at the next one).
1962 // If so, save them in the CallToMatchingCall map so that they get
1963 // assigned to the same context node, and skip them.
1964 bool DuplicateContextIds = false;
1965 for (unsigned J = I + 1; J < Calls.size(); J++) {
1966 auto &CallCtxInfo = Calls[J];
1967 auto &NextIds = CallCtxInfo.StackIds;
1968 if (NextIds != Ids)
1969 break;
1970 auto *NextFunc = CallCtxInfo.Func;
1971 if (NextFunc != Func) {
1972 // We have another Call with the same ids but that cannot share this
1973 // node, must duplicate ids for it.
1974 DuplicateContextIds = true;
1975 break;
1976 }
1977 auto &NextCall = CallCtxInfo.Call;
1978 CallToMatchingCall[NextCall] = Call;
1979 // Update I so that it gets incremented correctly to skip this call.
1980 I = J;
1981 }
1982
1983 // If we don't have duplicate context ids, then we can assign all the
1984 // context ids computed for the original node sequence to this call.
1985 // If there are duplicate calls with the same stack ids then we synthesize
1986 // new context ids that are duplicates of the originals. These are
1987 // assigned to SavedContextIds, which is a reference into the map entry
1988 // for this call, allowing us to access these ids later on.
1989 OldToNewContextIds.reserve(NumEntries: OldToNewContextIds.size() +
1990 StackSequenceContextIds.size());
1991 SavedContextIds =
1992 DuplicateContextIds
1993 ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
1994 : StackSequenceContextIds;
1995 assert(!SavedContextIds.empty());
1996
1997 if (!DuplicateContextIds) {
1998 // Update saved last node's context ids to remove those that are
1999 // assigned to other calls, so that it is ready for the next call at
2000 // this stack id.
2001 set_subtract(S1&: LastNodeContextIds, S2: StackSequenceContextIds);
2002 if (LastNodeContextIds.empty())
2003 break;
2004 }
2005 }
2006 }
2007
2008 // Propagate the duplicate context ids over the graph.
2009 propagateDuplicateContextIds(OldToNewContextIds);
2010
2011 if (VerifyCCG)
2012 check();
2013
2014 // Now perform a post-order traversal over the graph, starting with the
2015 // allocation nodes, essentially processing nodes from callers to callees.
2016 // For any that contains an id in the map, update the graph to contain new
2017 // nodes representing any inlining at interior callsites. Note we move the
2018 // associated context ids over to the new nodes.
2019 DenseSet<const ContextNode *> Visited;
2020 for (auto &Entry : AllocationCallToContextNodeMap)
2021 assignStackNodesPostOrder(Node: Entry.second, Visited, StackIdToMatchingCalls,
2022 CallToMatchingCall);
2023 if (VerifyCCG)
2024 check();
2025}
2026
2027uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
2028 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
2029 Call->getMetadata(KindID: LLVMContext::MD_callsite));
2030 return CallsiteContext.back();
2031}
2032
2033uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
2034 assert(isa<CallsiteInfo *>(Call));
2035 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
2036 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Val&: Call));
2037 // Need to convert index into stack id.
2038 return Index.getStackIdAtIndex(Index: CallsiteContext.back());
2039}
2040
2041static const std::string MemProfCloneSuffix = ".memprof.";
2042
2043static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
2044 // We use CloneNo == 0 to refer to the original version, which doesn't get
2045 // renamed with a suffix.
2046 if (!CloneNo)
2047 return Base.str();
2048 return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
2049}
2050
2051static bool isMemProfClone(const Function &F) {
2052 return F.getName().contains(Other: MemProfCloneSuffix);
2053}
2054
2055std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
2056 const Instruction *Call,
2057 unsigned CloneNo) const {
2058 return (Twine(Call->getFunction()->getName()) + " -> " +
2059 cast<CallBase>(Val: Call)->getCalledFunction()->getName())
2060 .str();
2061}
2062
2063std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
2064 const IndexCall &Call,
2065 unsigned CloneNo) const {
2066 auto VI = FSToVIMap.find(x: Func);
2067 assert(VI != FSToVIMap.end());
2068 if (isa<AllocInfo *>(Val: Call))
2069 return (VI->second.name() + " -> alloc").str();
2070 else {
2071 auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Val: Call);
2072 return (VI->second.name() + " -> " +
2073 getMemProfFuncName(Base: Callsite->Callee.name(),
2074 CloneNo: Callsite->Clones[CloneNo]))
2075 .str();
2076 }
2077}
2078
2079std::vector<uint64_t>
2080ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
2081 Instruction *Call) {
2082 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
2083 Call->getMetadata(KindID: LLVMContext::MD_callsite));
2084 return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
2085 CallsiteContext);
2086}
2087
2088std::vector<uint64_t>
2089IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
2090 assert(isa<CallsiteInfo *>(Call));
2091 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
2092 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Val&: Call));
2093 return getStackIdsWithContextNodes<CallsiteInfo,
2094 SmallVector<unsigned>::const_iterator>(
2095 CallsiteContext);
2096}
2097
2098template <typename DerivedCCG, typename FuncTy, typename CallTy>
2099template <class NodeT, class IteratorT>
2100std::vector<uint64_t>
2101CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
2102 CallStack<NodeT, IteratorT> &CallsiteContext) {
2103 std::vector<uint64_t> StackIds;
2104 for (auto IdOrIndex : CallsiteContext) {
2105 auto StackId = getStackId(IdOrIndex);
2106 ContextNode *Node = getNodeForStackId(StackId);
2107 if (!Node)
2108 break;
2109 StackIds.push_back(StackId);
2110 }
2111 return StackIds;
2112}
2113
2114ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
2115 Module &M,
2116 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)
2117 : Mod(M), OREGetter(OREGetter) {
2118 for (auto &F : M) {
2119 std::vector<CallInfo> CallsWithMetadata;
2120 for (auto &BB : F) {
2121 for (auto &I : BB) {
2122 if (!isa<CallBase>(Val: I))
2123 continue;
2124 if (auto *MemProfMD = I.getMetadata(KindID: LLVMContext::MD_memprof)) {
2125 CallsWithMetadata.push_back(x: &I);
2126 auto *AllocNode = addAllocNode(Call: &I, F: &F);
2127 auto *CallsiteMD = I.getMetadata(KindID: LLVMContext::MD_callsite);
2128 assert(CallsiteMD);
2129 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
2130 // Add all of the MIBs and their stack nodes.
2131 for (auto &MDOp : MemProfMD->operands()) {
2132 auto *MIBMD = cast<const MDNode>(Val: MDOp);
2133 std::vector<ContextTotalSize> ContextSizeInfo;
2134 // Collect the context size information if it exists.
2135 if (MIBMD->getNumOperands() > 2) {
2136 for (unsigned I = 2; I < MIBMD->getNumOperands(); I++) {
2137 MDNode *ContextSizePair =
2138 dyn_cast<MDNode>(Val: MIBMD->getOperand(I));
2139 assert(ContextSizePair->getNumOperands() == 2);
2140 uint64_t FullStackId = mdconst::dyn_extract<ConstantInt>(
2141 MD: ContextSizePair->getOperand(I: 0))
2142 ->getZExtValue();
2143 uint64_t TotalSize = mdconst::dyn_extract<ConstantInt>(
2144 MD: ContextSizePair->getOperand(I: 1))
2145 ->getZExtValue();
2146 ContextSizeInfo.push_back(x: {.FullStackId: FullStackId, .TotalSize: TotalSize});
2147 }
2148 }
2149 MDNode *StackNode = getMIBStackNode(MIB: MIBMD);
2150 assert(StackNode);
2151 CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode);
2152 addStackNodesForMIB<MDNode, MDNode::op_iterator>(
2153 AllocNode, StackContext, CallsiteContext,
2154 AllocType: getMIBAllocType(MIB: MIBMD), ContextSizeInfo);
2155 }
2156 // If exporting the graph to dot and an allocation id of interest was
2157 // specified, record all the context ids for this allocation node.
2158 if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
2159 DotAllocContextIds = AllocNode->getContextIds();
2160 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2161 // Memprof and callsite metadata on memory allocations no longer
2162 // needed.
2163 I.setMetadata(KindID: LLVMContext::MD_memprof, Node: nullptr);
2164 I.setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
2165 }
2166 // For callsite metadata, add to list for this function for later use.
2167 else if (I.getMetadata(KindID: LLVMContext::MD_callsite)) {
2168 CallsWithMetadata.push_back(x: &I);
2169 }
2170 }
2171 }
2172 if (!CallsWithMetadata.empty())
2173 FuncToCallsWithMetadata[&F] = CallsWithMetadata;
2174 }
2175
2176 if (DumpCCG) {
2177 dbgs() << "CCG before updating call stack chains:\n";
2178 dbgs() << *this;
2179 }
2180
2181 if (ExportToDot)
2182 exportToDot(Label: "prestackupdate");
2183
2184 updateStackNodes();
2185
2186 if (ExportToDot)
2187 exportToDot(Label: "poststackupdate");
2188
2189 handleCallsitesWithMultipleTargets();
2190
2191 markBackedges();
2192
2193 // Strip off remaining callsite metadata, no longer needed.
2194 for (auto &FuncEntry : FuncToCallsWithMetadata)
2195 for (auto &Call : FuncEntry.second)
2196 Call.call()->setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
2197}
2198
2199IndexCallsiteContextGraph::IndexCallsiteContextGraph(
2200 ModuleSummaryIndex &Index,
2201 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
2202 isPrevailing)
2203 : Index(Index), isPrevailing(isPrevailing) {
2204 for (auto &I : Index) {
2205 auto VI = Index.getValueInfo(R: I);
2206 for (auto &S : VI.getSummaryList()) {
2207 // We should only add the prevailing nodes. Otherwise we may try to clone
2208 // in a weak copy that won't be linked (and may be different than the
2209 // prevailing version).
2210 // We only keep the memprof summary on the prevailing copy now when
2211 // building the combined index, as a space optimization, however don't
2212 // rely on this optimization. The linker doesn't resolve local linkage
2213 // values so don't check whether those are prevailing.
2214 if (!GlobalValue::isLocalLinkage(Linkage: S->linkage()) &&
2215 !isPrevailing(VI.getGUID(), S.get()))
2216 continue;
2217 auto *FS = dyn_cast<FunctionSummary>(Val: S.get());
2218 if (!FS)
2219 continue;
2220 std::vector<CallInfo> CallsWithMetadata;
2221 if (!FS->allocs().empty()) {
2222 for (auto &AN : FS->mutableAllocs()) {
2223 // This can happen because of recursion elimination handling that
2224 // currently exists in ModuleSummaryAnalysis. Skip these for now.
2225 // We still added them to the summary because we need to be able to
2226 // correlate properly in applyImport in the backends.
2227 if (AN.MIBs.empty())
2228 continue;
2229 IndexCall AllocCall(&AN);
2230 CallsWithMetadata.push_back(x: AllocCall);
2231 auto *AllocNode = addAllocNode(Call: AllocCall, F: FS);
2232 // Pass an empty CallStack to the CallsiteContext (second)
2233 // parameter, since for ThinLTO we already collapsed out the inlined
2234 // stack ids on the allocation call during ModuleSummaryAnalysis.
2235 CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
2236 EmptyContext;
2237 unsigned I = 0;
2238 assert(!metadataMayIncludeContextSizeInfo() ||
2239 AN.ContextSizeInfos.size() == AN.MIBs.size());
2240 // Now add all of the MIBs and their stack nodes.
2241 for (auto &MIB : AN.MIBs) {
2242 CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
2243 StackContext(&MIB);
2244 std::vector<ContextTotalSize> ContextSizeInfo;
2245 if (!AN.ContextSizeInfos.empty()) {
2246 for (auto [FullStackId, TotalSize] : AN.ContextSizeInfos[I])
2247 ContextSizeInfo.push_back(x: {.FullStackId: FullStackId, .TotalSize: TotalSize});
2248 }
2249 addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
2250 AllocNode, StackContext, CallsiteContext&: EmptyContext, AllocType: MIB.AllocType,
2251 ContextSizeInfo);
2252 I++;
2253 }
2254 // If exporting the graph to dot and an allocation id of interest was
2255 // specified, record all the context ids for this allocation node.
2256 if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
2257 DotAllocContextIds = AllocNode->getContextIds();
2258 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2259 // Initialize version 0 on the summary alloc node to the current alloc
2260 // type, unless it has both types in which case make it default, so
2261 // that in the case where we aren't able to clone the original version
2262 // always ends up with the default allocation behavior.
2263 AN.Versions[0] = (uint8_t)allocTypeToUse(AllocTypes: AllocNode->AllocTypes);
2264 }
2265 }
2266 // For callsite metadata, add to list for this function for later use.
2267 if (!FS->callsites().empty())
2268 for (auto &SN : FS->mutableCallsites()) {
2269 IndexCall StackNodeCall(&SN);
2270 CallsWithMetadata.push_back(x: StackNodeCall);
2271 }
2272
2273 if (!CallsWithMetadata.empty())
2274 FuncToCallsWithMetadata[FS] = CallsWithMetadata;
2275
2276 if (!FS->allocs().empty() || !FS->callsites().empty())
2277 FSToVIMap[FS] = VI;
2278 }
2279 }
2280
2281 if (DumpCCG) {
2282 dbgs() << "CCG before updating call stack chains:\n";
2283 dbgs() << *this;
2284 }
2285
2286 if (ExportToDot)
2287 exportToDot(Label: "prestackupdate");
2288
2289 updateStackNodes();
2290
2291 if (ExportToDot)
2292 exportToDot(Label: "poststackupdate");
2293
2294 handleCallsitesWithMultipleTargets();
2295
2296 markBackedges();
2297}
2298
2299template <typename DerivedCCG, typename FuncTy, typename CallTy>
2300void CallsiteContextGraph<DerivedCCG, FuncTy,
2301 CallTy>::handleCallsitesWithMultipleTargets() {
2302 // Look for and workaround callsites that call multiple functions.
2303 // This can happen for indirect calls, which needs better handling, and in
2304 // more rare cases (e.g. macro expansion).
2305 // TODO: To fix this for indirect calls we will want to perform speculative
2306 // devirtualization using either the normal PGO info with ICP, or using the
2307 // information in the profiled MemProf contexts. We can do this prior to
2308 // this transformation for regular LTO, and for ThinLTO we can simulate that
2309 // effect in the summary and perform the actual speculative devirtualization
2310 // while cloning in the ThinLTO backend.
2311
2312 // Keep track of the new nodes synthesized for discovered tail calls missing
2313 // from the profiled contexts.
2314 MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;
2315
2316 std::vector<std::pair<CallInfo, ContextNode *>> NewCallToNode;
2317 for (auto &Entry : NonAllocationCallToContextNodeMap) {
2318 auto *Node = Entry.second;
2319 assert(Node->Clones.empty());
2320 // Check all node callees and see if in the same function.
2321 // We need to check all of the calls recorded in this Node, because in some
2322 // cases we may have had multiple calls with the same debug info calling
2323 // different callees. This can happen, for example, when an object is
2324 // constructed in the paramter list - the destructor call of the object has
2325 // the same debug info (line/col) as the call the object was passed to.
2326 // Here we will prune any that don't match all callee nodes.
2327 std::vector<CallInfo> AllCalls;
2328 AllCalls.reserve(Node->MatchingCalls.size() + 1);
2329 AllCalls.push_back(Node->Call);
2330 llvm::append_range(AllCalls, Node->MatchingCalls);
2331
2332 // First see if we can partition the calls by callee function, creating new
2333 // nodes to host each set of calls calling the same callees. This is
2334 // necessary for support indirect calls with ThinLTO, for which we
2335 // synthesized CallsiteInfo records for each target. They will all have the
2336 // same callsite stack ids and would be sharing a context node at this
2337 // point. We need to perform separate cloning for each, which will be
2338 // applied along with speculative devirtualization in the ThinLTO backends
2339 // as needed. Note this does not currently support looking through tail
2340 // calls, it is unclear if we need that for indirect call targets.
2341 // First partition calls by callee func. Map indexed by func, value is
2342 // struct with list of matching calls, assigned node.
2343 if (partitionCallsByCallee(Node, AllCalls, NewCallToNode))
2344 continue;
2345
2346 auto It = AllCalls.begin();
2347 // Iterate through the calls until we find the first that matches.
2348 for (; It != AllCalls.end(); ++It) {
2349 auto ThisCall = *It;
2350 bool Match = true;
2351 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();
2352 ++EI) {
2353 auto Edge = *EI;
2354 if (!Edge->Callee->hasCall())
2355 continue;
2356 assert(NodeToCallingFunc.count(Edge->Callee));
2357 // Check if the called function matches that of the callee node.
2358 if (!calleesMatch(Call: ThisCall.call(), EI, TailCallToContextNodeMap)) {
2359 Match = false;
2360 break;
2361 }
2362 }
2363 // Found a call that matches the callee nodes, we can quit now.
2364 if (Match) {
2365 // If the first match is not the primary call on the Node, update it
2366 // now. We will update the list of matching calls further below.
2367 if (Node->Call != ThisCall) {
2368 Node->setCall(ThisCall);
2369 // We need to update the NonAllocationCallToContextNodeMap, but don't
2370 // want to do this during iteration over that map, so save the calls
2371 // that need updated entries.
2372 NewCallToNode.push_back({ThisCall, Node});
2373 }
2374 break;
2375 }
2376 }
2377 // We will update this list below (or leave it cleared if there was no
2378 // match found above).
2379 Node->MatchingCalls.clear();
2380 // If we hit the end of the AllCalls vector, no call matching the callee
2381 // nodes was found, clear the call information in the node.
2382 if (It == AllCalls.end()) {
2383 RemovedEdgesWithMismatchedCallees++;
2384 // Work around by setting Node to have a null call, so it gets
2385 // skipped during cloning. Otherwise assignFunctions will assert
2386 // because its data structures are not designed to handle this case.
2387 Node->setCall(CallInfo());
2388 continue;
2389 }
2390 // Now add back any matching calls that call the same function as the
2391 // matching primary call on Node.
2392 for (++It; It != AllCalls.end(); ++It) {
2393 auto ThisCall = *It;
2394 if (!sameCallee(Call1: Node->Call.call(), Call2: ThisCall.call()))
2395 continue;
2396 Node->MatchingCalls.push_back(ThisCall);
2397 }
2398 }
2399
2400 // Remove all mismatched nodes identified in the above loop from the node map
2401 // (checking whether they have a null call which is set above). For a
2402 // MapVector like NonAllocationCallToContextNodeMap it is much more efficient
2403 // to do the removal via remove_if than by individually erasing entries above.
2404 // Also remove any entries if we updated the node's primary call above.
2405 NonAllocationCallToContextNodeMap.remove_if([](const auto &it) {
2406 return !it.second->hasCall() || it.second->Call != it.first;
2407 });
2408
2409 // Add entries for any new primary calls recorded above.
2410 for (auto &[Call, Node] : NewCallToNode)
2411 NonAllocationCallToContextNodeMap[Call] = Node;
2412
2413 // Add the new nodes after the above loop so that the iteration is not
2414 // invalidated.
2415 for (auto &[Call, Node] : TailCallToContextNodeMap)
2416 NonAllocationCallToContextNodeMap[Call] = Node;
2417}
2418
2419template <typename DerivedCCG, typename FuncTy, typename CallTy>
2420bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::partitionCallsByCallee(
2421 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
2422 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode) {
2423 // Struct to keep track of all the calls having the same callee function,
2424 // and the node we eventually assign to them. Eventually we will record the
2425 // context node assigned to this group of calls.
2426 struct CallsWithSameCallee {
2427 std::vector<CallInfo> Calls;
2428 ContextNode *Node = nullptr;
2429 };
2430
2431 // First partition calls by callee function. Build map from each function
2432 // to the list of matching calls.
2433 DenseMap<const FuncTy *, CallsWithSameCallee> CalleeFuncToCallInfo;
2434 for (auto ThisCall : AllCalls) {
2435 auto *F = getCalleeFunc(Call: ThisCall.call());
2436 if (F)
2437 CalleeFuncToCallInfo[F].Calls.push_back(ThisCall);
2438 }
2439
2440 // Next, walk through all callee edges. For each callee node, get its
2441 // containing function and see if it was recorded in the above map (meaning we
2442 // have at least one matching call). Build another map from each callee node
2443 // with a matching call to the structure instance created above containing all
2444 // the calls.
2445 DenseMap<ContextNode *, CallsWithSameCallee *> CalleeNodeToCallInfo;
2446 for (const auto &Edge : Node->CalleeEdges) {
2447 if (!Edge->Callee->hasCall())
2448 continue;
2449 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2450 if (CalleeFuncToCallInfo.contains(ProfiledCalleeFunc))
2451 CalleeNodeToCallInfo[Edge->Callee] =
2452 &CalleeFuncToCallInfo[ProfiledCalleeFunc];
2453 }
2454
2455 // If there are entries in the second map, then there were no matching
2456 // calls/callees, nothing to do here. Return so we can go to the handling that
2457 // looks through tail calls.
2458 if (CalleeNodeToCallInfo.empty())
2459 return false;
2460
2461 // Walk through all callee edges again. Any and all callee edges that didn't
2462 // match any calls (callee not in the CalleeNodeToCallInfo map) are moved to a
2463 // new caller node (UnmatchedCalleesNode) which gets a null call so that it is
2464 // ignored during cloning. If it is in the map, then we use the node recorded
2465 // in that entry (creating it if needed), and move the callee edge to it.
2466 // The first callee will use the original node instead of creating a new one.
2467 // Note that any of the original calls on this node (in AllCalls) that didn't
2468 // have a callee function automatically get dropped from the node as part of
2469 // this process.
2470 ContextNode *UnmatchedCalleesNode = nullptr;
2471 // Track whether we already assigned original node to a callee.
2472 bool UsedOrigNode = false;
2473 assert(NodeToCallingFunc[Node]);
2474 // Iterate over a copy of Node's callee edges, since we may need to remove
2475 // edges in moveCalleeEdgeToNewCaller, and this simplifies the handling and
2476 // makes it less error-prone.
2477 auto CalleeEdges = Node->CalleeEdges;
2478 for (auto &Edge : CalleeEdges) {
2479 if (!Edge->Callee->hasCall())
2480 continue;
2481
2482 // Will be updated below to point to whatever (caller) node this callee edge
2483 // should be moved to.
2484 ContextNode *CallerNodeToUse = nullptr;
2485
2486 // Handle the case where there were no matching calls first. Move this
2487 // callee edge to the UnmatchedCalleesNode, creating it if needed.
2488 if (!CalleeNodeToCallInfo.contains(Edge->Callee)) {
2489 if (!UnmatchedCalleesNode)
2490 UnmatchedCalleesNode =
2491 createNewNode(/*IsAllocation=*/false, F: NodeToCallingFunc[Node]);
2492 CallerNodeToUse = UnmatchedCalleesNode;
2493 } else {
2494 // Look up the information recorded for this callee node, and use the
2495 // recorded caller node (creating it if needed).
2496 auto *Info = CalleeNodeToCallInfo[Edge->Callee];
2497 if (!Info->Node) {
2498 // If we haven't assigned any callees to the original node use it.
2499 if (!UsedOrigNode) {
2500 Info->Node = Node;
2501 // Clear the set of matching calls which will be updated below.
2502 Node->MatchingCalls.clear();
2503 UsedOrigNode = true;
2504 } else
2505 Info->Node =
2506 createNewNode(/*IsAllocation=*/false, F: NodeToCallingFunc[Node]);
2507 assert(!Info->Calls.empty());
2508 // The first call becomes the primary call for this caller node, and the
2509 // rest go in the matching calls list.
2510 Info->Node->setCall(Info->Calls.front());
2511 llvm::append_range(Info->Node->MatchingCalls,
2512 llvm::drop_begin(Info->Calls));
2513 // Save the primary call to node correspondence so that we can update
2514 // the NonAllocationCallToContextNodeMap, which is being iterated in the
2515 // caller of this function.
2516 NewCallToNode.push_back({Info->Node->Call, Info->Node});
2517 }
2518 CallerNodeToUse = Info->Node;
2519 }
2520
2521 // Don't need to move edge if we are using the original node;
2522 if (CallerNodeToUse == Node)
2523 continue;
2524
2525 moveCalleeEdgeToNewCaller(Edge, NewCaller: CallerNodeToUse);
2526 }
2527 // Now that we are done moving edges, clean up any caller edges that ended
2528 // up with no type or context ids. During moveCalleeEdgeToNewCaller all
2529 // caller edges from Node are replicated onto the new callers, and it
2530 // simplifies the handling to leave them until we have moved all
2531 // edges/context ids.
2532 for (auto &I : CalleeNodeToCallInfo)
2533 removeNoneTypeCallerEdges(Node: I.second->Node);
2534 if (UnmatchedCalleesNode)
2535 removeNoneTypeCallerEdges(Node: UnmatchedCalleesNode);
2536 removeNoneTypeCallerEdges(Node);
2537
2538 return true;
2539}
2540
2541uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2542 // In the Module (IR) case this is already the Id.
2543 return IdOrIndex;
2544}
2545
2546uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2547 // In the Index case this is an index into the stack id list in the summary
2548 // index, convert it to an Id.
2549 return Index.getStackIdAtIndex(Index: IdOrIndex);
2550}
2551
2552template <typename DerivedCCG, typename FuncTy, typename CallTy>
2553bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
2554 CallTy Call, EdgeIter &EI,
2555 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) {
2556 auto Edge = *EI;
2557 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2558 const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller];
2559 // Will be populated in order of callee to caller if we find a chain of tail
2560 // calls between the profiled caller and callee.
2561 std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain;
2562 if (!calleeMatchesFunc(Call, Func: ProfiledCalleeFunc, CallerFunc,
2563 FoundCalleeChain))
2564 return false;
2565
2566 // The usual case where the profiled callee matches that of the IR/summary.
2567 if (FoundCalleeChain.empty())
2568 return true;
2569
2570 auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) {
2571 auto *CurEdge = Callee->findEdgeFromCaller(Caller);
2572 // If there is already an edge between these nodes, simply update it and
2573 // return.
2574 if (CurEdge) {
2575 CurEdge->ContextIds.insert_range(Edge->ContextIds);
2576 CurEdge->AllocTypes |= Edge->AllocTypes;
2577 return;
2578 }
2579 // Otherwise, create a new edge and insert it into the caller and callee
2580 // lists.
2581 auto NewEdge = std::make_shared<ContextEdge>(
2582 Callee, Caller, Edge->AllocTypes, Edge->ContextIds);
2583 Callee->CallerEdges.push_back(NewEdge);
2584 if (Caller == Edge->Caller) {
2585 // If we are inserting the new edge into the current edge's caller, insert
2586 // the new edge before the current iterator position, and then increment
2587 // back to the current edge.
2588 EI = Caller->CalleeEdges.insert(EI, NewEdge);
2589 ++EI;
2590 assert(*EI == Edge &&
2591 "Iterator position not restored after insert and increment");
2592 } else
2593 Caller->CalleeEdges.push_back(NewEdge);
2594 };
2595
2596 // Create new nodes for each found callee and connect in between the profiled
2597 // caller and callee.
2598 auto *CurCalleeNode = Edge->Callee;
2599 for (auto &[NewCall, Func] : FoundCalleeChain) {
2600 ContextNode *NewNode = nullptr;
2601 // First check if we have already synthesized a node for this tail call.
2602 if (TailCallToContextNodeMap.count(NewCall)) {
2603 NewNode = TailCallToContextNodeMap[NewCall];
2604 NewNode->AllocTypes |= Edge->AllocTypes;
2605 } else {
2606 FuncToCallsWithMetadata[Func].push_back({NewCall});
2607 // Create Node and record node info.
2608 NewNode = createNewNode(/*IsAllocation=*/false, F: Func, C: NewCall);
2609 TailCallToContextNodeMap[NewCall] = NewNode;
2610 NewNode->AllocTypes = Edge->AllocTypes;
2611 }
2612
2613 // Hook up node to its callee node
2614 AddEdge(NewNode, CurCalleeNode);
2615
2616 CurCalleeNode = NewNode;
2617 }
2618
2619 // Hook up edge's original caller to new callee node.
2620 AddEdge(Edge->Caller, CurCalleeNode);
2621
2622#ifndef NDEBUG
2623 // Save this because Edge's fields get cleared below when removed.
2624 auto *Caller = Edge->Caller;
2625#endif
2626
2627 // Remove old edge
2628 removeEdgeFromGraph(Edge: Edge.get(), EI: &EI, /*CalleeIter=*/true);
2629
2630 // To simplify the increment of EI in the caller, subtract one from EI.
2631 // In the final AddEdge call we would have either added a new callee edge,
2632 // to Edge->Caller, or found an existing one. Either way we are guaranteed
2633 // that there is at least one callee edge.
2634 assert(!Caller->CalleeEdges.empty());
2635 --EI;
2636
2637 return true;
2638}
2639
2640bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2641 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
2642 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
2643 bool &FoundMultipleCalleeChains) {
2644 // Stop recursive search if we have already explored the maximum specified
2645 // depth.
2646 if (Depth > TailCallSearchDepth)
2647 return false;
2648
2649 auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) {
2650 FoundCalleeChain.push_back(x: {Callsite, F});
2651 };
2652
2653 auto *CalleeFunc = dyn_cast<Function>(Val: CurCallee);
2654 if (!CalleeFunc) {
2655 auto *Alias = dyn_cast<GlobalAlias>(Val: CurCallee);
2656 assert(Alias);
2657 CalleeFunc = dyn_cast<Function>(Val: Alias->getAliasee());
2658 assert(CalleeFunc);
2659 }
2660
2661 // Look for tail calls in this function, and check if they either call the
2662 // profiled callee directly, or indirectly (via a recursive search).
2663 // Only succeed if there is a single unique tail call chain found between the
2664 // profiled caller and callee, otherwise we could perform incorrect cloning.
2665 bool FoundSingleCalleeChain = false;
2666 for (auto &BB : *CalleeFunc) {
2667 for (auto &I : BB) {
2668 auto *CB = dyn_cast<CallBase>(Val: &I);
2669 if (!CB || !CB->isTailCall())
2670 continue;
2671 auto *CalledValue = CB->getCalledOperand();
2672 auto *CalledFunction = CB->getCalledFunction();
2673 if (CalledValue && !CalledFunction) {
2674 CalledValue = CalledValue->stripPointerCasts();
2675 // Stripping pointer casts can reveal a called function.
2676 CalledFunction = dyn_cast<Function>(Val: CalledValue);
2677 }
2678 // Check if this is an alias to a function. If so, get the
2679 // called aliasee for the checks below.
2680 if (auto *GA = dyn_cast<GlobalAlias>(Val: CalledValue)) {
2681 assert(!CalledFunction &&
2682 "Expected null called function in callsite for alias");
2683 CalledFunction = dyn_cast<Function>(Val: GA->getAliaseeObject());
2684 }
2685 if (!CalledFunction)
2686 continue;
2687 if (CalledFunction == ProfiledCallee) {
2688 if (FoundSingleCalleeChain) {
2689 FoundMultipleCalleeChains = true;
2690 return false;
2691 }
2692 FoundSingleCalleeChain = true;
2693 FoundProfiledCalleeCount++;
2694 FoundProfiledCalleeDepth += Depth;
2695 if (Depth > FoundProfiledCalleeMaxDepth)
2696 FoundProfiledCalleeMaxDepth = Depth;
2697 SaveCallsiteInfo(&I, CalleeFunc);
2698 } else if (findProfiledCalleeThroughTailCalls(
2699 ProfiledCallee, CurCallee: CalledFunction, Depth: Depth + 1,
2700 FoundCalleeChain, FoundMultipleCalleeChains)) {
2701 // findProfiledCalleeThroughTailCalls should not have returned
2702 // true if FoundMultipleCalleeChains.
2703 assert(!FoundMultipleCalleeChains);
2704 if (FoundSingleCalleeChain) {
2705 FoundMultipleCalleeChains = true;
2706 return false;
2707 }
2708 FoundSingleCalleeChain = true;
2709 SaveCallsiteInfo(&I, CalleeFunc);
2710 } else if (FoundMultipleCalleeChains)
2711 return false;
2712 }
2713 }
2714
2715 return FoundSingleCalleeChain;
2716}
2717
2718const Function *ModuleCallsiteContextGraph::getCalleeFunc(Instruction *Call) {
2719 auto *CB = dyn_cast<CallBase>(Val: Call);
2720 if (!CB->getCalledOperand() || CB->isIndirectCall())
2721 return nullptr;
2722 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
2723 auto *Alias = dyn_cast<GlobalAlias>(Val: CalleeVal);
2724 if (Alias)
2725 return dyn_cast<Function>(Val: Alias->getAliasee());
2726 return dyn_cast<Function>(Val: CalleeVal);
2727}
2728
2729bool ModuleCallsiteContextGraph::calleeMatchesFunc(
2730 Instruction *Call, const Function *Func, const Function *CallerFunc,
2731 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) {
2732 auto *CB = dyn_cast<CallBase>(Val: Call);
2733 if (!CB->getCalledOperand() || CB->isIndirectCall())
2734 return false;
2735 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
2736 auto *CalleeFunc = dyn_cast<Function>(Val: CalleeVal);
2737 if (CalleeFunc == Func)
2738 return true;
2739 auto *Alias = dyn_cast<GlobalAlias>(Val: CalleeVal);
2740 if (Alias && Alias->getAliasee() == Func)
2741 return true;
2742
2743 // Recursively search for the profiled callee through tail calls starting with
2744 // the actual Callee. The discovered tail call chain is saved in
2745 // FoundCalleeChain, and we will fixup the graph to include these callsites
2746 // after returning.
2747 // FIXME: We will currently redo the same recursive walk if we find the same
2748 // mismatched callee from another callsite. We can improve this with more
2749 // bookkeeping of the created chain of new nodes for each mismatch.
2750 unsigned Depth = 1;
2751 bool FoundMultipleCalleeChains = false;
2752 if (!findProfiledCalleeThroughTailCalls(ProfiledCallee: Func, CurCallee: CalleeVal, Depth,
2753 FoundCalleeChain,
2754 FoundMultipleCalleeChains)) {
2755 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: "
2756 << Func->getName() << " from " << CallerFunc->getName()
2757 << " that actually called " << CalleeVal->getName()
2758 << (FoundMultipleCalleeChains
2759 ? " (found multiple possible chains)"
2760 : "")
2761 << "\n");
2762 if (FoundMultipleCalleeChains)
2763 FoundProfiledCalleeNonUniquelyCount++;
2764 return false;
2765 }
2766
2767 return true;
2768}
2769
2770bool ModuleCallsiteContextGraph::sameCallee(Instruction *Call1,
2771 Instruction *Call2) {
2772 auto *CB1 = cast<CallBase>(Val: Call1);
2773 if (!CB1->getCalledOperand() || CB1->isIndirectCall())
2774 return false;
2775 auto *CalleeVal1 = CB1->getCalledOperand()->stripPointerCasts();
2776 auto *CalleeFunc1 = dyn_cast<Function>(Val: CalleeVal1);
2777 auto *CB2 = cast<CallBase>(Val: Call2);
2778 if (!CB2->getCalledOperand() || CB2->isIndirectCall())
2779 return false;
2780 auto *CalleeVal2 = CB2->getCalledOperand()->stripPointerCasts();
2781 auto *CalleeFunc2 = dyn_cast<Function>(Val: CalleeVal2);
2782 return CalleeFunc1 == CalleeFunc2;
2783}
2784
2785bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2786 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
2787 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
2788 bool &FoundMultipleCalleeChains) {
2789 // Stop recursive search if we have already explored the maximum specified
2790 // depth.
2791 if (Depth > TailCallSearchDepth)
2792 return false;
2793
2794 auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) {
2795 // Make a CallsiteInfo for each discovered callee, if one hasn't already
2796 // been synthesized.
2797 if (!FunctionCalleesToSynthesizedCallsiteInfos.count(x: FS) ||
2798 !FunctionCalleesToSynthesizedCallsiteInfos[FS].count(x: Callee))
2799 // StackIds is empty (we don't have debug info available in the index for
2800 // these callsites)
2801 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] =
2802 std::make_unique<CallsiteInfo>(args&: Callee, args: SmallVector<unsigned>());
2803 CallsiteInfo *NewCallsiteInfo =
2804 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get();
2805 FoundCalleeChain.push_back(x: {NewCallsiteInfo, FS});
2806 };
2807
2808 // Look for tail calls in this function, and check if they either call the
2809 // profiled callee directly, or indirectly (via a recursive search).
2810 // Only succeed if there is a single unique tail call chain found between the
2811 // profiled caller and callee, otherwise we could perform incorrect cloning.
2812 bool FoundSingleCalleeChain = false;
2813 for (auto &S : CurCallee.getSummaryList()) {
2814 if (!GlobalValue::isLocalLinkage(Linkage: S->linkage()) &&
2815 !isPrevailing(CurCallee.getGUID(), S.get()))
2816 continue;
2817 auto *FS = dyn_cast<FunctionSummary>(Val: S->getBaseObject());
2818 if (!FS)
2819 continue;
2820 auto FSVI = CurCallee;
2821 auto *AS = dyn_cast<AliasSummary>(Val: S.get());
2822 if (AS)
2823 FSVI = AS->getAliaseeVI();
2824 for (auto &CallEdge : FS->calls()) {
2825 if (!CallEdge.second.hasTailCall())
2826 continue;
2827 if (CallEdge.first == ProfiledCallee) {
2828 if (FoundSingleCalleeChain) {
2829 FoundMultipleCalleeChains = true;
2830 return false;
2831 }
2832 FoundSingleCalleeChain = true;
2833 FoundProfiledCalleeCount++;
2834 FoundProfiledCalleeDepth += Depth;
2835 if (Depth > FoundProfiledCalleeMaxDepth)
2836 FoundProfiledCalleeMaxDepth = Depth;
2837 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2838 // Add FS to FSToVIMap in case it isn't already there.
2839 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2840 FSToVIMap[FS] = FSVI;
2841 } else if (findProfiledCalleeThroughTailCalls(
2842 ProfiledCallee, CurCallee: CallEdge.first, Depth: Depth + 1,
2843 FoundCalleeChain, FoundMultipleCalleeChains)) {
2844 // findProfiledCalleeThroughTailCalls should not have returned
2845 // true if FoundMultipleCalleeChains.
2846 assert(!FoundMultipleCalleeChains);
2847 if (FoundSingleCalleeChain) {
2848 FoundMultipleCalleeChains = true;
2849 return false;
2850 }
2851 FoundSingleCalleeChain = true;
2852 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2853 // Add FS to FSToVIMap in case it isn't already there.
2854 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2855 FSToVIMap[FS] = FSVI;
2856 } else if (FoundMultipleCalleeChains)
2857 return false;
2858 }
2859 }
2860
2861 return FoundSingleCalleeChain;
2862}
2863
2864const FunctionSummary *
2865IndexCallsiteContextGraph::getCalleeFunc(IndexCall &Call) {
2866 ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Val&: Call)->Callee;
2867 if (Callee.getSummaryList().empty())
2868 return nullptr;
2869 return dyn_cast<FunctionSummary>(Val: Callee.getSummaryList()[0]->getBaseObject());
2870}
2871
2872bool IndexCallsiteContextGraph::calleeMatchesFunc(
2873 IndexCall &Call, const FunctionSummary *Func,
2874 const FunctionSummary *CallerFunc,
2875 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {
2876 ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Val&: Call)->Callee;
2877 // If there is no summary list then this is a call to an externally defined
2878 // symbol.
2879 AliasSummary *Alias =
2880 Callee.getSummaryList().empty()
2881 ? nullptr
2882 : dyn_cast<AliasSummary>(Val: Callee.getSummaryList()[0].get());
2883 assert(FSToVIMap.count(Func));
2884 auto FuncVI = FSToVIMap[Func];
2885 if (Callee == FuncVI ||
2886 // If callee is an alias, check the aliasee, since only function
2887 // summary base objects will contain the stack node summaries and thus
2888 // get a context node.
2889 (Alias && Alias->getAliaseeVI() == FuncVI))
2890 return true;
2891
2892 // Recursively search for the profiled callee through tail calls starting with
2893 // the actual Callee. The discovered tail call chain is saved in
2894 // FoundCalleeChain, and we will fixup the graph to include these callsites
2895 // after returning.
2896 // FIXME: We will currently redo the same recursive walk if we find the same
2897 // mismatched callee from another callsite. We can improve this with more
2898 // bookkeeping of the created chain of new nodes for each mismatch.
2899 unsigned Depth = 1;
2900 bool FoundMultipleCalleeChains = false;
2901 if (!findProfiledCalleeThroughTailCalls(
2902 ProfiledCallee: FuncVI, CurCallee: Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) {
2903 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI
2904 << " from " << FSToVIMap[CallerFunc]
2905 << " that actually called " << Callee
2906 << (FoundMultipleCalleeChains
2907 ? " (found multiple possible chains)"
2908 : "")
2909 << "\n");
2910 if (FoundMultipleCalleeChains)
2911 FoundProfiledCalleeNonUniquelyCount++;
2912 return false;
2913 }
2914
2915 return true;
2916}
2917
2918bool IndexCallsiteContextGraph::sameCallee(IndexCall &Call1, IndexCall &Call2) {
2919 ValueInfo Callee1 = dyn_cast_if_present<CallsiteInfo *>(Val&: Call1)->Callee;
2920 ValueInfo Callee2 = dyn_cast_if_present<CallsiteInfo *>(Val&: Call2)->Callee;
2921 return Callee1 == Callee2;
2922}
2923
2924template <typename DerivedCCG, typename FuncTy, typename CallTy>
2925void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
2926 const {
2927 print(OS&: dbgs());
2928 dbgs() << "\n";
2929}
2930
2931template <typename DerivedCCG, typename FuncTy, typename CallTy>
2932void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
2933 raw_ostream &OS) const {
2934 OS << "Node " << this << "\n";
2935 OS << "\t";
2936 printCall(OS);
2937 if (Recursive)
2938 OS << " (recursive)";
2939 OS << "\n";
2940 if (!MatchingCalls.empty()) {
2941 OS << "\tMatchingCalls:\n";
2942 for (auto &MatchingCall : MatchingCalls) {
2943 OS << "\t";
2944 MatchingCall.print(OS);
2945 OS << "\n";
2946 }
2947 }
2948 OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
2949 OS << "\tContextIds:";
2950 // Make a copy of the computed context ids that we can sort for stability.
2951 auto ContextIds = getContextIds();
2952 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2953 std::sort(first: SortedIds.begin(), last: SortedIds.end());
2954 for (auto Id : SortedIds)
2955 OS << " " << Id;
2956 OS << "\n";
2957 OS << "\tCalleeEdges:\n";
2958 for (auto &Edge : CalleeEdges)
2959 OS << "\t\t" << *Edge << "\n";
2960 OS << "\tCallerEdges:\n";
2961 for (auto &Edge : CallerEdges)
2962 OS << "\t\t" << *Edge << "\n";
2963 if (!Clones.empty()) {
2964 OS << "\tClones: " << llvm::interleaved(Clones) << "\n";
2965 } else if (CloneOf) {
2966 OS << "\tClone of " << CloneOf << "\n";
2967 }
2968}
2969
2970template <typename DerivedCCG, typename FuncTy, typename CallTy>
2971void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
2972 const {
2973 print(OS&: dbgs());
2974 dbgs() << "\n";
2975}
2976
2977template <typename DerivedCCG, typename FuncTy, typename CallTy>
2978void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
2979 raw_ostream &OS) const {
2980 OS << "Edge from Callee " << Callee << " to Caller: " << Caller
2981 << (IsBackedge ? " (BE)" : "")
2982 << " AllocTypes: " << getAllocTypeString(AllocTypes);
2983 OS << " ContextIds:";
2984 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2985 std::sort(first: SortedIds.begin(), last: SortedIds.end());
2986 for (auto Id : SortedIds)
2987 OS << " " << Id;
2988}
2989
2990template <typename DerivedCCG, typename FuncTy, typename CallTy>
2991void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
2992 print(OS&: dbgs());
2993}
2994
2995template <typename DerivedCCG, typename FuncTy, typename CallTy>
2996void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
2997 raw_ostream &OS) const {
2998 OS << "Callsite Context Graph:\n";
2999 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3000 for (const auto Node : nodes<GraphType>(this)) {
3001 if (Node->isRemoved())
3002 continue;
3003 Node->print(OS);
3004 OS << "\n";
3005 }
3006}
3007
3008template <typename DerivedCCG, typename FuncTy, typename CallTy>
3009void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
3010 raw_ostream &OS) const {
3011 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3012 for (const auto Node : nodes<GraphType>(this)) {
3013 if (Node->isRemoved())
3014 continue;
3015 if (!Node->IsAllocation)
3016 continue;
3017 DenseSet<uint32_t> ContextIds = Node->getContextIds();
3018 auto AllocTypeFromCall = getAllocationCallType(Call: Node->Call);
3019 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3020 std::sort(first: SortedIds.begin(), last: SortedIds.end());
3021 for (auto Id : SortedIds) {
3022 auto TypeI = ContextIdToAllocationType.find(Val: Id);
3023 assert(TypeI != ContextIdToAllocationType.end());
3024 auto CSI = ContextIdToContextSizeInfos.find(Val: Id);
3025 if (CSI != ContextIdToContextSizeInfos.end()) {
3026 for (auto &Info : CSI->second) {
3027 OS << "MemProf hinting: "
3028 << getAllocTypeString(AllocTypes: (uint8_t)TypeI->second)
3029 << " full allocation context " << Info.FullStackId
3030 << " with total size " << Info.TotalSize << " is "
3031 << getAllocTypeString(Node->AllocTypes) << " after cloning";
3032 if (allocTypeToUse(Node->AllocTypes) != AllocTypeFromCall)
3033 OS << " marked " << getAllocTypeString(AllocTypes: (uint8_t)AllocTypeFromCall)
3034 << " due to cold byte percent";
3035 // Print the internal context id to aid debugging and visualization.
3036 OS << " (context id " << Id << ")";
3037 OS << "\n";
3038 }
3039 }
3040 }
3041 }
3042}
3043
3044template <typename DerivedCCG, typename FuncTy, typename CallTy>
3045void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
3046 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3047 for (const auto Node : nodes<GraphType>(this)) {
3048 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3049 for (auto &Edge : Node->CallerEdges)
3050 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
3051 }
3052}
3053
3054template <typename DerivedCCG, typename FuncTy, typename CallTy>
3055struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
3056 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3057 using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
3058
3059 using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
3060 static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
3061
3062 using nodes_iterator =
3063 mapped_iterator<typename std::vector<NodePtrTy>::const_iterator,
3064 decltype(&getNode)>;
3065
3066 static nodes_iterator nodes_begin(GraphType G) {
3067 return nodes_iterator(G->NodeOwner.begin(), &getNode);
3068 }
3069
3070 static nodes_iterator nodes_end(GraphType G) {
3071 return nodes_iterator(G->NodeOwner.end(), &getNode);
3072 }
3073
3074 static NodeRef getEntryNode(GraphType G) {
3075 return G->NodeOwner.begin()->get();
3076 }
3077
3078 using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
3079 static const ContextNode<DerivedCCG, FuncTy, CallTy> *
3080 GetCallee(const EdgePtrTy &P) {
3081 return P->Callee;
3082 }
3083
3084 using ChildIteratorType =
3085 mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
3086 DerivedCCG, FuncTy, CallTy>>>::const_iterator,
3087 decltype(&GetCallee)>;
3088
3089 static ChildIteratorType child_begin(NodeRef N) {
3090 return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
3091 }
3092
3093 static ChildIteratorType child_end(NodeRef N) {
3094 return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
3095 }
3096};
3097
3098template <typename DerivedCCG, typename FuncTy, typename CallTy>
3099struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
3100 : public DefaultDOTGraphTraits {
3101 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {
3102 // If the user requested the full graph to be exported, but provided an
3103 // allocation id, or if the user gave a context id and requested more than
3104 // just a specific context to be exported, note that highlighting is
3105 // enabled.
3106 DoHighlight =
3107 (AllocIdForDot.getNumOccurrences() && DotGraphScope == DotScope::All) ||
3108 (ContextIdForDot.getNumOccurrences() &&
3109 DotGraphScope != DotScope::Context);
3110 }
3111
3112 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3113 using GTraits = GraphTraits<GraphType>;
3114 using NodeRef = typename GTraits::NodeRef;
3115 using ChildIteratorType = typename GTraits::ChildIteratorType;
3116
3117 static std::string getNodeLabel(NodeRef Node, GraphType G) {
3118 std::string LabelString =
3119 (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
3120 Twine(Node->OrigStackOrAllocId))
3121 .str();
3122 LabelString += "\n";
3123 if (Node->hasCall()) {
3124 auto Func = G->NodeToCallingFunc.find(Node);
3125 assert(Func != G->NodeToCallingFunc.end());
3126 LabelString +=
3127 G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
3128 } else {
3129 LabelString += "null call";
3130 if (Node->Recursive)
3131 LabelString += " (recursive)";
3132 else
3133 LabelString += " (external)";
3134 }
3135 return LabelString;
3136 }
3137
3138 static std::string getNodeAttributes(NodeRef Node, GraphType G) {
3139 auto ContextIds = Node->getContextIds();
3140 // If highlighting enabled, see if this node contains any of the context ids
3141 // of interest. If so, it will use a different color and a larger fontsize
3142 // (which makes the node larger as well).
3143 bool Highlight = false;
3144 if (DoHighlight) {
3145 assert(ContextIdForDot.getNumOccurrences() ||
3146 AllocIdForDot.getNumOccurrences());
3147 if (ContextIdForDot.getNumOccurrences())
3148 Highlight = ContextIds.contains(ContextIdForDot);
3149 else
3150 Highlight = set_intersects(ContextIds, G->DotAllocContextIds);
3151 }
3152 std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
3153 getContextIds(ContextIds) + "\"")
3154 .str();
3155 // Default fontsize is 14
3156 if (Highlight)
3157 AttributeString += ",fontsize=\"30\"";
3158 AttributeString +=
3159 (Twine(",fillcolor=\"") + getColor(AllocTypes: Node->AllocTypes, Highlight) + "\"")
3160 .str();
3161 if (Node->CloneOf) {
3162 AttributeString += ",color=\"blue\"";
3163 AttributeString += ",style=\"filled,bold,dashed\"";
3164 } else
3165 AttributeString += ",style=\"filled\"";
3166 return AttributeString;
3167 }
3168
3169 static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
3170 GraphType G) {
3171 auto &Edge = *(ChildIter.getCurrent());
3172 // If highlighting enabled, see if this edge contains any of the context ids
3173 // of interest. If so, it will use a different color and a heavier arrow
3174 // size and weight (the larger weight makes the highlighted path
3175 // straighter).
3176 bool Highlight = false;
3177 if (DoHighlight) {
3178 assert(ContextIdForDot.getNumOccurrences() ||
3179 AllocIdForDot.getNumOccurrences());
3180 if (ContextIdForDot.getNumOccurrences())
3181 Highlight = Edge->ContextIds.contains(ContextIdForDot);
3182 else
3183 Highlight = set_intersects(Edge->ContextIds, G->DotAllocContextIds);
3184 }
3185 auto Color = getColor(AllocTypes: Edge->AllocTypes, Highlight);
3186 std::string AttributeString =
3187 (Twine("tooltip=\"") + getContextIds(ContextIds: Edge->ContextIds) + "\"" +
3188 // fillcolor is the arrow head and color is the line
3189 Twine(",fillcolor=\"") + Color + "\"" + Twine(",color=\"") + Color +
3190 "\"")
3191 .str();
3192 if (Edge->IsBackedge)
3193 AttributeString += ",style=\"dotted\"";
3194 // Default penwidth and weight are both 1.
3195 if (Highlight)
3196 AttributeString += ",penwidth=\"2.0\",weight=\"2\"";
3197 return AttributeString;
3198 }
3199
3200 // Since the NodeOwners list includes nodes that are no longer connected to
3201 // the graph, skip them here.
3202 static bool isNodeHidden(NodeRef Node, GraphType G) {
3203 if (Node->isRemoved())
3204 return true;
3205 // If a scope smaller than the full graph was requested, see if this node
3206 // contains any of the context ids of interest.
3207 if (DotGraphScope == DotScope::Alloc)
3208 return !set_intersects(Node->getContextIds(), G->DotAllocContextIds);
3209 if (DotGraphScope == DotScope::Context)
3210 return !Node->getContextIds().contains(ContextIdForDot);
3211 return false;
3212 }
3213
3214private:
3215 static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
3216 std::string IdString = "ContextIds:";
3217 if (ContextIds.size() < 100) {
3218 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3219 std::sort(first: SortedIds.begin(), last: SortedIds.end());
3220 for (auto Id : SortedIds)
3221 IdString += (" " + Twine(Id)).str();
3222 } else {
3223 IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
3224 }
3225 return IdString;
3226 }
3227
3228 static std::string getColor(uint8_t AllocTypes, bool Highlight) {
3229 // If DoHighlight is not enabled, we want to use the highlight colors for
3230 // NotCold and Cold, and the non-highlight color for NotCold+Cold. This is
3231 // both compatible with the color scheme before highlighting was supported,
3232 // and for the NotCold+Cold color the non-highlight color is a bit more
3233 // readable.
3234 if (AllocTypes == (uint8_t)AllocationType::NotCold)
3235 // Color "brown1" actually looks like a lighter red.
3236 return !DoHighlight || Highlight ? "brown1" : "lightpink";
3237 if (AllocTypes == (uint8_t)AllocationType::Cold)
3238 return !DoHighlight || Highlight ? "cyan" : "lightskyblue";
3239 if (AllocTypes ==
3240 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
3241 return Highlight ? "magenta" : "mediumorchid1";
3242 return "gray";
3243 }
3244
3245 static std::string getNodeId(NodeRef Node) {
3246 std::stringstream SStream;
3247 SStream << std::hex << "N0x" << (unsigned long long)Node;
3248 std::string Result = SStream.str();
3249 return Result;
3250 }
3251
3252 // True if we should highlight a specific context or allocation's contexts in
3253 // the emitted graph.
3254 static bool DoHighlight;
3255};
3256
3257template <typename DerivedCCG, typename FuncTy, typename CallTy>
3258bool DOTGraphTraits<
3259 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>::DoHighlight =
3260 false;
3261
3262template <typename DerivedCCG, typename FuncTy, typename CallTy>
3263void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
3264 std::string Label) const {
3265 WriteGraph(this, "", false, Label,
3266 DotFilePathPrefix + "ccg." + Label + ".dot");
3267}
3268
3269template <typename DerivedCCG, typename FuncTy, typename CallTy>
3270typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
3271CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
3272 const std::shared_ptr<ContextEdge> &Edge,
3273 DenseSet<uint32_t> ContextIdsToMove) {
3274 ContextNode *Node = Edge->Callee;
3275 assert(NodeToCallingFunc.count(Node));
3276 ContextNode *Clone =
3277 createNewNode(IsAllocation: Node->IsAllocation, F: NodeToCallingFunc[Node], C: Node->Call);
3278 Node->addClone(Clone);
3279 Clone->MatchingCalls = Node->MatchingCalls;
3280 moveEdgeToExistingCalleeClone(Edge, NewCallee: Clone, /*NewClone=*/true,
3281 ContextIdsToMove);
3282 return Clone;
3283}
3284
3285template <typename DerivedCCG, typename FuncTy, typename CallTy>
3286void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3287 moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
3288 ContextNode *NewCallee, bool NewClone,
3289 DenseSet<uint32_t> ContextIdsToMove) {
3290 // NewCallee and Edge's current callee must be clones of the same original
3291 // node (Edge's current callee may be the original node too).
3292 assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
3293
3294 bool EdgeIsRecursive = Edge->Callee == Edge->Caller;
3295
3296 ContextNode *OldCallee = Edge->Callee;
3297
3298 // We might already have an edge to the new callee from earlier cloning for a
3299 // different allocation. If one exists we will reuse it.
3300 auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller);
3301
3302 // Callers will pass an empty ContextIdsToMove set when they want to move the
3303 // edge. Copy in Edge's ids for simplicity.
3304 if (ContextIdsToMove.empty())
3305 ContextIdsToMove = Edge->getContextIds();
3306
3307 // If we are moving all of Edge's ids, then just move the whole Edge.
3308 // Otherwise only move the specified subset, to a new edge if needed.
3309 if (Edge->getContextIds().size() == ContextIdsToMove.size()) {
3310 // First, update the alloc types on New Callee from Edge.
3311 // Do this before we potentially clear Edge's fields below!
3312 NewCallee->AllocTypes |= Edge->AllocTypes;
3313 // Moving the whole Edge.
3314 if (ExistingEdgeToNewCallee) {
3315 // Since we already have an edge to NewCallee, simply move the ids
3316 // onto it, and remove the existing Edge.
3317 ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
3318 ExistingEdgeToNewCallee->AllocTypes |= Edge->AllocTypes;
3319 assert(Edge->ContextIds == ContextIdsToMove);
3320 removeEdgeFromGraph(Edge: Edge.get());
3321 } else {
3322 // Otherwise just reconnect Edge to NewCallee.
3323 Edge->Callee = NewCallee;
3324 NewCallee->CallerEdges.push_back(Edge);
3325 // Remove it from callee where it was previously connected.
3326 OldCallee->eraseCallerEdge(Edge.get());
3327 // Don't need to update Edge's context ids since we are simply
3328 // reconnecting it.
3329 }
3330 } else {
3331 // Only moving a subset of Edge's ids.
3332 // Compute the alloc type of the subset of ids being moved.
3333 auto CallerEdgeAllocType = computeAllocType(ContextIds&: ContextIdsToMove);
3334 if (ExistingEdgeToNewCallee) {
3335 // Since we already have an edge to NewCallee, simply move the ids
3336 // onto it.
3337 ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
3338 ExistingEdgeToNewCallee->AllocTypes |= CallerEdgeAllocType;
3339 } else {
3340 // Otherwise, create a new edge to NewCallee for the ids being moved.
3341 auto NewEdge = std::make_shared<ContextEdge>(
3342 NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove);
3343 Edge->Caller->CalleeEdges.push_back(NewEdge);
3344 NewCallee->CallerEdges.push_back(NewEdge);
3345 }
3346 // In either case, need to update the alloc types on NewCallee, and remove
3347 // those ids and update the alloc type on the original Edge.
3348 NewCallee->AllocTypes |= CallerEdgeAllocType;
3349 set_subtract(Edge->ContextIds, ContextIdsToMove);
3350 Edge->AllocTypes = computeAllocType(ContextIds&: Edge->ContextIds);
3351 }
3352 // Now walk the old callee node's callee edges and move Edge's context ids
3353 // over to the corresponding edge into the clone (which is created here if
3354 // this is a newly created clone).
3355 for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
3356 ContextNode *CalleeToUse = OldCalleeEdge->Callee;
3357 // If this is a direct recursion edge, use NewCallee (the clone) as the
3358 // callee as well, so that any edge updated/created here is also direct
3359 // recursive.
3360 if (CalleeToUse == OldCallee) {
3361 // If this is a recursive edge, see if we already moved a recursive edge
3362 // (which would have to have been this one) - if we were only moving a
3363 // subset of context ids it would still be on OldCallee.
3364 if (EdgeIsRecursive) {
3365 assert(OldCalleeEdge == Edge);
3366 continue;
3367 }
3368 CalleeToUse = NewCallee;
3369 }
3370 // The context ids moving to the new callee are the subset of this edge's
3371 // context ids and the context ids on the caller edge being moved.
3372 DenseSet<uint32_t> EdgeContextIdsToMove =
3373 set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove);
3374 set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
3375 OldCalleeEdge->AllocTypes =
3376 computeAllocType(ContextIds&: OldCalleeEdge->getContextIds());
3377 if (!NewClone) {
3378 // Update context ids / alloc type on corresponding edge to NewCallee.
3379 // There is a chance this may not exist if we are reusing an existing
3380 // clone, specifically during function assignment, where we would have
3381 // removed none type edges after creating the clone. If we can't find
3382 // a corresponding edge there, fall through to the cloning below.
3383 if (auto *NewCalleeEdge = NewCallee->findEdgeFromCallee(CalleeToUse)) {
3384 NewCalleeEdge->getContextIds().insert_range(EdgeContextIdsToMove);
3385 NewCalleeEdge->AllocTypes |= computeAllocType(ContextIds&: EdgeContextIdsToMove);
3386 continue;
3387 }
3388 }
3389 auto NewEdge = std::make_shared<ContextEdge>(
3390 CalleeToUse, NewCallee, computeAllocType(ContextIds&: EdgeContextIdsToMove),
3391 EdgeContextIdsToMove);
3392 NewCallee->CalleeEdges.push_back(NewEdge);
3393 NewEdge->Callee->CallerEdges.push_back(NewEdge);
3394 }
3395 // Recompute the node alloc type now that its callee edges have been
3396 // updated (since we will compute from those edges).
3397 OldCallee->AllocTypes = OldCallee->computeAllocType();
3398 // OldCallee alloc type should be None iff its context id set is now empty.
3399 assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
3400 OldCallee->emptyContextIds());
3401 if (VerifyCCG) {
3402 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
3403 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
3404 for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
3405 checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
3406 /*CheckEdges=*/false);
3407 for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
3408 checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
3409 /*CheckEdges=*/false);
3410 }
3411}
3412
3413template <typename DerivedCCG, typename FuncTy, typename CallTy>
3414void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3415 moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
3416 ContextNode *NewCaller) {
3417 auto *OldCallee = Edge->Callee;
3418 auto *NewCallee = OldCallee;
3419 // If this edge was direct recursive, make any new/updated edge also direct
3420 // recursive to NewCaller.
3421 bool Recursive = Edge->Caller == Edge->Callee;
3422 if (Recursive)
3423 NewCallee = NewCaller;
3424
3425 ContextNode *OldCaller = Edge->Caller;
3426 OldCaller->eraseCalleeEdge(Edge.get());
3427
3428 // We might already have an edge to the new caller. If one exists we will
3429 // reuse it.
3430 auto ExistingEdgeToNewCaller = NewCaller->findEdgeFromCallee(NewCallee);
3431
3432 if (ExistingEdgeToNewCaller) {
3433 // Since we already have an edge to NewCaller, simply move the ids
3434 // onto it, and remove the existing Edge.
3435 ExistingEdgeToNewCaller->getContextIds().insert_range(
3436 Edge->getContextIds());
3437 ExistingEdgeToNewCaller->AllocTypes |= Edge->AllocTypes;
3438 Edge->ContextIds.clear();
3439 Edge->AllocTypes = (uint8_t)AllocationType::None;
3440 OldCallee->eraseCallerEdge(Edge.get());
3441 } else {
3442 // Otherwise just reconnect Edge to NewCaller.
3443 Edge->Caller = NewCaller;
3444 NewCaller->CalleeEdges.push_back(Edge);
3445 if (Recursive) {
3446 assert(NewCallee == NewCaller);
3447 // In the case of (direct) recursive edges, we update the callee as well
3448 // so that it becomes recursive on the new caller.
3449 Edge->Callee = NewCallee;
3450 NewCallee->CallerEdges.push_back(Edge);
3451 OldCallee->eraseCallerEdge(Edge.get());
3452 }
3453 // Don't need to update Edge's context ids since we are simply
3454 // reconnecting it.
3455 }
3456 // In either case, need to update the alloc types on New Caller.
3457 NewCaller->AllocTypes |= Edge->AllocTypes;
3458
3459 // Now walk the old caller node's caller edges and move Edge's context ids
3460 // over to the corresponding edge into the node (which is created here if
3461 // this is a newly created node). We can tell whether this is a newly created
3462 // node by seeing if it has any caller edges yet.
3463#ifndef NDEBUG
3464 bool IsNewNode = NewCaller->CallerEdges.empty();
3465#endif
3466 // If we just moved a direct recursive edge, presumably its context ids should
3467 // also flow out of OldCaller via some other non-recursive callee edge. We
3468 // don't want to remove the recursive context ids from other caller edges yet,
3469 // otherwise the context ids get into an inconsistent state on OldCaller.
3470 // We will update these context ids on the non-recursive caller edge when and
3471 // if they are updated on the non-recursive callee.
3472 if (!Recursive) {
3473 for (auto &OldCallerEdge : OldCaller->CallerEdges) {
3474 auto OldCallerCaller = OldCallerEdge->Caller;
3475 // The context ids moving to the new caller are the subset of this edge's
3476 // context ids and the context ids on the callee edge being moved.
3477 DenseSet<uint32_t> EdgeContextIdsToMove = set_intersection(
3478 OldCallerEdge->getContextIds(), Edge->getContextIds());
3479 if (OldCaller == OldCallerCaller) {
3480 OldCallerCaller = NewCaller;
3481 // Don't actually move this one. The caller will move it directly via a
3482 // call to this function with this as the Edge if it is appropriate to
3483 // move to a diff node that has a matching callee (itself).
3484 continue;
3485 }
3486 set_subtract(OldCallerEdge->getContextIds(), EdgeContextIdsToMove);
3487 OldCallerEdge->AllocTypes =
3488 computeAllocType(ContextIds&: OldCallerEdge->getContextIds());
3489 // In this function we expect that any pre-existing node already has edges
3490 // from the same callers as the old node. That should be true in the
3491 // current use case, where we will remove None-type edges after copying
3492 // over all caller edges from the callee.
3493 auto *ExistingCallerEdge = NewCaller->findEdgeFromCaller(OldCallerCaller);
3494 // Since we would have skipped caller edges when moving a direct recursive
3495 // edge, this may not hold true when recursive handling enabled.
3496 assert(IsNewNode || ExistingCallerEdge || AllowRecursiveCallsites);
3497 if (ExistingCallerEdge) {
3498 ExistingCallerEdge->getContextIds().insert_range(EdgeContextIdsToMove);
3499 ExistingCallerEdge->AllocTypes |=
3500 computeAllocType(ContextIds&: EdgeContextIdsToMove);
3501 continue;
3502 }
3503 auto NewEdge = std::make_shared<ContextEdge>(
3504 NewCaller, OldCallerCaller, computeAllocType(ContextIds&: EdgeContextIdsToMove),
3505 EdgeContextIdsToMove);
3506 NewCaller->CallerEdges.push_back(NewEdge);
3507 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
3508 }
3509 }
3510 // Recompute the node alloc type now that its caller edges have been
3511 // updated (since we will compute from those edges).
3512 OldCaller->AllocTypes = OldCaller->computeAllocType();
3513 // OldCaller alloc type should be None iff its context id set is now empty.
3514 assert((OldCaller->AllocTypes == (uint8_t)AllocationType::None) ==
3515 OldCaller->emptyContextIds());
3516 if (VerifyCCG) {
3517 checkNode<DerivedCCG, FuncTy, CallTy>(OldCaller, /*CheckEdges=*/false);
3518 checkNode<DerivedCCG, FuncTy, CallTy>(NewCaller, /*CheckEdges=*/false);
3519 for (const auto &OldCallerEdge : OldCaller->CallerEdges)
3520 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallerEdge->Caller,
3521 /*CheckEdges=*/false);
3522 for (const auto &NewCallerEdge : NewCaller->CallerEdges)
3523 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallerEdge->Caller,
3524 /*CheckEdges=*/false);
3525 }
3526}
3527
3528template <typename DerivedCCG, typename FuncTy, typename CallTy>
3529void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3530 recursivelyRemoveNoneTypeCalleeEdges(
3531 ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
3532 auto Inserted = Visited.insert(Node);
3533 if (!Inserted.second)
3534 return;
3535
3536 removeNoneTypeCalleeEdges(Node);
3537
3538 for (auto *Clone : Node->Clones)
3539 recursivelyRemoveNoneTypeCalleeEdges(Node: Clone, Visited);
3540
3541 // The recursive call may remove some of this Node's caller edges.
3542 // Iterate over a copy and skip any that were removed.
3543 auto CallerEdges = Node->CallerEdges;
3544 for (auto &Edge : CallerEdges) {
3545 // Skip any that have been removed by an earlier recursive call.
3546 if (Edge->isRemoved()) {
3547 assert(!is_contained(Node->CallerEdges, Edge));
3548 continue;
3549 }
3550 recursivelyRemoveNoneTypeCalleeEdges(Node: Edge->Caller, Visited);
3551 }
3552}
3553
3554// This is the standard DFS based backedge discovery algorithm.
3555template <typename DerivedCCG, typename FuncTy, typename CallTy>
3556void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges() {
3557 // If we are cloning recursive contexts, find and mark backedges from all root
3558 // callers, using the typical DFS based backedge analysis.
3559 if (!CloneRecursiveContexts)
3560 return;
3561 DenseSet<const ContextNode *> Visited;
3562 DenseSet<const ContextNode *> CurrentStack;
3563 for (auto &Entry : NonAllocationCallToContextNodeMap) {
3564 auto *Node = Entry.second;
3565 if (Node->isRemoved())
3566 continue;
3567 // It is a root if it doesn't have callers.
3568 if (!Node->CallerEdges.empty())
3569 continue;
3570 markBackedges(Node, Visited, CurrentStack);
3571 assert(CurrentStack.empty());
3572 }
3573}
3574
3575// Recursive helper for above markBackedges method.
3576template <typename DerivedCCG, typename FuncTy, typename CallTy>
3577void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges(
3578 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3579 DenseSet<const ContextNode *> &CurrentStack) {
3580 auto I = Visited.insert(Node);
3581 // We should only call this for unvisited nodes.
3582 assert(I.second);
3583 (void)I;
3584 for (auto &CalleeEdge : Node->CalleeEdges) {
3585 auto *Callee = CalleeEdge->Callee;
3586 if (Visited.count(Callee)) {
3587 // Since this was already visited we need to check if it is currently on
3588 // the recursive stack in which case it is a backedge.
3589 if (CurrentStack.count(Callee))
3590 CalleeEdge->IsBackedge = true;
3591 continue;
3592 }
3593 CurrentStack.insert(Callee);
3594 markBackedges(Callee, Visited, CurrentStack);
3595 CurrentStack.erase(Callee);
3596 }
3597}
3598
3599template <typename DerivedCCG, typename FuncTy, typename CallTy>
3600void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
3601 DenseSet<const ContextNode *> Visited;
3602 for (auto &Entry : AllocationCallToContextNodeMap) {
3603 Visited.clear();
3604 identifyClones(Entry.second, Visited, Entry.second->getContextIds());
3605 }
3606 Visited.clear();
3607 for (auto &Entry : AllocationCallToContextNodeMap)
3608 recursivelyRemoveNoneTypeCalleeEdges(Node: Entry.second, Visited);
3609 if (VerifyCCG)
3610 check();
3611}
3612
3613// helper function to check an AllocType is cold or notcold or both.
3614bool checkColdOrNotCold(uint8_t AllocType) {
3615 return (AllocType == (uint8_t)AllocationType::Cold) ||
3616 (AllocType == (uint8_t)AllocationType::NotCold) ||
3617 (AllocType ==
3618 ((uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold));
3619}
3620
3621template <typename DerivedCCG, typename FuncTy, typename CallTy>
3622void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
3623 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3624 const DenseSet<uint32_t> &AllocContextIds) {
3625 if (VerifyNodes)
3626 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3627 assert(!Node->CloneOf);
3628
3629 // If Node as a null call, then either it wasn't found in the module (regular
3630 // LTO) or summary index (ThinLTO), or there were other conditions blocking
3631 // cloning (e.g. recursion, calls multiple targets, etc).
3632 // Do this here so that we don't try to recursively clone callers below, which
3633 // isn't useful at least for this node.
3634 if (!Node->hasCall())
3635 return;
3636
3637 // No need to look at any callers if allocation type already unambiguous.
3638 if (hasSingleAllocType(Node->AllocTypes))
3639 return;
3640
3641#ifndef NDEBUG
3642 auto Insert =
3643#endif
3644 Visited.insert(Node);
3645 // We should not have visited this node yet.
3646 assert(Insert.second);
3647 // The recursive call to identifyClones may delete the current edge from the
3648 // CallerEdges vector. Make a copy and iterate on that, simpler than passing
3649 // in an iterator and having recursive call erase from it. Other edges may
3650 // also get removed during the recursion, which will have null Callee and
3651 // Caller pointers (and are deleted later), so we skip those below.
3652 {
3653 auto CallerEdges = Node->CallerEdges;
3654 for (auto &Edge : CallerEdges) {
3655 // Skip any that have been removed by an earlier recursive call.
3656 if (Edge->isRemoved()) {
3657 assert(!is_contained(Node->CallerEdges, Edge));
3658 continue;
3659 }
3660 // Defer backedges. See comments further below where these edges are
3661 // handled during the cloning of this Node.
3662 if (Edge->IsBackedge) {
3663 // We should only mark these if cloning recursive contexts, where we
3664 // need to do this deferral.
3665 assert(CloneRecursiveContexts);
3666 continue;
3667 }
3668 // Ignore any caller we previously visited via another edge.
3669 if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
3670 identifyClones(Edge->Caller, Visited, AllocContextIds);
3671 }
3672 }
3673 }
3674
3675 // Check if we reached an unambiguous call or have have only a single caller.
3676 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
3677 return;
3678
3679 // We need to clone.
3680
3681 // Try to keep the original version as alloc type NotCold. This will make
3682 // cases with indirect calls or any other situation with an unknown call to
3683 // the original function get the default behavior. We do this by sorting the
3684 // CallerEdges of the Node we will clone by alloc type.
3685 //
3686 // Give NotCold edge the lowest sort priority so those edges are at the end of
3687 // the caller edges vector, and stay on the original version (since the below
3688 // code clones greedily until it finds all remaining edges have the same type
3689 // and leaves the remaining ones on the original Node).
3690 //
3691 // We shouldn't actually have any None type edges, so the sorting priority for
3692 // that is arbitrary, and we assert in that case below.
3693 const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4,
3694 /*Cold*/ 1,
3695 /*NotColdCold*/ 2};
3696 llvm::stable_sort(Node->CallerEdges,
3697 [&](const std::shared_ptr<ContextEdge> &A,
3698 const std::shared_ptr<ContextEdge> &B) {
3699 // Nodes with non-empty context ids should be sorted
3700 // before those with empty context ids.
3701 if (A->ContextIds.empty())
3702 // Either B ContextIds are non-empty (in which case we
3703 // should return false because B < A), or B ContextIds
3704 // are empty, in which case they are equal, and we
3705 // should maintain the original relative ordering.
3706 return false;
3707 if (B->ContextIds.empty())
3708 return true;
3709
3710 if (A->AllocTypes == B->AllocTypes)
3711 // Use the first context id for each edge as a
3712 // tie-breaker.
3713 return *A->ContextIds.begin() < *B->ContextIds.begin();
3714 return AllocTypeCloningPriority[A->AllocTypes] <
3715 AllocTypeCloningPriority[B->AllocTypes];
3716 });
3717
3718 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3719
3720 DenseSet<uint32_t> RecursiveContextIds;
3721 assert(AllowRecursiveContexts || !CloneRecursiveContexts);
3722 // If we are allowing recursive callsites, but have also disabled recursive
3723 // contexts, look for context ids that show up in multiple caller edges.
3724 if (AllowRecursiveCallsites && !AllowRecursiveContexts) {
3725 DenseSet<uint32_t> AllCallerContextIds;
3726 for (auto &CE : Node->CallerEdges) {
3727 // Resize to the largest set of caller context ids, since we know the
3728 // final set will be at least that large.
3729 AllCallerContextIds.reserve(Size: CE->getContextIds().size());
3730 for (auto Id : CE->getContextIds())
3731 if (!AllCallerContextIds.insert(Id).second)
3732 RecursiveContextIds.insert(Id);
3733 }
3734 }
3735
3736 // Iterate until we find no more opportunities for disambiguating the alloc
3737 // types via cloning. In most cases this loop will terminate once the Node
3738 // has a single allocation type, in which case no more cloning is needed.
3739 // Iterate over a copy of Node's caller edges, since we may need to remove
3740 // edges in the moveEdgeTo* methods, and this simplifies the handling and
3741 // makes it less error-prone.
3742 auto CallerEdges = Node->CallerEdges;
3743 for (auto &CallerEdge : CallerEdges) {
3744 // Skip any that have been removed by an earlier recursive call.
3745 if (CallerEdge->isRemoved()) {
3746 assert(!is_contained(Node->CallerEdges, CallerEdge));
3747 continue;
3748 }
3749 assert(CallerEdge->Callee == Node);
3750
3751 // See if cloning the prior caller edge left this node with a single alloc
3752 // type or a single caller. In that case no more cloning of Node is needed.
3753 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
3754 break;
3755
3756 // If the caller was not successfully matched to a call in the IR/summary,
3757 // there is no point in trying to clone for it as we can't update that call.
3758 if (!CallerEdge->Caller->hasCall())
3759 continue;
3760
3761 // Only need to process the ids along this edge pertaining to the given
3762 // allocation.
3763 auto CallerEdgeContextsForAlloc =
3764 set_intersection(CallerEdge->getContextIds(), AllocContextIds);
3765 if (!RecursiveContextIds.empty())
3766 CallerEdgeContextsForAlloc =
3767 set_difference(CallerEdgeContextsForAlloc, RecursiveContextIds);
3768 if (CallerEdgeContextsForAlloc.empty())
3769 continue;
3770
3771 auto CallerAllocTypeForAlloc = computeAllocType(ContextIds&: CallerEdgeContextsForAlloc);
3772
3773 // Compute the node callee edge alloc types corresponding to the context ids
3774 // for this caller edge.
3775 std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
3776 CalleeEdgeAllocTypesForCallerEdge.reserve(n: Node->CalleeEdges.size());
3777 for (auto &CalleeEdge : Node->CalleeEdges)
3778 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
3779 Node1Ids: CalleeEdge->getContextIds(), Node2Ids: CallerEdgeContextsForAlloc));
3780
3781 // Don't clone if doing so will not disambiguate any alloc types amongst
3782 // caller edges (including the callee edges that would be cloned).
3783 // Otherwise we will simply move all edges to the clone.
3784 //
3785 // First check if by cloning we will disambiguate the caller allocation
3786 // type from node's allocation type. Query allocTypeToUse so that we don't
3787 // bother cloning to distinguish NotCold+Cold from NotCold. Note that
3788 // neither of these should be None type.
3789 //
3790 // Then check if by cloning node at least one of the callee edges will be
3791 // disambiguated by splitting out different context ids.
3792 //
3793 // However, always do the cloning if this is a backedge, in which case we
3794 // have not yet cloned along this caller edge.
3795 assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
3796 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3797 if (!CallerEdge->IsBackedge &&
3798 allocTypeToUse(CallerAllocTypeForAlloc) ==
3799 allocTypeToUse(Node->AllocTypes) &&
3800 allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
3801 CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {
3802 continue;
3803 }
3804
3805 if (CallerEdge->IsBackedge) {
3806 // We should only mark these if cloning recursive contexts, where we
3807 // need to do this deferral.
3808 assert(CloneRecursiveContexts);
3809 DeferredBackedges++;
3810 }
3811
3812 // If this is a backedge, we now do recursive cloning starting from its
3813 // caller since we may have moved unambiguous caller contexts to a clone
3814 // of this Node in a previous iteration of the current loop, giving more
3815 // opportunity for cloning through the backedge. Because we sorted the
3816 // caller edges earlier so that cold caller edges are first, we would have
3817 // visited and cloned this node for any unamibiguously cold non-recursive
3818 // callers before any ambiguous backedge callers. Note that we don't do this
3819 // if the caller is already cloned or visited during cloning (e.g. via a
3820 // different context path from the allocation).
3821 // TODO: Can we do better in the case where the caller was already visited?
3822 if (CallerEdge->IsBackedge && !CallerEdge->Caller->CloneOf &&
3823 !Visited.count(CallerEdge->Caller)) {
3824 const auto OrigIdCount = CallerEdge->getContextIds().size();
3825 // Now do the recursive cloning of this backedge's caller, which was
3826 // deferred earlier.
3827 identifyClones(CallerEdge->Caller, Visited, CallerEdgeContextsForAlloc);
3828 removeNoneTypeCalleeEdges(Node: CallerEdge->Caller);
3829 // See if the recursive call to identifyClones moved the context ids to a
3830 // new edge from this node to a clone of caller, and switch to looking at
3831 // that new edge so that we clone Node for the new caller clone.
3832 bool UpdatedEdge = false;
3833 if (OrigIdCount > CallerEdge->getContextIds().size()) {
3834 for (auto E : Node->CallerEdges) {
3835 // Only interested in clones of the current edges caller.
3836 if (E->Caller->CloneOf != CallerEdge->Caller)
3837 continue;
3838 // See if this edge contains any of the context ids originally on the
3839 // current caller edge.
3840 auto CallerEdgeContextsForAllocNew =
3841 set_intersection(CallerEdgeContextsForAlloc, E->getContextIds());
3842 if (CallerEdgeContextsForAllocNew.empty())
3843 continue;
3844 // Make sure we don't pick a previously existing caller edge of this
3845 // Node, which would be processed on a different iteration of the
3846 // outer loop over the saved CallerEdges.
3847 if (llvm::is_contained(CallerEdges, E))
3848 continue;
3849 // The CallerAllocTypeForAlloc and CalleeEdgeAllocTypesForCallerEdge
3850 // are updated further below for all cases where we just invoked
3851 // identifyClones recursively.
3852 CallerEdgeContextsForAlloc.swap(CallerEdgeContextsForAllocNew);
3853 CallerEdge = E;
3854 UpdatedEdge = true;
3855 break;
3856 }
3857 }
3858 // If cloning removed this edge (and we didn't update it to a new edge
3859 // above), we're done with this edge. It's possible we moved all of the
3860 // context ids to an existing clone, in which case there's no need to do
3861 // further processing for them.
3862 if (CallerEdge->isRemoved())
3863 continue;
3864
3865 // Now we need to update the information used for the cloning decisions
3866 // further below, as we may have modified edges and their context ids.
3867
3868 // Note if we changed the CallerEdge above we would have already updated
3869 // the context ids.
3870 if (!UpdatedEdge) {
3871 CallerEdgeContextsForAlloc = set_intersection(
3872 CallerEdgeContextsForAlloc, CallerEdge->getContextIds());
3873 if (CallerEdgeContextsForAlloc.empty())
3874 continue;
3875 }
3876 // Update the other information that depends on the edges and on the now
3877 // updated CallerEdgeContextsForAlloc.
3878 CallerAllocTypeForAlloc = computeAllocType(ContextIds&: CallerEdgeContextsForAlloc);
3879 CalleeEdgeAllocTypesForCallerEdge.clear();
3880 for (auto &CalleeEdge : Node->CalleeEdges) {
3881 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
3882 Node1Ids: CalleeEdge->getContextIds(), Node2Ids: CallerEdgeContextsForAlloc));
3883 }
3884 }
3885
3886 // First see if we can use an existing clone. Check each clone and its
3887 // callee edges for matching alloc types.
3888 ContextNode *Clone = nullptr;
3889 for (auto *CurClone : Node->Clones) {
3890 if (allocTypeToUse(CurClone->AllocTypes) !=
3891 allocTypeToUse(CallerAllocTypeForAlloc))
3892 continue;
3893
3894 bool BothSingleAlloc = hasSingleAllocType(CurClone->AllocTypes) &&
3895 hasSingleAllocType(CallerAllocTypeForAlloc);
3896 // The above check should mean that if both have single alloc types that
3897 // they should be equal.
3898 assert(!BothSingleAlloc ||
3899 CurClone->AllocTypes == CallerAllocTypeForAlloc);
3900
3901 // If either both have a single alloc type (which are the same), or if the
3902 // clone's callee edges have the same alloc types as those for the current
3903 // allocation on Node's callee edges (CalleeEdgeAllocTypesForCallerEdge),
3904 // then we can reuse this clone.
3905 if (BothSingleAlloc || allocTypesMatchClone<DerivedCCG, FuncTy, CallTy>(
3906 CalleeEdgeAllocTypesForCallerEdge, CurClone)) {
3907 Clone = CurClone;
3908 break;
3909 }
3910 }
3911
3912 // The edge iterator is adjusted when we move the CallerEdge to the clone.
3913 if (Clone)
3914 moveEdgeToExistingCalleeClone(Edge: CallerEdge, NewCallee: Clone, /*NewClone=*/false,
3915 ContextIdsToMove: CallerEdgeContextsForAlloc);
3916 else
3917 Clone = moveEdgeToNewCalleeClone(Edge: CallerEdge, ContextIdsToMove: CallerEdgeContextsForAlloc);
3918
3919 // Sanity check that no alloc types on clone or its edges are None.
3920 assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
3921 }
3922
3923 // We should still have some context ids on the original Node.
3924 assert(!Node->emptyContextIds());
3925
3926 // Sanity check that no alloc types on node or edges are None.
3927 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3928
3929 if (VerifyNodes)
3930 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3931}
3932
3933void ModuleCallsiteContextGraph::updateAllocationCall(
3934 CallInfo &Call, AllocationType AllocType) {
3935 std::string AllocTypeString = getAllocTypeAttributeString(Type: AllocType);
3936 auto A = llvm::Attribute::get(Context&: Call.call()->getFunction()->getContext(),
3937 Kind: "memprof", Val: AllocTypeString);
3938 cast<CallBase>(Val: Call.call())->addFnAttr(Attr: A);
3939 OREGetter(Call.call()->getFunction())
3940 .emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
3941 << ore::NV("AllocationCall", Call.call()) << " in clone "
3942 << ore::NV("Caller", Call.call()->getFunction())
3943 << " marked with memprof allocation attribute "
3944 << ore::NV("Attribute", AllocTypeString));
3945}
3946
3947void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
3948 AllocationType AllocType) {
3949 auto *AI = cast<AllocInfo *>(Val: Call.call());
3950 assert(AI);
3951 assert(AI->Versions.size() > Call.cloneNo());
3952 AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
3953}
3954
3955AllocationType
3956ModuleCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
3957 const auto *CB = cast<CallBase>(Val: Call.call());
3958 if (!CB->getAttributes().hasFnAttr(Kind: "memprof"))
3959 return AllocationType::None;
3960 return CB->getAttributes().getFnAttr(Kind: "memprof").getValueAsString() == "cold"
3961 ? AllocationType::Cold
3962 : AllocationType::NotCold;
3963}
3964
3965AllocationType
3966IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
3967 const auto *AI = cast<AllocInfo *>(Val: Call.call());
3968 assert(AI->Versions.size() > Call.cloneNo());
3969 return (AllocationType)AI->Versions[Call.cloneNo()];
3970}
3971
3972void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
3973 FuncInfo CalleeFunc) {
3974 if (CalleeFunc.cloneNo() > 0)
3975 cast<CallBase>(Val: CallerCall.call())->setCalledFunction(CalleeFunc.func());
3976 OREGetter(CallerCall.call()->getFunction())
3977 .emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
3978 << ore::NV("Call", CallerCall.call()) << " in clone "
3979 << ore::NV("Caller", CallerCall.call()->getFunction())
3980 << " assigned to call function clone "
3981 << ore::NV("Callee", CalleeFunc.func()));
3982}
3983
3984void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
3985 FuncInfo CalleeFunc) {
3986 auto *CI = cast<CallsiteInfo *>(Val: CallerCall.call());
3987 assert(CI &&
3988 "Caller cannot be an allocation which should not have profiled calls");
3989 assert(CI->Clones.size() > CallerCall.cloneNo());
3990 CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo();
3991}
3992
3993CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
3994 Instruction *>::FuncInfo
3995ModuleCallsiteContextGraph::cloneFunctionForCallsite(
3996 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
3997 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
3998 // Use existing LLVM facilities for cloning and obtaining Call in clone
3999 ValueToValueMapTy VMap;
4000 auto *NewFunc = CloneFunction(F: Func.func(), VMap);
4001 std::string Name = getMemProfFuncName(Base: Func.func()->getName(), CloneNo);
4002 assert(!Func.func()->getParent()->getFunction(Name));
4003 NewFunc->setName(Name);
4004 if (auto *SP = NewFunc->getSubprogram())
4005 SP->replaceLinkageName(
4006 LN: MDString::get(Context&: NewFunc->getParent()->getContext(), Str: Name));
4007 for (auto &Inst : CallsWithMetadataInFunc) {
4008 // This map always has the initial version in it.
4009 assert(Inst.cloneNo() == 0);
4010 CallMap[Inst] = {cast<Instruction>(Val&: VMap[Inst.call()]), CloneNo};
4011 }
4012 OREGetter(Func.func())
4013 .emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
4014 << "created clone " << ore::NV("NewFunction", NewFunc));
4015 return {NewFunc, CloneNo};
4016}
4017
4018CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
4019 IndexCall>::FuncInfo
4020IndexCallsiteContextGraph::cloneFunctionForCallsite(
4021 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
4022 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
4023 // Check how many clones we have of Call (and therefore function).
4024 // The next clone number is the current size of versions array.
4025 // Confirm this matches the CloneNo provided by the caller, which is based on
4026 // the number of function clones we have.
4027 assert(CloneNo == (isa<AllocInfo *>(Call.call())
4028 ? cast<AllocInfo *>(Call.call())->Versions.size()
4029 : cast<CallsiteInfo *>(Call.call())->Clones.size()));
4030 // Walk all the instructions in this function. Create a new version for
4031 // each (by adding an entry to the Versions/Clones summary array), and copy
4032 // over the version being called for the function clone being cloned here.
4033 // Additionally, add an entry to the CallMap for the new function clone,
4034 // mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
4035 // to the new call clone.
4036 for (auto &Inst : CallsWithMetadataInFunc) {
4037 // This map always has the initial version in it.
4038 assert(Inst.cloneNo() == 0);
4039 if (auto *AI = dyn_cast<AllocInfo *>(Val: Inst.call())) {
4040 assert(AI->Versions.size() == CloneNo);
4041 // We assign the allocation type later (in updateAllocationCall), just add
4042 // an entry for it here.
4043 AI->Versions.push_back(Elt: 0);
4044 } else {
4045 auto *CI = cast<CallsiteInfo *>(Val: Inst.call());
4046 assert(CI && CI->Clones.size() == CloneNo);
4047 // We assign the clone number later (in updateCall), just add an entry for
4048 // it here.
4049 CI->Clones.push_back(Elt: 0);
4050 }
4051 CallMap[Inst] = {Inst.call(), CloneNo};
4052 }
4053 return {Func.func(), CloneNo};
4054}
4055
4056// We perform cloning for each allocation node separately. However, this
4057// sometimes results in a situation where the same node calls multiple
4058// clones of the same callee, created for different allocations. This
4059// causes issues when assigning functions to these clones, as each node can
4060// in reality only call a single callee clone.
4061//
4062// To address this, before assigning functions, merge callee clone nodes as
4063// needed using a post order traversal from the allocations. We attempt to
4064// use existing clones as the merge node when legal, and to share them
4065// among callers with the same properties (callers calling the same set of
4066// callee clone nodes for the same allocations).
4067//
4068// Without this fix, in some cases incorrect function assignment will lead
4069// to calling the wrong allocation clone.
4070template <typename DerivedCCG, typename FuncTy, typename CallTy>
4071void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones() {
4072 if (!MergeClones)
4073 return;
4074
4075 // Generate a map from context id to the associated allocation node for use
4076 // when merging clones.
4077 DenseMap<uint32_t, ContextNode *> ContextIdToAllocationNode;
4078 for (auto &Entry : AllocationCallToContextNodeMap) {
4079 auto *Node = Entry.second;
4080 for (auto Id : Node->getContextIds())
4081 ContextIdToAllocationNode[Id] = Node->getOrigNode();
4082 for (auto *Clone : Node->Clones) {
4083 for (auto Id : Clone->getContextIds())
4084 ContextIdToAllocationNode[Id] = Clone->getOrigNode();
4085 }
4086 }
4087
4088 // Post order traversal starting from allocations to ensure each callsite
4089 // calls a single clone of its callee. Callee nodes that are clones of each
4090 // other are merged (via new merge nodes if needed) to achieve this.
4091 DenseSet<const ContextNode *> Visited;
4092 for (auto &Entry : AllocationCallToContextNodeMap) {
4093 auto *Node = Entry.second;
4094
4095 mergeClones(Node, Visited, ContextIdToAllocationNode);
4096
4097 // Make a copy so the recursive post order traversal that may create new
4098 // clones doesn't mess up iteration. Note that the recursive traversal
4099 // itself does not call mergeClones on any of these nodes, which are all
4100 // (clones of) allocations.
4101 auto Clones = Node->Clones;
4102 for (auto *Clone : Clones)
4103 mergeClones(Clone, Visited, ContextIdToAllocationNode);
4104 }
4105
4106 if (DumpCCG) {
4107 dbgs() << "CCG after merging:\n";
4108 dbgs() << *this;
4109 }
4110 if (ExportToDot)
4111 exportToDot(Label: "aftermerge");
4112
4113 if (VerifyCCG) {
4114 check();
4115 }
4116}
4117
4118// Recursive helper for above mergeClones method.
4119template <typename DerivedCCG, typename FuncTy, typename CallTy>
4120void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones(
4121 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
4122 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
4123 auto Inserted = Visited.insert(Node);
4124 if (!Inserted.second)
4125 return;
4126
4127 // Make a copy since the recursive call may move a caller edge to a new
4128 // callee, messing up the iterator.
4129 auto CallerEdges = Node->CallerEdges;
4130 for (auto CallerEdge : CallerEdges) {
4131 // Skip any caller edge moved onto a different callee during recursion.
4132 if (CallerEdge->Callee != Node)
4133 continue;
4134 mergeClones(CallerEdge->Caller, Visited, ContextIdToAllocationNode);
4135 }
4136
4137 // Merge for this node after we handle its callers.
4138 mergeNodeCalleeClones(Node, Visited, ContextIdToAllocationNode);
4139}
4140
4141template <typename DerivedCCG, typename FuncTy, typename CallTy>
4142void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeNodeCalleeClones(
4143 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
4144 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
4145 // Ignore Node if we moved all of its contexts to clones.
4146 if (Node->emptyContextIds())
4147 return;
4148
4149 // First identify groups of clones among Node's callee edges, by building
4150 // a map from each callee base node to the associated callee edges from Node.
4151 MapVector<ContextNode *, std::vector<std::shared_ptr<ContextEdge>>>
4152 OrigNodeToCloneEdges;
4153 for (const auto &E : Node->CalleeEdges) {
4154 auto *Callee = E->Callee;
4155 if (!Callee->CloneOf && Callee->Clones.empty())
4156 continue;
4157 ContextNode *Base = Callee->getOrigNode();
4158 OrigNodeToCloneEdges[Base].push_back(E);
4159 }
4160
4161 // Helper for callee edge sorting below. Return true if A's callee has fewer
4162 // caller edges than B, or if A is a clone and B is not, or if A's first
4163 // context id is smaller than B's.
4164 auto CalleeCallerEdgeLessThan = [](const std::shared_ptr<ContextEdge> &A,
4165 const std::shared_ptr<ContextEdge> &B) {
4166 if (A->Callee->CallerEdges.size() != B->Callee->CallerEdges.size())
4167 return A->Callee->CallerEdges.size() < B->Callee->CallerEdges.size();
4168 if (A->Callee->CloneOf && !B->Callee->CloneOf)
4169 return true;
4170 else if (!A->Callee->CloneOf && B->Callee->CloneOf)
4171 return false;
4172 // Use the first context id for each edge as a
4173 // tie-breaker.
4174 return *A->ContextIds.begin() < *B->ContextIds.begin();
4175 };
4176
4177 // Process each set of callee clones called by Node, performing the needed
4178 // merging.
4179 for (auto Entry : OrigNodeToCloneEdges) {
4180 // CalleeEdges is the set of edges from Node reaching callees that are
4181 // mutual clones of each other.
4182 auto &CalleeEdges = Entry.second;
4183 auto NumCalleeClones = CalleeEdges.size();
4184 // A single edge means there is no merging needed.
4185 if (NumCalleeClones == 1)
4186 continue;
4187 // Sort the CalleeEdges calling this group of clones in ascending order of
4188 // their caller edge counts, putting the original non-clone node first in
4189 // cases of a tie. This simplifies finding an existing node to use as the
4190 // merge node.
4191 llvm::stable_sort(CalleeEdges, CalleeCallerEdgeLessThan);
4192
4193 /// Find other callers of the given set of callee edges that can
4194 /// share the same callee merge node. See the comments at this method
4195 /// definition for details.
4196 DenseSet<ContextNode *> OtherCallersToShareMerge;
4197 findOtherCallersToShareMerge(Node, CalleeEdges, ContextIdToAllocationNode,
4198 OtherCallersToShareMerge);
4199
4200 // Now do the actual merging. Identify existing or create a new MergeNode
4201 // during the first iteration. Move each callee over, along with edges from
4202 // other callers we've determined above can share the same merge node.
4203 ContextNode *MergeNode = nullptr;
4204 DenseMap<ContextNode *, unsigned> CallerToMoveCount;
4205 for (auto CalleeEdge : CalleeEdges) {
4206 auto *OrigCallee = CalleeEdge->Callee;
4207 // If we don't have a MergeNode yet (only happens on the first iteration,
4208 // as a new one will be created when we go to move the first callee edge
4209 // over as needed), see if we can use this callee.
4210 if (!MergeNode) {
4211 // If there are no other callers, simply use this callee.
4212 if (CalleeEdge->Callee->CallerEdges.size() == 1) {
4213 MergeNode = OrigCallee;
4214 NonNewMergedNodes++;
4215 continue;
4216 }
4217 // Otherwise, if we have identified other caller nodes that can share
4218 // the merge node with Node, see if all of OrigCallee's callers are
4219 // going to share the same merge node. In that case we can use callee
4220 // (since all of its callers would move to the new merge node).
4221 if (!OtherCallersToShareMerge.empty()) {
4222 bool MoveAllCallerEdges = true;
4223 for (auto CalleeCallerE : OrigCallee->CallerEdges) {
4224 if (CalleeCallerE == CalleeEdge)
4225 continue;
4226 if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller)) {
4227 MoveAllCallerEdges = false;
4228 break;
4229 }
4230 }
4231 // If we are going to move all callers over, we can use this callee as
4232 // the MergeNode.
4233 if (MoveAllCallerEdges) {
4234 MergeNode = OrigCallee;
4235 NonNewMergedNodes++;
4236 continue;
4237 }
4238 }
4239 }
4240 // Move this callee edge, creating a new merge node if necessary.
4241 if (MergeNode) {
4242 assert(MergeNode != OrigCallee);
4243 moveEdgeToExistingCalleeClone(Edge: CalleeEdge, NewCallee: MergeNode,
4244 /*NewClone*/ false);
4245 } else {
4246 MergeNode = moveEdgeToNewCalleeClone(Edge: CalleeEdge);
4247 NewMergedNodes++;
4248 }
4249 // Now move all identified edges from other callers over to the merge node
4250 // as well.
4251 if (!OtherCallersToShareMerge.empty()) {
4252 // Make and iterate over a copy of OrigCallee's caller edges because
4253 // some of these will be moved off of the OrigCallee and that would mess
4254 // up the iteration from OrigCallee.
4255 auto OrigCalleeCallerEdges = OrigCallee->CallerEdges;
4256 for (auto &CalleeCallerE : OrigCalleeCallerEdges) {
4257 if (CalleeCallerE == CalleeEdge)
4258 continue;
4259 if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller))
4260 continue;
4261 CallerToMoveCount[CalleeCallerE->Caller]++;
4262 moveEdgeToExistingCalleeClone(Edge: CalleeCallerE, NewCallee: MergeNode,
4263 /*NewClone*/ false);
4264 }
4265 }
4266 removeNoneTypeCalleeEdges(Node: OrigCallee);
4267 removeNoneTypeCalleeEdges(Node: MergeNode);
4268 }
4269 }
4270}
4271
4272// Look for other nodes that have edges to the same set of callee
4273// clones as the current Node. Those can share the eventual merge node
4274// (reducing cloning and binary size overhead) iff:
4275// - they have edges to the same set of callee clones
4276// - each callee edge reaches a subset of the same allocations as Node's
4277// corresponding edge to the same callee clone.
4278// The second requirement is to ensure that we don't undo any of the
4279// necessary cloning to distinguish contexts with different allocation
4280// behavior.
4281// FIXME: This is somewhat conservative, as we really just need to ensure
4282// that they don't reach the same allocations as contexts on edges from Node
4283// going to any of the *other* callee clones being merged. However, that
4284// requires more tracking and checking to get right.
4285template <typename DerivedCCG, typename FuncTy, typename CallTy>
4286void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
4287 findOtherCallersToShareMerge(
4288 ContextNode *Node,
4289 std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges,
4290 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode,
4291 DenseSet<ContextNode *> &OtherCallersToShareMerge) {
4292 auto NumCalleeClones = CalleeEdges.size();
4293 // This map counts how many edges to the same callee clone exist for other
4294 // caller nodes of each callee clone.
4295 DenseMap<ContextNode *, unsigned> OtherCallersToSharedCalleeEdgeCount;
4296 // Counts the number of other caller nodes that have edges to all callee
4297 // clones that don't violate the allocation context checking.
4298 unsigned PossibleOtherCallerNodes = 0;
4299
4300 // We only need to look at other Caller nodes if the first callee edge has
4301 // multiple callers (recall they are sorted in ascending order above).
4302 if (CalleeEdges[0]->Callee->CallerEdges.size() < 2)
4303 return;
4304
4305 // For each callee edge:
4306 // - Collect the count of other caller nodes calling the same callees.
4307 // - Collect the alloc nodes reached by contexts on each callee edge.
4308 DenseMap<ContextEdge *, DenseSet<ContextNode *>> CalleeEdgeToAllocNodes;
4309 for (auto CalleeEdge : CalleeEdges) {
4310 assert(CalleeEdge->Callee->CallerEdges.size() > 1);
4311 // For each other caller of the same callee, increment the count of
4312 // edges reaching the same callee clone.
4313 for (auto CalleeCallerEdges : CalleeEdge->Callee->CallerEdges) {
4314 if (CalleeCallerEdges->Caller == Node) {
4315 assert(CalleeCallerEdges == CalleeEdge);
4316 continue;
4317 }
4318 OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller]++;
4319 // If this caller edge now reaches all of the same callee clones,
4320 // increment the count of candidate other caller nodes.
4321 if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller] ==
4322 NumCalleeClones)
4323 PossibleOtherCallerNodes++;
4324 }
4325 // Collect the alloc nodes reached by contexts on each callee edge, for
4326 // later analysis.
4327 for (auto Id : CalleeEdge->getContextIds()) {
4328 auto *Alloc = ContextIdToAllocationNode.lookup(Id);
4329 if (!Alloc) {
4330 // FIXME: unclear why this happens occasionally, presumably
4331 // imperfect graph updates possibly with recursion.
4332 MissingAllocForContextId++;
4333 continue;
4334 }
4335 CalleeEdgeToAllocNodes[CalleeEdge.get()].insert(Alloc);
4336 }
4337 }
4338
4339 // Now walk the callee edges again, and make sure that for each candidate
4340 // caller node all of its edges to the callees reach the same allocs (or
4341 // a subset) as those along the corresponding callee edge from Node.
4342 for (auto CalleeEdge : CalleeEdges) {
4343 assert(CalleeEdge->Callee->CallerEdges.size() > 1);
4344 // Stop if we do not have any (more) candidate other caller nodes.
4345 if (!PossibleOtherCallerNodes)
4346 break;
4347 auto &CurCalleeAllocNodes = CalleeEdgeToAllocNodes[CalleeEdge.get()];
4348 // Check each other caller of this callee clone.
4349 for (auto &CalleeCallerE : CalleeEdge->Callee->CallerEdges) {
4350 // Not interested in the callee edge from Node itself.
4351 if (CalleeCallerE == CalleeEdge)
4352 continue;
4353 // Skip any callers that didn't have callee edges to all the same
4354 // callee clones.
4355 if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] !=
4356 NumCalleeClones)
4357 continue;
4358 // Make sure that each context along edge from candidate caller node
4359 // reaches an allocation also reached by this callee edge from Node.
4360 for (auto Id : CalleeCallerE->getContextIds()) {
4361 auto *Alloc = ContextIdToAllocationNode.lookup(Id);
4362 if (!Alloc)
4363 continue;
4364 // If not, simply reset the map entry to 0 so caller is ignored, and
4365 // reduce the count of candidate other caller nodes.
4366 if (!CurCalleeAllocNodes.contains(Alloc)) {
4367 OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] = 0;
4368 PossibleOtherCallerNodes--;
4369 break;
4370 }
4371 }
4372 }
4373 }
4374
4375 if (!PossibleOtherCallerNodes)
4376 return;
4377
4378 // Build the set of other caller nodes that can use the same callee merge
4379 // node.
4380 for (auto &[OtherCaller, Count] : OtherCallersToSharedCalleeEdgeCount) {
4381 if (Count != NumCalleeClones)
4382 continue;
4383 OtherCallersToShareMerge.insert(OtherCaller);
4384 }
4385}
4386
4387// This method assigns cloned callsites to functions, cloning the functions as
4388// needed. The assignment is greedy and proceeds roughly as follows:
4389//
4390// For each function Func:
4391// For each call with graph Node having clones:
4392// Initialize ClonesWorklist to Node and its clones
4393// Initialize NodeCloneCount to 0
4394// While ClonesWorklist is not empty:
4395// Clone = pop front ClonesWorklist
4396// NodeCloneCount++
4397// If Func has been cloned less than NodeCloneCount times:
4398// If NodeCloneCount is 1:
4399// Assign Clone to original Func
4400// Continue
4401// Create a new function clone
4402// If other callers not assigned to call a function clone yet:
4403// Assign them to call new function clone
4404// Continue
4405// Assign any other caller calling the cloned version to new clone
4406//
4407// For each caller of Clone:
4408// If caller is assigned to call a specific function clone:
4409// If we cannot assign Clone to that function clone:
4410// Create new callsite Clone NewClone
4411// Add NewClone to ClonesWorklist
4412// Continue
4413// Assign Clone to existing caller's called function clone
4414// Else:
4415// If Clone not already assigned to a function clone:
4416// Assign to first function clone without assignment
4417// Assign caller to selected function clone
4418template <typename DerivedCCG, typename FuncTy, typename CallTy>
4419bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
4420 bool Changed = false;
4421
4422 mergeClones();
4423
4424 // Keep track of the assignment of nodes (callsites) to function clones they
4425 // call.
4426 DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
4427
4428 // Update caller node to call function version CalleeFunc, by recording the
4429 // assignment in CallsiteToCalleeFuncCloneMap.
4430 auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
4431 const FuncInfo &CalleeFunc) {
4432 assert(Caller->hasCall());
4433 CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
4434 };
4435
4436 // Walk all functions for which we saw calls with memprof metadata, and handle
4437 // cloning for each of its calls.
4438 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
4439 FuncInfo OrigFunc(Func);
4440 // Map from each clone of OrigFunc to a map of remappings of each call of
4441 // interest (from original uncloned call to the corresponding cloned call in
4442 // that function clone).
4443 std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap;
4444 for (auto &Call : CallsWithMetadata) {
4445 ContextNode *Node = getNodeForInst(C: Call);
4446 // Skip call if we do not have a node for it (all uses of its stack ids
4447 // were either on inlined chains or pruned from the MIBs), or if we did
4448 // not create any clones for it.
4449 if (!Node || Node->Clones.empty())
4450 continue;
4451 assert(Node->hasCall() &&
4452 "Not having a call should have prevented cloning");
4453
4454 // Track the assignment of function clones to clones of the current
4455 // callsite Node being handled.
4456 std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
4457
4458 // Assign callsite version CallsiteClone to function version FuncClone,
4459 // and also assign (possibly cloned) Call to CallsiteClone.
4460 auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
4461 CallInfo &Call,
4462 ContextNode *CallsiteClone,
4463 bool IsAlloc) {
4464 // Record the clone of callsite node assigned to this function clone.
4465 FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
4466
4467 assert(FuncClonesToCallMap.count(FuncClone));
4468 std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone];
4469 CallInfo CallClone(Call);
4470 if (auto It = CallMap.find(Call); It != CallMap.end())
4471 CallClone = It->second;
4472 CallsiteClone->setCall(CallClone);
4473 // Need to do the same for all matching calls.
4474 for (auto &MatchingCall : Node->MatchingCalls) {
4475 CallInfo CallClone(MatchingCall);
4476 if (auto It = CallMap.find(MatchingCall); It != CallMap.end())
4477 CallClone = It->second;
4478 // Updates the call in the list.
4479 MatchingCall = CallClone;
4480 }
4481 };
4482
4483 // Keep track of the clones of callsite Node that need to be assigned to
4484 // function clones. This list may be expanded in the loop body below if we
4485 // find additional cloning is required.
4486 std::deque<ContextNode *> ClonesWorklist;
4487 // Ignore original Node if we moved all of its contexts to clones.
4488 if (!Node->emptyContextIds())
4489 ClonesWorklist.push_back(Node);
4490 llvm::append_range(ClonesWorklist, Node->Clones);
4491
4492 // Now walk through all of the clones of this callsite Node that we need,
4493 // and determine the assignment to a corresponding clone of the current
4494 // function (creating new function clones as needed).
4495 unsigned NodeCloneCount = 0;
4496 while (!ClonesWorklist.empty()) {
4497 ContextNode *Clone = ClonesWorklist.front();
4498 ClonesWorklist.pop_front();
4499 NodeCloneCount++;
4500 if (VerifyNodes)
4501 checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
4502
4503 // Need to create a new function clone if we have more callsite clones
4504 // than existing function clones, which would have been assigned to an
4505 // earlier clone in the list (we assign callsite clones to function
4506 // clones greedily).
4507 if (FuncClonesToCallMap.size() < NodeCloneCount) {
4508 // If this is the first callsite copy, assign to original function.
4509 if (NodeCloneCount == 1) {
4510 // Since FuncClonesToCallMap is empty in this case, no clones have
4511 // been created for this function yet, and no callers should have
4512 // been assigned a function clone for this callee node yet.
4513 assert(llvm::none_of(
4514 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
4515 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
4516 }));
4517 // Initialize with empty call map, assign Clone to original function
4518 // and its callers, and skip to the next clone.
4519 FuncClonesToCallMap[OrigFunc] = {};
4520 AssignCallsiteCloneToFuncClone(
4521 OrigFunc, Call, Clone,
4522 AllocationCallToContextNodeMap.count(Call));
4523 for (auto &CE : Clone->CallerEdges) {
4524 // Ignore any caller that does not have a recorded callsite Call.
4525 if (!CE->Caller->hasCall())
4526 continue;
4527 RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
4528 }
4529 continue;
4530 }
4531
4532 // First locate which copy of OrigFunc to clone again. If a caller
4533 // of this callsite clone was already assigned to call a particular
4534 // function clone, we need to redirect all of those callers to the
4535 // new function clone, and update their other callees within this
4536 // function.
4537 FuncInfo PreviousAssignedFuncClone;
4538 auto EI = llvm::find_if(
4539 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
4540 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
4541 });
4542 bool CallerAssignedToCloneOfFunc = false;
4543 if (EI != Clone->CallerEdges.end()) {
4544 const std::shared_ptr<ContextEdge> &Edge = *EI;
4545 PreviousAssignedFuncClone =
4546 CallsiteToCalleeFuncCloneMap[Edge->Caller];
4547 CallerAssignedToCloneOfFunc = true;
4548 }
4549
4550 // Clone function and save it along with the CallInfo map created
4551 // during cloning in the FuncClonesToCallMap.
4552 std::map<CallInfo, CallInfo> NewCallMap;
4553 unsigned CloneNo = FuncClonesToCallMap.size();
4554 assert(CloneNo > 0 && "Clone 0 is the original function, which "
4555 "should already exist in the map");
4556 FuncInfo NewFuncClone = cloneFunctionForCallsite(
4557 Func&: OrigFunc, Call, CallMap&: NewCallMap, CallsWithMetadataInFunc&: CallsWithMetadata, CloneNo);
4558 FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap));
4559 FunctionClonesAnalysis++;
4560 Changed = true;
4561
4562 // If no caller callsites were already assigned to a clone of this
4563 // function, we can simply assign this clone to the new func clone
4564 // and update all callers to it, then skip to the next clone.
4565 if (!CallerAssignedToCloneOfFunc) {
4566 AssignCallsiteCloneToFuncClone(
4567 NewFuncClone, Call, Clone,
4568 AllocationCallToContextNodeMap.count(Call));
4569 for (auto &CE : Clone->CallerEdges) {
4570 // Ignore any caller that does not have a recorded callsite Call.
4571 if (!CE->Caller->hasCall())
4572 continue;
4573 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
4574 }
4575 continue;
4576 }
4577
4578 // We may need to do additional node cloning in this case.
4579 // Reset the CallsiteToCalleeFuncCloneMap entry for any callers
4580 // that were previously assigned to call PreviousAssignedFuncClone,
4581 // to record that they now call NewFuncClone.
4582 // The none type edge removal may remove some of this Clone's caller
4583 // edges, if it is reached via another of its caller's callees.
4584 // Iterate over a copy and skip any that were removed.
4585 auto CallerEdges = Clone->CallerEdges;
4586 for (auto CE : CallerEdges) {
4587 // Skip any that have been removed on an earlier iteration.
4588 if (CE->isRemoved()) {
4589 assert(!is_contained(Clone->CallerEdges, CE));
4590 continue;
4591 }
4592 assert(CE);
4593 // Ignore any caller that does not have a recorded callsite Call.
4594 if (!CE->Caller->hasCall())
4595 continue;
4596
4597 if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
4598 // We subsequently fall through to later handling that
4599 // will perform any additional cloning required for
4600 // callers that were calling other function clones.
4601 CallsiteToCalleeFuncCloneMap[CE->Caller] !=
4602 PreviousAssignedFuncClone)
4603 continue;
4604
4605 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
4606
4607 // If we are cloning a function that was already assigned to some
4608 // callers, then essentially we are creating new callsite clones
4609 // of the other callsites in that function that are reached by those
4610 // callers. Clone the other callees of the current callsite's caller
4611 // that were already assigned to PreviousAssignedFuncClone
4612 // accordingly. This is important since we subsequently update the
4613 // calls from the nodes in the graph and their assignments to callee
4614 // functions recorded in CallsiteToCalleeFuncCloneMap.
4615 // The none type edge removal may remove some of this caller's
4616 // callee edges, if it is reached via another of its callees.
4617 // Iterate over a copy and skip any that were removed.
4618 auto CalleeEdges = CE->Caller->CalleeEdges;
4619 for (auto CalleeEdge : CalleeEdges) {
4620 // Skip any that have been removed on an earlier iteration when
4621 // cleaning up newly None type callee edges.
4622 if (CalleeEdge->isRemoved()) {
4623 assert(!is_contained(CE->Caller->CalleeEdges, CalleeEdge));
4624 continue;
4625 }
4626 assert(CalleeEdge);
4627 ContextNode *Callee = CalleeEdge->Callee;
4628 // Skip the current callsite, we are looking for other
4629 // callsites Caller calls, as well as any that does not have a
4630 // recorded callsite Call.
4631 if (Callee == Clone || !Callee->hasCall())
4632 continue;
4633 // Skip direct recursive calls. We don't need/want to clone the
4634 // caller node again, and this loop will not behave as expected if
4635 // we tried.
4636 if (Callee == CalleeEdge->Caller)
4637 continue;
4638 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge: CalleeEdge);
4639 removeNoneTypeCalleeEdges(Node: NewClone);
4640 // Moving the edge may have resulted in some none type
4641 // callee edges on the original Callee.
4642 removeNoneTypeCalleeEdges(Node: Callee);
4643 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
4644 // If the Callee node was already assigned to call a specific
4645 // function version, make sure its new clone is assigned to call
4646 // that same function clone.
4647 if (CallsiteToCalleeFuncCloneMap.count(Callee))
4648 RecordCalleeFuncOfCallsite(
4649 NewClone, CallsiteToCalleeFuncCloneMap[Callee]);
4650 // Update NewClone with the new Call clone of this callsite's Call
4651 // created for the new function clone created earlier.
4652 // Recall that we have already ensured when building the graph
4653 // that each caller can only call callsites within the same
4654 // function, so we are guaranteed that Callee Call is in the
4655 // current OrigFunc.
4656 // CallMap is set up as indexed by original Call at clone 0.
4657 CallInfo OrigCall(Callee->getOrigNode()->Call);
4658 OrigCall.setCloneNo(0);
4659 std::map<CallInfo, CallInfo> &CallMap =
4660 FuncClonesToCallMap[NewFuncClone];
4661 assert(CallMap.count(OrigCall));
4662 CallInfo NewCall(CallMap[OrigCall]);
4663 assert(NewCall);
4664 NewClone->setCall(NewCall);
4665 // Need to do the same for all matching calls.
4666 for (auto &MatchingCall : NewClone->MatchingCalls) {
4667 CallInfo OrigMatchingCall(MatchingCall);
4668 OrigMatchingCall.setCloneNo(0);
4669 assert(CallMap.count(OrigMatchingCall));
4670 CallInfo NewCall(CallMap[OrigMatchingCall]);
4671 assert(NewCall);
4672 // Updates the call in the list.
4673 MatchingCall = NewCall;
4674 }
4675 }
4676 }
4677 // Fall through to handling below to perform the recording of the
4678 // function for this callsite clone. This enables handling of cases
4679 // where the callers were assigned to different clones of a function.
4680 }
4681
4682 // See if we can use existing function clone. Walk through
4683 // all caller edges to see if any have already been assigned to
4684 // a clone of this callsite's function. If we can use it, do so. If not,
4685 // because that function clone is already assigned to a different clone
4686 // of this callsite, then we need to clone again.
4687 // Basically, this checking is needed to handle the case where different
4688 // caller functions/callsites may need versions of this function
4689 // containing different mixes of callsite clones across the different
4690 // callsites within the function. If that happens, we need to create
4691 // additional function clones to handle the various combinations.
4692 //
4693 // Keep track of any new clones of this callsite created by the
4694 // following loop, as well as any existing clone that we decided to
4695 // assign this clone to.
4696 std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
4697 FuncInfo FuncCloneAssignedToCurCallsiteClone;
4698 // Iterate over a copy of Clone's caller edges, since we may need to
4699 // remove edges in the moveEdgeTo* methods, and this simplifies the
4700 // handling and makes it less error-prone.
4701 auto CloneCallerEdges = Clone->CallerEdges;
4702 for (auto &Edge : CloneCallerEdges) {
4703 // Skip removed edges (due to direct recursive edges updated when
4704 // updating callee edges when moving an edge and subsequently
4705 // removed by call to removeNoneTypeCalleeEdges on the Clone).
4706 if (Edge->isRemoved())
4707 continue;
4708 // Ignore any caller that does not have a recorded callsite Call.
4709 if (!Edge->Caller->hasCall())
4710 continue;
4711 // If this caller already assigned to call a version of OrigFunc, need
4712 // to ensure we can assign this callsite clone to that function clone.
4713 if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
4714 FuncInfo FuncCloneCalledByCaller =
4715 CallsiteToCalleeFuncCloneMap[Edge->Caller];
4716 // First we need to confirm that this function clone is available
4717 // for use by this callsite node clone.
4718 //
4719 // While FuncCloneToCurNodeCloneMap is built only for this Node and
4720 // its callsite clones, one of those callsite clones X could have
4721 // been assigned to the same function clone called by Edge's caller
4722 // - if Edge's caller calls another callsite within Node's original
4723 // function, and that callsite has another caller reaching clone X.
4724 // We need to clone Node again in this case.
4725 if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
4726 FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
4727 Clone) ||
4728 // Detect when we have multiple callers of this callsite that
4729 // have already been assigned to specific, and different, clones
4730 // of OrigFunc (due to other unrelated callsites in Func they
4731 // reach via call contexts). Is this Clone of callsite Node
4732 // assigned to a different clone of OrigFunc? If so, clone Node
4733 // again.
4734 (FuncCloneAssignedToCurCallsiteClone &&
4735 FuncCloneAssignedToCurCallsiteClone !=
4736 FuncCloneCalledByCaller)) {
4737 // We need to use a different newly created callsite clone, in
4738 // order to assign it to another new function clone on a
4739 // subsequent iteration over the Clones array (adjusted below).
4740 // Note we specifically do not reset the
4741 // CallsiteToCalleeFuncCloneMap entry for this caller, so that
4742 // when this new clone is processed later we know which version of
4743 // the function to copy (so that other callsite clones we have
4744 // assigned to that function clone are properly cloned over). See
4745 // comments in the function cloning handling earlier.
4746
4747 // Check if we already have cloned this callsite again while
4748 // walking through caller edges, for a caller calling the same
4749 // function clone. If so, we can move this edge to that new clone
4750 // rather than creating yet another new clone.
4751 if (FuncCloneToNewCallsiteCloneMap.count(
4752 FuncCloneCalledByCaller)) {
4753 ContextNode *NewClone =
4754 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
4755 moveEdgeToExistingCalleeClone(Edge, NewCallee: NewClone);
4756 // Cleanup any none type edges cloned over.
4757 removeNoneTypeCalleeEdges(Node: NewClone);
4758 } else {
4759 // Create a new callsite clone.
4760 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge);
4761 removeNoneTypeCalleeEdges(Node: NewClone);
4762 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
4763 NewClone;
4764 // Add to list of clones and process later.
4765 ClonesWorklist.push_back(NewClone);
4766 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
4767 }
4768 // Moving the caller edge may have resulted in some none type
4769 // callee edges.
4770 removeNoneTypeCalleeEdges(Node: Clone);
4771 // We will handle the newly created callsite clone in a subsequent
4772 // iteration over this Node's Clones.
4773 continue;
4774 }
4775
4776 // Otherwise, we can use the function clone already assigned to this
4777 // caller.
4778 if (!FuncCloneAssignedToCurCallsiteClone) {
4779 FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
4780 // Assign Clone to FuncCloneCalledByCaller
4781 AssignCallsiteCloneToFuncClone(
4782 FuncCloneCalledByCaller, Call, Clone,
4783 AllocationCallToContextNodeMap.count(Call));
4784 } else
4785 // Don't need to do anything - callsite is already calling this
4786 // function clone.
4787 assert(FuncCloneAssignedToCurCallsiteClone ==
4788 FuncCloneCalledByCaller);
4789
4790 } else {
4791 // We have not already assigned this caller to a version of
4792 // OrigFunc. Do the assignment now.
4793
4794 // First check if we have already assigned this callsite clone to a
4795 // clone of OrigFunc for another caller during this iteration over
4796 // its caller edges.
4797 if (!FuncCloneAssignedToCurCallsiteClone) {
4798 // Find first function in FuncClonesToCallMap without an assigned
4799 // clone of this callsite Node. We should always have one
4800 // available at this point due to the earlier cloning when the
4801 // FuncClonesToCallMap size was smaller than the clone number.
4802 for (auto &CF : FuncClonesToCallMap) {
4803 if (!FuncCloneToCurNodeCloneMap.count(CF.first)) {
4804 FuncCloneAssignedToCurCallsiteClone = CF.first;
4805 break;
4806 }
4807 }
4808 assert(FuncCloneAssignedToCurCallsiteClone);
4809 // Assign Clone to FuncCloneAssignedToCurCallsiteClone
4810 AssignCallsiteCloneToFuncClone(
4811 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
4812 AllocationCallToContextNodeMap.count(Call));
4813 } else
4814 assert(FuncCloneToCurNodeCloneMap
4815 [FuncCloneAssignedToCurCallsiteClone] == Clone);
4816 // Update callers to record function version called.
4817 RecordCalleeFuncOfCallsite(Edge->Caller,
4818 FuncCloneAssignedToCurCallsiteClone);
4819 }
4820 }
4821 }
4822 if (VerifyCCG) {
4823 checkNode<DerivedCCG, FuncTy, CallTy>(Node);
4824 for (const auto &PE : Node->CalleeEdges)
4825 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
4826 for (const auto &CE : Node->CallerEdges)
4827 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
4828 for (auto *Clone : Node->Clones) {
4829 checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
4830 for (const auto &PE : Clone->CalleeEdges)
4831 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
4832 for (const auto &CE : Clone->CallerEdges)
4833 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
4834 }
4835 }
4836 }
4837 }
4838
4839 uint8_t BothTypes =
4840 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
4841
4842 auto UpdateCalls = [&](ContextNode *Node,
4843 DenseSet<const ContextNode *> &Visited,
4844 auto &&UpdateCalls) {
4845 auto Inserted = Visited.insert(Node);
4846 if (!Inserted.second)
4847 return;
4848
4849 for (auto *Clone : Node->Clones)
4850 UpdateCalls(Clone, Visited, UpdateCalls);
4851
4852 for (auto &Edge : Node->CallerEdges)
4853 UpdateCalls(Edge->Caller, Visited, UpdateCalls);
4854
4855 // Skip if either no call to update, or if we ended up with no context ids
4856 // (we moved all edges onto other clones).
4857 if (!Node->hasCall() || Node->emptyContextIds())
4858 return;
4859
4860 if (Node->IsAllocation) {
4861 auto AT = allocTypeToUse(Node->AllocTypes);
4862 // If the allocation type is ambiguous, and more aggressive hinting
4863 // has been enabled via the MinClonedColdBytePercent flag, see if this
4864 // allocation should be hinted cold anyway because its fraction cold bytes
4865 // allocated is at least the given threshold.
4866 if (Node->AllocTypes == BothTypes && MinClonedColdBytePercent < 100 &&
4867 !ContextIdToContextSizeInfos.empty()) {
4868 uint64_t TotalCold = 0;
4869 uint64_t Total = 0;
4870 for (auto Id : Node->getContextIds()) {
4871 auto TypeI = ContextIdToAllocationType.find(Id);
4872 assert(TypeI != ContextIdToAllocationType.end());
4873 auto CSI = ContextIdToContextSizeInfos.find(Id);
4874 if (CSI != ContextIdToContextSizeInfos.end()) {
4875 for (auto &Info : CSI->second) {
4876 Total += Info.TotalSize;
4877 if (TypeI->second == AllocationType::Cold)
4878 TotalCold += Info.TotalSize;
4879 }
4880 }
4881 }
4882 if (TotalCold * 100 >= Total * MinClonedColdBytePercent)
4883 AT = AllocationType::Cold;
4884 }
4885 updateAllocationCall(Call&: Node->Call, AllocType: AT);
4886 assert(Node->MatchingCalls.empty());
4887 return;
4888 }
4889
4890 if (!CallsiteToCalleeFuncCloneMap.count(Node))
4891 return;
4892
4893 auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
4894 updateCall(CallerCall&: Node->Call, CalleeFunc);
4895 // Update all the matching calls as well.
4896 for (auto &Call : Node->MatchingCalls)
4897 updateCall(CallerCall&: Call, CalleeFunc);
4898 };
4899
4900 // Performs DFS traversal starting from allocation nodes to update calls to
4901 // reflect cloning decisions recorded earlier. For regular LTO this will
4902 // update the actual calls in the IR to call the appropriate function clone
4903 // (and add attributes to allocation calls), whereas for ThinLTO the decisions
4904 // are recorded in the summary entries.
4905 DenseSet<const ContextNode *> Visited;
4906 for (auto &Entry : AllocationCallToContextNodeMap)
4907 UpdateCalls(Entry.second, Visited, UpdateCalls);
4908
4909 return Changed;
4910}
4911
4912static SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> createFunctionClones(
4913 Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
4914 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
4915 &FuncToAliasMap) {
4916 // The first "clone" is the original copy, we should only call this if we
4917 // needed to create new clones.
4918 assert(NumClones > 1);
4919 SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
4920 VMaps.reserve(N: NumClones - 1);
4921 FunctionsClonedThinBackend++;
4922 for (unsigned I = 1; I < NumClones; I++) {
4923 VMaps.emplace_back(Args: std::make_unique<ValueToValueMapTy>());
4924 auto *NewF = CloneFunction(F: &F, VMap&: *VMaps.back());
4925 FunctionClonesThinBackend++;
4926 // Strip memprof and callsite metadata from clone as they are no longer
4927 // needed.
4928 for (auto &BB : *NewF) {
4929 for (auto &Inst : BB) {
4930 Inst.setMetadata(KindID: LLVMContext::MD_memprof, Node: nullptr);
4931 Inst.setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
4932 }
4933 }
4934 std::string Name = getMemProfFuncName(Base: F.getName(), CloneNo: I);
4935 auto *PrevF = M.getFunction(Name);
4936 if (PrevF) {
4937 // We might have created this when adjusting callsite in another
4938 // function. It should be a declaration.
4939 assert(PrevF->isDeclaration());
4940 NewF->takeName(V: PrevF);
4941 PrevF->replaceAllUsesWith(V: NewF);
4942 PrevF->eraseFromParent();
4943 } else
4944 NewF->setName(Name);
4945 if (auto *SP = NewF->getSubprogram())
4946 SP->replaceLinkageName(
4947 LN: MDString::get(Context&: NewF->getParent()->getContext(), Str: Name));
4948 ORE.emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
4949 << "created clone " << ore::NV("NewFunction", NewF));
4950
4951 // Now handle aliases to this function, and clone those as well.
4952 if (!FuncToAliasMap.count(x: &F))
4953 continue;
4954 for (auto *A : FuncToAliasMap[&F]) {
4955 std::string Name = getMemProfFuncName(Base: A->getName(), CloneNo: I);
4956 auto *PrevA = M.getNamedAlias(Name);
4957 auto *NewA = GlobalAlias::create(Ty: A->getValueType(),
4958 AddressSpace: A->getType()->getPointerAddressSpace(),
4959 Linkage: A->getLinkage(), Name, Aliasee: NewF);
4960 NewA->copyAttributesFrom(Src: A);
4961 if (PrevA) {
4962 // We might have created this when adjusting callsite in another
4963 // function. It should be a declaration.
4964 assert(PrevA->isDeclaration());
4965 NewA->takeName(V: PrevA);
4966 PrevA->replaceAllUsesWith(V: NewA);
4967 PrevA->eraseFromParent();
4968 }
4969 }
4970 }
4971 return VMaps;
4972}
4973
4974// Locate the summary for F. This is complicated by the fact that it might
4975// have been internalized or promoted.
4976static ValueInfo findValueInfoForFunc(const Function &F, const Module &M,
4977 const ModuleSummaryIndex *ImportSummary,
4978 const Function *CallingFunc = nullptr) {
4979 // FIXME: Ideally we would retain the original GUID in some fashion on the
4980 // function (e.g. as metadata), but for now do our best to locate the
4981 // summary without that information.
4982 ValueInfo TheFnVI = ImportSummary->getValueInfo(GUID: F.getGUID());
4983 if (!TheFnVI)
4984 // See if theFn was internalized, by checking index directly with
4985 // original name (this avoids the name adjustment done by getGUID() for
4986 // internal symbols).
4987 TheFnVI = ImportSummary->getValueInfo(
4988 GUID: GlobalValue::getGUIDAssumingExternalLinkage(GlobalName: F.getName()));
4989 if (TheFnVI)
4990 return TheFnVI;
4991 // Now query with the original name before any promotion was performed.
4992 StringRef OrigName =
4993 ModuleSummaryIndex::getOriginalNameBeforePromote(Name: F.getName());
4994 // When this pass is enabled, we always add thinlto_src_file provenance
4995 // metadata to imported function definitions, which allows us to recreate the
4996 // original internal symbol's GUID.
4997 auto SrcFileMD = F.getMetadata(Kind: "thinlto_src_file");
4998 // If this is a call to an imported/promoted local for which we didn't import
4999 // the definition, the metadata will not exist on the declaration. However,
5000 // since we are doing this early, before any inlining in the LTO backend, we
5001 // can simply look at the metadata on the calling function which must have
5002 // been from the same module if F was an internal symbol originally.
5003 if (!SrcFileMD && F.isDeclaration()) {
5004 // We would only call this for a declaration for a direct callsite, in which
5005 // case the caller would have provided the calling function pointer.
5006 assert(CallingFunc);
5007 SrcFileMD = CallingFunc->getMetadata(Kind: "thinlto_src_file");
5008 // If this is a promoted local (OrigName != F.getName()), since this is a
5009 // declaration, it must be imported from a different module and therefore we
5010 // should always find the metadata on its calling function. Any call to a
5011 // promoted local that came from this module should still be a definition.
5012 assert(SrcFileMD || OrigName == F.getName());
5013 }
5014 StringRef SrcFile = M.getSourceFileName();
5015 if (SrcFileMD)
5016 SrcFile = dyn_cast<MDString>(Val: SrcFileMD->getOperand(I: 0))->getString();
5017 std::string OrigId = GlobalValue::getGlobalIdentifier(
5018 Name: OrigName, Linkage: GlobalValue::InternalLinkage, FileName: SrcFile);
5019 TheFnVI = ImportSummary->getValueInfo(
5020 GUID: GlobalValue::getGUIDAssumingExternalLinkage(GlobalName: OrigId));
5021 // Internal func in original module may have gotten a numbered suffix if we
5022 // imported an external function with the same name. This happens
5023 // automatically during IR linking for naming conflicts. It would have to
5024 // still be internal in that case (otherwise it would have been renamed on
5025 // promotion in which case we wouldn't have a naming conflict).
5026 if (!TheFnVI && OrigName == F.getName() && F.hasLocalLinkage() &&
5027 F.getName().contains(C: '.')) {
5028 OrigName = F.getName().rsplit(Separator: '.').first;
5029 OrigId = GlobalValue::getGlobalIdentifier(
5030 Name: OrigName, Linkage: GlobalValue::InternalLinkage, FileName: SrcFile);
5031 TheFnVI = ImportSummary->getValueInfo(
5032 GUID: GlobalValue::getGUIDAssumingExternalLinkage(GlobalName: OrigId));
5033 }
5034 // The only way we may not have a VI is if this is a declaration created for
5035 // an imported reference. For distributed ThinLTO we may not have a VI for
5036 // such declarations in the distributed summary.
5037 assert(TheFnVI || F.isDeclaration());
5038 return TheFnVI;
5039}
5040
5041bool MemProfContextDisambiguation::initializeIndirectCallPromotionInfo(
5042 Module &M) {
5043 ICallAnalysis = std::make_unique<ICallPromotionAnalysis>();
5044 Symtab = std::make_unique<InstrProfSymtab>();
5045 // Don't add canonical names, to avoid multiple functions to the symtab
5046 // when they both have the same root name with "." suffixes stripped.
5047 // If we pick the wrong one then this could lead to incorrect ICP and calling
5048 // a memprof clone that we don't actually create (resulting in linker unsats).
5049 // What this means is that the GUID of the function (or its PGOFuncName
5050 // metadata) *must* match that in the VP metadata to allow promotion.
5051 // In practice this should not be a limitation, since local functions should
5052 // have PGOFuncName metadata and global function names shouldn't need any
5053 // special handling (they should not get the ".llvm.*" suffix that the
5054 // canonicalization handling is attempting to strip).
5055 if (Error E = Symtab->create(M, /*InLTO=*/true, /*AddCanonical=*/false)) {
5056 std::string SymtabFailure = toString(E: std::move(E));
5057 M.getContext().emitError(ErrorStr: "Failed to create symtab: " + SymtabFailure);
5058 return false;
5059 }
5060 return true;
5061}
5062
5063#ifndef NDEBUG
5064// Sanity check that the MIB stack ids match between the summary and
5065// instruction metadata.
5066static void checkAllocContextIds(
5067 const AllocInfo &AllocNode, const MDNode *MemProfMD,
5068 const CallStack<MDNode, MDNode::op_iterator> &CallsiteContext,
5069 const ModuleSummaryIndex *ImportSummary) {
5070 auto MIBIter = AllocNode.MIBs.begin();
5071 for (auto &MDOp : MemProfMD->operands()) {
5072 assert(MIBIter != AllocNode.MIBs.end());
5073 auto StackIdIndexIter = MIBIter->StackIdIndices.begin();
5074 auto *MIBMD = cast<const MDNode>(MDOp);
5075 MDNode *StackMDNode = getMIBStackNode(MIBMD);
5076 assert(StackMDNode);
5077 CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
5078 auto ContextIterBegin =
5079 StackContext.beginAfterSharedPrefix(CallsiteContext);
5080 // Skip the checking on the first iteration.
5081 uint64_t LastStackContextId =
5082 (ContextIterBegin != StackContext.end() && *ContextIterBegin == 0) ? 1
5083 : 0;
5084 for (auto ContextIter = ContextIterBegin; ContextIter != StackContext.end();
5085 ++ContextIter) {
5086 // If this is a direct recursion, simply skip the duplicate
5087 // entries, to be consistent with how the summary ids were
5088 // generated during ModuleSummaryAnalysis.
5089 if (LastStackContextId == *ContextIter)
5090 continue;
5091 LastStackContextId = *ContextIter;
5092 assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
5093 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
5094 *ContextIter);
5095 StackIdIndexIter++;
5096 }
5097 MIBIter++;
5098 }
5099}
5100#endif
5101
5102bool MemProfContextDisambiguation::applyImport(Module &M) {
5103 assert(ImportSummary);
5104 bool Changed = false;
5105
5106 // We also need to clone any aliases that reference cloned functions, because
5107 // the modified callsites may invoke via the alias. Keep track of the aliases
5108 // for each function.
5109 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
5110 FuncToAliasMap;
5111 for (auto &A : M.aliases()) {
5112 auto *Aliasee = A.getAliaseeObject();
5113 if (auto *F = dyn_cast<Function>(Val: Aliasee))
5114 FuncToAliasMap[F].insert(Ptr: &A);
5115 }
5116
5117 if (!initializeIndirectCallPromotionInfo(M))
5118 return false;
5119
5120 for (auto &F : M) {
5121 if (F.isDeclaration() || isMemProfClone(F))
5122 continue;
5123
5124 OptimizationRemarkEmitter ORE(&F);
5125
5126 SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
5127 bool ClonesCreated = false;
5128 unsigned NumClonesCreated = 0;
5129 auto CloneFuncIfNeeded = [&](unsigned NumClones) {
5130 // We should at least have version 0 which is the original copy.
5131 assert(NumClones > 0);
5132 // If only one copy needed use original.
5133 if (NumClones == 1)
5134 return;
5135 // If we already performed cloning of this function, confirm that the
5136 // requested number of clones matches (the thin link should ensure the
5137 // number of clones for each constituent callsite is consistent within
5138 // each function), before returning.
5139 if (ClonesCreated) {
5140 assert(NumClonesCreated == NumClones);
5141 return;
5142 }
5143 VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap);
5144 // The first "clone" is the original copy, which doesn't have a VMap.
5145 assert(VMaps.size() == NumClones - 1);
5146 Changed = true;
5147 ClonesCreated = true;
5148 NumClonesCreated = NumClones;
5149 };
5150
5151 auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
5152 Function *CalledFunction) {
5153 // Perform cloning if not yet done.
5154 CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size());
5155
5156 assert(!isMemProfClone(*CalledFunction));
5157
5158 // Update the calls per the summary info.
5159 // Save orig name since it gets updated in the first iteration
5160 // below.
5161 auto CalleeOrigName = CalledFunction->getName();
5162 for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
5163 // Do nothing if this version calls the original version of its
5164 // callee.
5165 if (!StackNode.Clones[J])
5166 continue;
5167 auto NewF = M.getOrInsertFunction(
5168 Name: getMemProfFuncName(Base: CalleeOrigName, CloneNo: StackNode.Clones[J]),
5169 T: CalledFunction->getFunctionType());
5170 CallBase *CBClone;
5171 // Copy 0 is the original function.
5172 if (!J)
5173 CBClone = CB;
5174 else
5175 CBClone = cast<CallBase>(Val&: (*VMaps[J - 1])[CB]);
5176 CBClone->setCalledFunction(NewF);
5177 ORE.emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
5178 << ore::NV("Call", CBClone) << " in clone "
5179 << ore::NV("Caller", CBClone->getFunction())
5180 << " assigned to call function clone "
5181 << ore::NV("Callee", NewF.getCallee()));
5182 }
5183 };
5184
5185 // Locate the summary for F.
5186 ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
5187 // If not found, this could be an imported local (see comment in
5188 // findValueInfoForFunc). Skip for now as it will be cloned in its original
5189 // module (where it would have been promoted to global scope so should
5190 // satisfy any reference in this module).
5191 if (!TheFnVI)
5192 continue;
5193
5194 auto *GVSummary =
5195 ImportSummary->findSummaryInModule(VI: TheFnVI, ModuleId: M.getModuleIdentifier());
5196 if (!GVSummary) {
5197 // Must have been imported, use the summary which matches the definition。
5198 // (might be multiple if this was a linkonce_odr).
5199 auto SrcModuleMD = F.getMetadata(Kind: "thinlto_src_module");
5200 assert(SrcModuleMD &&
5201 "enable-import-metadata is needed to emit thinlto_src_module");
5202 StringRef SrcModule =
5203 dyn_cast<MDString>(Val: SrcModuleMD->getOperand(I: 0))->getString();
5204 for (auto &GVS : TheFnVI.getSummaryList()) {
5205 if (GVS->modulePath() == SrcModule) {
5206 GVSummary = GVS.get();
5207 break;
5208 }
5209 }
5210 assert(GVSummary && GVSummary->modulePath() == SrcModule);
5211 }
5212
5213 // If this was an imported alias skip it as we won't have the function
5214 // summary, and it should be cloned in the original module.
5215 if (isa<AliasSummary>(Val: GVSummary))
5216 continue;
5217
5218 auto *FS = cast<FunctionSummary>(Val: GVSummary->getBaseObject());
5219
5220 if (FS->allocs().empty() && FS->callsites().empty())
5221 continue;
5222
5223 auto SI = FS->callsites().begin();
5224 auto AI = FS->allocs().begin();
5225
5226 // To handle callsite infos synthesized for tail calls which have missing
5227 // frames in the profiled context, map callee VI to the synthesized callsite
5228 // info.
5229 DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite;
5230 // Iterate the callsites for this function in reverse, since we place all
5231 // those synthesized for tail calls at the end.
5232 for (auto CallsiteIt = FS->callsites().rbegin();
5233 CallsiteIt != FS->callsites().rend(); CallsiteIt++) {
5234 auto &Callsite = *CallsiteIt;
5235 // Stop as soon as we see a non-synthesized callsite info (see comment
5236 // above loop). All the entries added for discovered tail calls have empty
5237 // stack ids.
5238 if (!Callsite.StackIdIndices.empty())
5239 break;
5240 MapTailCallCalleeVIToCallsite.insert(KV: {Callsite.Callee, Callsite});
5241 }
5242
5243 // Keeps track of needed ICP for the function.
5244 SmallVector<ICallAnalysisData> ICallAnalysisInfo;
5245
5246 // Assume for now that the instructions are in the exact same order
5247 // as when the summary was created, but confirm this is correct by
5248 // matching the stack ids.
5249 for (auto &BB : F) {
5250 for (auto &I : BB) {
5251 auto *CB = dyn_cast<CallBase>(Val: &I);
5252 // Same handling as when creating module summary.
5253 if (!mayHaveMemprofSummary(CB))
5254 continue;
5255
5256 auto *CalledValue = CB->getCalledOperand();
5257 auto *CalledFunction = CB->getCalledFunction();
5258 if (CalledValue && !CalledFunction) {
5259 CalledValue = CalledValue->stripPointerCasts();
5260 // Stripping pointer casts can reveal a called function.
5261 CalledFunction = dyn_cast<Function>(Val: CalledValue);
5262 }
5263 // Check if this is an alias to a function. If so, get the
5264 // called aliasee for the checks below.
5265 if (auto *GA = dyn_cast<GlobalAlias>(Val: CalledValue)) {
5266 assert(!CalledFunction &&
5267 "Expected null called function in callsite for alias");
5268 CalledFunction = dyn_cast<Function>(Val: GA->getAliaseeObject());
5269 }
5270
5271 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
5272 I.getMetadata(KindID: LLVMContext::MD_callsite));
5273 auto *MemProfMD = I.getMetadata(KindID: LLVMContext::MD_memprof);
5274
5275 // Include allocs that were already assigned a memprof function
5276 // attribute in the statistics.
5277 if (CB->getAttributes().hasFnAttr(Kind: "memprof")) {
5278 assert(!MemProfMD);
5279 CB->getAttributes().getFnAttr(Kind: "memprof").getValueAsString() == "cold"
5280 ? AllocTypeColdThinBackend++
5281 : AllocTypeNotColdThinBackend++;
5282 OrigAllocsThinBackend++;
5283 AllocVersionsThinBackend++;
5284 if (!MaxAllocVersionsThinBackend)
5285 MaxAllocVersionsThinBackend = 1;
5286 continue;
5287 }
5288
5289 if (MemProfMD) {
5290 // Consult the next alloc node.
5291 assert(AI != FS->allocs().end());
5292 auto &AllocNode = *(AI++);
5293
5294#ifndef NDEBUG
5295 checkAllocContextIds(AllocNode, MemProfMD, CallsiteContext,
5296 ImportSummary);
5297#endif
5298
5299 // Perform cloning if not yet done.
5300 CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size());
5301
5302 OrigAllocsThinBackend++;
5303 AllocVersionsThinBackend += AllocNode.Versions.size();
5304 if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
5305 MaxAllocVersionsThinBackend = AllocNode.Versions.size();
5306
5307 // If there is only one version that means we didn't end up
5308 // considering this function for cloning, and in that case the alloc
5309 // will still be none type or should have gotten the default NotCold.
5310 // Skip that after calling clone helper since that does some sanity
5311 // checks that confirm we haven't decided yet that we need cloning.
5312 // We might have a single version that is cold due to the
5313 // MinClonedColdBytePercent heuristic, make sure we don't skip in that
5314 // case.
5315 if (AllocNode.Versions.size() == 1 &&
5316 (AllocationType)AllocNode.Versions[0] != AllocationType::Cold) {
5317 assert((AllocationType)AllocNode.Versions[0] ==
5318 AllocationType::NotCold ||
5319 (AllocationType)AllocNode.Versions[0] ==
5320 AllocationType::None);
5321 UnclonableAllocsThinBackend++;
5322 continue;
5323 }
5324
5325 // All versions should have a singular allocation type.
5326 assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
5327 return Type == ((uint8_t)AllocationType::NotCold |
5328 (uint8_t)AllocationType::Cold);
5329 }));
5330
5331 // Update the allocation types per the summary info.
5332 for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
5333 // Ignore any that didn't get an assigned allocation type.
5334 if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
5335 continue;
5336 AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
5337 AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
5338 : AllocTypeNotColdThinBackend++;
5339 std::string AllocTypeString = getAllocTypeAttributeString(Type: AllocTy);
5340 auto A = llvm::Attribute::get(Context&: F.getContext(), Kind: "memprof",
5341 Val: AllocTypeString);
5342 CallBase *CBClone;
5343 // Copy 0 is the original function.
5344 if (!J)
5345 CBClone = CB;
5346 else
5347 // Since VMaps are only created for new clones, we index with
5348 // clone J-1 (J==0 is the original clone and does not have a VMaps
5349 // entry).
5350 CBClone = cast<CallBase>(Val&: (*VMaps[J - 1])[CB]);
5351 CBClone->addFnAttr(Attr: A);
5352 ORE.emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
5353 << ore::NV("AllocationCall", CBClone) << " in clone "
5354 << ore::NV("Caller", CBClone->getFunction())
5355 << " marked with memprof allocation attribute "
5356 << ore::NV("Attribute", AllocTypeString));
5357 }
5358 } else if (!CallsiteContext.empty()) {
5359 if (!CalledFunction) {
5360#ifndef NDEBUG
5361 // We should have skipped inline assembly calls.
5362 auto *CI = dyn_cast<CallInst>(CB);
5363 assert(!CI || !CI->isInlineAsm());
5364#endif
5365 // We should have skipped direct calls via a Constant.
5366 assert(CalledValue && !isa<Constant>(CalledValue));
5367
5368 // This is an indirect call, see if we have profile information and
5369 // whether any clones were recorded for the profiled targets (that
5370 // we synthesized CallsiteInfo summary records for when building the
5371 // index).
5372 auto NumClones =
5373 recordICPInfo(CB, AllCallsites: FS->callsites(), SI, ICallAnalysisInfo);
5374
5375 // Perform cloning if not yet done. This is done here in case
5376 // we don't need to do ICP, but might need to clone this
5377 // function as it is the target of other cloned calls.
5378 if (NumClones)
5379 CloneFuncIfNeeded(NumClones);
5380 }
5381
5382 else {
5383 // Consult the next callsite node.
5384 assert(SI != FS->callsites().end());
5385 auto &StackNode = *(SI++);
5386
5387#ifndef NDEBUG
5388 // Sanity check that the stack ids match between the summary and
5389 // instruction metadata.
5390 auto StackIdIndexIter = StackNode.StackIdIndices.begin();
5391 for (auto StackId : CallsiteContext) {
5392 assert(StackIdIndexIter != StackNode.StackIdIndices.end());
5393 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
5394 StackId);
5395 StackIdIndexIter++;
5396 }
5397#endif
5398
5399 CloneCallsite(StackNode, CB, CalledFunction);
5400 }
5401 } else if (CB->isTailCall() && CalledFunction) {
5402 // Locate the synthesized callsite info for the callee VI, if any was
5403 // created, and use that for cloning.
5404 ValueInfo CalleeVI =
5405 findValueInfoForFunc(F: *CalledFunction, M, ImportSummary, CallingFunc: &F);
5406 if (CalleeVI && MapTailCallCalleeVIToCallsite.count(Val: CalleeVI)) {
5407 auto Callsite = MapTailCallCalleeVIToCallsite.find(Val: CalleeVI);
5408 assert(Callsite != MapTailCallCalleeVIToCallsite.end());
5409 CloneCallsite(Callsite->second, CB, CalledFunction);
5410 }
5411 }
5412 }
5413 }
5414
5415 // Now do any promotion required for cloning.
5416 performICP(M, AllCallsites: FS->callsites(), VMaps, ICallAnalysisInfo, ORE);
5417 }
5418
5419 // We skip some of the functions and instructions above, so remove all the
5420 // metadata in a single sweep here.
5421 for (auto &F : M) {
5422 // We can skip memprof clones because createFunctionClones already strips
5423 // the metadata from the newly created clones.
5424 if (F.isDeclaration() || isMemProfClone(F))
5425 continue;
5426 for (auto &BB : F) {
5427 for (auto &I : BB) {
5428 if (!isa<CallBase>(Val: I))
5429 continue;
5430 I.setMetadata(KindID: LLVMContext::MD_memprof, Node: nullptr);
5431 I.setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
5432 }
5433 }
5434 }
5435
5436 return Changed;
5437}
5438
5439unsigned MemProfContextDisambiguation::recordICPInfo(
5440 CallBase *CB, ArrayRef<CallsiteInfo> AllCallsites,
5441 ArrayRef<CallsiteInfo>::iterator &SI,
5442 SmallVector<ICallAnalysisData> &ICallAnalysisInfo) {
5443 // First see if we have profile information for this indirect call.
5444 uint32_t NumCandidates;
5445 uint64_t TotalCount;
5446 auto CandidateProfileData =
5447 ICallAnalysis->getPromotionCandidatesForInstruction(I: CB, TotalCount,
5448 NumCandidates);
5449 if (CandidateProfileData.empty())
5450 return 0;
5451
5452 // Iterate through all of the candidate profiled targets along with the
5453 // CallsiteInfo summary records synthesized for them when building the index,
5454 // and see if any are cloned and/or refer to clones.
5455 bool ICPNeeded = false;
5456 unsigned NumClones = 0;
5457 size_t CallsiteInfoStartIndex = std::distance(first: AllCallsites.begin(), last: SI);
5458 for (const auto &Candidate : CandidateProfileData) {
5459#ifndef NDEBUG
5460 auto CalleeValueInfo =
5461#endif
5462 ImportSummary->getValueInfo(GUID: Candidate.Value);
5463 // We might not have a ValueInfo if this is a distributed
5464 // ThinLTO backend and decided not to import that function.
5465 assert(!CalleeValueInfo || SI->Callee == CalleeValueInfo);
5466 assert(SI != AllCallsites.end());
5467 auto &StackNode = *(SI++);
5468 // See if any of the clones of the indirect callsite for this
5469 // profiled target should call a cloned version of the profiled
5470 // target. We only need to do the ICP here if so.
5471 ICPNeeded |= llvm::any_of(Range: StackNode.Clones,
5472 P: [](unsigned CloneNo) { return CloneNo != 0; });
5473 // Every callsite in the same function should have been cloned the same
5474 // number of times.
5475 assert(!NumClones || NumClones == StackNode.Clones.size());
5476 NumClones = StackNode.Clones.size();
5477 }
5478 if (!ICPNeeded)
5479 return NumClones;
5480 // Save information for ICP, which is performed later to avoid messing up the
5481 // current function traversal.
5482 ICallAnalysisInfo.push_back(Elt: {.CB: CB, .CandidateProfileData: CandidateProfileData.vec(), .NumCandidates: NumCandidates,
5483 .TotalCount: TotalCount, .CallsiteInfoStartIndex: CallsiteInfoStartIndex});
5484 return NumClones;
5485}
5486
5487void MemProfContextDisambiguation::performICP(
5488 Module &M, ArrayRef<CallsiteInfo> AllCallsites,
5489 ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps,
5490 ArrayRef<ICallAnalysisData> ICallAnalysisInfo,
5491 OptimizationRemarkEmitter &ORE) {
5492 // Now do any promotion required for cloning. Specifically, for each
5493 // recorded ICP candidate (which was only recorded because one clone of that
5494 // candidate should call a cloned target), we perform ICP (speculative
5495 // devirtualization) for each clone of the callsite, and update its callee
5496 // to the appropriate clone. Note that the ICP compares against the original
5497 // version of the target, which is what is in the vtable.
5498 for (auto &Info : ICallAnalysisInfo) {
5499 auto *CB = Info.CB;
5500 auto CallsiteIndex = Info.CallsiteInfoStartIndex;
5501 auto TotalCount = Info.TotalCount;
5502 unsigned NumPromoted = 0;
5503 unsigned NumClones = 0;
5504
5505 for (auto &Candidate : Info.CandidateProfileData) {
5506 auto &StackNode = AllCallsites[CallsiteIndex++];
5507
5508 // All calls in the same function must have the same number of clones.
5509 assert(!NumClones || NumClones == StackNode.Clones.size());
5510 NumClones = StackNode.Clones.size();
5511
5512 // See if the target is in the module. If it wasn't imported, it is
5513 // possible that this profile could have been collected on a different
5514 // target (or version of the code), and we need to be conservative
5515 // (similar to what is done in the ICP pass).
5516 Function *TargetFunction = Symtab->getFunction(FuncMD5Hash: Candidate.Value);
5517 if (TargetFunction == nullptr ||
5518 // Any ThinLTO global dead symbol removal should have already
5519 // occurred, so it should be safe to promote when the target is a
5520 // declaration.
5521 // TODO: Remove internal option once more fully tested.
5522 (MemProfRequireDefinitionForPromotion &&
5523 TargetFunction->isDeclaration())) {
5524 ORE.emit(RemarkBuilder: [&]() {
5525 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", CB)
5526 << "Memprof cannot promote indirect call: target with md5sum "
5527 << ore::NV("target md5sum", Candidate.Value) << " not found";
5528 });
5529 // FIXME: See if we can use the new declaration importing support to
5530 // at least get the declarations imported for this case. Hot indirect
5531 // targets should have been imported normally, however.
5532 continue;
5533 }
5534
5535 // Check if legal to promote
5536 const char *Reason = nullptr;
5537 if (!isLegalToPromote(CB: *CB, Callee: TargetFunction, FailureReason: &Reason)) {
5538 ORE.emit(RemarkBuilder: [&]() {
5539 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", CB)
5540 << "Memprof cannot promote indirect call to "
5541 << ore::NV("TargetFunction", TargetFunction)
5542 << " with count of " << ore::NV("TotalCount", TotalCount)
5543 << ": " << Reason;
5544 });
5545 continue;
5546 }
5547
5548 assert(!isMemProfClone(*TargetFunction));
5549
5550 // Handle each call clone, applying ICP so that each clone directly
5551 // calls the specified callee clone, guarded by the appropriate ICP
5552 // check.
5553 CallBase *CBClone = CB;
5554 for (unsigned J = 0; J < NumClones; J++) {
5555 // Copy 0 is the original function.
5556 if (J > 0)
5557 CBClone = cast<CallBase>(Val&: (*VMaps[J - 1])[CB]);
5558 // We do the promotion using the original name, so that the comparison
5559 // is against the name in the vtable. Then just below, change the new
5560 // direct call to call the cloned function.
5561 auto &DirectCall =
5562 pgo::promoteIndirectCall(CB&: *CBClone, F: TargetFunction, Count: Candidate.Count,
5563 TotalCount, AttachProfToDirectCall: isSamplePGO, ORE: &ORE);
5564 auto *TargetToUse = TargetFunction;
5565 // Call original if this version calls the original version of its
5566 // callee.
5567 if (StackNode.Clones[J]) {
5568 TargetToUse =
5569 cast<Function>(Val: M.getOrInsertFunction(
5570 Name: getMemProfFuncName(Base: TargetFunction->getName(),
5571 CloneNo: StackNode.Clones[J]),
5572 T: TargetFunction->getFunctionType())
5573 .getCallee());
5574 }
5575 DirectCall.setCalledFunction(TargetToUse);
5576 ORE.emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
5577 << ore::NV("Call", CBClone) << " in clone "
5578 << ore::NV("Caller", CBClone->getFunction())
5579 << " promoted and assigned to call function clone "
5580 << ore::NV("Callee", TargetToUse));
5581 }
5582
5583 // Update TotalCount (all clones should get same count above)
5584 TotalCount -= Candidate.Count;
5585 NumPromoted++;
5586 }
5587 // Adjust the MD.prof metadata for all clones, now that we have the new
5588 // TotalCount and the number promoted.
5589 CallBase *CBClone = CB;
5590 for (unsigned J = 0; J < NumClones; J++) {
5591 // Copy 0 is the original function.
5592 if (J > 0)
5593 CBClone = cast<CallBase>(Val&: (*VMaps[J - 1])[CB]);
5594 // First delete the old one.
5595 CBClone->setMetadata(KindID: LLVMContext::MD_prof, Node: nullptr);
5596 // If all promoted, we don't need the MD.prof metadata.
5597 // Otherwise we need update with the un-promoted records back.
5598 if (TotalCount != 0)
5599 annotateValueSite(
5600 M, Inst&: *CBClone, VDs: ArrayRef(Info.CandidateProfileData).slice(N: NumPromoted),
5601 Sum: TotalCount, ValueKind: IPVK_IndirectCallTarget, MaxMDCount: Info.NumCandidates);
5602 }
5603 }
5604}
5605
5606template <typename DerivedCCG, typename FuncTy, typename CallTy>
5607bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
5608 if (DumpCCG) {
5609 dbgs() << "CCG before cloning:\n";
5610 dbgs() << *this;
5611 }
5612 if (ExportToDot)
5613 exportToDot(Label: "postbuild");
5614
5615 if (VerifyCCG) {
5616 check();
5617 }
5618
5619 identifyClones();
5620
5621 if (VerifyCCG) {
5622 check();
5623 }
5624
5625 if (DumpCCG) {
5626 dbgs() << "CCG after cloning:\n";
5627 dbgs() << *this;
5628 }
5629 if (ExportToDot)
5630 exportToDot(Label: "cloned");
5631
5632 bool Changed = assignFunctions();
5633
5634 if (DumpCCG) {
5635 dbgs() << "CCG after assigning function clones:\n";
5636 dbgs() << *this;
5637 }
5638 if (ExportToDot)
5639 exportToDot(Label: "clonefuncassign");
5640
5641 if (MemProfReportHintedSizes)
5642 printTotalSizes(OS&: errs());
5643
5644 return Changed;
5645}
5646
5647bool MemProfContextDisambiguation::processModule(
5648 Module &M,
5649 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
5650
5651 // If we have an import summary, then the cloning decisions were made during
5652 // the thin link on the index. Apply them and return.
5653 if (ImportSummary)
5654 return applyImport(M);
5655
5656 // TODO: If/when other types of memprof cloning are enabled beyond just for
5657 // hot and cold, we will need to change this to individually control the
5658 // AllocationType passed to addStackNodesForMIB during CCG construction.
5659 // Note that we specifically check this after applying imports above, so that
5660 // the option isn't needed to be passed to distributed ThinLTO backend
5661 // clang processes, which won't necessarily have visibility into the linker
5662 // dependences. Instead the information is communicated from the LTO link to
5663 // the backends via the combined summary index.
5664 if (!SupportsHotColdNew)
5665 return false;
5666
5667 ModuleCallsiteContextGraph CCG(M, OREGetter);
5668 return CCG.process();
5669}
5670
5671MemProfContextDisambiguation::MemProfContextDisambiguation(
5672 const ModuleSummaryIndex *Summary, bool isSamplePGO)
5673 : ImportSummary(Summary), isSamplePGO(isSamplePGO) {
5674 // Check the dot graph printing options once here, to make sure we have valid
5675 // and expected combinations.
5676 if (DotGraphScope == DotScope::Alloc && !AllocIdForDot.getNumOccurrences())
5677 llvm::report_fatal_error(
5678 reason: "-memprof-dot-scope=alloc requires -memprof-dot-alloc-id");
5679 if (DotGraphScope == DotScope::Context &&
5680 !ContextIdForDot.getNumOccurrences())
5681 llvm::report_fatal_error(
5682 reason: "-memprof-dot-scope=context requires -memprof-dot-context-id");
5683 if (DotGraphScope == DotScope::All && AllocIdForDot.getNumOccurrences() &&
5684 ContextIdForDot.getNumOccurrences())
5685 llvm::report_fatal_error(
5686 reason: "-memprof-dot-scope=all can't have both -memprof-dot-alloc-id and "
5687 "-memprof-dot-context-id");
5688 if (ImportSummary) {
5689 // The MemProfImportSummary should only be used for testing ThinLTO
5690 // distributed backend handling via opt, in which case we don't have a
5691 // summary from the pass pipeline.
5692 assert(MemProfImportSummary.empty());
5693 return;
5694 }
5695 if (MemProfImportSummary.empty())
5696 return;
5697
5698 auto ReadSummaryFile =
5699 errorOrToExpected(EO: MemoryBuffer::getFile(Filename: MemProfImportSummary));
5700 if (!ReadSummaryFile) {
5701 logAllUnhandledErrors(E: ReadSummaryFile.takeError(), OS&: errs(),
5702 ErrorBanner: "Error loading file '" + MemProfImportSummary +
5703 "': ");
5704 return;
5705 }
5706 auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(Buffer: **ReadSummaryFile);
5707 if (!ImportSummaryForTestingOrErr) {
5708 logAllUnhandledErrors(E: ImportSummaryForTestingOrErr.takeError(), OS&: errs(),
5709 ErrorBanner: "Error parsing file '" + MemProfImportSummary +
5710 "': ");
5711 return;
5712 }
5713 ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
5714 ImportSummary = ImportSummaryForTesting.get();
5715}
5716
5717PreservedAnalyses MemProfContextDisambiguation::run(Module &M,
5718 ModuleAnalysisManager &AM) {
5719 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(IR&: M).getManager();
5720 auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
5721 return FAM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: *F);
5722 };
5723 if (!processModule(M, OREGetter))
5724 return PreservedAnalyses::all();
5725 return PreservedAnalyses::none();
5726}
5727
5728void MemProfContextDisambiguation::run(
5729 ModuleSummaryIndex &Index,
5730 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
5731 isPrevailing) {
5732 // TODO: If/when other types of memprof cloning are enabled beyond just for
5733 // hot and cold, we will need to change this to individually control the
5734 // AllocationType passed to addStackNodesForMIB during CCG construction.
5735 // The index was set from the option, so these should be in sync.
5736 assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
5737 if (!SupportsHotColdNew)
5738 return;
5739
5740 IndexCallsiteContextGraph CCG(Index, isPrevailing);
5741 CCG.process();
5742}
5743