1//==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements support for context disambiguation of allocation
10// calls for profile guided heap optimization. Specifically, it uses Memprof
11// profiles which indicate context specific allocation behavior (currently
12// distinguishing cold vs hot memory allocations). Cloning is performed to
13// expose the cold allocation call contexts, and the allocation calls are
14// subsequently annotated with an attribute for later transformation.
15//
16// The transformations can be performed either directly on IR (regular LTO), or
17// on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
18// Both types of LTO operate on a the same base graph representation, which
19// uses CRTP to support either IR or Index formats.
20//
21//===----------------------------------------------------------------------===//
22
23#include "llvm/Transforms/IPO/MemProfContextDisambiguation.h"
24#include "llvm/ADT/DenseMap.h"
25#include "llvm/ADT/DenseSet.h"
26#include "llvm/ADT/MapVector.h"
27#include "llvm/ADT/SetOperations.h"
28#include "llvm/ADT/SmallPtrSet.h"
29#include "llvm/ADT/SmallSet.h"
30#include "llvm/ADT/SmallVector.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/StringExtras.h"
33#include "llvm/Analysis/MemoryProfileInfo.h"
34#include "llvm/Analysis/ModuleSummaryAnalysis.h"
35#include "llvm/Analysis/OptimizationRemarkEmitter.h"
36#include "llvm/Bitcode/BitcodeReader.h"
37#include "llvm/IR/Instructions.h"
38#include "llvm/IR/Module.h"
39#include "llvm/IR/ModuleSummaryIndex.h"
40#include "llvm/Pass.h"
41#include "llvm/Support/CommandLine.h"
42#include "llvm/Support/GraphWriter.h"
43#include "llvm/Support/InterleavedRange.h"
44#include "llvm/Support/SHA1.h"
45#include "llvm/Support/raw_ostream.h"
46#include "llvm/Transforms/IPO.h"
47#include "llvm/Transforms/Utils/CallPromotionUtils.h"
48#include "llvm/Transforms/Utils/Cloning.h"
49#include "llvm/Transforms/Utils/Instrumentation.h"
50#include <deque>
51#include <sstream>
52#include <unordered_map>
53#include <vector>
54using namespace llvm;
55using namespace llvm::memprof;
56
57#define DEBUG_TYPE "memprof-context-disambiguation"
58
59STATISTIC(FunctionClonesAnalysis,
60 "Number of function clones created during whole program analysis");
61STATISTIC(FunctionClonesThinBackend,
62 "Number of function clones created during ThinLTO backend");
63STATISTIC(FunctionsClonedThinBackend,
64 "Number of functions that had clones created during ThinLTO backend");
65STATISTIC(
66 FunctionCloneDuplicatesThinBackend,
67 "Number of function clone duplicates detected during ThinLTO backend");
68STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
69 "cloned) during whole program analysis");
70STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
71 "during whole program analysis");
72STATISTIC(AllocTypeNotColdThinBackend,
73 "Number of not cold static allocations (possibly cloned) during "
74 "ThinLTO backend");
75STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
76 "(possibly cloned) during ThinLTO backend");
77STATISTIC(OrigAllocsThinBackend,
78 "Number of original (not cloned) allocations with memprof profiles "
79 "during ThinLTO backend");
80STATISTIC(
81 AllocVersionsThinBackend,
82 "Number of allocation versions (including clones) during ThinLTO backend");
83STATISTIC(MaxAllocVersionsThinBackend,
84 "Maximum number of allocation versions created for an original "
85 "allocation during ThinLTO backend");
86STATISTIC(UnclonableAllocsThinBackend,
87 "Number of unclonable ambigous allocations during ThinLTO backend");
88STATISTIC(RemovedEdgesWithMismatchedCallees,
89 "Number of edges removed due to mismatched callees (profiled vs IR)");
90STATISTIC(FoundProfiledCalleeCount,
91 "Number of profiled callees found via tail calls");
92STATISTIC(FoundProfiledCalleeDepth,
93 "Aggregate depth of profiled callees found via tail calls");
94STATISTIC(FoundProfiledCalleeMaxDepth,
95 "Maximum depth of profiled callees found via tail calls");
96STATISTIC(FoundProfiledCalleeNonUniquelyCount,
97 "Number of profiled callees found via multiple tail call chains");
98STATISTIC(DeferredBackedges, "Number of backedges with deferred cloning");
99STATISTIC(NewMergedNodes, "Number of new nodes created during merging");
100STATISTIC(NonNewMergedNodes, "Number of non new nodes used during merging");
101STATISTIC(MissingAllocForContextId,
102 "Number of missing alloc nodes for context ids");
103STATISTIC(SkippedCallsCloning,
104 "Number of calls skipped during cloning due to unexpected operand");
105STATISTIC(MismatchedCloneAssignments,
106 "Number of callsites assigned to call multiple non-matching clones");
107STATISTIC(TotalMergeInvokes, "Number of merge invocations for nodes");
108STATISTIC(TotalMergeIters, "Number of merge iterations for nodes");
109STATISTIC(MaxMergeIters, "Max merge iterations for nodes");
110STATISTIC(NumImportantContextIds, "Number of important context ids");
111STATISTIC(NumFixupEdgeIdsInserted, "Number of fixup edge ids inserted");
112STATISTIC(NumFixupEdgesAdded, "Number of fixup edges added");
113STATISTIC(NumFixedContexts, "Number of contexts with fixed edges");
114STATISTIC(AliaseesPrevailingInDiffModuleFromAlias,
115 "Number of aliasees prevailing in a different module than its alias");
116
117static cl::opt<std::string> DotFilePathPrefix(
118 "memprof-dot-file-path-prefix", cl::init(Val: ""), cl::Hidden,
119 cl::value_desc("filename"),
120 cl::desc("Specify the path prefix of the MemProf dot files."));
121
122static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(Val: false),
123 cl::Hidden,
124 cl::desc("Export graph to dot files."));
125
126// TODO: Remove this option once new handling is validated more widely.
127static cl::opt<bool> DoMergeIteration(
128 "memprof-merge-iteration", cl::init(Val: true), cl::Hidden,
129 cl::desc("Iteratively apply merging on a node to catch new callers"));
130
131// How much of the graph to export to dot.
132enum DotScope {
133 All, // The full CCG graph.
134 Alloc, // Only contexts for the specified allocation.
135 Context, // Only the specified context.
136};
137
138static cl::opt<DotScope> DotGraphScope(
139 "memprof-dot-scope", cl::desc("Scope of graph to export to dot"),
140 cl::Hidden, cl::init(Val: DotScope::All),
141 cl::values(
142 clEnumValN(DotScope::All, "all", "Export full callsite graph"),
143 clEnumValN(DotScope::Alloc, "alloc",
144 "Export only nodes with contexts feeding given "
145 "-memprof-dot-alloc-id"),
146 clEnumValN(DotScope::Context, "context",
147 "Export only nodes with given -memprof-dot-context-id")));
148
149static cl::opt<unsigned>
150 AllocIdForDot("memprof-dot-alloc-id", cl::init(Val: 0), cl::Hidden,
151 cl::desc("Id of alloc to export if -memprof-dot-scope=alloc "
152 "or to highlight if -memprof-dot-scope=all"));
153
154static cl::opt<unsigned> ContextIdForDot(
155 "memprof-dot-context-id", cl::init(Val: 0), cl::Hidden,
156 cl::desc("Id of context to export if -memprof-dot-scope=context or to "
157 "highlight otherwise"));
158
159static cl::opt<bool>
160 DumpCCG("memprof-dump-ccg", cl::init(Val: false), cl::Hidden,
161 cl::desc("Dump CallingContextGraph to stdout after each stage."));
162
163static cl::opt<bool>
164 VerifyCCG("memprof-verify-ccg", cl::init(Val: false), cl::Hidden,
165 cl::desc("Perform verification checks on CallingContextGraph."));
166
167static cl::opt<bool>
168 VerifyNodes("memprof-verify-nodes", cl::init(Val: false), cl::Hidden,
169 cl::desc("Perform frequent verification checks on nodes."));
170
171static cl::opt<std::string> MemProfImportSummary(
172 "memprof-import-summary",
173 cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
174 cl::Hidden);
175
176static cl::opt<unsigned>
177 TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(Val: 5),
178 cl::Hidden,
179 cl::desc("Max depth to recursively search for missing "
180 "frames through tail calls."));
181
182// Optionally enable cloning of callsites involved with recursive cycles
183static cl::opt<bool> AllowRecursiveCallsites(
184 "memprof-allow-recursive-callsites", cl::init(Val: true), cl::Hidden,
185 cl::desc("Allow cloning of callsites involved in recursive cycles"));
186
187static cl::opt<bool> CloneRecursiveContexts(
188 "memprof-clone-recursive-contexts", cl::init(Val: true), cl::Hidden,
189 cl::desc("Allow cloning of contexts through recursive cycles"));
190
191// Generally this is needed for correct assignment of allocation clones to
192// function clones, however, allow it to be disabled for debugging while the
193// functionality is new and being tested more widely.
194static cl::opt<bool>
195 MergeClones("memprof-merge-clones", cl::init(Val: true), cl::Hidden,
196 cl::desc("Merge clones before assigning functions"));
197
198// When disabled, try to detect and prevent cloning of recursive contexts.
199// This is only necessary until we support cloning through recursive cycles.
200// Leave on by default for now, as disabling requires a little bit of compile
201// time overhead and doesn't affect correctness, it will just inflate the cold
202// hinted bytes reporting a bit when -memprof-report-hinted-sizes is enabled.
203static cl::opt<bool> AllowRecursiveContexts(
204 "memprof-allow-recursive-contexts", cl::init(Val: true), cl::Hidden,
205 cl::desc("Allow cloning of contexts having recursive cycles"));
206
207// Set the minimum absolute count threshold for allowing inlining of indirect
208// calls promoted during cloning.
209static cl::opt<unsigned> MemProfICPNoInlineThreshold(
210 "memprof-icp-noinline-threshold", cl::init(Val: 2), cl::Hidden,
211 cl::desc("Minimum absolute count for promoted target to be inlinable"));
212
213namespace llvm {
214cl::opt<bool> EnableMemProfContextDisambiguation(
215 "enable-memprof-context-disambiguation", cl::init(Val: false), cl::Hidden,
216 cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));
217
218// Indicate we are linking with an allocator that supports hot/cold operator
219// new interfaces.
220cl::opt<bool> SupportsHotColdNew(
221 "supports-hot-cold-new", cl::init(Val: false), cl::Hidden,
222 cl::desc("Linking with hot/cold operator new interfaces"));
223
224static cl::opt<bool> MemProfRequireDefinitionForPromotion(
225 "memprof-require-definition-for-promotion", cl::init(Val: false), cl::Hidden,
226 cl::desc(
227 "Require target function definition when promoting indirect calls"));
228
229extern cl::opt<bool> MemProfReportHintedSizes;
230extern cl::opt<unsigned> MinClonedColdBytePercent;
231
232cl::opt<unsigned> MemProfTopNImportant(
233 "memprof-top-n-important", cl::init(Val: 10), cl::Hidden,
234 cl::desc("Number of largest cold contexts to consider important"));
235
236cl::opt<bool> MemProfFixupImportant(
237 "memprof-fixup-important", cl::init(Val: true), cl::Hidden,
238 cl::desc("Enables edge fixup for important contexts"));
239
240extern cl::opt<unsigned> MaxSummaryIndirectEdges;
241
242} // namespace llvm
243
244namespace {
245
246/// CRTP base for graphs built from either IR or ThinLTO summary index.
247///
248/// The graph represents the call contexts in all memprof metadata on allocation
249/// calls, with nodes for the allocations themselves, as well as for the calls
250/// in each context. The graph is initially built from the allocation memprof
251/// metadata (or summary) MIBs. It is then updated to match calls with callsite
252/// metadata onto the nodes, updating it to reflect any inlining performed on
253/// those calls.
254///
255/// Each MIB (representing an allocation's call context with allocation
256/// behavior) is assigned a unique context id during the graph build. The edges
257/// and nodes in the graph are decorated with the context ids they carry. This
258/// is used to correctly update the graph when cloning is performed so that we
259/// can uniquify the context for a single (possibly cloned) allocation.
260template <typename DerivedCCG, typename FuncTy, typename CallTy>
261class CallsiteContextGraph {
262public:
263 CallsiteContextGraph() = default;
264 CallsiteContextGraph(const CallsiteContextGraph &) = default;
265 CallsiteContextGraph(CallsiteContextGraph &&) = default;
266
267 /// Main entry point to perform analysis and transformations on graph.
268 bool process(function_ref<void(StringRef, StringRef, const Twine &)>
269 EmitRemark = nullptr);
270
271 /// Perform cloning on the graph necessary to uniquely identify the allocation
272 /// behavior of an allocation based on its context.
273 void identifyClones();
274
275 /// Assign callsite clones to functions, cloning functions as needed to
276 /// accommodate the combinations of their callsite clones reached by callers.
277 /// For regular LTO this clones functions and callsites in the IR, but for
278 /// ThinLTO the cloning decisions are noted in the summaries and later applied
279 /// in applyImport.
280 bool assignFunctions();
281
282 void dump() const;
283 void print(raw_ostream &OS) const;
284 void printTotalSizes(raw_ostream &OS,
285 function_ref<void(StringRef, StringRef, const Twine &)>
286 EmitRemark = nullptr) const;
287
288 friend raw_ostream &operator<<(raw_ostream &OS,
289 const CallsiteContextGraph &CCG) {
290 CCG.print(OS);
291 return OS;
292 }
293
294 friend struct GraphTraits<
295 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
296 friend struct DOTGraphTraits<
297 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
298
299 void exportToDot(std::string Label) const;
300
301 /// Represents a function clone via FuncTy pointer and clone number pair.
302 struct FuncInfo final
303 : public std::pair<FuncTy *, unsigned /*Clone number*/> {
304 using Base = std::pair<FuncTy *, unsigned>;
305 FuncInfo(const Base &B) : Base(B) {}
306 FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
307 explicit operator bool() const { return this->first != nullptr; }
308 FuncTy *func() const { return this->first; }
309 unsigned cloneNo() const { return this->second; }
310 };
311
312 /// Represents a callsite clone via CallTy and clone number pair.
313 struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
314 using Base = std::pair<CallTy, unsigned>;
315 CallInfo(const Base &B) : Base(B) {}
316 CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
317 : Base(Call, CloneNo) {}
318 explicit operator bool() const { return (bool)this->first; }
319 CallTy call() const { return this->first; }
320 unsigned cloneNo() const { return this->second; }
321 void setCloneNo(unsigned N) { this->second = N; }
322 void print(raw_ostream &OS) const {
323 if (!operator bool()) {
324 assert(!cloneNo());
325 OS << "null Call";
326 return;
327 }
328 call()->print(OS);
329 OS << "\t(clone " << cloneNo() << ")";
330 }
331 void dump() const {
332 print(OS&: dbgs());
333 dbgs() << "\n";
334 }
335 friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
336 Call.print(OS);
337 return OS;
338 }
339 };
340
341 struct ContextEdge;
342
343 /// Node in the Callsite Context Graph
344 struct ContextNode {
345 // Assigned to nodes as they are created, useful for debugging.
346 unsigned NodeId = 0;
347
348 // Keep this for now since in the IR case where we have an Instruction* it
349 // is not as immediately discoverable. Used for printing richer information
350 // when dumping graph.
351 bool IsAllocation;
352
353 // Keeps track of when the Call was reset to null because there was
354 // recursion.
355 bool Recursive = false;
356
357 // This will be formed by ORing together the AllocationType enum values
358 // for contexts including this node.
359 uint8_t AllocTypes = 0;
360
361 // The corresponding allocation or interior call. This is the primary call
362 // for which we have created this node.
363 CallInfo Call;
364
365 // List of other calls that can be treated the same as the primary call
366 // through cloning. I.e. located in the same function and have the same
367 // (possibly pruned) stack ids. They will be updated the same way as the
368 // primary call when assigning to function clones.
369 SmallVector<CallInfo, 0> MatchingCalls;
370
371 // For alloc nodes this is a unique id assigned when constructed, and for
372 // callsite stack nodes it is the original stack id when the node is
373 // constructed from the memprof MIB metadata on the alloc nodes. Note that
374 // this is only used when matching callsite metadata onto the stack nodes
375 // created when processing the allocation memprof MIBs, and for labeling
376 // nodes in the dot graph. Therefore we don't bother to assign a value for
377 // clones.
378 uint64_t OrigStackOrAllocId = 0;
379
380 // Edges to all callees in the profiled call stacks.
381 // TODO: Should this be a map (from Callee node) for more efficient lookup?
382 std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
383
384 // Edges to all callers in the profiled call stacks.
385 // TODO: Should this be a map (from Caller node) for more efficient lookup?
386 std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
387
388 // Returns true if we need to look at the callee edges for determining the
389 // node context ids and allocation type.
390 bool useCallerEdgesForContextInfo() const {
391 // Typically if the callee edges are empty either the caller edges are
392 // also empty, or this is an allocation (leaf node). However, if we are
393 // allowing recursive callsites and contexts this will be violated for
394 // incompletely cloned recursive cycles.
395 assert(!CalleeEdges.empty() || CallerEdges.empty() || IsAllocation ||
396 (AllowRecursiveCallsites && AllowRecursiveContexts));
397 // When cloning for a recursive context, during cloning we might be in the
398 // midst of cloning for a recurrence and have moved context ids off of a
399 // caller edge onto the clone but not yet off of the incoming caller
400 // (back) edge. If we don't look at those we miss the fact that this node
401 // still has context ids of interest.
402 return IsAllocation || CloneRecursiveContexts;
403 }
404
405 // Compute the context ids for this node from the union of its edge context
406 // ids.
407 DenseSet<uint32_t> getContextIds() const {
408 unsigned Count = 0;
409 // Compute the number of ids for reserve below. In general we only need to
410 // look at one set of edges, typically the callee edges, since other than
411 // allocations and in some cases during recursion cloning, all the context
412 // ids on the callers should also flow out via callee edges.
413 for (auto &Edge : CalleeEdges.empty() ? CallerEdges : CalleeEdges)
414 Count += Edge->getContextIds().size();
415 DenseSet<uint32_t> ContextIds;
416 ContextIds.reserve(Size: Count);
417 auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
418 CalleeEdges, useCallerEdgesForContextInfo()
419 ? CallerEdges
420 : std::vector<std::shared_ptr<ContextEdge>>());
421 for (const auto &Edge : Edges)
422 ContextIds.insert_range(Edge->getContextIds());
423 return ContextIds;
424 }
425
426 // Compute the allocation type for this node from the OR of its edge
427 // allocation types.
428 uint8_t computeAllocType() const {
429 uint8_t BothTypes =
430 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
431 uint8_t AllocType = (uint8_t)AllocationType::None;
432 auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
433 CalleeEdges, useCallerEdgesForContextInfo()
434 ? CallerEdges
435 : std::vector<std::shared_ptr<ContextEdge>>());
436 for (const auto &Edge : Edges) {
437 AllocType |= Edge->AllocTypes;
438 // Bail early if alloc type reached both, no further refinement.
439 if (AllocType == BothTypes)
440 return AllocType;
441 }
442 return AllocType;
443 }
444
445 // The context ids set for this node is empty if its edge context ids are
446 // also all empty.
447 bool emptyContextIds() const {
448 auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
449 CalleeEdges, useCallerEdgesForContextInfo()
450 ? CallerEdges
451 : std::vector<std::shared_ptr<ContextEdge>>());
452 for (const auto &Edge : Edges) {
453 if (!Edge->getContextIds().empty())
454 return false;
455 }
456 return true;
457 }
458
459 // List of clones of this ContextNode, initially empty.
460 std::vector<ContextNode *> Clones;
461
462 // If a clone, points to the original uncloned node.
463 ContextNode *CloneOf = nullptr;
464
465 ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
466
467 ContextNode(bool IsAllocation, CallInfo C)
468 : IsAllocation(IsAllocation), Call(C) {}
469
470 void addClone(ContextNode *Clone) {
471 if (CloneOf) {
472 CloneOf->Clones.push_back(Clone);
473 Clone->CloneOf = CloneOf;
474 } else {
475 Clones.push_back(Clone);
476 assert(!Clone->CloneOf);
477 Clone->CloneOf = this;
478 }
479 }
480
481 ContextNode *getOrigNode() {
482 if (!CloneOf)
483 return this;
484 return CloneOf;
485 }
486
487 void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
488 unsigned int ContextId);
489
490 ContextEdge *findEdgeFromCallee(const ContextNode *Callee);
491 ContextEdge *findEdgeFromCaller(const ContextNode *Caller);
492 void eraseCalleeEdge(const ContextEdge *Edge);
493 void eraseCallerEdge(const ContextEdge *Edge);
494
495 void setCall(CallInfo C) { Call = std::move(C); }
496
497 bool hasCall() const { return (bool)Call.call(); }
498
499 void printCall(raw_ostream &OS) const { Call.print(OS); }
500
501 // True if this node was effectively removed from the graph, in which case
502 // it should have an allocation type of None and empty context ids.
503 bool isRemoved() const {
504 // Typically if the callee edges are empty either the caller edges are
505 // also empty, or this is an allocation (leaf node). However, if we are
506 // allowing recursive callsites and contexts this will be violated for
507 // incompletely cloned recursive cycles.
508 assert((AllowRecursiveCallsites && AllowRecursiveContexts) ||
509 (AllocTypes == (uint8_t)AllocationType::None) ==
510 emptyContextIds());
511 return AllocTypes == (uint8_t)AllocationType::None;
512 }
513
514 void dump() const;
515 void print(raw_ostream &OS) const;
516
517 friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
518 Node.print(OS);
519 return OS;
520 }
521 };
522
523 /// Edge in the Callsite Context Graph from a ContextNode N to a caller or
524 /// callee.
525 struct ContextEdge {
526 ContextNode *Callee;
527 ContextNode *Caller;
528
529 // This will be formed by ORing together the AllocationType enum values
530 // for contexts including this edge.
531 uint8_t AllocTypes = 0;
532
533 // Set just before initiating cloning when cloning of recursive contexts is
534 // enabled. Used to defer cloning of backedges until we have done cloning of
535 // the callee node for non-backedge caller edges. This exposes cloning
536 // opportunities through the backedge of the cycle.
537 // TODO: Note that this is not updated during cloning, and it is unclear
538 // whether that would be needed.
539 bool IsBackedge = false;
540
541 // The set of IDs for contexts including this edge.
542 DenseSet<uint32_t> ContextIds;
543
544 ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
545 DenseSet<uint32_t> ContextIds)
546 : Callee(Callee), Caller(Caller), AllocTypes(AllocType),
547 ContextIds(std::move(ContextIds)) {}
548
549 DenseSet<uint32_t> &getContextIds() { return ContextIds; }
550
551 // Helper to clear the fields of this edge when we are removing it from the
552 // graph.
553 inline void clear() {
554 ContextIds.clear();
555 AllocTypes = (uint8_t)AllocationType::None;
556 Caller = nullptr;
557 Callee = nullptr;
558 }
559
560 // Check if edge was removed from the graph. This is useful while iterating
561 // over a copy of edge lists when performing operations that mutate the
562 // graph in ways that might remove one of the edges.
563 inline bool isRemoved() const {
564 if (Callee || Caller)
565 return false;
566 // Any edges that have been removed from the graph but are still in a
567 // shared_ptr somewhere should have all fields null'ed out by clear()
568 // above.
569 assert(AllocTypes == (uint8_t)AllocationType::None);
570 assert(ContextIds.empty());
571 return true;
572 }
573
574 void dump() const;
575 void print(raw_ostream &OS) const;
576
577 friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
578 Edge.print(OS);
579 return OS;
580 }
581 };
582
583 /// Helpers to remove edges that have allocation type None (due to not
584 /// carrying any context ids) after transformations.
585 void removeNoneTypeCalleeEdges(ContextNode *Node);
586 void removeNoneTypeCallerEdges(ContextNode *Node);
587 void
588 recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node,
589 DenseSet<const ContextNode *> &Visited);
590
591protected:
592 /// Get a list of nodes corresponding to the stack ids in the given callsite
593 /// context.
594 template <class NodeT, class IteratorT>
595 std::vector<uint64_t>
596 getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
597
598 /// Adds nodes for the given allocation and any stack ids on its memprof MIB
599 /// metadata (or summary).
600 ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);
601
602 /// Adds nodes for the given MIB stack ids.
603 template <class NodeT, class IteratorT>
604 void addStackNodesForMIB(
605 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
606 CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
607 ArrayRef<ContextTotalSize> ContextSizeInfo,
608 std::map<uint64_t, uint32_t> &TotalSizeToContextIdTopNCold);
609
610 /// Matches all callsite metadata (or summary) to the nodes created for
611 /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
612 /// inlining performed on those callsite instructions.
613 void updateStackNodes();
614
615 /// Optionally fixup edges for the N largest cold contexts to better enable
616 /// cloning. This is particularly helpful if the context includes recursion
617 /// as well as inlining, resulting in a single stack node for multiple stack
618 /// ids in the context. With recursion it is particularly difficult to get the
619 /// edge updates correct as in the general case we have lost the original
620 /// stack id ordering for the context. Do more expensive fixup for the largest
621 /// contexts, controlled by MemProfTopNImportant and MemProfFixupImportant.
622 void fixupImportantContexts();
623
624 /// Update graph to conservatively handle any callsite stack nodes that target
625 /// multiple different callee target functions.
626 void handleCallsitesWithMultipleTargets();
627
628 /// Mark backedges via the standard DFS based backedge algorithm.
629 void markBackedges();
630
631 /// Merge clones generated during cloning for different allocations but that
632 /// are called by the same caller node, to ensure proper function assignment.
633 void mergeClones();
634
635 // Try to partition calls on the given node (already placed into the AllCalls
636 // array) by callee function, creating new copies of Node as needed to hold
637 // calls with different callees, and moving the callee edges appropriately.
638 // Returns true if partitioning was successful.
639 bool partitionCallsByCallee(
640 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
641 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode);
642
643 /// Save lists of calls with MemProf metadata in each function, for faster
644 /// iteration.
645 MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
646
647 /// Map from callsite node to the enclosing caller function.
648 std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
649
650 // When exporting to dot, and an allocation id is specified, contains the
651 // context ids on that allocation.
652 DenseSet<uint32_t> DotAllocContextIds;
653
654private:
655 using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
656
657 // Structure to keep track of information for each call as we are matching
658 // non-allocation callsites onto context nodes created from the allocation
659 // call metadata / summary contexts.
660 struct CallContextInfo {
661 // The callsite we're trying to match.
662 CallTy Call;
663 // The callsites stack ids that have a context node in the graph.
664 std::vector<uint64_t> StackIds;
665 // The function containing this callsite.
666 const FuncTy *Func;
667 // Initially empty, if needed this will be updated to contain the context
668 // ids for use in a new context node created for this callsite.
669 DenseSet<uint32_t> ContextIds;
670 };
671
672 /// Helper to remove edge from graph, updating edge iterator if it is provided
673 /// (in which case CalleeIter indicates which edge list is being iterated).
674 /// This will also perform the necessary clearing of the ContextEdge members
675 /// to enable later checking if the edge has been removed (since we may have
676 /// other copies of the shared_ptr in existence, and in fact rely on this to
677 /// enable removal while iterating over a copy of a node's edge list).
678 void removeEdgeFromGraph(ContextEdge *Edge, EdgeIter *EI = nullptr,
679 bool CalleeIter = true);
680
681 /// Assigns the given Node to calls at or inlined into the location with
682 /// the Node's stack id, after post order traversing and processing its
683 /// caller nodes. Uses the call information recorded in the given
684 /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
685 /// as needed. Called by updateStackNodes which sets up the given
686 /// StackIdToMatchingCalls map.
687 void assignStackNodesPostOrder(
688 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
689 DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls,
690 DenseMap<CallInfo, CallInfo> &CallToMatchingCall,
691 const DenseSet<uint32_t> &ImportantContextIds);
692
693 /// Duplicates the given set of context ids, updating the provided
694 /// map from each original id with the newly generated context ids,
695 /// and returning the new duplicated id set.
696 DenseSet<uint32_t> duplicateContextIds(
697 const DenseSet<uint32_t> &StackSequenceContextIds,
698 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
699
700 /// Propagates all duplicated context ids across the graph.
701 void propagateDuplicateContextIds(
702 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
703
704 /// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
705 /// else to its callers. Also updates OrigNode's edges to remove any context
706 /// ids moved to the newly created edge.
707 void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
708 bool TowardsCallee,
709 DenseSet<uint32_t> RemainingContextIds);
710
711 /// Get the stack id corresponding to the given Id or Index (for IR this will
712 /// return itself, for a summary index this will return the id recorded in the
713 /// index for that stack id index value).
714 uint64_t getStackId(uint64_t IdOrIndex) const {
715 return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
716 }
717
718 /// Returns true if the given call targets the callee of the given edge, or if
719 /// we were able to identify the call chain through intermediate tail calls.
720 /// In the latter case new context nodes are added to the graph for the
721 /// identified tail calls, and their synthesized nodes are added to
722 /// TailCallToContextNodeMap. The EdgeIter is updated in the latter case for
723 /// the updated edges and to prepare it for an increment in the caller.
724 bool
725 calleesMatch(CallTy Call, EdgeIter &EI,
726 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);
727
728 // Return the callee function of the given call, or nullptr if it can't be
729 // determined
730 const FuncTy *getCalleeFunc(CallTy Call) {
731 return static_cast<DerivedCCG *>(this)->getCalleeFunc(Call);
732 }
733
734 /// Returns true if the given call targets the given function, or if we were
735 /// able to identify the call chain through intermediate tail calls (in which
736 /// case FoundCalleeChain will be populated).
737 bool calleeMatchesFunc(
738 CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc,
739 std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) {
740 return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(
741 Call, Func, CallerFunc, FoundCalleeChain);
742 }
743
744 /// Returns true if both call instructions have the same callee.
745 bool sameCallee(CallTy Call1, CallTy Call2) {
746 return static_cast<DerivedCCG *>(this)->sameCallee(Call1, Call2);
747 }
748
749 /// Get a list of nodes corresponding to the stack ids in the given
750 /// callsite's context.
751 std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
752 return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
753 Call);
754 }
755
756 /// Get the last stack id in the context for callsite.
757 uint64_t getLastStackId(CallTy Call) {
758 return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
759 }
760
761 /// Update the allocation call to record type of allocated memory.
762 void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
763 AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
764 static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
765 }
766
767 /// Get the AllocationType assigned to the given allocation instruction clone.
768 AllocationType getAllocationCallType(const CallInfo &Call) const {
769 return static_cast<const DerivedCCG *>(this)->getAllocationCallType(Call);
770 }
771
772 /// Update non-allocation call to invoke (possibly cloned) function
773 /// CalleeFunc.
774 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
775 static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
776 }
777
778 /// Clone the given function for the given callsite, recording mapping of all
779 /// of the functions tracked calls to their new versions in the CallMap.
780 /// Assigns new clones to clone number CloneNo.
781 FuncInfo cloneFunctionForCallsite(
782 FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
783 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
784 return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
785 Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
786 }
787
788 /// Gets a label to use in the dot graph for the given call clone in the given
789 /// function.
790 std::string getLabel(const FuncTy *Func, const CallTy Call,
791 unsigned CloneNo) const {
792 return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
793 }
794
795 // Create and return a new ContextNode.
796 ContextNode *createNewNode(bool IsAllocation, const FuncTy *F = nullptr,
797 CallInfo C = CallInfo()) {
798 NodeOwner.push_back(std::make_unique<ContextNode>(IsAllocation, C));
799 auto *NewNode = NodeOwner.back().get();
800 if (F)
801 NodeToCallingFunc[NewNode] = F;
802 NewNode->NodeId = NodeOwner.size();
803 return NewNode;
804 }
805
806 /// Helpers to find the node corresponding to the given call or stackid.
807 ContextNode *getNodeForInst(const CallInfo &C);
808 ContextNode *getNodeForAlloc(const CallInfo &C);
809 ContextNode *getNodeForStackId(uint64_t StackId);
810
811 /// Computes the alloc type corresponding to the given context ids, by
812 /// unioning their recorded alloc types.
813 uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds) const;
814
815 /// Returns the allocation type of the intersection of the contexts of two
816 /// nodes (based on their provided context id sets), optimized for the case
817 /// when Node1Ids is smaller than Node2Ids.
818 uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
819 const DenseSet<uint32_t> &Node2Ids) const;
820
821 /// Returns the allocation type of the intersection of the contexts of two
822 /// nodes (based on their provided context id sets).
823 uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
824 const DenseSet<uint32_t> &Node2Ids) const;
825
826 /// Create a clone of Edge's callee and move Edge to that new callee node,
827 /// performing the necessary context id and allocation type updates.
828 /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
829 /// moved to an edge to the new callee.
830 ContextNode *
831 moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
832 DenseSet<uint32_t> ContextIdsToMove = {});
833
834 /// Change the callee of Edge to existing callee clone NewCallee, performing
835 /// the necessary context id and allocation type updates.
836 /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
837 /// moved to an edge to the new callee.
838 void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
839 ContextNode *NewCallee,
840 bool NewClone = false,
841 DenseSet<uint32_t> ContextIdsToMove = {});
842
843 /// Change the caller of the edge at the given callee edge iterator to be
844 /// NewCaller, performing the necessary context id and allocation type
845 /// updates. This is similar to the above moveEdgeToExistingCalleeClone, but
846 /// a simplified version of it as we always move the given edge and all of its
847 /// context ids.
848 void moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
849 ContextNode *NewCaller);
850
851 /// Recursive helper for marking backedges via DFS.
852 void markBackedges(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
853 DenseSet<const ContextNode *> &CurrentStack);
854
855 /// Recursive helper for merging clones.
856 void
857 mergeClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
858 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode);
859 /// Main worker for merging callee clones for a given node.
860 void mergeNodeCalleeClones(
861 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
862 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode);
863 /// Helper to find other callers of the given set of callee edges that can
864 /// share the same callee merge node.
865 void findOtherCallersToShareMerge(
866 ContextNode *Node, std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges,
867 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode,
868 DenseSet<ContextNode *> &OtherCallersToShareMerge);
869
870 /// Recursively perform cloning on the graph for the given Node and its
871 /// callers, in order to uniquely identify the allocation behavior of an
872 /// allocation given its context. The context ids of the allocation being
873 /// processed are given in AllocContextIds.
874 void identifyClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
875 const DenseSet<uint32_t> &AllocContextIds);
876
877 /// Map from each context ID to the AllocationType assigned to that context.
878 DenseMap<uint32_t, AllocationType> ContextIdToAllocationType;
879
880 /// Map from each contextID to the profiled full contexts and their total
881 /// sizes (there may be more than one due to context trimming),
882 /// optionally populated when requested (via MemProfReportHintedSizes or
883 /// MinClonedColdBytePercent).
884 DenseMap<uint32_t, std::vector<ContextTotalSize>> ContextIdToContextSizeInfos;
885
886 /// Identifies the context node created for a stack id when adding the MIB
887 /// contexts to the graph. This is used to locate the context nodes when
888 /// trying to assign the corresponding callsites with those stack ids to these
889 /// nodes.
890 DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
891
892 /// Saves information for the contexts identified as important (the largest
893 /// cold contexts up to MemProfTopNImportant).
894 struct ImportantContextInfo {
895 // The original list of leaf first stack ids corresponding to this context.
896 std::vector<uint64_t> StackIds;
897 // Max length of stack ids corresponding to a single stack ContextNode for
898 // this context (i.e. the max length of a key in StackIdsToNode below).
899 unsigned MaxLength = 0;
900 // Mapping of slices of the stack ids to the corresponding ContextNode
901 // (there can be multiple stack ids due to inlining). Populated when
902 // updating stack nodes while matching them to the IR or summary.
903 std::map<std::vector<uint64_t>, ContextNode *> StackIdsToNode;
904 };
905
906 // Map of important full context ids to information about each.
907 DenseMap<uint32_t, ImportantContextInfo> ImportantContextIdInfo;
908
909 // For each important context id found in Node (if any), records the list of
910 // stack ids that corresponded to the given callsite Node. There can be more
911 // than one in the case of inlining.
912 void recordStackNode(std::vector<uint64_t> &StackIds, ContextNode *Node,
913 // We pass in the Node's context ids to avoid the
914 // overhead of computing them as the caller already has
915 // them in some cases.
916 const DenseSet<uint32_t> &NodeContextIds,
917 const DenseSet<uint32_t> &ImportantContextIds) {
918 if (!MemProfTopNImportant) {
919 assert(ImportantContextIds.empty());
920 return;
921 }
922 DenseSet<uint32_t> Ids =
923 set_intersection(S1: NodeContextIds, S2: ImportantContextIds);
924 if (Ids.empty())
925 return;
926 auto Size = StackIds.size();
927 for (auto Id : Ids) {
928 auto &Entry = ImportantContextIdInfo[Id];
929 Entry.StackIdsToNode[StackIds] = Node;
930 // Keep track of the max to simplify later analysis.
931 if (Size > Entry.MaxLength)
932 Entry.MaxLength = Size;
933 }
934 }
935
936 /// Maps to track the calls to their corresponding nodes in the graph.
937 MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
938 MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
939
940 /// Owner of all ContextNode unique_ptrs.
941 std::vector<std::unique_ptr<ContextNode>> NodeOwner;
942
943 /// Perform sanity checks on graph when requested.
944 void check() const;
945
946 /// Keeps track of the last unique context id assigned.
947 unsigned int LastContextId = 0;
948};
949
950template <typename DerivedCCG, typename FuncTy, typename CallTy>
951using ContextNode =
952 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
953template <typename DerivedCCG, typename FuncTy, typename CallTy>
954using ContextEdge =
955 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
956template <typename DerivedCCG, typename FuncTy, typename CallTy>
957using FuncInfo =
958 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
959template <typename DerivedCCG, typename FuncTy, typename CallTy>
960using CallInfo =
961 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
962
963/// CRTP derived class for graphs built from IR (regular LTO).
964class ModuleCallsiteContextGraph
965 : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
966 Instruction *> {
967public:
968 ModuleCallsiteContextGraph(
969 Module &M,
970 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
971
972private:
973 friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
974 Instruction *>;
975
976 uint64_t getStackId(uint64_t IdOrIndex) const;
977 const Function *getCalleeFunc(Instruction *Call);
978 bool calleeMatchesFunc(
979 Instruction *Call, const Function *Func, const Function *CallerFunc,
980 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain);
981 bool sameCallee(Instruction *Call1, Instruction *Call2);
982 bool findProfiledCalleeThroughTailCalls(
983 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
984 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
985 bool &FoundMultipleCalleeChains);
986 uint64_t getLastStackId(Instruction *Call);
987 std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
988 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
989 AllocationType getAllocationCallType(const CallInfo &Call) const;
990 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
991 CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
992 Instruction *>::FuncInfo
993 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
994 DenseMap<CallInfo, CallInfo> &CallMap,
995 std::vector<CallInfo> &CallsWithMetadataInFunc,
996 unsigned CloneNo);
997 std::string getLabel(const Function *Func, const Instruction *Call,
998 unsigned CloneNo) const;
999
1000 const Module &Mod;
1001 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
1002};
1003
1004/// Represents a call in the summary index graph, which can either be an
1005/// allocation or an interior callsite node in an allocation's context.
1006/// Holds a pointer to the corresponding data structure in the index.
1007struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
1008 IndexCall() : PointerUnion() {}
1009 IndexCall(std::nullptr_t) : IndexCall() {}
1010 IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {}
1011 IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
1012 IndexCall(PointerUnion PT) : PointerUnion(PT) {}
1013
1014 IndexCall *operator->() { return this; }
1015
1016 void print(raw_ostream &OS) const {
1017 PointerUnion<CallsiteInfo *, AllocInfo *> Base = *this;
1018 if (auto *AI = llvm::dyn_cast_if_present<AllocInfo *>(Val&: Base)) {
1019 OS << *AI;
1020 } else {
1021 auto *CI = llvm::dyn_cast_if_present<CallsiteInfo *>(Val&: Base);
1022 assert(CI);
1023 OS << *CI;
1024 }
1025 }
1026};
1027} // namespace
1028
1029namespace llvm {
1030template <> struct simplify_type<IndexCall> {
1031 using SimpleType = PointerUnion<CallsiteInfo *, AllocInfo *>;
1032 static SimpleType getSimplifiedValue(IndexCall &Val) { return Val; }
1033};
1034template <> struct simplify_type<const IndexCall> {
1035 using SimpleType = const PointerUnion<CallsiteInfo *, AllocInfo *>;
1036 static SimpleType getSimplifiedValue(const IndexCall &Val) { return Val; }
1037};
1038} // namespace llvm
1039
1040namespace {
1041/// CRTP derived class for graphs built from summary index (ThinLTO).
1042class IndexCallsiteContextGraph
1043 : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
1044 IndexCall> {
1045public:
1046 IndexCallsiteContextGraph(
1047 ModuleSummaryIndex &Index,
1048 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
1049 isPrevailing);
1050
1051 ~IndexCallsiteContextGraph() {
1052 // Now that we are done with the graph it is safe to add the new
1053 // CallsiteInfo structs to the function summary vectors. The graph nodes
1054 // point into locations within these vectors, so we don't want to add them
1055 // any earlier.
1056 for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) {
1057 auto *FS = I.first;
1058 for (auto &Callsite : I.second)
1059 FS->addCallsite(Callsite&: *Callsite.second);
1060 }
1061 }
1062
1063private:
1064 friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
1065 IndexCall>;
1066
1067 uint64_t getStackId(uint64_t IdOrIndex) const;
1068 const FunctionSummary *getCalleeFunc(IndexCall &Call);
1069 bool calleeMatchesFunc(
1070 IndexCall &Call, const FunctionSummary *Func,
1071 const FunctionSummary *CallerFunc,
1072 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);
1073 bool sameCallee(IndexCall &Call1, IndexCall &Call2);
1074 bool findProfiledCalleeThroughTailCalls(
1075 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
1076 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
1077 bool &FoundMultipleCalleeChains);
1078 uint64_t getLastStackId(IndexCall &Call);
1079 std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
1080 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
1081 AllocationType getAllocationCallType(const CallInfo &Call) const;
1082 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
1083 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
1084 IndexCall>::FuncInfo
1085 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
1086 DenseMap<CallInfo, CallInfo> &CallMap,
1087 std::vector<CallInfo> &CallsWithMetadataInFunc,
1088 unsigned CloneNo);
1089 std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
1090 unsigned CloneNo) const;
1091 DenseSet<GlobalValue::GUID> findAliaseeGUIDsPrevailingInDifferentModule();
1092
1093 // Saves mapping from function summaries containing memprof records back to
1094 // its VI, for use in checking and debugging.
1095 std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
1096
1097 const ModuleSummaryIndex &Index;
1098 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
1099 isPrevailing;
1100
1101 // Saves/owns the callsite info structures synthesized for missing tail call
1102 // frames that we discover while building the graph.
1103 // It maps from the summary of the function making the tail call, to a map
1104 // of callee ValueInfo to corresponding synthesized callsite info.
1105 std::unordered_map<FunctionSummary *,
1106 std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>>
1107 FunctionCalleesToSynthesizedCallsiteInfos;
1108};
1109} // namespace
1110
1111template <>
1112struct llvm::DenseMapInfo<CallsiteContextGraph<
1113 ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
1114 : public DenseMapInfo<std::pair<Instruction *, unsigned>> {};
1115template <>
1116struct llvm::DenseMapInfo<CallsiteContextGraph<
1117 IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
1118 : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
1119template <>
1120struct llvm::DenseMapInfo<IndexCall>
1121 : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
1122
1123namespace {
1124
1125// Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
1126// type we should actually use on the corresponding allocation.
1127// If we can't clone a node that has NotCold+Cold alloc type, we will fall
1128// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
1129// from NotCold.
1130AllocationType allocTypeToUse(uint8_t AllocTypes) {
1131 assert(AllocTypes != (uint8_t)AllocationType::None);
1132 if (AllocTypes ==
1133 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
1134 return AllocationType::NotCold;
1135 else
1136 return (AllocationType)AllocTypes;
1137}
1138
1139// Helper to check if the alloc types for all edges recorded in the
1140// InAllocTypes vector match the alloc types for all edges in the Edges
1141// vector.
1142template <typename DerivedCCG, typename FuncTy, typename CallTy>
1143bool allocTypesMatch(
1144 const std::vector<uint8_t> &InAllocTypes,
1145 const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
1146 &Edges) {
1147 // This should be called only when the InAllocTypes vector was computed for
1148 // this set of Edges. Make sure the sizes are the same.
1149 assert(InAllocTypes.size() == Edges.size());
1150 return std::equal(
1151 InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(), Edges.end(),
1152 [](const uint8_t &l,
1153 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
1154 // Can share if one of the edges is None type - don't
1155 // care about the type along that edge as it doesn't
1156 // exist for those context ids.
1157 if (l == (uint8_t)AllocationType::None ||
1158 r->AllocTypes == (uint8_t)AllocationType::None)
1159 return true;
1160 return allocTypeToUse(AllocTypes: l) == allocTypeToUse(r->AllocTypes);
1161 });
1162}
1163
1164// Helper to check if the alloc types for all edges recorded in the
1165// InAllocTypes vector match the alloc types for callee edges in the given
1166// clone. Because the InAllocTypes were computed from the original node's callee
1167// edges, and other cloning could have happened after this clone was created, we
1168// need to find the matching clone callee edge, which may or may not exist.
1169template <typename DerivedCCG, typename FuncTy, typename CallTy>
1170bool allocTypesMatchClone(
1171 const std::vector<uint8_t> &InAllocTypes,
1172 const ContextNode<DerivedCCG, FuncTy, CallTy> *Clone) {
1173 const ContextNode<DerivedCCG, FuncTy, CallTy> *Node = Clone->CloneOf;
1174 assert(Node);
1175 // InAllocTypes should have been computed for the original node's callee
1176 // edges.
1177 assert(InAllocTypes.size() == Node->CalleeEdges.size());
1178 // First create a map of the clone callee edge callees to the edge alloc type.
1179 DenseMap<const ContextNode<DerivedCCG, FuncTy, CallTy> *, uint8_t>
1180 EdgeCalleeMap;
1181 for (const auto &E : Clone->CalleeEdges) {
1182 assert(!EdgeCalleeMap.contains(E->Callee));
1183 EdgeCalleeMap[E->Callee] = E->AllocTypes;
1184 }
1185 // Next, walk the original node's callees, and look for the corresponding
1186 // clone edge to that callee.
1187 for (unsigned I = 0; I < Node->CalleeEdges.size(); I++) {
1188 auto Iter = EdgeCalleeMap.find(Node->CalleeEdges[I]->Callee);
1189 // Not found is ok, we will simply add an edge if we use this clone.
1190 if (Iter == EdgeCalleeMap.end())
1191 continue;
1192 // Can share if one of the edges is None type - don't
1193 // care about the type along that edge as it doesn't
1194 // exist for those context ids.
1195 if (InAllocTypes[I] == (uint8_t)AllocationType::None ||
1196 Iter->second == (uint8_t)AllocationType::None)
1197 continue;
1198 if (allocTypeToUse(Iter->second) != allocTypeToUse(AllocTypes: InAllocTypes[I]))
1199 return false;
1200 }
1201 return true;
1202}
1203
1204} // end anonymous namespace
1205
1206template <typename DerivedCCG, typename FuncTy, typename CallTy>
1207typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1208CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
1209 const CallInfo &C) {
1210 ContextNode *Node = getNodeForAlloc(C);
1211 if (Node)
1212 return Node;
1213
1214 return NonAllocationCallToContextNodeMap.lookup(C);
1215}
1216
1217template <typename DerivedCCG, typename FuncTy, typename CallTy>
1218typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1219CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
1220 const CallInfo &C) {
1221 return AllocationCallToContextNodeMap.lookup(C);
1222}
1223
1224template <typename DerivedCCG, typename FuncTy, typename CallTy>
1225typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1226CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
1227 uint64_t StackId) {
1228 auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
1229 if (StackEntryNode != StackEntryIdToContextNodeMap.end())
1230 return StackEntryNode->second;
1231 return nullptr;
1232}
1233
1234template <typename DerivedCCG, typename FuncTy, typename CallTy>
1235void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1236 addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
1237 unsigned int ContextId) {
1238 for (auto &Edge : CallerEdges) {
1239 if (Edge->Caller == Caller) {
1240 Edge->AllocTypes |= (uint8_t)AllocType;
1241 Edge->getContextIds().insert(ContextId);
1242 return;
1243 }
1244 }
1245 std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
1246 this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
1247 CallerEdges.push_back(Edge);
1248 Caller->CalleeEdges.push_back(Edge);
1249}
1250
1251template <typename DerivedCCG, typename FuncTy, typename CallTy>
1252void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::removeEdgeFromGraph(
1253 ContextEdge *Edge, EdgeIter *EI, bool CalleeIter) {
1254 assert(!EI || (*EI)->get() == Edge);
1255 assert(!Edge->isRemoved());
1256 // Save the Caller and Callee pointers so we can erase Edge from their edge
1257 // lists after clearing Edge below. We do the clearing first in case it is
1258 // destructed after removing from the edge lists (if those were the last
1259 // shared_ptr references to Edge).
1260 auto *Callee = Edge->Callee;
1261 auto *Caller = Edge->Caller;
1262
1263 // Make sure the edge fields are cleared out so we can properly detect
1264 // removed edges if Edge is not destructed because there is still a shared_ptr
1265 // reference.
1266 Edge->clear();
1267
1268#ifndef NDEBUG
1269 auto CalleeCallerCount = Callee->CallerEdges.size();
1270 auto CallerCalleeCount = Caller->CalleeEdges.size();
1271#endif
1272 if (!EI) {
1273 Callee->eraseCallerEdge(Edge);
1274 Caller->eraseCalleeEdge(Edge);
1275 } else if (CalleeIter) {
1276 Callee->eraseCallerEdge(Edge);
1277 *EI = Caller->CalleeEdges.erase(*EI);
1278 } else {
1279 Caller->eraseCalleeEdge(Edge);
1280 *EI = Callee->CallerEdges.erase(*EI);
1281 }
1282 assert(Callee->CallerEdges.size() < CalleeCallerCount);
1283 assert(Caller->CalleeEdges.size() < CallerCalleeCount);
1284}
1285
1286template <typename DerivedCCG, typename FuncTy, typename CallTy>
1287void CallsiteContextGraph<
1288 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
1289 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
1290 auto Edge = *EI;
1291 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1292 assert(Edge->ContextIds.empty());
1293 removeEdgeFromGraph(Edge: Edge.get(), EI: &EI, /*CalleeIter=*/true);
1294 } else
1295 ++EI;
1296 }
1297}
1298
1299template <typename DerivedCCG, typename FuncTy, typename CallTy>
1300void CallsiteContextGraph<
1301 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCallerEdges(ContextNode *Node) {
1302 for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
1303 auto Edge = *EI;
1304 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1305 assert(Edge->ContextIds.empty());
1306 Edge->Caller->eraseCalleeEdge(Edge.get());
1307 EI = Node->CallerEdges.erase(EI);
1308 } else
1309 ++EI;
1310 }
1311}
1312
1313template <typename DerivedCCG, typename FuncTy, typename CallTy>
1314typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1315CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1316 findEdgeFromCallee(const ContextNode *Callee) {
1317 for (const auto &Edge : CalleeEdges)
1318 if (Edge->Callee == Callee)
1319 return Edge.get();
1320 return nullptr;
1321}
1322
1323template <typename DerivedCCG, typename FuncTy, typename CallTy>
1324typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1325CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1326 findEdgeFromCaller(const ContextNode *Caller) {
1327 for (const auto &Edge : CallerEdges)
1328 if (Edge->Caller == Caller)
1329 return Edge.get();
1330 return nullptr;
1331}
1332
1333template <typename DerivedCCG, typename FuncTy, typename CallTy>
1334void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1335 eraseCalleeEdge(const ContextEdge *Edge) {
1336 auto EI = llvm::find_if(
1337 CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
1338 return CalleeEdge.get() == Edge;
1339 });
1340 assert(EI != CalleeEdges.end());
1341 CalleeEdges.erase(EI);
1342}
1343
1344template <typename DerivedCCG, typename FuncTy, typename CallTy>
1345void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1346 eraseCallerEdge(const ContextEdge *Edge) {
1347 auto EI = llvm::find_if(
1348 CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
1349 return CallerEdge.get() == Edge;
1350 });
1351 assert(EI != CallerEdges.end());
1352 CallerEdges.erase(EI);
1353}
1354
1355template <typename DerivedCCG, typename FuncTy, typename CallTy>
1356uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
1357 DenseSet<uint32_t> &ContextIds) const {
1358 uint8_t BothTypes =
1359 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1360 uint8_t AllocType = (uint8_t)AllocationType::None;
1361 for (auto Id : ContextIds) {
1362 AllocType |= (uint8_t)ContextIdToAllocationType.at(Val: Id);
1363 // Bail early if alloc type reached both, no further refinement.
1364 if (AllocType == BothTypes)
1365 return AllocType;
1366 }
1367 return AllocType;
1368}
1369
1370template <typename DerivedCCG, typename FuncTy, typename CallTy>
1371uint8_t
1372CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
1373 const DenseSet<uint32_t> &Node1Ids,
1374 const DenseSet<uint32_t> &Node2Ids) const {
1375 uint8_t BothTypes =
1376 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1377 uint8_t AllocType = (uint8_t)AllocationType::None;
1378 for (auto Id : Node1Ids) {
1379 if (!Node2Ids.count(V: Id))
1380 continue;
1381 AllocType |= (uint8_t)ContextIdToAllocationType.at(Val: Id);
1382 // Bail early if alloc type reached both, no further refinement.
1383 if (AllocType == BothTypes)
1384 return AllocType;
1385 }
1386 return AllocType;
1387}
1388
1389template <typename DerivedCCG, typename FuncTy, typename CallTy>
1390uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
1391 const DenseSet<uint32_t> &Node1Ids,
1392 const DenseSet<uint32_t> &Node2Ids) const {
1393 if (Node1Ids.size() < Node2Ids.size())
1394 return intersectAllocTypesImpl(Node1Ids, Node2Ids);
1395 else
1396 return intersectAllocTypesImpl(Node1Ids: Node2Ids, Node2Ids: Node1Ids);
1397}
1398
1399template <typename DerivedCCG, typename FuncTy, typename CallTy>
1400typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1401CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
1402 CallInfo Call, const FuncTy *F) {
1403 assert(!getNodeForAlloc(Call));
1404 ContextNode *AllocNode = createNewNode(/*IsAllocation=*/true, F, C: Call);
1405 AllocationCallToContextNodeMap[Call] = AllocNode;
1406 // Use LastContextId as a uniq id for MIB allocation nodes.
1407 AllocNode->OrigStackOrAllocId = LastContextId;
1408 // Alloc type should be updated as we add in the MIBs. We should assert
1409 // afterwards that it is not still None.
1410 AllocNode->AllocTypes = (uint8_t)AllocationType::None;
1411
1412 return AllocNode;
1413}
1414
1415static std::string getAllocTypeString(uint8_t AllocTypes) {
1416 if (!AllocTypes)
1417 return "None";
1418 std::string Str;
1419 if (AllocTypes & (uint8_t)AllocationType::NotCold)
1420 Str += "NotCold";
1421 if (AllocTypes & (uint8_t)AllocationType::Cold)
1422 Str += "Cold";
1423 return Str;
1424}
1425
1426template <typename DerivedCCG, typename FuncTy, typename CallTy>
1427template <class NodeT, class IteratorT>
1428void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
1429 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
1430 CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
1431 ArrayRef<ContextTotalSize> ContextSizeInfo,
1432 std::map<uint64_t, uint32_t> &TotalSizeToContextIdTopNCold) {
1433 // Treating the hot alloc type as NotCold before the disambiguation for "hot"
1434 // is done.
1435 if (AllocType == AllocationType::Hot)
1436 AllocType = AllocationType::NotCold;
1437
1438 ContextIdToAllocationType[++LastContextId] = AllocType;
1439
1440 bool IsImportant = false;
1441 if (!ContextSizeInfo.empty()) {
1442 auto &Entry = ContextIdToContextSizeInfos[LastContextId];
1443 // If this is a cold allocation, and we are collecting non-zero largest
1444 // contexts, see if this is a candidate.
1445 if (AllocType == AllocationType::Cold && MemProfTopNImportant > 0) {
1446 uint64_t TotalCold = 0;
1447 for (auto &CSI : ContextSizeInfo)
1448 TotalCold += CSI.TotalSize;
1449 // Record this context if either we haven't found the first top-n largest
1450 // yet, or if it is larger than the smallest already recorded.
1451 if (TotalSizeToContextIdTopNCold.size() < MemProfTopNImportant ||
1452 // Since TotalSizeToContextIdTopNCold is a std::map, it is implicitly
1453 // sorted in ascending size of its key which is the size.
1454 TotalCold > TotalSizeToContextIdTopNCold.begin()->first) {
1455 if (TotalSizeToContextIdTopNCold.size() == MemProfTopNImportant) {
1456 // Remove old one and its associated entries.
1457 auto IdToRemove = TotalSizeToContextIdTopNCold.begin()->second;
1458 TotalSizeToContextIdTopNCold.erase(
1459 position: TotalSizeToContextIdTopNCold.begin());
1460 assert(ImportantContextIdInfo.count(IdToRemove));
1461 ImportantContextIdInfo.erase(IdToRemove);
1462 }
1463 TotalSizeToContextIdTopNCold[TotalCold] = LastContextId;
1464 IsImportant = true;
1465 }
1466 }
1467 Entry.insert(position: Entry.begin(), first: ContextSizeInfo.begin(), last: ContextSizeInfo.end());
1468 }
1469
1470 // Update alloc type and context ids for this MIB.
1471 AllocNode->AllocTypes |= (uint8_t)AllocType;
1472
1473 // Now add or update nodes for each stack id in alloc's context.
1474 // Later when processing the stack ids on non-alloc callsites we will adjust
1475 // for any inlining in the context.
1476 ContextNode *PrevNode = AllocNode;
1477 // Look for recursion (direct recursion should have been collapsed by
1478 // module summary analysis, here we should just be detecting mutual
1479 // recursion). Mark these nodes so we don't try to clone.
1480 SmallSet<uint64_t, 8> StackIdSet;
1481 // Skip any on the allocation call (inlining).
1482 for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
1483 ContextIter != StackContext.end(); ++ContextIter) {
1484 auto StackId = getStackId(IdOrIndex: *ContextIter);
1485 if (IsImportant)
1486 ImportantContextIdInfo[LastContextId].StackIds.push_back(StackId);
1487 ContextNode *StackNode = getNodeForStackId(StackId);
1488 if (!StackNode) {
1489 StackNode = createNewNode(/*IsAllocation=*/false);
1490 StackEntryIdToContextNodeMap[StackId] = StackNode;
1491 StackNode->OrigStackOrAllocId = StackId;
1492 }
1493 // Marking a node recursive will prevent its cloning completely, even for
1494 // non-recursive contexts flowing through it.
1495 if (!AllowRecursiveCallsites) {
1496 auto Ins = StackIdSet.insert(StackId);
1497 if (!Ins.second)
1498 StackNode->Recursive = true;
1499 }
1500 StackNode->AllocTypes |= (uint8_t)AllocType;
1501 PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
1502 PrevNode = StackNode;
1503 }
1504}
1505
1506template <typename DerivedCCG, typename FuncTy, typename CallTy>
1507DenseSet<uint32_t>
1508CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
1509 const DenseSet<uint32_t> &StackSequenceContextIds,
1510 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1511 DenseSet<uint32_t> NewContextIds;
1512 for (auto OldId : StackSequenceContextIds) {
1513 NewContextIds.insert(V: ++LastContextId);
1514 OldToNewContextIds[OldId].insert(V: LastContextId);
1515 assert(ContextIdToAllocationType.count(OldId));
1516 // The new context has the same allocation type and size info as original.
1517 ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
1518 auto CSI = ContextIdToContextSizeInfos.find(Val: OldId);
1519 if (CSI != ContextIdToContextSizeInfos.end())
1520 ContextIdToContextSizeInfos[LastContextId] = CSI->second;
1521 if (DotAllocContextIds.contains(V: OldId))
1522 DotAllocContextIds.insert(V: LastContextId);
1523 }
1524 return NewContextIds;
1525}
1526
1527template <typename DerivedCCG, typename FuncTy, typename CallTy>
1528void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1529 propagateDuplicateContextIds(
1530 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1531 // Build a set of duplicated context ids corresponding to the input id set.
1532 auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
1533 DenseSet<uint32_t> NewIds;
1534 for (auto Id : ContextIds)
1535 if (auto NewId = OldToNewContextIds.find(Val: Id);
1536 NewId != OldToNewContextIds.end())
1537 NewIds.insert_range(R: NewId->second);
1538 return NewIds;
1539 };
1540
1541 // Recursively update context ids sets along caller edges.
1542 auto UpdateCallers = [&](ContextNode *Node,
1543 DenseSet<const ContextEdge *> &Visited,
1544 auto &&UpdateCallers) -> void {
1545 for (const auto &Edge : Node->CallerEdges) {
1546 auto Inserted = Visited.insert(Edge.get());
1547 if (!Inserted.second)
1548 continue;
1549 ContextNode *NextNode = Edge->Caller;
1550 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
1551 // Only need to recursively iterate to NextNode via this caller edge if
1552 // it resulted in any added ids to NextNode.
1553 if (!NewIdsToAdd.empty()) {
1554 Edge->getContextIds().insert_range(NewIdsToAdd);
1555 UpdateCallers(NextNode, Visited, UpdateCallers);
1556 }
1557 }
1558 };
1559
1560 DenseSet<const ContextEdge *> Visited;
1561 for (auto &Entry : AllocationCallToContextNodeMap) {
1562 auto *Node = Entry.second;
1563 UpdateCallers(Node, Visited, UpdateCallers);
1564 }
1565}
1566
1567template <typename DerivedCCG, typename FuncTy, typename CallTy>
1568void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
1569 ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee,
1570 // This must be passed by value to make a copy since it will be adjusted
1571 // as ids are moved.
1572 DenseSet<uint32_t> RemainingContextIds) {
1573 auto &OrigEdges =
1574 TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
1575 DenseSet<uint32_t> RecursiveContextIds;
1576 DenseSet<uint32_t> AllCallerContextIds;
1577 if (AllowRecursiveCallsites) {
1578 // Identify which context ids are recursive which is needed to properly
1579 // update the RemainingContextIds set. The relevant recursive context ids
1580 // are those that are in multiple edges.
1581 for (auto &CE : OrigEdges) {
1582 AllCallerContextIds.reserve(Size: CE->getContextIds().size());
1583 for (auto Id : CE->getContextIds())
1584 if (!AllCallerContextIds.insert(Id).second)
1585 RecursiveContextIds.insert(Id);
1586 }
1587 }
1588 // Increment iterator in loop so that we can remove edges as needed.
1589 for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
1590 auto Edge = *EI;
1591 DenseSet<uint32_t> NewEdgeContextIds;
1592 DenseSet<uint32_t> NotFoundContextIds;
1593 // Remove any matching context ids from Edge, return set that were found and
1594 // removed, these are the new edge's context ids. Also update the remaining
1595 // (not found ids).
1596 set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
1597 NotFoundContextIds);
1598 // Update the remaining context ids set for the later edges. This is a
1599 // compile time optimization.
1600 if (RecursiveContextIds.empty()) {
1601 // No recursive ids, so all of the previously remaining context ids that
1602 // were not seen on this edge are the new remaining set.
1603 RemainingContextIds.swap(RHS&: NotFoundContextIds);
1604 } else {
1605 // Keep the recursive ids in the remaining set as we expect to see those
1606 // on another edge. We can remove the non-recursive remaining ids that
1607 // were seen on this edge, however. We already have the set of remaining
1608 // ids that were on this edge (in NewEdgeContextIds). Figure out which are
1609 // non-recursive and only remove those. Note that despite the higher
1610 // overhead of updating the remaining context ids set when recursion
1611 // handling is enabled, it was found to be at worst performance neutral
1612 // and in one case a clear win.
1613 DenseSet<uint32_t> NonRecursiveRemainingCurEdgeIds =
1614 set_difference(S1: NewEdgeContextIds, S2: RecursiveContextIds);
1615 set_subtract(S1&: RemainingContextIds, S2: NonRecursiveRemainingCurEdgeIds);
1616 }
1617 // If no matching context ids for this edge, skip it.
1618 if (NewEdgeContextIds.empty()) {
1619 ++EI;
1620 continue;
1621 }
1622 if (TowardsCallee) {
1623 uint8_t NewAllocType = computeAllocType(ContextIds&: NewEdgeContextIds);
1624 auto NewEdge = std::make_shared<ContextEdge>(
1625 Edge->Callee, NewNode, NewAllocType, std::move(NewEdgeContextIds));
1626 NewNode->CalleeEdges.push_back(NewEdge);
1627 NewEdge->Callee->CallerEdges.push_back(NewEdge);
1628 } else {
1629 uint8_t NewAllocType = computeAllocType(ContextIds&: NewEdgeContextIds);
1630 auto NewEdge = std::make_shared<ContextEdge>(
1631 NewNode, Edge->Caller, NewAllocType, std::move(NewEdgeContextIds));
1632 NewNode->CallerEdges.push_back(NewEdge);
1633 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
1634 }
1635 // Remove old edge if context ids empty.
1636 if (Edge->getContextIds().empty()) {
1637 removeEdgeFromGraph(Edge: Edge.get(), EI: &EI, CalleeIter: TowardsCallee);
1638 continue;
1639 }
1640 ++EI;
1641 }
1642}
1643
1644template <typename DerivedCCG, typename FuncTy, typename CallTy>
1645static void checkEdge(
1646 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
1647 // Confirm that alloc type is not None and that we have at least one context
1648 // id.
1649 assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
1650 assert(!Edge->ContextIds.empty());
1651}
1652
1653template <typename DerivedCCG, typename FuncTy, typename CallTy>
1654static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
1655 bool CheckEdges = true) {
1656 if (Node->isRemoved())
1657 return;
1658#ifndef NDEBUG
1659 // Compute node's context ids once for use in asserts.
1660 auto NodeContextIds = Node->getContextIds();
1661#endif
1662 // Node's context ids should be the union of both its callee and caller edge
1663 // context ids.
1664 if (Node->CallerEdges.size()) {
1665 DenseSet<uint32_t> CallerEdgeContextIds(
1666 Node->CallerEdges.front()->ContextIds);
1667 for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) {
1668 if (CheckEdges)
1669 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1670 set_union(CallerEdgeContextIds, Edge->ContextIds);
1671 }
1672 // Node can have more context ids than callers if some contexts terminate at
1673 // node and some are longer. If we are allowing recursive callsites and
1674 // contexts this will be violated for incompletely cloned recursive cycles,
1675 // so skip the checking in that case.
1676 assert((AllowRecursiveCallsites && AllowRecursiveContexts) ||
1677 NodeContextIds == CallerEdgeContextIds ||
1678 set_is_subset(CallerEdgeContextIds, NodeContextIds));
1679 }
1680 if (Node->CalleeEdges.size()) {
1681 DenseSet<uint32_t> CalleeEdgeContextIds(
1682 Node->CalleeEdges.front()->ContextIds);
1683 for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) {
1684 if (CheckEdges)
1685 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1686 set_union(CalleeEdgeContextIds, Edge->getContextIds());
1687 }
1688 // If we are allowing recursive callsites and contexts this will be violated
1689 // for incompletely cloned recursive cycles, so skip the checking in that
1690 // case.
1691 assert((AllowRecursiveCallsites && AllowRecursiveContexts) ||
1692 NodeContextIds == CalleeEdgeContextIds);
1693 }
1694 // FIXME: Since this checking is only invoked under an option, we should
1695 // change the error checking from using assert to something that will trigger
1696 // an error on a release build.
1697#ifndef NDEBUG
1698 // Make sure we don't end up with duplicate edges between the same caller and
1699 // callee.
1700 DenseSet<ContextNode<DerivedCCG, FuncTy, CallTy> *> NodeSet;
1701 for (const auto &E : Node->CalleeEdges)
1702 NodeSet.insert(E->Callee);
1703 assert(NodeSet.size() == Node->CalleeEdges.size());
1704#endif
1705}
1706
1707template <typename DerivedCCG, typename FuncTy, typename CallTy>
1708void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1709 assignStackNodesPostOrder(ContextNode *Node,
1710 DenseSet<const ContextNode *> &Visited,
1711 DenseMap<uint64_t, std::vector<CallContextInfo>>
1712 &StackIdToMatchingCalls,
1713 DenseMap<CallInfo, CallInfo> &CallToMatchingCall,
1714 const DenseSet<uint32_t> &ImportantContextIds) {
1715 auto Inserted = Visited.insert(Node);
1716 if (!Inserted.second)
1717 return;
1718 // Post order traversal. Iterate over a copy since we may add nodes and
1719 // therefore new callers during the recursive call, invalidating any
1720 // iterator over the original edge vector. We don't need to process these
1721 // new nodes as they were already processed on creation.
1722 auto CallerEdges = Node->CallerEdges;
1723 for (auto &Edge : CallerEdges) {
1724 // Skip any that have been removed during the recursion.
1725 if (Edge->isRemoved()) {
1726 assert(!is_contained(Node->CallerEdges, Edge));
1727 continue;
1728 }
1729 assignStackNodesPostOrder(Node: Edge->Caller, Visited, StackIdToMatchingCalls,
1730 CallToMatchingCall, ImportantContextIds);
1731 }
1732
1733 // If this node's stack id is in the map, update the graph to contain new
1734 // nodes representing any inlining at interior callsites. Note we move the
1735 // associated context ids over to the new nodes.
1736
1737 // Ignore this node if it is for an allocation or we didn't record any
1738 // stack id lists ending at it.
1739 if (Node->IsAllocation ||
1740 !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
1741 return;
1742
1743 auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
1744 // Handle the simple case first. A single call with a single stack id.
1745 // In this case there is no need to create any new context nodes, simply
1746 // assign the context node for stack id to this Call.
1747 if (Calls.size() == 1) {
1748 auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
1749 if (Ids.size() == 1) {
1750 assert(SavedContextIds.empty());
1751 // It should be this Node
1752 assert(Node == getNodeForStackId(Ids[0]));
1753 if (Node->Recursive)
1754 return;
1755 Node->setCall(Call);
1756 NonAllocationCallToContextNodeMap[Call] = Node;
1757 NodeToCallingFunc[Node] = Func;
1758 recordStackNode(StackIds&: Ids, Node, NodeContextIds: Node->getContextIds(), ImportantContextIds);
1759 return;
1760 }
1761 }
1762
1763#ifndef NDEBUG
1764 // Find the node for the last stack id, which should be the same
1765 // across all calls recorded for this id, and is this node's id.
1766 uint64_t LastId = Node->OrigStackOrAllocId;
1767 ContextNode *LastNode = getNodeForStackId(LastId);
1768 // We should only have kept stack ids that had nodes.
1769 assert(LastNode);
1770 assert(LastNode == Node);
1771#else
1772 ContextNode *LastNode = Node;
1773#endif
1774
1775 // Compute the last node's context ids once, as it is shared by all calls in
1776 // this entry.
1777 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1778
1779 [[maybe_unused]] bool PrevIterCreatedNode = false;
1780 bool CreatedNode = false;
1781 for (unsigned I = 0; I < Calls.size();
1782 I++, PrevIterCreatedNode = CreatedNode) {
1783 CreatedNode = false;
1784 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1785 // Skip any for which we didn't assign any ids, these don't get a node in
1786 // the graph.
1787 if (SavedContextIds.empty()) {
1788 // If this call has a matching call (located in the same function and
1789 // having the same stack ids), simply add it to the context node created
1790 // for its matching call earlier. These can be treated the same through
1791 // cloning and get updated at the same time.
1792 if (!CallToMatchingCall.contains(Call))
1793 continue;
1794 auto MatchingCall = CallToMatchingCall[Call];
1795 if (!NonAllocationCallToContextNodeMap.contains(MatchingCall)) {
1796 // This should only happen if we had a prior iteration, and it didn't
1797 // create a node because of the below recomputation of context ids
1798 // finding none remaining and continuing early.
1799 assert(I > 0 && !PrevIterCreatedNode);
1800 continue;
1801 }
1802 NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back(
1803 Call);
1804 continue;
1805 }
1806
1807 assert(LastId == Ids.back());
1808
1809 // Recompute the context ids for this stack id sequence (the
1810 // intersection of the context ids of the corresponding nodes).
1811 // Start with the ids we saved in the map for this call, which could be
1812 // duplicated context ids. We have to recompute as we might have overlap
1813 // overlap between the saved context ids for different last nodes, and
1814 // removed them already during the post order traversal.
1815 set_intersect(SavedContextIds, LastNodeContextIds);
1816 ContextNode *PrevNode = LastNode;
1817 bool Skip = false;
1818 // Iterate backwards through the stack Ids, starting after the last Id
1819 // in the list, which was handled once outside for all Calls.
1820 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1821 auto Id = *IdIter;
1822 ContextNode *CurNode = getNodeForStackId(StackId: Id);
1823 // We should only have kept stack ids that had nodes and weren't
1824 // recursive.
1825 assert(CurNode);
1826 assert(!CurNode->Recursive);
1827
1828 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1829 if (!Edge) {
1830 Skip = true;
1831 break;
1832 }
1833 PrevNode = CurNode;
1834
1835 // Update the context ids, which is the intersection of the ids along
1836 // all edges in the sequence.
1837 set_intersect(SavedContextIds, Edge->getContextIds());
1838
1839 // If we now have no context ids for clone, skip this call.
1840 if (SavedContextIds.empty()) {
1841 Skip = true;
1842 break;
1843 }
1844 }
1845 if (Skip)
1846 continue;
1847
1848 // Create new context node.
1849 ContextNode *NewNode = createNewNode(/*IsAllocation=*/false, F: Func, C: Call);
1850 NonAllocationCallToContextNodeMap[Call] = NewNode;
1851 CreatedNode = true;
1852 NewNode->AllocTypes = computeAllocType(ContextIds&: SavedContextIds);
1853
1854 ContextNode *FirstNode = getNodeForStackId(StackId: Ids[0]);
1855 assert(FirstNode);
1856
1857 // Connect to callees of innermost stack frame in inlined call chain.
1858 // This updates context ids for FirstNode's callee's to reflect those
1859 // moved to NewNode.
1860 connectNewNode(NewNode, OrigNode: FirstNode, /*TowardsCallee=*/true, RemainingContextIds: SavedContextIds);
1861
1862 // Connect to callers of outermost stack frame in inlined call chain.
1863 // This updates context ids for FirstNode's caller's to reflect those
1864 // moved to NewNode.
1865 connectNewNode(NewNode, OrigNode: LastNode, /*TowardsCallee=*/false, RemainingContextIds: SavedContextIds);
1866
1867 // Now we need to remove context ids from edges/nodes between First and
1868 // Last Node.
1869 PrevNode = nullptr;
1870 for (auto Id : Ids) {
1871 ContextNode *CurNode = getNodeForStackId(StackId: Id);
1872 // We should only have kept stack ids that had nodes.
1873 assert(CurNode);
1874
1875 // Remove the context ids moved to NewNode from CurNode, and the
1876 // edge from the prior node.
1877 if (PrevNode) {
1878 auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
1879 // If the sequence contained recursion, we might have already removed
1880 // some edges during the connectNewNode calls above.
1881 if (!PrevEdge) {
1882 PrevNode = CurNode;
1883 continue;
1884 }
1885 set_subtract(PrevEdge->getContextIds(), SavedContextIds);
1886 if (PrevEdge->getContextIds().empty())
1887 removeEdgeFromGraph(Edge: PrevEdge);
1888 }
1889 // Since we update the edges from leaf to tail, only look at the callee
1890 // edges. This isn't an alloc node, so if there are no callee edges, the
1891 // alloc type is None.
1892 CurNode->AllocTypes = CurNode->CalleeEdges.empty()
1893 ? (uint8_t)AllocationType::None
1894 : CurNode->computeAllocType();
1895 PrevNode = CurNode;
1896 }
1897
1898 recordStackNode(StackIds&: Ids, Node: NewNode, NodeContextIds: SavedContextIds, ImportantContextIds);
1899
1900 if (VerifyNodes) {
1901 checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true);
1902 for (auto Id : Ids) {
1903 ContextNode *CurNode = getNodeForStackId(StackId: Id);
1904 // We should only have kept stack ids that had nodes.
1905 assert(CurNode);
1906 checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /*CheckEdges=*/true);
1907 }
1908 }
1909 }
1910}
1911
1912template <typename DerivedCCG, typename FuncTy, typename CallTy>
1913void CallsiteContextGraph<DerivedCCG, FuncTy,
1914 CallTy>::fixupImportantContexts() {
1915 if (ImportantContextIdInfo.empty())
1916 return;
1917
1918 // Update statistics as we are done building this map at this point.
1919 NumImportantContextIds = ImportantContextIdInfo.size();
1920
1921 if (!MemProfFixupImportant)
1922 return;
1923
1924 if (ExportToDot)
1925 exportToDot(Label: "beforestackfixup");
1926
1927 // For each context we identified as important, walk through the saved context
1928 // stack ids in order from leaf upwards, and make sure all edges are correct.
1929 // These can be difficult to get right when updating the graph while mapping
1930 // nodes onto summary or IR, especially when there is recursion. In
1931 // particular, when we have created new nodes to reflect inlining, it is
1932 // sometimes impossible to know exactly how to update the edges in the face of
1933 // recursion, as we have lost the original ordering of the stack ids in the
1934 // contexts.
1935 // TODO: Consider only doing this if we detect the context has recursive
1936 // cycles.
1937 //
1938 // I.e. assume we have a context with stack ids like: {A B A C A D E}
1939 // and let's say A was inlined into B, C, and D. The original graph will have
1940 // multiple recursive cycles through A. When we match the original context
1941 // nodes onto the IR or summary, we will merge {A B} into one context node,
1942 // {A C} onto another, and {A D} onto another. Looking at the stack sequence
1943 // above, we should end up with a non-cyclic set of edges like:
1944 // {AB} <- {AC} <- {AD} <- E. However, because we normally have lost the
1945 // original ordering, we won't get the edges correct initially (it's
1946 // impossible without the original ordering). Here we do the fixup (add and
1947 // removing edges where necessary) for this context. In the
1948 // ImportantContextInfo struct in this case we should have a MaxLength = 2,
1949 // and map entries for {A B}, {A C}, {A D}, and {E}.
1950 for (auto &[CurContextId, Info] : ImportantContextIdInfo) {
1951 if (Info.StackIdsToNode.empty())
1952 continue;
1953 bool Changed = false;
1954 ContextNode *PrevNode = nullptr;
1955 ContextNode *CurNode = nullptr;
1956 DenseSet<const ContextEdge *> VisitedEdges;
1957 ArrayRef<uint64_t> AllStackIds(Info.StackIds);
1958 // Try to identify what callsite ContextNode maps to which slice of the
1959 // context's ordered stack ids.
1960 for (unsigned I = 0; I < AllStackIds.size(); I++, PrevNode = CurNode) {
1961 // We will do this greedily, trying up to MaxLength stack ids in a row, to
1962 // see if we recorded a context node for that sequence.
1963 auto Len = Info.MaxLength;
1964 auto LenToEnd = AllStackIds.size() - I;
1965 if (Len > LenToEnd)
1966 Len = LenToEnd;
1967 CurNode = nullptr;
1968 // Try to find a recorded context node starting with the longest length
1969 // recorded, and on down until we check for just a single stack node.
1970 for (; Len > 0; Len--) {
1971 // Get the slice of the original stack id sequence to check.
1972 auto CheckStackIds = AllStackIds.slice(I, Len);
1973 auto EntryIt = Info.StackIdsToNode.find(CheckStackIds);
1974 if (EntryIt == Info.StackIdsToNode.end())
1975 continue;
1976 CurNode = EntryIt->second;
1977 // Skip forward so we don't try to look for the ones we just matched.
1978 // We increment by Len - 1, because the outer for loop will increment I.
1979 I += Len - 1;
1980 break;
1981 }
1982 // Give up if we couldn't find a node. Since we need to clone from the
1983 // leaf allocation upwards, no sense in doing anymore fixup further up
1984 // the context if we couldn't match part of the original stack context
1985 // onto a callsite node.
1986 if (!CurNode)
1987 break;
1988 // No edges to fix up until we have a pair of nodes that should be
1989 // adjacent in the graph.
1990 if (!PrevNode)
1991 continue;
1992 // See if we already have a call edge from CurNode to PrevNode.
1993 auto *CurEdge = PrevNode->findEdgeFromCaller(CurNode);
1994 if (CurEdge) {
1995 // We already have an edge. Make sure it contains this context id.
1996 if (CurEdge->getContextIds().insert(CurContextId).second) {
1997 NumFixupEdgeIdsInserted++;
1998 Changed = true;
1999 }
2000 } else {
2001 // No edge exists - add one.
2002 NumFixupEdgesAdded++;
2003 DenseSet<uint32_t> ContextIds({CurContextId});
2004 auto AllocType = computeAllocType(ContextIds);
2005 auto NewEdge = std::make_shared<ContextEdge>(
2006 PrevNode, CurNode, AllocType, std::move(ContextIds));
2007 PrevNode->CallerEdges.push_back(NewEdge);
2008 CurNode->CalleeEdges.push_back(NewEdge);
2009 // Save the new edge for the below handling.
2010 CurEdge = NewEdge.get();
2011 Changed = true;
2012 }
2013 VisitedEdges.insert(CurEdge);
2014 // Now remove this context id from any other caller edges calling
2015 // PrevNode.
2016 for (auto &Edge : PrevNode->CallerEdges) {
2017 // Skip the edge updating/created above and edges we have already
2018 // visited (due to recursion).
2019 if (Edge.get() != CurEdge && !VisitedEdges.contains(Edge.get()))
2020 Edge->getContextIds().erase(CurContextId);
2021 }
2022 }
2023 if (Changed)
2024 NumFixedContexts++;
2025 }
2026}
2027
2028template <typename DerivedCCG, typename FuncTy, typename CallTy>
2029void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
2030 // Map of stack id to all calls with that as the last (outermost caller)
2031 // callsite id that has a context node (some might not due to pruning
2032 // performed during matching of the allocation profile contexts).
2033 // The CallContextInfo contains the Call and a list of its stack ids with
2034 // ContextNodes, the function containing Call, and the set of context ids
2035 // the analysis will eventually identify for use in any new node created
2036 // for that callsite.
2037 DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls;
2038 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
2039 for (auto &Call : CallsWithMetadata) {
2040 // Ignore allocations, already handled.
2041 if (AllocationCallToContextNodeMap.count(Call))
2042 continue;
2043 auto StackIdsWithContextNodes =
2044 getStackIdsWithContextNodesForCall(Call: Call.call());
2045 // If there were no nodes created for MIBs on allocs (maybe this was in
2046 // the unambiguous part of the MIB stack that was pruned), ignore.
2047 if (StackIdsWithContextNodes.empty())
2048 continue;
2049 // Otherwise, record this Call along with the list of ids for the last
2050 // (outermost caller) stack id with a node.
2051 StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
2052 {Call.call(), StackIdsWithContextNodes, Func, {}});
2053 }
2054 }
2055
2056 // First make a pass through all stack ids that correspond to a call,
2057 // as identified in the above loop. Compute the context ids corresponding to
2058 // each of these calls when they correspond to multiple stack ids due to
2059 // due to inlining. Perform any duplication of context ids required when
2060 // there is more than one call with the same stack ids. Their (possibly newly
2061 // duplicated) context ids are saved in the StackIdToMatchingCalls map.
2062 DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
2063 // Save a map from each call to any that are found to match it. I.e. located
2064 // in the same function and have the same (possibly pruned) stack ids. We use
2065 // this to avoid creating extra graph nodes as they can be treated the same.
2066 DenseMap<CallInfo, CallInfo> CallToMatchingCall;
2067 for (auto &It : StackIdToMatchingCalls) {
2068 auto &Calls = It.getSecond();
2069 // Skip single calls with a single stack id. These don't need a new node.
2070 if (Calls.size() == 1) {
2071 auto &Ids = Calls[0].StackIds;
2072 if (Ids.size() == 1)
2073 continue;
2074 }
2075 // In order to do the best and maximal matching of inlined calls to context
2076 // node sequences we will sort the vectors of stack ids in descending order
2077 // of length, and within each length, lexicographically by stack id. The
2078 // latter is so that we can specially handle calls that have identical stack
2079 // id sequences (either due to cloning or artificially because of the MIB
2080 // context pruning). Those with the same Ids are then sorted by function to
2081 // facilitate efficiently mapping them to the same context node.
2082 // Because the functions are pointers, to ensure a stable sort first assign
2083 // each function pointer to its first index in the Calls array, and then use
2084 // that to sort by.
2085 DenseMap<const FuncTy *, unsigned> FuncToIndex;
2086 for (const auto &[Idx, CallCtxInfo] : enumerate(Calls))
2087 FuncToIndex.insert({CallCtxInfo.Func, Idx});
2088 llvm::stable_sort(
2089 Calls,
2090 [&FuncToIndex](const CallContextInfo &A, const CallContextInfo &B) {
2091 return A.StackIds.size() > B.StackIds.size() ||
2092 (A.StackIds.size() == B.StackIds.size() &&
2093 (A.StackIds < B.StackIds ||
2094 (A.StackIds == B.StackIds &&
2095 FuncToIndex[A.Func] < FuncToIndex[B.Func])));
2096 });
2097
2098 // Find the node for the last stack id, which should be the same
2099 // across all calls recorded for this id, and is the id for this
2100 // entry in the StackIdToMatchingCalls map.
2101 uint64_t LastId = It.getFirst();
2102 ContextNode *LastNode = getNodeForStackId(StackId: LastId);
2103 // We should only have kept stack ids that had nodes.
2104 assert(LastNode);
2105
2106 if (LastNode->Recursive)
2107 continue;
2108
2109 // Initialize the context ids with the last node's. We will subsequently
2110 // refine the context ids by computing the intersection along all edges.
2111 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
2112 assert(!LastNodeContextIds.empty());
2113
2114#ifndef NDEBUG
2115 // Save the set of functions seen for a particular set of the same stack
2116 // ids. This is used to ensure that they have been correctly sorted to be
2117 // adjacent in the Calls list, since we rely on that to efficiently place
2118 // all such matching calls onto the same context node.
2119 DenseSet<const FuncTy *> MatchingIdsFuncSet;
2120#endif
2121
2122 for (unsigned I = 0; I < Calls.size(); I++) {
2123 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
2124 assert(SavedContextIds.empty());
2125 assert(LastId == Ids.back());
2126
2127#ifndef NDEBUG
2128 // If this call has a different set of ids than the last one, clear the
2129 // set used to ensure they are sorted properly.
2130 if (I > 0 && Ids != Calls[I - 1].StackIds)
2131 MatchingIdsFuncSet.clear();
2132#endif
2133
2134 // First compute the context ids for this stack id sequence (the
2135 // intersection of the context ids of the corresponding nodes).
2136 // Start with the remaining saved ids for the last node.
2137 assert(!LastNodeContextIds.empty());
2138 DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
2139
2140 ContextNode *PrevNode = LastNode;
2141 ContextNode *CurNode = LastNode;
2142 bool Skip = false;
2143
2144 // Iterate backwards through the stack Ids, starting after the last Id
2145 // in the list, which was handled once outside for all Calls.
2146 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
2147 auto Id = *IdIter;
2148 CurNode = getNodeForStackId(StackId: Id);
2149 // We should only have kept stack ids that had nodes.
2150 assert(CurNode);
2151
2152 if (CurNode->Recursive) {
2153 Skip = true;
2154 break;
2155 }
2156
2157 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
2158 // If there is no edge then the nodes belong to different MIB contexts,
2159 // and we should skip this inlined context sequence. For example, this
2160 // particular inlined context may include stack ids A->B, and we may
2161 // indeed have nodes for both A and B, but it is possible that they were
2162 // never profiled in sequence in a single MIB for any allocation (i.e.
2163 // we might have profiled an allocation that involves the callsite A,
2164 // but through a different one of its callee callsites, and we might
2165 // have profiled an allocation that involves callsite B, but reached
2166 // from a different caller callsite).
2167 if (!Edge) {
2168 Skip = true;
2169 break;
2170 }
2171 PrevNode = CurNode;
2172
2173 // Update the context ids, which is the intersection of the ids along
2174 // all edges in the sequence.
2175 set_intersect(StackSequenceContextIds, Edge->getContextIds());
2176
2177 // If we now have no context ids for clone, skip this call.
2178 if (StackSequenceContextIds.empty()) {
2179 Skip = true;
2180 break;
2181 }
2182 }
2183 if (Skip)
2184 continue;
2185
2186 // If some of this call's stack ids did not have corresponding nodes (due
2187 // to pruning), don't include any context ids for contexts that extend
2188 // beyond these nodes. Otherwise we would be matching part of unrelated /
2189 // not fully matching stack contexts. To do this, subtract any context ids
2190 // found in caller nodes of the last node found above.
2191 if (Ids.back() != getLastStackId(Call)) {
2192 for (const auto &PE : LastNode->CallerEdges) {
2193 set_subtract(StackSequenceContextIds, PE->getContextIds());
2194 if (StackSequenceContextIds.empty())
2195 break;
2196 }
2197 // If we now have no context ids for clone, skip this call.
2198 if (StackSequenceContextIds.empty())
2199 continue;
2200 }
2201
2202#ifndef NDEBUG
2203 // If the prior call had the same stack ids this set would not be empty.
2204 // Check if we already have a call that "matches" because it is located
2205 // in the same function. If the Calls list was sorted properly we should
2206 // not encounter this situation as all such entries should be adjacent
2207 // and processed in bulk further below.
2208 assert(!MatchingIdsFuncSet.contains(Func));
2209
2210 MatchingIdsFuncSet.insert(Func);
2211#endif
2212
2213 // Check if the next set of stack ids is the same (since the Calls vector
2214 // of tuples is sorted by the stack ids we can just look at the next one).
2215 // If so, save them in the CallToMatchingCall map so that they get
2216 // assigned to the same context node, and skip them.
2217 bool DuplicateContextIds = false;
2218 for (unsigned J = I + 1; J < Calls.size(); J++) {
2219 auto &CallCtxInfo = Calls[J];
2220 auto &NextIds = CallCtxInfo.StackIds;
2221 if (NextIds != Ids)
2222 break;
2223 auto *NextFunc = CallCtxInfo.Func;
2224 if (NextFunc != Func) {
2225 // We have another Call with the same ids but that cannot share this
2226 // node, must duplicate ids for it.
2227 DuplicateContextIds = true;
2228 break;
2229 }
2230 auto &NextCall = CallCtxInfo.Call;
2231 CallToMatchingCall[NextCall] = Call;
2232 // Update I so that it gets incremented correctly to skip this call.
2233 I = J;
2234 }
2235
2236 // If we don't have duplicate context ids, then we can assign all the
2237 // context ids computed for the original node sequence to this call.
2238 // If there are duplicate calls with the same stack ids then we synthesize
2239 // new context ids that are duplicates of the originals. These are
2240 // assigned to SavedContextIds, which is a reference into the map entry
2241 // for this call, allowing us to access these ids later on.
2242 OldToNewContextIds.reserve(NumEntries: OldToNewContextIds.size() +
2243 StackSequenceContextIds.size());
2244 SavedContextIds =
2245 DuplicateContextIds
2246 ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
2247 : StackSequenceContextIds;
2248 assert(!SavedContextIds.empty());
2249
2250 if (!DuplicateContextIds) {
2251 // Update saved last node's context ids to remove those that are
2252 // assigned to other calls, so that it is ready for the next call at
2253 // this stack id.
2254 set_subtract(S1&: LastNodeContextIds, S2: StackSequenceContextIds);
2255 if (LastNodeContextIds.empty())
2256 break;
2257 }
2258 }
2259 }
2260
2261 // Propagate the duplicate context ids over the graph.
2262 propagateDuplicateContextIds(OldToNewContextIds);
2263
2264 if (VerifyCCG)
2265 check();
2266
2267 // Now perform a post-order traversal over the graph, starting with the
2268 // allocation nodes, essentially processing nodes from callers to callees.
2269 // For any that contains an id in the map, update the graph to contain new
2270 // nodes representing any inlining at interior callsites. Note we move the
2271 // associated context ids over to the new nodes.
2272 DenseSet<const ContextNode *> Visited;
2273 DenseSet<uint32_t> ImportantContextIds(llvm::from_range,
2274 ImportantContextIdInfo.keys());
2275 for (auto &Entry : AllocationCallToContextNodeMap)
2276 assignStackNodesPostOrder(Node: Entry.second, Visited, StackIdToMatchingCalls,
2277 CallToMatchingCall, ImportantContextIds);
2278
2279 fixupImportantContexts();
2280
2281 if (VerifyCCG)
2282 check();
2283}
2284
2285uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
2286 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
2287 Call->getMetadata(KindID: LLVMContext::MD_callsite));
2288 return CallsiteContext.back();
2289}
2290
2291uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
2292 assert(isa<CallsiteInfo *>(Call));
2293 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
2294 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Val&: Call));
2295 // Need to convert index into stack id.
2296 return Index.getStackIdAtIndex(Index: CallsiteContext.back());
2297}
2298
2299static const std::string MemProfCloneSuffix = ".memprof.";
2300
2301static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
2302 // We use CloneNo == 0 to refer to the original version, which doesn't get
2303 // renamed with a suffix.
2304 if (!CloneNo)
2305 return Base.str();
2306 return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
2307}
2308
2309static bool isMemProfClone(const Function &F) {
2310 return F.getName().contains(Other: MemProfCloneSuffix);
2311}
2312
2313// Return the clone number of the given function by extracting it from the
2314// memprof suffix. Assumes the caller has already confirmed it is a memprof
2315// clone.
2316static unsigned getMemProfCloneNum(const Function &F) {
2317 assert(isMemProfClone(F));
2318 auto Pos = F.getName().find_last_of(C: '.');
2319 assert(Pos > 0);
2320 unsigned CloneNo;
2321 bool Err = F.getName().drop_front(N: Pos + 1).getAsInteger(Radix: 10, Result&: CloneNo);
2322 assert(!Err);
2323 (void)Err;
2324 return CloneNo;
2325}
2326
2327std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
2328 const Instruction *Call,
2329 unsigned CloneNo) const {
2330 return (Twine(Call->getFunction()->getName()) + " -> " +
2331 cast<CallBase>(Val: Call)->getCalledFunction()->getName())
2332 .str();
2333}
2334
2335std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
2336 const IndexCall &Call,
2337 unsigned CloneNo) const {
2338 auto VI = FSToVIMap.find(x: Func);
2339 assert(VI != FSToVIMap.end());
2340 std::string CallerName = getMemProfFuncName(Base: VI->second.name(), CloneNo);
2341 if (isa<AllocInfo *>(Val: Call))
2342 return CallerName + " -> alloc";
2343 else {
2344 auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Val: Call);
2345 return CallerName + " -> " +
2346 getMemProfFuncName(Base: Callsite->Callee.name(),
2347 CloneNo: Callsite->Clones[CloneNo]);
2348 }
2349}
2350
2351std::vector<uint64_t>
2352ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
2353 Instruction *Call) {
2354 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
2355 Call->getMetadata(KindID: LLVMContext::MD_callsite));
2356 return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
2357 CallsiteContext);
2358}
2359
2360std::vector<uint64_t>
2361IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
2362 assert(isa<CallsiteInfo *>(Call));
2363 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
2364 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Val&: Call));
2365 return getStackIdsWithContextNodes<CallsiteInfo,
2366 SmallVector<unsigned>::const_iterator>(
2367 CallsiteContext);
2368}
2369
2370template <typename DerivedCCG, typename FuncTy, typename CallTy>
2371template <class NodeT, class IteratorT>
2372std::vector<uint64_t>
2373CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
2374 CallStack<NodeT, IteratorT> &CallsiteContext) {
2375 std::vector<uint64_t> StackIds;
2376 for (auto IdOrIndex : CallsiteContext) {
2377 auto StackId = getStackId(IdOrIndex);
2378 ContextNode *Node = getNodeForStackId(StackId);
2379 if (!Node)
2380 break;
2381 StackIds.push_back(StackId);
2382 }
2383 return StackIds;
2384}
2385
2386ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
2387 Module &M,
2388 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)
2389 : Mod(M), OREGetter(OREGetter) {
2390 // Map for keeping track of the largest cold contexts up to the number given
2391 // by MemProfTopNImportant. Must be a std::map (not DenseMap) because keys
2392 // must be sorted.
2393 std::map<uint64_t, uint32_t> TotalSizeToContextIdTopNCold;
2394 for (auto &F : M) {
2395 std::vector<CallInfo> CallsWithMetadata;
2396 for (auto &BB : F) {
2397 for (auto &I : BB) {
2398 if (!isa<CallBase>(Val: I))
2399 continue;
2400 if (auto *MemProfMD = I.getMetadata(KindID: LLVMContext::MD_memprof)) {
2401 CallsWithMetadata.push_back(x: &I);
2402 auto *AllocNode = addAllocNode(Call: &I, F: &F);
2403 auto *CallsiteMD = I.getMetadata(KindID: LLVMContext::MD_callsite);
2404 assert(CallsiteMD);
2405 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
2406 // Add all of the MIBs and their stack nodes.
2407 for (auto &MDOp : MemProfMD->operands()) {
2408 auto *MIBMD = cast<const MDNode>(Val: MDOp);
2409 std::vector<ContextTotalSize> ContextSizeInfo;
2410 // Collect the context size information if it exists.
2411 if (MIBMD->getNumOperands() > 2) {
2412 for (unsigned I = 2; I < MIBMD->getNumOperands(); I++) {
2413 MDNode *ContextSizePair =
2414 dyn_cast<MDNode>(Val: MIBMD->getOperand(I));
2415 assert(ContextSizePair->getNumOperands() == 2);
2416 uint64_t FullStackId = mdconst::dyn_extract<ConstantInt>(
2417 MD: ContextSizePair->getOperand(I: 0))
2418 ->getZExtValue();
2419 uint64_t TotalSize = mdconst::dyn_extract<ConstantInt>(
2420 MD: ContextSizePair->getOperand(I: 1))
2421 ->getZExtValue();
2422 ContextSizeInfo.push_back(x: {.FullStackId: FullStackId, .TotalSize: TotalSize});
2423 }
2424 }
2425 MDNode *StackNode = getMIBStackNode(MIB: MIBMD);
2426 assert(StackNode);
2427 CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode);
2428 addStackNodesForMIB<MDNode, MDNode::op_iterator>(
2429 AllocNode, StackContext, CallsiteContext,
2430 AllocType: getMIBAllocType(MIB: MIBMD), ContextSizeInfo,
2431 TotalSizeToContextIdTopNCold);
2432 }
2433 // If exporting the graph to dot and an allocation id of interest was
2434 // specified, record all the context ids for this allocation node.
2435 if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
2436 DotAllocContextIds = AllocNode->getContextIds();
2437 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2438 // Memprof and callsite metadata on memory allocations no longer
2439 // needed.
2440 I.setMetadata(KindID: LLVMContext::MD_memprof, Node: nullptr);
2441 I.setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
2442 }
2443 // For callsite metadata, add to list for this function for later use.
2444 else if (I.getMetadata(KindID: LLVMContext::MD_callsite)) {
2445 CallsWithMetadata.push_back(x: &I);
2446 }
2447 }
2448 }
2449 if (!CallsWithMetadata.empty())
2450 FuncToCallsWithMetadata[&F] = CallsWithMetadata;
2451 }
2452
2453 if (DumpCCG) {
2454 dbgs() << "CCG before updating call stack chains:\n";
2455 dbgs() << *this;
2456 }
2457
2458 if (ExportToDot)
2459 exportToDot(Label: "prestackupdate");
2460
2461 updateStackNodes();
2462
2463 if (ExportToDot)
2464 exportToDot(Label: "poststackupdate");
2465
2466 handleCallsitesWithMultipleTargets();
2467
2468 markBackedges();
2469
2470 // Strip off remaining callsite metadata, no longer needed.
2471 for (auto &FuncEntry : FuncToCallsWithMetadata)
2472 for (auto &Call : FuncEntry.second)
2473 Call.call()->setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
2474}
2475
2476// Finds the set of GUIDs for weak aliasees that are prevailing in different
2477// modules than any of their aliases. We need to handle these specially.
2478DenseSet<GlobalValue::GUID>
2479IndexCallsiteContextGraph::findAliaseeGUIDsPrevailingInDifferentModule() {
2480 DenseSet<GlobalValue::GUID> AliaseeGUIDs;
2481 for (auto &I : Index) {
2482 auto VI = Index.getValueInfo(R: I);
2483 for (auto &S : VI.getSummaryList()) {
2484 // We only care about aliases to functions.
2485 auto *AS = dyn_cast<AliasSummary>(Val: S.get());
2486 if (!AS)
2487 continue;
2488 auto *AliaseeSummary = &AS->getAliasee();
2489 auto *AliaseeFS = dyn_cast<FunctionSummary>(Val: AliaseeSummary);
2490 if (!AliaseeFS)
2491 continue;
2492 // Skip this summary if it is not for the prevailing symbol for this GUID.
2493 // The linker doesn't resolve local linkage values so don't check whether
2494 // those are prevailing.
2495 if (!GlobalValue::isLocalLinkage(Linkage: S->linkage()) &&
2496 !isPrevailing(VI.getGUID(), S.get()))
2497 continue;
2498 // Prevailing aliasee could be in a different module only if it is weak.
2499 if (!GlobalValue::isWeakForLinker(Linkage: AliaseeSummary->linkage()))
2500 continue;
2501 auto AliaseeGUID = AS->getAliaseeGUID();
2502 // If the aliasee copy in this module is not prevailing, record it.
2503 if (!isPrevailing(AliaseeGUID, AliaseeSummary))
2504 AliaseeGUIDs.insert(V: AliaseeGUID);
2505 }
2506 }
2507 AliaseesPrevailingInDiffModuleFromAlias += AliaseeGUIDs.size();
2508 return AliaseeGUIDs;
2509}
2510
2511IndexCallsiteContextGraph::IndexCallsiteContextGraph(
2512 ModuleSummaryIndex &Index,
2513 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
2514 isPrevailing)
2515 : Index(Index), isPrevailing(isPrevailing) {
2516 // Since we use the aliasee summary info to create the necessary clones for
2517 // its aliases, conservatively skip recording the aliasee function's callsites
2518 // in the CCG for any that are prevailing in a different module than one of
2519 // its aliases. We could record the necessary information to do this in the
2520 // summary, but this case should not be common.
2521 DenseSet<GlobalValue::GUID> GUIDsToSkip =
2522 findAliaseeGUIDsPrevailingInDifferentModule();
2523 // Map for keeping track of the largest cold contexts up to the number given
2524 // by MemProfTopNImportant. Must be a std::map (not DenseMap) because keys
2525 // must be sorted.
2526 std::map<uint64_t, uint32_t> TotalSizeToContextIdTopNCold;
2527 for (auto &I : Index) {
2528 auto VI = Index.getValueInfo(R: I);
2529 if (GUIDsToSkip.contains(V: VI.getGUID()))
2530 continue;
2531 for (auto &S : VI.getSummaryList()) {
2532 // We should only add the prevailing nodes. Otherwise we may try to clone
2533 // in a weak copy that won't be linked (and may be different than the
2534 // prevailing version).
2535 // We only keep the memprof summary on the prevailing copy now when
2536 // building the combined index, as a space optimization, however don't
2537 // rely on this optimization. The linker doesn't resolve local linkage
2538 // values so don't check whether those are prevailing.
2539 if (!GlobalValue::isLocalLinkage(Linkage: S->linkage()) &&
2540 !isPrevailing(VI.getGUID(), S.get()))
2541 continue;
2542 auto *FS = dyn_cast<FunctionSummary>(Val: S.get());
2543 if (!FS)
2544 continue;
2545 std::vector<CallInfo> CallsWithMetadata;
2546 if (!FS->allocs().empty()) {
2547 for (auto &AN : FS->mutableAllocs()) {
2548 // This can happen because of recursion elimination handling that
2549 // currently exists in ModuleSummaryAnalysis. Skip these for now.
2550 // We still added them to the summary because we need to be able to
2551 // correlate properly in applyImport in the backends.
2552 if (AN.MIBs.empty())
2553 continue;
2554 IndexCall AllocCall(&AN);
2555 CallsWithMetadata.push_back(x: AllocCall);
2556 auto *AllocNode = addAllocNode(Call: AllocCall, F: FS);
2557 // Pass an empty CallStack to the CallsiteContext (second)
2558 // parameter, since for ThinLTO we already collapsed out the inlined
2559 // stack ids on the allocation call during ModuleSummaryAnalysis.
2560 CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
2561 EmptyContext;
2562 unsigned I = 0;
2563 assert(!metadataMayIncludeContextSizeInfo() ||
2564 AN.ContextSizeInfos.size() == AN.MIBs.size());
2565 // Now add all of the MIBs and their stack nodes.
2566 for (auto &MIB : AN.MIBs) {
2567 CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
2568 StackContext(&MIB);
2569 std::vector<ContextTotalSize> ContextSizeInfo;
2570 if (!AN.ContextSizeInfos.empty()) {
2571 for (auto [FullStackId, TotalSize] : AN.ContextSizeInfos[I])
2572 ContextSizeInfo.push_back(x: {.FullStackId: FullStackId, .TotalSize: TotalSize});
2573 }
2574 addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
2575 AllocNode, StackContext, CallsiteContext&: EmptyContext, AllocType: MIB.AllocType,
2576 ContextSizeInfo, TotalSizeToContextIdTopNCold);
2577 I++;
2578 }
2579 // If exporting the graph to dot and an allocation id of interest was
2580 // specified, record all the context ids for this allocation node.
2581 if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
2582 DotAllocContextIds = AllocNode->getContextIds();
2583 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2584 // Initialize version 0 on the summary alloc node to the current alloc
2585 // type, unless it has both types in which case make it default, so
2586 // that in the case where we aren't able to clone the original version
2587 // always ends up with the default allocation behavior.
2588 AN.Versions[0] = (uint8_t)allocTypeToUse(AllocTypes: AllocNode->AllocTypes);
2589 }
2590 }
2591 // For callsite metadata, add to list for this function for later use.
2592 if (!FS->callsites().empty())
2593 for (auto &SN : FS->mutableCallsites()) {
2594 IndexCall StackNodeCall(&SN);
2595 CallsWithMetadata.push_back(x: StackNodeCall);
2596 }
2597
2598 if (!CallsWithMetadata.empty())
2599 FuncToCallsWithMetadata[FS] = CallsWithMetadata;
2600
2601 if (!FS->allocs().empty() || !FS->callsites().empty())
2602 FSToVIMap[FS] = VI;
2603 }
2604 }
2605
2606 if (DumpCCG) {
2607 dbgs() << "CCG before updating call stack chains:\n";
2608 dbgs() << *this;
2609 }
2610
2611 if (ExportToDot)
2612 exportToDot(Label: "prestackupdate");
2613
2614 updateStackNodes();
2615
2616 if (ExportToDot)
2617 exportToDot(Label: "poststackupdate");
2618
2619 handleCallsitesWithMultipleTargets();
2620
2621 markBackedges();
2622}
2623
2624template <typename DerivedCCG, typename FuncTy, typename CallTy>
2625void CallsiteContextGraph<DerivedCCG, FuncTy,
2626 CallTy>::handleCallsitesWithMultipleTargets() {
2627 // Look for and workaround callsites that call multiple functions.
2628 // This can happen for indirect calls, which needs better handling, and in
2629 // more rare cases (e.g. macro expansion).
2630 // TODO: To fix this for indirect calls we will want to perform speculative
2631 // devirtualization using either the normal PGO info with ICP, or using the
2632 // information in the profiled MemProf contexts. We can do this prior to
2633 // this transformation for regular LTO, and for ThinLTO we can simulate that
2634 // effect in the summary and perform the actual speculative devirtualization
2635 // while cloning in the ThinLTO backend.
2636
2637 // Keep track of the new nodes synthesized for discovered tail calls missing
2638 // from the profiled contexts.
2639 MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;
2640
2641 std::vector<std::pair<CallInfo, ContextNode *>> NewCallToNode;
2642 for (auto &Entry : NonAllocationCallToContextNodeMap) {
2643 auto *Node = Entry.second;
2644 assert(Node->Clones.empty());
2645 // Check all node callees and see if in the same function.
2646 // We need to check all of the calls recorded in this Node, because in some
2647 // cases we may have had multiple calls with the same debug info calling
2648 // different callees. This can happen, for example, when an object is
2649 // constructed in the paramter list - the destructor call of the object has
2650 // the same debug info (line/col) as the call the object was passed to.
2651 // Here we will prune any that don't match all callee nodes.
2652 std::vector<CallInfo> AllCalls;
2653 AllCalls.reserve(Node->MatchingCalls.size() + 1);
2654 AllCalls.push_back(Node->Call);
2655 llvm::append_range(AllCalls, Node->MatchingCalls);
2656
2657 // First see if we can partition the calls by callee function, creating new
2658 // nodes to host each set of calls calling the same callees. This is
2659 // necessary for support indirect calls with ThinLTO, for which we
2660 // synthesized CallsiteInfo records for each target. They will all have the
2661 // same callsite stack ids and would be sharing a context node at this
2662 // point. We need to perform separate cloning for each, which will be
2663 // applied along with speculative devirtualization in the ThinLTO backends
2664 // as needed. Note this does not currently support looking through tail
2665 // calls, it is unclear if we need that for indirect call targets.
2666 // First partition calls by callee func. Map indexed by func, value is
2667 // struct with list of matching calls, assigned node.
2668 if (partitionCallsByCallee(Node, AllCalls, NewCallToNode))
2669 continue;
2670
2671 auto It = AllCalls.begin();
2672 // Iterate through the calls until we find the first that matches.
2673 for (; It != AllCalls.end(); ++It) {
2674 auto ThisCall = *It;
2675 bool Match = true;
2676 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();
2677 ++EI) {
2678 auto Edge = *EI;
2679 if (!Edge->Callee->hasCall())
2680 continue;
2681 assert(NodeToCallingFunc.count(Edge->Callee));
2682 // Check if the called function matches that of the callee node.
2683 if (!calleesMatch(Call: ThisCall.call(), EI, TailCallToContextNodeMap)) {
2684 Match = false;
2685 break;
2686 }
2687 }
2688 // Found a call that matches the callee nodes, we can quit now.
2689 if (Match) {
2690 // If the first match is not the primary call on the Node, update it
2691 // now. We will update the list of matching calls further below.
2692 if (Node->Call != ThisCall) {
2693 Node->setCall(ThisCall);
2694 // We need to update the NonAllocationCallToContextNodeMap, but don't
2695 // want to do this during iteration over that map, so save the calls
2696 // that need updated entries.
2697 NewCallToNode.push_back({ThisCall, Node});
2698 }
2699 break;
2700 }
2701 }
2702 // We will update this list below (or leave it cleared if there was no
2703 // match found above).
2704 Node->MatchingCalls.clear();
2705 // If we hit the end of the AllCalls vector, no call matching the callee
2706 // nodes was found, clear the call information in the node.
2707 if (It == AllCalls.end()) {
2708 RemovedEdgesWithMismatchedCallees++;
2709 // Work around by setting Node to have a null call, so it gets
2710 // skipped during cloning. Otherwise assignFunctions will assert
2711 // because its data structures are not designed to handle this case.
2712 Node->setCall(CallInfo());
2713 continue;
2714 }
2715 // Now add back any matching calls that call the same function as the
2716 // matching primary call on Node.
2717 for (++It; It != AllCalls.end(); ++It) {
2718 auto ThisCall = *It;
2719 if (!sameCallee(Call1: Node->Call.call(), Call2: ThisCall.call()))
2720 continue;
2721 Node->MatchingCalls.push_back(ThisCall);
2722 }
2723 }
2724
2725 // Remove all mismatched nodes identified in the above loop from the node map
2726 // (checking whether they have a null call which is set above). For a
2727 // MapVector like NonAllocationCallToContextNodeMap it is much more efficient
2728 // to do the removal via remove_if than by individually erasing entries above.
2729 // Also remove any entries if we updated the node's primary call above.
2730 NonAllocationCallToContextNodeMap.remove_if([](const auto &it) {
2731 return !it.second->hasCall() || it.second->Call != it.first;
2732 });
2733
2734 // Add entries for any new primary calls recorded above.
2735 for (auto &[Call, Node] : NewCallToNode)
2736 NonAllocationCallToContextNodeMap[Call] = Node;
2737
2738 // Add the new nodes after the above loop so that the iteration is not
2739 // invalidated.
2740 for (auto &[Call, Node] : TailCallToContextNodeMap)
2741 NonAllocationCallToContextNodeMap[Call] = Node;
2742}
2743
2744template <typename DerivedCCG, typename FuncTy, typename CallTy>
2745bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::partitionCallsByCallee(
2746 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
2747 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode) {
2748 // Struct to keep track of all the calls having the same callee function,
2749 // and the node we eventually assign to them. Eventually we will record the
2750 // context node assigned to this group of calls.
2751 struct CallsWithSameCallee {
2752 std::vector<CallInfo> Calls;
2753 ContextNode *Node = nullptr;
2754 };
2755
2756 // First partition calls by callee function. Build map from each function
2757 // to the list of matching calls.
2758 DenseMap<const FuncTy *, CallsWithSameCallee> CalleeFuncToCallInfo;
2759 for (auto ThisCall : AllCalls) {
2760 auto *F = getCalleeFunc(Call: ThisCall.call());
2761 if (F)
2762 CalleeFuncToCallInfo[F].Calls.push_back(ThisCall);
2763 }
2764
2765 // Next, walk through all callee edges. For each callee node, get its
2766 // containing function and see if it was recorded in the above map (meaning we
2767 // have at least one matching call). Build another map from each callee node
2768 // with a matching call to the structure instance created above containing all
2769 // the calls.
2770 DenseMap<ContextNode *, CallsWithSameCallee *> CalleeNodeToCallInfo;
2771 for (const auto &Edge : Node->CalleeEdges) {
2772 if (!Edge->Callee->hasCall())
2773 continue;
2774 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2775 if (CalleeFuncToCallInfo.contains(ProfiledCalleeFunc))
2776 CalleeNodeToCallInfo[Edge->Callee] =
2777 &CalleeFuncToCallInfo[ProfiledCalleeFunc];
2778 }
2779
2780 // If there are entries in the second map, then there were no matching
2781 // calls/callees, nothing to do here. Return so we can go to the handling that
2782 // looks through tail calls.
2783 if (CalleeNodeToCallInfo.empty())
2784 return false;
2785
2786 // Walk through all callee edges again. Any and all callee edges that didn't
2787 // match any calls (callee not in the CalleeNodeToCallInfo map) are moved to a
2788 // new caller node (UnmatchedCalleesNode) which gets a null call so that it is
2789 // ignored during cloning. If it is in the map, then we use the node recorded
2790 // in that entry (creating it if needed), and move the callee edge to it.
2791 // The first callee will use the original node instead of creating a new one.
2792 // Note that any of the original calls on this node (in AllCalls) that didn't
2793 // have a callee function automatically get dropped from the node as part of
2794 // this process.
2795 ContextNode *UnmatchedCalleesNode = nullptr;
2796 // Track whether we already assigned original node to a callee.
2797 bool UsedOrigNode = false;
2798 assert(NodeToCallingFunc[Node]);
2799 // Iterate over a copy of Node's callee edges, since we may need to remove
2800 // edges in moveCalleeEdgeToNewCaller, and this simplifies the handling and
2801 // makes it less error-prone.
2802 auto CalleeEdges = Node->CalleeEdges;
2803 for (auto &Edge : CalleeEdges) {
2804 if (!Edge->Callee->hasCall())
2805 continue;
2806
2807 // Will be updated below to point to whatever (caller) node this callee edge
2808 // should be moved to.
2809 ContextNode *CallerNodeToUse = nullptr;
2810
2811 // Handle the case where there were no matching calls first. Move this
2812 // callee edge to the UnmatchedCalleesNode, creating it if needed.
2813 if (!CalleeNodeToCallInfo.contains(Edge->Callee)) {
2814 if (!UnmatchedCalleesNode)
2815 UnmatchedCalleesNode =
2816 createNewNode(/*IsAllocation=*/false, F: NodeToCallingFunc[Node]);
2817 CallerNodeToUse = UnmatchedCalleesNode;
2818 } else {
2819 // Look up the information recorded for this callee node, and use the
2820 // recorded caller node (creating it if needed).
2821 auto *Info = CalleeNodeToCallInfo[Edge->Callee];
2822 if (!Info->Node) {
2823 // If we haven't assigned any callees to the original node use it.
2824 if (!UsedOrigNode) {
2825 Info->Node = Node;
2826 // Clear the set of matching calls which will be updated below.
2827 Node->MatchingCalls.clear();
2828 UsedOrigNode = true;
2829 } else
2830 Info->Node =
2831 createNewNode(/*IsAllocation=*/false, F: NodeToCallingFunc[Node]);
2832 assert(!Info->Calls.empty());
2833 // The first call becomes the primary call for this caller node, and the
2834 // rest go in the matching calls list.
2835 Info->Node->setCall(Info->Calls.front());
2836 llvm::append_range(Info->Node->MatchingCalls,
2837 llvm::drop_begin(Info->Calls));
2838 // Save the primary call to node correspondence so that we can update
2839 // the NonAllocationCallToContextNodeMap, which is being iterated in the
2840 // caller of this function.
2841 NewCallToNode.push_back({Info->Node->Call, Info->Node});
2842 }
2843 CallerNodeToUse = Info->Node;
2844 }
2845
2846 // Don't need to move edge if we are using the original node;
2847 if (CallerNodeToUse == Node)
2848 continue;
2849
2850 moveCalleeEdgeToNewCaller(Edge, NewCaller: CallerNodeToUse);
2851 }
2852 // Now that we are done moving edges, clean up any caller edges that ended
2853 // up with no type or context ids. During moveCalleeEdgeToNewCaller all
2854 // caller edges from Node are replicated onto the new callers, and it
2855 // simplifies the handling to leave them until we have moved all
2856 // edges/context ids.
2857 for (auto &I : CalleeNodeToCallInfo)
2858 removeNoneTypeCallerEdges(Node: I.second->Node);
2859 if (UnmatchedCalleesNode)
2860 removeNoneTypeCallerEdges(Node: UnmatchedCalleesNode);
2861 removeNoneTypeCallerEdges(Node);
2862
2863 return true;
2864}
2865
2866uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2867 // In the Module (IR) case this is already the Id.
2868 return IdOrIndex;
2869}
2870
2871uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2872 // In the Index case this is an index into the stack id list in the summary
2873 // index, convert it to an Id.
2874 return Index.getStackIdAtIndex(Index: IdOrIndex);
2875}
2876
2877template <typename DerivedCCG, typename FuncTy, typename CallTy>
2878bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
2879 CallTy Call, EdgeIter &EI,
2880 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) {
2881 auto Edge = *EI;
2882 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2883 const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller];
2884 // Will be populated in order of callee to caller if we find a chain of tail
2885 // calls between the profiled caller and callee.
2886 std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain;
2887 if (!calleeMatchesFunc(Call, Func: ProfiledCalleeFunc, CallerFunc,
2888 FoundCalleeChain))
2889 return false;
2890
2891 // The usual case where the profiled callee matches that of the IR/summary.
2892 if (FoundCalleeChain.empty())
2893 return true;
2894
2895 auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) {
2896 auto *CurEdge = Callee->findEdgeFromCaller(Caller);
2897 // If there is already an edge between these nodes, simply update it and
2898 // return.
2899 if (CurEdge) {
2900 CurEdge->ContextIds.insert_range(Edge->ContextIds);
2901 CurEdge->AllocTypes |= Edge->AllocTypes;
2902 return;
2903 }
2904 // Otherwise, create a new edge and insert it into the caller and callee
2905 // lists.
2906 auto NewEdge = std::make_shared<ContextEdge>(
2907 Callee, Caller, Edge->AllocTypes, Edge->ContextIds);
2908 Callee->CallerEdges.push_back(NewEdge);
2909 if (Caller == Edge->Caller) {
2910 // If we are inserting the new edge into the current edge's caller, insert
2911 // the new edge before the current iterator position, and then increment
2912 // back to the current edge.
2913 EI = Caller->CalleeEdges.insert(EI, NewEdge);
2914 ++EI;
2915 assert(*EI == Edge &&
2916 "Iterator position not restored after insert and increment");
2917 } else
2918 Caller->CalleeEdges.push_back(NewEdge);
2919 };
2920
2921 // Create new nodes for each found callee and connect in between the profiled
2922 // caller and callee.
2923 auto *CurCalleeNode = Edge->Callee;
2924 for (auto &[NewCall, Func] : FoundCalleeChain) {
2925 ContextNode *NewNode = nullptr;
2926 // First check if we have already synthesized a node for this tail call.
2927 if (TailCallToContextNodeMap.count(NewCall)) {
2928 NewNode = TailCallToContextNodeMap[NewCall];
2929 NewNode->AllocTypes |= Edge->AllocTypes;
2930 } else {
2931 FuncToCallsWithMetadata[Func].push_back({NewCall});
2932 // Create Node and record node info.
2933 NewNode = createNewNode(/*IsAllocation=*/false, F: Func, C: NewCall);
2934 TailCallToContextNodeMap[NewCall] = NewNode;
2935 NewNode->AllocTypes = Edge->AllocTypes;
2936 }
2937
2938 // Hook up node to its callee node
2939 AddEdge(NewNode, CurCalleeNode);
2940
2941 CurCalleeNode = NewNode;
2942 }
2943
2944 // Hook up edge's original caller to new callee node.
2945 AddEdge(Edge->Caller, CurCalleeNode);
2946
2947#ifndef NDEBUG
2948 // Save this because Edge's fields get cleared below when removed.
2949 auto *Caller = Edge->Caller;
2950#endif
2951
2952 // Remove old edge
2953 removeEdgeFromGraph(Edge: Edge.get(), EI: &EI, /*CalleeIter=*/true);
2954
2955 // To simplify the increment of EI in the caller, subtract one from EI.
2956 // In the final AddEdge call we would have either added a new callee edge,
2957 // to Edge->Caller, or found an existing one. Either way we are guaranteed
2958 // that there is at least one callee edge.
2959 assert(!Caller->CalleeEdges.empty());
2960 --EI;
2961
2962 return true;
2963}
2964
2965bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2966 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
2967 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
2968 bool &FoundMultipleCalleeChains) {
2969 // Stop recursive search if we have already explored the maximum specified
2970 // depth.
2971 if (Depth > TailCallSearchDepth)
2972 return false;
2973
2974 auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) {
2975 FoundCalleeChain.push_back(x: {Callsite, F});
2976 };
2977
2978 auto *CalleeFunc = dyn_cast<Function>(Val: CurCallee);
2979 if (!CalleeFunc) {
2980 auto *Alias = dyn_cast<GlobalAlias>(Val: CurCallee);
2981 assert(Alias);
2982 CalleeFunc = dyn_cast<Function>(Val: Alias->getAliasee());
2983 assert(CalleeFunc);
2984 }
2985
2986 // Look for tail calls in this function, and check if they either call the
2987 // profiled callee directly, or indirectly (via a recursive search).
2988 // Only succeed if there is a single unique tail call chain found between the
2989 // profiled caller and callee, otherwise we could perform incorrect cloning.
2990 bool FoundSingleCalleeChain = false;
2991 for (auto &BB : *CalleeFunc) {
2992 for (auto &I : BB) {
2993 auto *CB = dyn_cast<CallBase>(Val: &I);
2994 if (!CB || !CB->isTailCall())
2995 continue;
2996 auto *CalledValue = CB->getCalledOperand();
2997 auto *CalledFunction = CB->getCalledFunction();
2998 if (CalledValue && !CalledFunction) {
2999 CalledValue = CalledValue->stripPointerCasts();
3000 // Stripping pointer casts can reveal a called function.
3001 CalledFunction = dyn_cast<Function>(Val: CalledValue);
3002 }
3003 // Check if this is an alias to a function. If so, get the
3004 // called aliasee for the checks below.
3005 if (auto *GA = dyn_cast<GlobalAlias>(Val: CalledValue)) {
3006 assert(!CalledFunction &&
3007 "Expected null called function in callsite for alias");
3008 CalledFunction = dyn_cast<Function>(Val: GA->getAliaseeObject());
3009 }
3010 if (!CalledFunction)
3011 continue;
3012 if (CalledFunction == ProfiledCallee) {
3013 if (FoundSingleCalleeChain) {
3014 FoundMultipleCalleeChains = true;
3015 return false;
3016 }
3017 FoundSingleCalleeChain = true;
3018 FoundProfiledCalleeCount++;
3019 FoundProfiledCalleeDepth += Depth;
3020 if (Depth > FoundProfiledCalleeMaxDepth)
3021 FoundProfiledCalleeMaxDepth = Depth;
3022 SaveCallsiteInfo(&I, CalleeFunc);
3023 } else if (findProfiledCalleeThroughTailCalls(
3024 ProfiledCallee, CurCallee: CalledFunction, Depth: Depth + 1,
3025 FoundCalleeChain, FoundMultipleCalleeChains)) {
3026 // findProfiledCalleeThroughTailCalls should not have returned
3027 // true if FoundMultipleCalleeChains.
3028 assert(!FoundMultipleCalleeChains);
3029 if (FoundSingleCalleeChain) {
3030 FoundMultipleCalleeChains = true;
3031 return false;
3032 }
3033 FoundSingleCalleeChain = true;
3034 SaveCallsiteInfo(&I, CalleeFunc);
3035 } else if (FoundMultipleCalleeChains)
3036 return false;
3037 }
3038 }
3039
3040 return FoundSingleCalleeChain;
3041}
3042
3043const Function *ModuleCallsiteContextGraph::getCalleeFunc(Instruction *Call) {
3044 auto *CB = dyn_cast<CallBase>(Val: Call);
3045 if (!CB->getCalledOperand() || CB->isIndirectCall())
3046 return nullptr;
3047 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
3048 auto *Alias = dyn_cast<GlobalAlias>(Val: CalleeVal);
3049 if (Alias)
3050 return dyn_cast<Function>(Val: Alias->getAliasee());
3051 return dyn_cast<Function>(Val: CalleeVal);
3052}
3053
3054bool ModuleCallsiteContextGraph::calleeMatchesFunc(
3055 Instruction *Call, const Function *Func, const Function *CallerFunc,
3056 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) {
3057 auto *CB = dyn_cast<CallBase>(Val: Call);
3058 if (!CB->getCalledOperand() || CB->isIndirectCall())
3059 return false;
3060 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
3061 auto *CalleeFunc = dyn_cast<Function>(Val: CalleeVal);
3062 if (CalleeFunc == Func)
3063 return true;
3064 auto *Alias = dyn_cast<GlobalAlias>(Val: CalleeVal);
3065 if (Alias && Alias->getAliasee() == Func)
3066 return true;
3067
3068 // Recursively search for the profiled callee through tail calls starting with
3069 // the actual Callee. The discovered tail call chain is saved in
3070 // FoundCalleeChain, and we will fixup the graph to include these callsites
3071 // after returning.
3072 // FIXME: We will currently redo the same recursive walk if we find the same
3073 // mismatched callee from another callsite. We can improve this with more
3074 // bookkeeping of the created chain of new nodes for each mismatch.
3075 unsigned Depth = 1;
3076 bool FoundMultipleCalleeChains = false;
3077 if (!findProfiledCalleeThroughTailCalls(ProfiledCallee: Func, CurCallee: CalleeVal, Depth,
3078 FoundCalleeChain,
3079 FoundMultipleCalleeChains)) {
3080 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: "
3081 << Func->getName() << " from " << CallerFunc->getName()
3082 << " that actually called " << CalleeVal->getName()
3083 << (FoundMultipleCalleeChains
3084 ? " (found multiple possible chains)"
3085 : "")
3086 << "\n");
3087 if (FoundMultipleCalleeChains)
3088 FoundProfiledCalleeNonUniquelyCount++;
3089 return false;
3090 }
3091
3092 return true;
3093}
3094
3095bool ModuleCallsiteContextGraph::sameCallee(Instruction *Call1,
3096 Instruction *Call2) {
3097 auto *CB1 = cast<CallBase>(Val: Call1);
3098 if (!CB1->getCalledOperand() || CB1->isIndirectCall())
3099 return false;
3100 auto *CalleeVal1 = CB1->getCalledOperand()->stripPointerCasts();
3101 auto *CalleeFunc1 = dyn_cast<Function>(Val: CalleeVal1);
3102 auto *CB2 = cast<CallBase>(Val: Call2);
3103 if (!CB2->getCalledOperand() || CB2->isIndirectCall())
3104 return false;
3105 auto *CalleeVal2 = CB2->getCalledOperand()->stripPointerCasts();
3106 auto *CalleeFunc2 = dyn_cast<Function>(Val: CalleeVal2);
3107 return CalleeFunc1 == CalleeFunc2;
3108}
3109
3110bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
3111 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
3112 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
3113 bool &FoundMultipleCalleeChains) {
3114 // Stop recursive search if we have already explored the maximum specified
3115 // depth.
3116 if (Depth > TailCallSearchDepth)
3117 return false;
3118
3119 auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) {
3120 // Make a CallsiteInfo for each discovered callee, if one hasn't already
3121 // been synthesized.
3122 if (!FunctionCalleesToSynthesizedCallsiteInfos.count(x: FS) ||
3123 !FunctionCalleesToSynthesizedCallsiteInfos[FS].count(x: Callee))
3124 // StackIds is empty (we don't have debug info available in the index for
3125 // these callsites)
3126 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] =
3127 std::make_unique<CallsiteInfo>(args&: Callee, args: SmallVector<unsigned>());
3128 CallsiteInfo *NewCallsiteInfo =
3129 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get();
3130 FoundCalleeChain.push_back(x: {NewCallsiteInfo, FS});
3131 };
3132
3133 // Look for tail calls in this function, and check if they either call the
3134 // profiled callee directly, or indirectly (via a recursive search).
3135 // Only succeed if there is a single unique tail call chain found between the
3136 // profiled caller and callee, otherwise we could perform incorrect cloning.
3137 bool FoundSingleCalleeChain = false;
3138 for (auto &S : CurCallee.getSummaryList()) {
3139 if (!GlobalValue::isLocalLinkage(Linkage: S->linkage()) &&
3140 !isPrevailing(CurCallee.getGUID(), S.get()))
3141 continue;
3142 auto *FS = dyn_cast<FunctionSummary>(Val: S->getBaseObject());
3143 if (!FS)
3144 continue;
3145 auto FSVI = CurCallee;
3146 auto *AS = dyn_cast<AliasSummary>(Val: S.get());
3147 if (AS)
3148 FSVI = AS->getAliaseeVI();
3149 for (auto &CallEdge : FS->calls()) {
3150 if (!CallEdge.second.hasTailCall())
3151 continue;
3152 if (CallEdge.first == ProfiledCallee) {
3153 if (FoundSingleCalleeChain) {
3154 FoundMultipleCalleeChains = true;
3155 return false;
3156 }
3157 FoundSingleCalleeChain = true;
3158 FoundProfiledCalleeCount++;
3159 FoundProfiledCalleeDepth += Depth;
3160 if (Depth > FoundProfiledCalleeMaxDepth)
3161 FoundProfiledCalleeMaxDepth = Depth;
3162 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
3163 // Add FS to FSToVIMap in case it isn't already there.
3164 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
3165 FSToVIMap[FS] = FSVI;
3166 } else if (findProfiledCalleeThroughTailCalls(
3167 ProfiledCallee, CurCallee: CallEdge.first, Depth: Depth + 1,
3168 FoundCalleeChain, FoundMultipleCalleeChains)) {
3169 // findProfiledCalleeThroughTailCalls should not have returned
3170 // true if FoundMultipleCalleeChains.
3171 assert(!FoundMultipleCalleeChains);
3172 if (FoundSingleCalleeChain) {
3173 FoundMultipleCalleeChains = true;
3174 return false;
3175 }
3176 FoundSingleCalleeChain = true;
3177 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
3178 // Add FS to FSToVIMap in case it isn't already there.
3179 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
3180 FSToVIMap[FS] = FSVI;
3181 } else if (FoundMultipleCalleeChains)
3182 return false;
3183 }
3184 }
3185
3186 return FoundSingleCalleeChain;
3187}
3188
3189const FunctionSummary *
3190IndexCallsiteContextGraph::getCalleeFunc(IndexCall &Call) {
3191 ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Val&: Call)->Callee;
3192 if (Callee.getSummaryList().empty())
3193 return nullptr;
3194 return dyn_cast<FunctionSummary>(Val: Callee.getSummaryList()[0]->getBaseObject());
3195}
3196
3197bool IndexCallsiteContextGraph::calleeMatchesFunc(
3198 IndexCall &Call, const FunctionSummary *Func,
3199 const FunctionSummary *CallerFunc,
3200 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {
3201 ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Val&: Call)->Callee;
3202 // If there is no summary list then this is a call to an externally defined
3203 // symbol.
3204 AliasSummary *Alias =
3205 Callee.getSummaryList().empty()
3206 ? nullptr
3207 : dyn_cast<AliasSummary>(Val: Callee.getSummaryList()[0].get());
3208 assert(FSToVIMap.count(Func));
3209 auto FuncVI = FSToVIMap[Func];
3210 if (Callee == FuncVI ||
3211 // If callee is an alias, check the aliasee, since only function
3212 // summary base objects will contain the stack node summaries and thus
3213 // get a context node.
3214 (Alias && Alias->getAliaseeVI() == FuncVI))
3215 return true;
3216
3217 // Recursively search for the profiled callee through tail calls starting with
3218 // the actual Callee. The discovered tail call chain is saved in
3219 // FoundCalleeChain, and we will fixup the graph to include these callsites
3220 // after returning.
3221 // FIXME: We will currently redo the same recursive walk if we find the same
3222 // mismatched callee from another callsite. We can improve this with more
3223 // bookkeeping of the created chain of new nodes for each mismatch.
3224 unsigned Depth = 1;
3225 bool FoundMultipleCalleeChains = false;
3226 if (!findProfiledCalleeThroughTailCalls(
3227 ProfiledCallee: FuncVI, CurCallee: Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) {
3228 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI
3229 << " from " << FSToVIMap[CallerFunc]
3230 << " that actually called " << Callee
3231 << (FoundMultipleCalleeChains
3232 ? " (found multiple possible chains)"
3233 : "")
3234 << "\n");
3235 if (FoundMultipleCalleeChains)
3236 FoundProfiledCalleeNonUniquelyCount++;
3237 return false;
3238 }
3239
3240 return true;
3241}
3242
3243bool IndexCallsiteContextGraph::sameCallee(IndexCall &Call1, IndexCall &Call2) {
3244 ValueInfo Callee1 = dyn_cast_if_present<CallsiteInfo *>(Val&: Call1)->Callee;
3245 ValueInfo Callee2 = dyn_cast_if_present<CallsiteInfo *>(Val&: Call2)->Callee;
3246 return Callee1 == Callee2;
3247}
3248
3249template <typename DerivedCCG, typename FuncTy, typename CallTy>
3250void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
3251 const {
3252 print(OS&: dbgs());
3253 dbgs() << "\n";
3254}
3255
3256template <typename DerivedCCG, typename FuncTy, typename CallTy>
3257void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
3258 raw_ostream &OS) const {
3259 OS << "Node " << this << "\n";
3260 OS << "\t";
3261 printCall(OS);
3262 if (Recursive)
3263 OS << " (recursive)";
3264 OS << "\n";
3265 if (!MatchingCalls.empty()) {
3266 OS << "\tMatchingCalls:\n";
3267 for (auto &MatchingCall : MatchingCalls) {
3268 OS << "\t";
3269 MatchingCall.print(OS);
3270 OS << "\n";
3271 }
3272 }
3273 OS << "\tNodeId: " << NodeId << "\n";
3274 OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
3275 OS << "\tContextIds:";
3276 // Make a copy of the computed context ids that we can sort for stability.
3277 auto ContextIds = getContextIds();
3278 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3279 std::sort(first: SortedIds.begin(), last: SortedIds.end());
3280 for (auto Id : SortedIds)
3281 OS << " " << Id;
3282 OS << "\n";
3283 OS << "\tCalleeEdges:\n";
3284 for (auto &Edge : CalleeEdges)
3285 OS << "\t\t" << *Edge << " (Callee NodeId: " << Edge->Callee->NodeId
3286 << ")\n";
3287 OS << "\tCallerEdges:\n";
3288 for (auto &Edge : CallerEdges)
3289 OS << "\t\t" << *Edge << " (Caller NodeId: " << Edge->Caller->NodeId
3290 << ")\n";
3291 if (!Clones.empty()) {
3292 OS << "\tClones: ";
3293 ListSeparator LS;
3294 for (auto *C : Clones)
3295 OS << LS << C << " NodeId: " << C->NodeId;
3296 OS << "\n";
3297 } else if (CloneOf) {
3298 OS << "\tClone of " << CloneOf << " NodeId: " << CloneOf->NodeId << "\n";
3299 }
3300}
3301
3302template <typename DerivedCCG, typename FuncTy, typename CallTy>
3303void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
3304 const {
3305 print(OS&: dbgs());
3306 dbgs() << "\n";
3307}
3308
3309template <typename DerivedCCG, typename FuncTy, typename CallTy>
3310void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
3311 raw_ostream &OS) const {
3312 OS << "Edge from Callee " << Callee << " to Caller: " << Caller
3313 << (IsBackedge ? " (BE)" : "")
3314 << " AllocTypes: " << getAllocTypeString(AllocTypes);
3315 OS << " ContextIds:";
3316 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3317 std::sort(first: SortedIds.begin(), last: SortedIds.end());
3318 for (auto Id : SortedIds)
3319 OS << " " << Id;
3320}
3321
3322template <typename DerivedCCG, typename FuncTy, typename CallTy>
3323void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
3324 print(OS&: dbgs());
3325}
3326
3327template <typename DerivedCCG, typename FuncTy, typename CallTy>
3328void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
3329 raw_ostream &OS) const {
3330 OS << "Callsite Context Graph:\n";
3331 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3332 for (const auto Node : nodes<GraphType>(this)) {
3333 if (Node->isRemoved())
3334 continue;
3335 Node->print(OS);
3336 OS << "\n";
3337 }
3338}
3339
3340template <typename DerivedCCG, typename FuncTy, typename CallTy>
3341void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
3342 raw_ostream &OS,
3343 function_ref<void(StringRef, StringRef, const Twine &)> EmitRemark) const {
3344 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3345 for (const auto Node : nodes<GraphType>(this)) {
3346 if (Node->isRemoved())
3347 continue;
3348 if (!Node->IsAllocation)
3349 continue;
3350 DenseSet<uint32_t> ContextIds = Node->getContextIds();
3351 auto AllocTypeFromCall = getAllocationCallType(Call: Node->Call);
3352 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3353 std::sort(first: SortedIds.begin(), last: SortedIds.end());
3354 for (auto Id : SortedIds) {
3355 auto TypeI = ContextIdToAllocationType.find(Val: Id);
3356 assert(TypeI != ContextIdToAllocationType.end());
3357 auto CSI = ContextIdToContextSizeInfos.find(Val: Id);
3358 if (CSI != ContextIdToContextSizeInfos.end()) {
3359 for (auto &Info : CSI->second) {
3360 std::string Msg =
3361 "MemProf hinting: " + getAllocTypeString(AllocTypes: (uint8_t)TypeI->second) +
3362 " full allocation context " + std::to_string(val: Info.FullStackId) +
3363 " with total size " + std::to_string(val: Info.TotalSize) + " is " +
3364 getAllocTypeString(Node->AllocTypes) + " after cloning";
3365 if (allocTypeToUse(Node->AllocTypes) != AllocTypeFromCall)
3366 Msg += " marked " + getAllocTypeString(AllocTypes: (uint8_t)AllocTypeFromCall) +
3367 " due to cold byte percent";
3368 // Print the internal context id to aid debugging and visualization.
3369 Msg += " (context id " + std::to_string(val: Id) + ")";
3370 OS << Msg << "\n";
3371 if (EmitRemark)
3372 EmitRemark(DEBUG_TYPE, "MemProfReport", Msg);
3373 }
3374 }
3375 }
3376 }
3377}
3378
3379template <typename DerivedCCG, typename FuncTy, typename CallTy>
3380void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
3381 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3382 for (const auto Node : nodes<GraphType>(this)) {
3383 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3384 for (auto &Edge : Node->CallerEdges)
3385 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
3386 }
3387}
3388
3389template <typename DerivedCCG, typename FuncTy, typename CallTy>
3390struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
3391 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3392 using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
3393
3394 using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
3395 static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
3396
3397 using nodes_iterator =
3398 mapped_iterator<typename std::vector<NodePtrTy>::const_iterator,
3399 decltype(&getNode)>;
3400
3401 static nodes_iterator nodes_begin(GraphType G) {
3402 return nodes_iterator(G->NodeOwner.begin(), &getNode);
3403 }
3404
3405 static nodes_iterator nodes_end(GraphType G) {
3406 return nodes_iterator(G->NodeOwner.end(), &getNode);
3407 }
3408
3409 static NodeRef getEntryNode(GraphType G) {
3410 return G->NodeOwner.begin()->get();
3411 }
3412
3413 using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
3414 static const ContextNode<DerivedCCG, FuncTy, CallTy> *
3415 GetCallee(const EdgePtrTy &P) {
3416 return P->Callee;
3417 }
3418
3419 using ChildIteratorType =
3420 mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
3421 DerivedCCG, FuncTy, CallTy>>>::const_iterator,
3422 decltype(&GetCallee)>;
3423
3424 static ChildIteratorType child_begin(NodeRef N) {
3425 return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
3426 }
3427
3428 static ChildIteratorType child_end(NodeRef N) {
3429 return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
3430 }
3431};
3432
3433template <typename DerivedCCG, typename FuncTy, typename CallTy>
3434struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
3435 : public DefaultDOTGraphTraits {
3436 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {
3437 // If the user requested the full graph to be exported, but provided an
3438 // allocation id, or if the user gave a context id and requested more than
3439 // just a specific context to be exported, note that highlighting is
3440 // enabled.
3441 DoHighlight =
3442 (AllocIdForDot.getNumOccurrences() && DotGraphScope == DotScope::All) ||
3443 (ContextIdForDot.getNumOccurrences() &&
3444 DotGraphScope != DotScope::Context);
3445 }
3446
3447 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3448 using GTraits = GraphTraits<GraphType>;
3449 using NodeRef = typename GTraits::NodeRef;
3450 using ChildIteratorType = typename GTraits::ChildIteratorType;
3451
3452 static std::string getNodeLabel(NodeRef Node, GraphType G) {
3453 std::string LabelString =
3454 (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
3455 Twine(Node->OrigStackOrAllocId) + " NodeId: " + Twine(Node->NodeId))
3456 .str();
3457 LabelString += "\n";
3458 if (Node->hasCall()) {
3459 auto Func = G->NodeToCallingFunc.find(Node);
3460 assert(Func != G->NodeToCallingFunc.end());
3461 LabelString +=
3462 G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
3463 for (auto &MatchingCall : Node->MatchingCalls) {
3464 LabelString += "\n";
3465 LabelString += G->getLabel(Func->second, MatchingCall.call(),
3466 MatchingCall.cloneNo());
3467 }
3468 } else {
3469 LabelString += "null call";
3470 if (Node->Recursive)
3471 LabelString += " (recursive)";
3472 else
3473 LabelString += " (external)";
3474 }
3475 return LabelString;
3476 }
3477
3478 static std::string getNodeAttributes(NodeRef Node, GraphType G) {
3479 auto ContextIds = Node->getContextIds();
3480 // If highlighting enabled, see if this node contains any of the context ids
3481 // of interest. If so, it will use a different color and a larger fontsize
3482 // (which makes the node larger as well).
3483 bool Highlight = false;
3484 if (DoHighlight) {
3485 assert(ContextIdForDot.getNumOccurrences() ||
3486 AllocIdForDot.getNumOccurrences());
3487 if (ContextIdForDot.getNumOccurrences())
3488 Highlight = ContextIds.contains(ContextIdForDot);
3489 else
3490 Highlight = set_intersects(ContextIds, G->DotAllocContextIds);
3491 }
3492 std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
3493 getContextIds(ContextIds) + "\"")
3494 .str();
3495 // Default fontsize is 14
3496 if (Highlight)
3497 AttributeString += ",fontsize=\"30\"";
3498 AttributeString +=
3499 (Twine(",fillcolor=\"") + getColor(AllocTypes: Node->AllocTypes, Highlight) + "\"")
3500 .str();
3501 if (Node->CloneOf) {
3502 AttributeString += ",color=\"blue\"";
3503 AttributeString += ",style=\"filled,bold,dashed\"";
3504 } else
3505 AttributeString += ",style=\"filled\"";
3506 return AttributeString;
3507 }
3508
3509 static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
3510 GraphType G) {
3511 auto &Edge = *(ChildIter.getCurrent());
3512 // If highlighting enabled, see if this edge contains any of the context ids
3513 // of interest. If so, it will use a different color and a heavier arrow
3514 // size and weight (the larger weight makes the highlighted path
3515 // straighter).
3516 bool Highlight = false;
3517 if (DoHighlight) {
3518 assert(ContextIdForDot.getNumOccurrences() ||
3519 AllocIdForDot.getNumOccurrences());
3520 if (ContextIdForDot.getNumOccurrences())
3521 Highlight = Edge->ContextIds.contains(ContextIdForDot);
3522 else
3523 Highlight = set_intersects(Edge->ContextIds, G->DotAllocContextIds);
3524 }
3525 auto Color = getColor(AllocTypes: Edge->AllocTypes, Highlight);
3526 std::string AttributeString =
3527 (Twine("tooltip=\"") + getContextIds(ContextIds: Edge->ContextIds) + "\"" +
3528 // fillcolor is the arrow head and color is the line
3529 Twine(",fillcolor=\"") + Color + "\"" + Twine(",color=\"") + Color +
3530 "\"")
3531 .str();
3532 if (Edge->IsBackedge)
3533 AttributeString += ",style=\"dotted\"";
3534 // Default penwidth and weight are both 1.
3535 if (Highlight)
3536 AttributeString += ",penwidth=\"2.0\",weight=\"2\"";
3537 return AttributeString;
3538 }
3539
3540 // Since the NodeOwners list includes nodes that are no longer connected to
3541 // the graph, skip them here.
3542 static bool isNodeHidden(NodeRef Node, GraphType G) {
3543 if (Node->isRemoved())
3544 return true;
3545 // If a scope smaller than the full graph was requested, see if this node
3546 // contains any of the context ids of interest.
3547 if (DotGraphScope == DotScope::Alloc)
3548 return !set_intersects(Node->getContextIds(), G->DotAllocContextIds);
3549 if (DotGraphScope == DotScope::Context)
3550 return !Node->getContextIds().contains(ContextIdForDot);
3551 return false;
3552 }
3553
3554private:
3555 static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
3556 std::string IdString = "ContextIds:";
3557 if (ContextIds.size() < 100) {
3558 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3559 std::sort(first: SortedIds.begin(), last: SortedIds.end());
3560 for (auto Id : SortedIds)
3561 IdString += (" " + Twine(Id)).str();
3562 } else {
3563 IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
3564 }
3565 return IdString;
3566 }
3567
3568 static std::string getColor(uint8_t AllocTypes, bool Highlight) {
3569 // If DoHighlight is not enabled, we want to use the highlight colors for
3570 // NotCold and Cold, and the non-highlight color for NotCold+Cold. This is
3571 // both compatible with the color scheme before highlighting was supported,
3572 // and for the NotCold+Cold color the non-highlight color is a bit more
3573 // readable.
3574 if (AllocTypes == (uint8_t)AllocationType::NotCold)
3575 // Color "brown1" actually looks like a lighter red.
3576 return !DoHighlight || Highlight ? "brown1" : "lightpink";
3577 if (AllocTypes == (uint8_t)AllocationType::Cold)
3578 return !DoHighlight || Highlight ? "cyan" : "lightskyblue";
3579 if (AllocTypes ==
3580 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
3581 return Highlight ? "magenta" : "mediumorchid1";
3582 return "gray";
3583 }
3584
3585 static std::string getNodeId(NodeRef Node) {
3586 std::stringstream SStream;
3587 SStream << std::hex << "N0x" << (unsigned long long)Node;
3588 std::string Result = SStream.str();
3589 return Result;
3590 }
3591
3592 // True if we should highlight a specific context or allocation's contexts in
3593 // the emitted graph.
3594 static bool DoHighlight;
3595};
3596
3597template <typename DerivedCCG, typename FuncTy, typename CallTy>
3598bool DOTGraphTraits<
3599 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>::DoHighlight =
3600 false;
3601
3602template <typename DerivedCCG, typename FuncTy, typename CallTy>
3603void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
3604 std::string Label) const {
3605 WriteGraph(this, "", false, Label,
3606 DotFilePathPrefix + "ccg." + Label + ".dot");
3607}
3608
3609template <typename DerivedCCG, typename FuncTy, typename CallTy>
3610typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
3611CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
3612 const std::shared_ptr<ContextEdge> &Edge,
3613 DenseSet<uint32_t> ContextIdsToMove) {
3614 ContextNode *Node = Edge->Callee;
3615 assert(NodeToCallingFunc.count(Node));
3616 ContextNode *Clone =
3617 createNewNode(IsAllocation: Node->IsAllocation, F: NodeToCallingFunc[Node], C: Node->Call);
3618 Node->addClone(Clone);
3619 Clone->MatchingCalls = Node->MatchingCalls;
3620 moveEdgeToExistingCalleeClone(Edge, NewCallee: Clone, /*NewClone=*/true,
3621 ContextIdsToMove);
3622 return Clone;
3623}
3624
3625template <typename DerivedCCG, typename FuncTy, typename CallTy>
3626void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3627 moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
3628 ContextNode *NewCallee, bool NewClone,
3629 DenseSet<uint32_t> ContextIdsToMove) {
3630 // NewCallee and Edge's current callee must be clones of the same original
3631 // node (Edge's current callee may be the original node too).
3632 assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
3633
3634 bool EdgeIsRecursive = Edge->Callee == Edge->Caller;
3635
3636 ContextNode *OldCallee = Edge->Callee;
3637
3638 // We might already have an edge to the new callee from earlier cloning for a
3639 // different allocation. If one exists we will reuse it.
3640 auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller);
3641
3642 // Callers will pass an empty ContextIdsToMove set when they want to move the
3643 // edge. Copy in Edge's ids for simplicity.
3644 if (ContextIdsToMove.empty())
3645 ContextIdsToMove = Edge->getContextIds();
3646
3647 // If we are moving all of Edge's ids, then just move the whole Edge.
3648 // Otherwise only move the specified subset, to a new edge if needed.
3649 if (Edge->getContextIds().size() == ContextIdsToMove.size()) {
3650 // First, update the alloc types on New Callee from Edge.
3651 // Do this before we potentially clear Edge's fields below!
3652 NewCallee->AllocTypes |= Edge->AllocTypes;
3653 // Moving the whole Edge.
3654 if (ExistingEdgeToNewCallee) {
3655 // Since we already have an edge to NewCallee, simply move the ids
3656 // onto it, and remove the existing Edge.
3657 ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
3658 ExistingEdgeToNewCallee->AllocTypes |= Edge->AllocTypes;
3659 assert(Edge->ContextIds == ContextIdsToMove);
3660 removeEdgeFromGraph(Edge: Edge.get());
3661 } else {
3662 // Otherwise just reconnect Edge to NewCallee.
3663 Edge->Callee = NewCallee;
3664 NewCallee->CallerEdges.push_back(Edge);
3665 // Remove it from callee where it was previously connected.
3666 OldCallee->eraseCallerEdge(Edge.get());
3667 // Don't need to update Edge's context ids since we are simply
3668 // reconnecting it.
3669 }
3670 } else {
3671 // Only moving a subset of Edge's ids.
3672 // Compute the alloc type of the subset of ids being moved.
3673 auto CallerEdgeAllocType = computeAllocType(ContextIds&: ContextIdsToMove);
3674 if (ExistingEdgeToNewCallee) {
3675 // Since we already have an edge to NewCallee, simply move the ids
3676 // onto it.
3677 ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
3678 ExistingEdgeToNewCallee->AllocTypes |= CallerEdgeAllocType;
3679 } else {
3680 // Otherwise, create a new edge to NewCallee for the ids being moved.
3681 auto NewEdge = std::make_shared<ContextEdge>(
3682 NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove);
3683 Edge->Caller->CalleeEdges.push_back(NewEdge);
3684 NewCallee->CallerEdges.push_back(NewEdge);
3685 }
3686 // In either case, need to update the alloc types on NewCallee, and remove
3687 // those ids and update the alloc type on the original Edge.
3688 NewCallee->AllocTypes |= CallerEdgeAllocType;
3689 set_subtract(Edge->ContextIds, ContextIdsToMove);
3690 Edge->AllocTypes = computeAllocType(ContextIds&: Edge->ContextIds);
3691 }
3692 // Now walk the old callee node's callee edges and move Edge's context ids
3693 // over to the corresponding edge into the clone (which is created here if
3694 // this is a newly created clone).
3695 for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
3696 ContextNode *CalleeToUse = OldCalleeEdge->Callee;
3697 // If this is a direct recursion edge, use NewCallee (the clone) as the
3698 // callee as well, so that any edge updated/created here is also direct
3699 // recursive.
3700 if (CalleeToUse == OldCallee) {
3701 // If this is a recursive edge, see if we already moved a recursive edge
3702 // (which would have to have been this one) - if we were only moving a
3703 // subset of context ids it would still be on OldCallee.
3704 if (EdgeIsRecursive) {
3705 assert(OldCalleeEdge == Edge);
3706 continue;
3707 }
3708 CalleeToUse = NewCallee;
3709 }
3710 // The context ids moving to the new callee are the subset of this edge's
3711 // context ids and the context ids on the caller edge being moved.
3712 DenseSet<uint32_t> EdgeContextIdsToMove =
3713 set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove);
3714 set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
3715 OldCalleeEdge->AllocTypes =
3716 computeAllocType(ContextIds&: OldCalleeEdge->getContextIds());
3717 if (!NewClone) {
3718 // Update context ids / alloc type on corresponding edge to NewCallee.
3719 // There is a chance this may not exist if we are reusing an existing
3720 // clone, specifically during function assignment, where we would have
3721 // removed none type edges after creating the clone. If we can't find
3722 // a corresponding edge there, fall through to the cloning below.
3723 if (auto *NewCalleeEdge = NewCallee->findEdgeFromCallee(CalleeToUse)) {
3724 NewCalleeEdge->getContextIds().insert_range(EdgeContextIdsToMove);
3725 NewCalleeEdge->AllocTypes |= computeAllocType(ContextIds&: EdgeContextIdsToMove);
3726 continue;
3727 }
3728 }
3729 auto NewEdge = std::make_shared<ContextEdge>(
3730 CalleeToUse, NewCallee, computeAllocType(ContextIds&: EdgeContextIdsToMove),
3731 EdgeContextIdsToMove);
3732 NewCallee->CalleeEdges.push_back(NewEdge);
3733 NewEdge->Callee->CallerEdges.push_back(NewEdge);
3734 }
3735 // Recompute the node alloc type now that its callee edges have been
3736 // updated (since we will compute from those edges).
3737 OldCallee->AllocTypes = OldCallee->computeAllocType();
3738 // OldCallee alloc type should be None iff its context id set is now empty.
3739 assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
3740 OldCallee->emptyContextIds());
3741 if (VerifyCCG) {
3742 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
3743 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
3744 for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
3745 checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
3746 /*CheckEdges=*/false);
3747 for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
3748 checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
3749 /*CheckEdges=*/false);
3750 }
3751}
3752
3753template <typename DerivedCCG, typename FuncTy, typename CallTy>
3754void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3755 moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
3756 ContextNode *NewCaller) {
3757 auto *OldCallee = Edge->Callee;
3758 auto *NewCallee = OldCallee;
3759 // If this edge was direct recursive, make any new/updated edge also direct
3760 // recursive to NewCaller.
3761 bool Recursive = Edge->Caller == Edge->Callee;
3762 if (Recursive)
3763 NewCallee = NewCaller;
3764
3765 ContextNode *OldCaller = Edge->Caller;
3766 OldCaller->eraseCalleeEdge(Edge.get());
3767
3768 // We might already have an edge to the new caller. If one exists we will
3769 // reuse it.
3770 auto ExistingEdgeToNewCaller = NewCaller->findEdgeFromCallee(NewCallee);
3771
3772 if (ExistingEdgeToNewCaller) {
3773 // Since we already have an edge to NewCaller, simply move the ids
3774 // onto it, and remove the existing Edge.
3775 ExistingEdgeToNewCaller->getContextIds().insert_range(
3776 Edge->getContextIds());
3777 ExistingEdgeToNewCaller->AllocTypes |= Edge->AllocTypes;
3778 Edge->ContextIds.clear();
3779 Edge->AllocTypes = (uint8_t)AllocationType::None;
3780 OldCallee->eraseCallerEdge(Edge.get());
3781 } else {
3782 // Otherwise just reconnect Edge to NewCaller.
3783 Edge->Caller = NewCaller;
3784 NewCaller->CalleeEdges.push_back(Edge);
3785 if (Recursive) {
3786 assert(NewCallee == NewCaller);
3787 // In the case of (direct) recursive edges, we update the callee as well
3788 // so that it becomes recursive on the new caller.
3789 Edge->Callee = NewCallee;
3790 NewCallee->CallerEdges.push_back(Edge);
3791 OldCallee->eraseCallerEdge(Edge.get());
3792 }
3793 // Don't need to update Edge's context ids since we are simply
3794 // reconnecting it.
3795 }
3796 // In either case, need to update the alloc types on New Caller.
3797 NewCaller->AllocTypes |= Edge->AllocTypes;
3798
3799 // Now walk the old caller node's caller edges and move Edge's context ids
3800 // over to the corresponding edge into the node (which is created here if
3801 // this is a newly created node). We can tell whether this is a newly created
3802 // node by seeing if it has any caller edges yet.
3803#ifndef NDEBUG
3804 bool IsNewNode = NewCaller->CallerEdges.empty();
3805#endif
3806 // If we just moved a direct recursive edge, presumably its context ids should
3807 // also flow out of OldCaller via some other non-recursive callee edge. We
3808 // don't want to remove the recursive context ids from other caller edges yet,
3809 // otherwise the context ids get into an inconsistent state on OldCaller.
3810 // We will update these context ids on the non-recursive caller edge when and
3811 // if they are updated on the non-recursive callee.
3812 if (!Recursive) {
3813 for (auto &OldCallerEdge : OldCaller->CallerEdges) {
3814 auto OldCallerCaller = OldCallerEdge->Caller;
3815 // The context ids moving to the new caller are the subset of this edge's
3816 // context ids and the context ids on the callee edge being moved.
3817 DenseSet<uint32_t> EdgeContextIdsToMove = set_intersection(
3818 OldCallerEdge->getContextIds(), Edge->getContextIds());
3819 if (OldCaller == OldCallerCaller) {
3820 OldCallerCaller = NewCaller;
3821 // Don't actually move this one. The caller will move it directly via a
3822 // call to this function with this as the Edge if it is appropriate to
3823 // move to a diff node that has a matching callee (itself).
3824 continue;
3825 }
3826 set_subtract(OldCallerEdge->getContextIds(), EdgeContextIdsToMove);
3827 OldCallerEdge->AllocTypes =
3828 computeAllocType(ContextIds&: OldCallerEdge->getContextIds());
3829 // In this function we expect that any pre-existing node already has edges
3830 // from the same callers as the old node. That should be true in the
3831 // current use case, where we will remove None-type edges after copying
3832 // over all caller edges from the callee.
3833 auto *ExistingCallerEdge = NewCaller->findEdgeFromCaller(OldCallerCaller);
3834 // Since we would have skipped caller edges when moving a direct recursive
3835 // edge, this may not hold true when recursive handling enabled.
3836 assert(IsNewNode || ExistingCallerEdge || AllowRecursiveCallsites);
3837 if (ExistingCallerEdge) {
3838 ExistingCallerEdge->getContextIds().insert_range(EdgeContextIdsToMove);
3839 ExistingCallerEdge->AllocTypes |=
3840 computeAllocType(ContextIds&: EdgeContextIdsToMove);
3841 continue;
3842 }
3843 auto NewEdge = std::make_shared<ContextEdge>(
3844 NewCaller, OldCallerCaller, computeAllocType(ContextIds&: EdgeContextIdsToMove),
3845 EdgeContextIdsToMove);
3846 NewCaller->CallerEdges.push_back(NewEdge);
3847 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
3848 }
3849 }
3850 // Recompute the node alloc type now that its caller edges have been
3851 // updated (since we will compute from those edges).
3852 OldCaller->AllocTypes = OldCaller->computeAllocType();
3853 // OldCaller alloc type should be None iff its context id set is now empty.
3854 assert((OldCaller->AllocTypes == (uint8_t)AllocationType::None) ==
3855 OldCaller->emptyContextIds());
3856 if (VerifyCCG) {
3857 checkNode<DerivedCCG, FuncTy, CallTy>(OldCaller, /*CheckEdges=*/false);
3858 checkNode<DerivedCCG, FuncTy, CallTy>(NewCaller, /*CheckEdges=*/false);
3859 for (const auto &OldCallerEdge : OldCaller->CallerEdges)
3860 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallerEdge->Caller,
3861 /*CheckEdges=*/false);
3862 for (const auto &NewCallerEdge : NewCaller->CallerEdges)
3863 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallerEdge->Caller,
3864 /*CheckEdges=*/false);
3865 }
3866}
3867
3868template <typename DerivedCCG, typename FuncTy, typename CallTy>
3869void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3870 recursivelyRemoveNoneTypeCalleeEdges(
3871 ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
3872 auto Inserted = Visited.insert(Node);
3873 if (!Inserted.second)
3874 return;
3875
3876 removeNoneTypeCalleeEdges(Node);
3877
3878 for (auto *Clone : Node->Clones)
3879 recursivelyRemoveNoneTypeCalleeEdges(Node: Clone, Visited);
3880
3881 // The recursive call may remove some of this Node's caller edges.
3882 // Iterate over a copy and skip any that were removed.
3883 auto CallerEdges = Node->CallerEdges;
3884 for (auto &Edge : CallerEdges) {
3885 // Skip any that have been removed by an earlier recursive call.
3886 if (Edge->isRemoved()) {
3887 assert(!is_contained(Node->CallerEdges, Edge));
3888 continue;
3889 }
3890 recursivelyRemoveNoneTypeCalleeEdges(Node: Edge->Caller, Visited);
3891 }
3892}
3893
3894// This is the standard DFS based backedge discovery algorithm.
3895template <typename DerivedCCG, typename FuncTy, typename CallTy>
3896void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges() {
3897 // If we are cloning recursive contexts, find and mark backedges from all root
3898 // callers, using the typical DFS based backedge analysis.
3899 if (!CloneRecursiveContexts)
3900 return;
3901 DenseSet<const ContextNode *> Visited;
3902 DenseSet<const ContextNode *> CurrentStack;
3903 for (auto &Entry : NonAllocationCallToContextNodeMap) {
3904 auto *Node = Entry.second;
3905 if (Node->isRemoved())
3906 continue;
3907 // It is a root if it doesn't have callers.
3908 if (!Node->CallerEdges.empty())
3909 continue;
3910 markBackedges(Node, Visited, CurrentStack);
3911 assert(CurrentStack.empty());
3912 }
3913}
3914
3915// Recursive helper for above markBackedges method.
3916template <typename DerivedCCG, typename FuncTy, typename CallTy>
3917void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges(
3918 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3919 DenseSet<const ContextNode *> &CurrentStack) {
3920 auto I = Visited.insert(Node);
3921 // We should only call this for unvisited nodes.
3922 assert(I.second);
3923 (void)I;
3924 for (auto &CalleeEdge : Node->CalleeEdges) {
3925 auto *Callee = CalleeEdge->Callee;
3926 if (Visited.count(Callee)) {
3927 // Since this was already visited we need to check if it is currently on
3928 // the recursive stack in which case it is a backedge.
3929 if (CurrentStack.count(Callee))
3930 CalleeEdge->IsBackedge = true;
3931 continue;
3932 }
3933 CurrentStack.insert(Callee);
3934 markBackedges(Callee, Visited, CurrentStack);
3935 CurrentStack.erase(Callee);
3936 }
3937}
3938
3939template <typename DerivedCCG, typename FuncTy, typename CallTy>
3940void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
3941 DenseSet<const ContextNode *> Visited;
3942 for (auto &Entry : AllocationCallToContextNodeMap) {
3943 Visited.clear();
3944 identifyClones(Entry.second, Visited, Entry.second->getContextIds());
3945 }
3946 Visited.clear();
3947 for (auto &Entry : AllocationCallToContextNodeMap)
3948 recursivelyRemoveNoneTypeCalleeEdges(Node: Entry.second, Visited);
3949 if (VerifyCCG)
3950 check();
3951}
3952
3953// helper function to check an AllocType is cold or notcold or both.
3954bool checkColdOrNotCold(uint8_t AllocType) {
3955 return (AllocType == (uint8_t)AllocationType::Cold) ||
3956 (AllocType == (uint8_t)AllocationType::NotCold) ||
3957 (AllocType ==
3958 ((uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold));
3959}
3960
3961template <typename DerivedCCG, typename FuncTy, typename CallTy>
3962void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
3963 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3964 const DenseSet<uint32_t> &AllocContextIds) {
3965 if (VerifyNodes)
3966 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3967 assert(!Node->CloneOf);
3968
3969 // If Node as a null call, then either it wasn't found in the module (regular
3970 // LTO) or summary index (ThinLTO), or there were other conditions blocking
3971 // cloning (e.g. recursion, calls multiple targets, etc).
3972 // Do this here so that we don't try to recursively clone callers below, which
3973 // isn't useful at least for this node.
3974 if (!Node->hasCall())
3975 return;
3976
3977 // No need to look at any callers if allocation type already unambiguous.
3978 if (hasSingleAllocType(Node->AllocTypes))
3979 return;
3980
3981#ifndef NDEBUG
3982 auto Insert =
3983#endif
3984 Visited.insert(Node);
3985 // We should not have visited this node yet.
3986 assert(Insert.second);
3987 // The recursive call to identifyClones may delete the current edge from the
3988 // CallerEdges vector. Make a copy and iterate on that, simpler than passing
3989 // in an iterator and having recursive call erase from it. Other edges may
3990 // also get removed during the recursion, which will have null Callee and
3991 // Caller pointers (and are deleted later), so we skip those below.
3992 {
3993 auto CallerEdges = Node->CallerEdges;
3994 for (auto &Edge : CallerEdges) {
3995 // Skip any that have been removed by an earlier recursive call.
3996 if (Edge->isRemoved()) {
3997 assert(!is_contained(Node->CallerEdges, Edge));
3998 continue;
3999 }
4000 // Defer backedges. See comments further below where these edges are
4001 // handled during the cloning of this Node.
4002 if (Edge->IsBackedge) {
4003 // We should only mark these if cloning recursive contexts, where we
4004 // need to do this deferral.
4005 assert(CloneRecursiveContexts);
4006 continue;
4007 }
4008 // Ignore any caller we previously visited via another edge.
4009 if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
4010 identifyClones(Edge->Caller, Visited, AllocContextIds);
4011 }
4012 }
4013 }
4014
4015 // Check if we reached an unambiguous call or have have only a single caller.
4016 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
4017 return;
4018
4019 // We need to clone.
4020
4021 // Try to keep the original version as alloc type NotCold. This will make
4022 // cases with indirect calls or any other situation with an unknown call to
4023 // the original function get the default behavior. We do this by sorting the
4024 // CallerEdges of the Node we will clone by alloc type.
4025 //
4026 // Give NotCold edge the lowest sort priority so those edges are at the end of
4027 // the caller edges vector, and stay on the original version (since the below
4028 // code clones greedily until it finds all remaining edges have the same type
4029 // and leaves the remaining ones on the original Node).
4030 //
4031 // We shouldn't actually have any None type edges, so the sorting priority for
4032 // that is arbitrary, and we assert in that case below.
4033 const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4,
4034 /*Cold*/ 1,
4035 /*NotColdCold*/ 2};
4036 llvm::stable_sort(Node->CallerEdges,
4037 [&](const std::shared_ptr<ContextEdge> &A,
4038 const std::shared_ptr<ContextEdge> &B) {
4039 // Nodes with non-empty context ids should be sorted
4040 // before those with empty context ids.
4041 if (A->ContextIds.empty())
4042 // Either B ContextIds are non-empty (in which case we
4043 // should return false because B < A), or B ContextIds
4044 // are empty, in which case they are equal, and we
4045 // should maintain the original relative ordering.
4046 return false;
4047 if (B->ContextIds.empty())
4048 return true;
4049
4050 if (A->AllocTypes == B->AllocTypes)
4051 // Use the first context id for each edge as a
4052 // tie-breaker.
4053 return *A->ContextIds.begin() < *B->ContextIds.begin();
4054 return AllocTypeCloningPriority[A->AllocTypes] <
4055 AllocTypeCloningPriority[B->AllocTypes];
4056 });
4057
4058 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
4059
4060 DenseSet<uint32_t> RecursiveContextIds;
4061 assert(AllowRecursiveContexts || !CloneRecursiveContexts);
4062 // If we are allowing recursive callsites, but have also disabled recursive
4063 // contexts, look for context ids that show up in multiple caller edges.
4064 if (AllowRecursiveCallsites && !AllowRecursiveContexts) {
4065 DenseSet<uint32_t> AllCallerContextIds;
4066 for (auto &CE : Node->CallerEdges) {
4067 // Resize to the largest set of caller context ids, since we know the
4068 // final set will be at least that large.
4069 AllCallerContextIds.reserve(Size: CE->getContextIds().size());
4070 for (auto Id : CE->getContextIds())
4071 if (!AllCallerContextIds.insert(Id).second)
4072 RecursiveContextIds.insert(Id);
4073 }
4074 }
4075
4076 // Iterate until we find no more opportunities for disambiguating the alloc
4077 // types via cloning. In most cases this loop will terminate once the Node
4078 // has a single allocation type, in which case no more cloning is needed.
4079 // Iterate over a copy of Node's caller edges, since we may need to remove
4080 // edges in the moveEdgeTo* methods, and this simplifies the handling and
4081 // makes it less error-prone.
4082 auto CallerEdges = Node->CallerEdges;
4083 for (auto &CallerEdge : CallerEdges) {
4084 // Skip any that have been removed by an earlier recursive call.
4085 if (CallerEdge->isRemoved()) {
4086 assert(!is_contained(Node->CallerEdges, CallerEdge));
4087 continue;
4088 }
4089 assert(CallerEdge->Callee == Node);
4090
4091 // See if cloning the prior caller edge left this node with a single alloc
4092 // type or a single caller. In that case no more cloning of Node is needed.
4093 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
4094 break;
4095
4096 // If the caller was not successfully matched to a call in the IR/summary,
4097 // there is no point in trying to clone for it as we can't update that call.
4098 if (!CallerEdge->Caller->hasCall())
4099 continue;
4100
4101 // Only need to process the ids along this edge pertaining to the given
4102 // allocation.
4103 auto CallerEdgeContextsForAlloc =
4104 set_intersection(CallerEdge->getContextIds(), AllocContextIds);
4105 if (!RecursiveContextIds.empty())
4106 CallerEdgeContextsForAlloc =
4107 set_difference(CallerEdgeContextsForAlloc, RecursiveContextIds);
4108 if (CallerEdgeContextsForAlloc.empty())
4109 continue;
4110
4111 auto CallerAllocTypeForAlloc = computeAllocType(ContextIds&: CallerEdgeContextsForAlloc);
4112
4113 // Compute the node callee edge alloc types corresponding to the context ids
4114 // for this caller edge.
4115 std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
4116 CalleeEdgeAllocTypesForCallerEdge.reserve(n: Node->CalleeEdges.size());
4117 for (auto &CalleeEdge : Node->CalleeEdges)
4118 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
4119 Node1Ids: CalleeEdge->getContextIds(), Node2Ids: CallerEdgeContextsForAlloc));
4120
4121 // Don't clone if doing so will not disambiguate any alloc types amongst
4122 // caller edges (including the callee edges that would be cloned).
4123 // Otherwise we will simply move all edges to the clone.
4124 //
4125 // First check if by cloning we will disambiguate the caller allocation
4126 // type from node's allocation type. Query allocTypeToUse so that we don't
4127 // bother cloning to distinguish NotCold+Cold from NotCold. Note that
4128 // neither of these should be None type.
4129 //
4130 // Then check if by cloning node at least one of the callee edges will be
4131 // disambiguated by splitting out different context ids.
4132 //
4133 // However, always do the cloning if this is a backedge, in which case we
4134 // have not yet cloned along this caller edge.
4135 assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
4136 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
4137 if (!CallerEdge->IsBackedge &&
4138 allocTypeToUse(CallerAllocTypeForAlloc) ==
4139 allocTypeToUse(Node->AllocTypes) &&
4140 allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
4141 CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {
4142 continue;
4143 }
4144
4145 if (CallerEdge->IsBackedge) {
4146 // We should only mark these if cloning recursive contexts, where we
4147 // need to do this deferral.
4148 assert(CloneRecursiveContexts);
4149 DeferredBackedges++;
4150 }
4151
4152 // If this is a backedge, we now do recursive cloning starting from its
4153 // caller since we may have moved unambiguous caller contexts to a clone
4154 // of this Node in a previous iteration of the current loop, giving more
4155 // opportunity for cloning through the backedge. Because we sorted the
4156 // caller edges earlier so that cold caller edges are first, we would have
4157 // visited and cloned this node for any unamibiguously cold non-recursive
4158 // callers before any ambiguous backedge callers. Note that we don't do this
4159 // if the caller is already cloned or visited during cloning (e.g. via a
4160 // different context path from the allocation).
4161 // TODO: Can we do better in the case where the caller was already visited?
4162 if (CallerEdge->IsBackedge && !CallerEdge->Caller->CloneOf &&
4163 !Visited.count(CallerEdge->Caller)) {
4164 const auto OrigIdCount = CallerEdge->getContextIds().size();
4165 // Now do the recursive cloning of this backedge's caller, which was
4166 // deferred earlier.
4167 identifyClones(CallerEdge->Caller, Visited, CallerEdgeContextsForAlloc);
4168 removeNoneTypeCalleeEdges(Node: CallerEdge->Caller);
4169 // See if the recursive call to identifyClones moved the context ids to a
4170 // new edge from this node to a clone of caller, and switch to looking at
4171 // that new edge so that we clone Node for the new caller clone.
4172 bool UpdatedEdge = false;
4173 if (OrigIdCount > CallerEdge->getContextIds().size()) {
4174 for (auto E : Node->CallerEdges) {
4175 // Only interested in clones of the current edges caller.
4176 if (E->Caller->CloneOf != CallerEdge->Caller)
4177 continue;
4178 // See if this edge contains any of the context ids originally on the
4179 // current caller edge.
4180 auto CallerEdgeContextsForAllocNew =
4181 set_intersection(CallerEdgeContextsForAlloc, E->getContextIds());
4182 if (CallerEdgeContextsForAllocNew.empty())
4183 continue;
4184 // Make sure we don't pick a previously existing caller edge of this
4185 // Node, which would be processed on a different iteration of the
4186 // outer loop over the saved CallerEdges.
4187 if (llvm::is_contained(CallerEdges, E))
4188 continue;
4189 // The CallerAllocTypeForAlloc and CalleeEdgeAllocTypesForCallerEdge
4190 // are updated further below for all cases where we just invoked
4191 // identifyClones recursively.
4192 CallerEdgeContextsForAlloc.swap(CallerEdgeContextsForAllocNew);
4193 CallerEdge = E;
4194 UpdatedEdge = true;
4195 break;
4196 }
4197 }
4198 // If cloning removed this edge (and we didn't update it to a new edge
4199 // above), we're done with this edge. It's possible we moved all of the
4200 // context ids to an existing clone, in which case there's no need to do
4201 // further processing for them.
4202 if (CallerEdge->isRemoved())
4203 continue;
4204
4205 // Now we need to update the information used for the cloning decisions
4206 // further below, as we may have modified edges and their context ids.
4207
4208 // Note if we changed the CallerEdge above we would have already updated
4209 // the context ids.
4210 if (!UpdatedEdge) {
4211 CallerEdgeContextsForAlloc = set_intersection(
4212 CallerEdgeContextsForAlloc, CallerEdge->getContextIds());
4213 if (CallerEdgeContextsForAlloc.empty())
4214 continue;
4215 }
4216 // Update the other information that depends on the edges and on the now
4217 // updated CallerEdgeContextsForAlloc.
4218 CallerAllocTypeForAlloc = computeAllocType(ContextIds&: CallerEdgeContextsForAlloc);
4219 CalleeEdgeAllocTypesForCallerEdge.clear();
4220 for (auto &CalleeEdge : Node->CalleeEdges) {
4221 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
4222 Node1Ids: CalleeEdge->getContextIds(), Node2Ids: CallerEdgeContextsForAlloc));
4223 }
4224 }
4225
4226 // First see if we can use an existing clone. Check each clone and its
4227 // callee edges for matching alloc types.
4228 ContextNode *Clone = nullptr;
4229 for (auto *CurClone : Node->Clones) {
4230 if (allocTypeToUse(CurClone->AllocTypes) !=
4231 allocTypeToUse(CallerAllocTypeForAlloc))
4232 continue;
4233
4234 bool BothSingleAlloc = hasSingleAllocType(CurClone->AllocTypes) &&
4235 hasSingleAllocType(CallerAllocTypeForAlloc);
4236 // The above check should mean that if both have single alloc types that
4237 // they should be equal.
4238 assert(!BothSingleAlloc ||
4239 CurClone->AllocTypes == CallerAllocTypeForAlloc);
4240
4241 // If either both have a single alloc type (which are the same), or if the
4242 // clone's callee edges have the same alloc types as those for the current
4243 // allocation on Node's callee edges (CalleeEdgeAllocTypesForCallerEdge),
4244 // then we can reuse this clone.
4245 if (BothSingleAlloc || allocTypesMatchClone<DerivedCCG, FuncTy, CallTy>(
4246 CalleeEdgeAllocTypesForCallerEdge, CurClone)) {
4247 Clone = CurClone;
4248 break;
4249 }
4250 }
4251
4252 // The edge iterator is adjusted when we move the CallerEdge to the clone.
4253 if (Clone)
4254 moveEdgeToExistingCalleeClone(Edge: CallerEdge, NewCallee: Clone, /*NewClone=*/false,
4255 ContextIdsToMove: CallerEdgeContextsForAlloc);
4256 else
4257 Clone = moveEdgeToNewCalleeClone(Edge: CallerEdge, ContextIdsToMove: CallerEdgeContextsForAlloc);
4258
4259 // Sanity check that no alloc types on clone or its edges are None.
4260 assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
4261 }
4262
4263 // We should still have some context ids on the original Node.
4264 assert(!Node->emptyContextIds());
4265
4266 // Sanity check that no alloc types on node or edges are None.
4267 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
4268
4269 if (VerifyNodes)
4270 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
4271}
4272
4273void ModuleCallsiteContextGraph::updateAllocationCall(
4274 CallInfo &Call, AllocationType AllocType) {
4275 std::string AllocTypeString = getAllocTypeAttributeString(Type: AllocType);
4276 removeAnyExistingAmbiguousAttribute(CB: cast<CallBase>(Val: Call.call()));
4277 auto A = llvm::Attribute::get(Context&: Call.call()->getFunction()->getContext(),
4278 Kind: "memprof", Val: AllocTypeString);
4279 cast<CallBase>(Val: Call.call())->addFnAttr(Attr: A);
4280 OREGetter(Call.call()->getFunction())
4281 .emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
4282 << ore::NV("AllocationCall", Call.call()) << " in clone "
4283 << ore::NV("Caller", Call.call()->getFunction())
4284 << " marked with memprof allocation attribute "
4285 << ore::NV("Attribute", AllocTypeString));
4286}
4287
4288void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
4289 AllocationType AllocType) {
4290 auto *AI = cast<AllocInfo *>(Val: Call.call());
4291 assert(AI);
4292 assert(AI->Versions.size() > Call.cloneNo());
4293 AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
4294}
4295
4296AllocationType
4297ModuleCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
4298 const auto *CB = cast<CallBase>(Val: Call.call());
4299 if (!CB->getAttributes().hasFnAttr(Kind: "memprof"))
4300 return AllocationType::None;
4301 return CB->getAttributes().getFnAttr(Kind: "memprof").getValueAsString() == "cold"
4302 ? AllocationType::Cold
4303 : AllocationType::NotCold;
4304}
4305
4306AllocationType
4307IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
4308 const auto *AI = cast<AllocInfo *>(Val: Call.call());
4309 assert(AI->Versions.size() > Call.cloneNo());
4310 return (AllocationType)AI->Versions[Call.cloneNo()];
4311}
4312
4313void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
4314 FuncInfo CalleeFunc) {
4315 auto *CurF = getCalleeFunc(Call: CallerCall.call());
4316 auto NewCalleeCloneNo = CalleeFunc.cloneNo();
4317 if (isMemProfClone(F: *CurF)) {
4318 // If we already assigned this callsite to call a specific non-default
4319 // clone (i.e. not the original function which is clone 0), ensure that we
4320 // aren't trying to now update it to call a different clone, which is
4321 // indicative of a bug in the graph or function assignment.
4322 auto CurCalleeCloneNo = getMemProfCloneNum(F: *CurF);
4323 if (CurCalleeCloneNo != NewCalleeCloneNo) {
4324 LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was "
4325 << CurCalleeCloneNo << " now " << NewCalleeCloneNo
4326 << "\n");
4327 MismatchedCloneAssignments++;
4328 }
4329 }
4330 if (NewCalleeCloneNo > 0)
4331 cast<CallBase>(Val: CallerCall.call())->setCalledFunction(CalleeFunc.func());
4332 OREGetter(CallerCall.call()->getFunction())
4333 .emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
4334 << ore::NV("Call", CallerCall.call()) << " in clone "
4335 << ore::NV("Caller", CallerCall.call()->getFunction())
4336 << " assigned to call function clone "
4337 << ore::NV("Callee", CalleeFunc.func()));
4338}
4339
4340void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
4341 FuncInfo CalleeFunc) {
4342 auto *CI = cast<CallsiteInfo *>(Val: CallerCall.call());
4343 assert(CI &&
4344 "Caller cannot be an allocation which should not have profiled calls");
4345 assert(CI->Clones.size() > CallerCall.cloneNo());
4346 auto NewCalleeCloneNo = CalleeFunc.cloneNo();
4347 auto &CurCalleeCloneNo = CI->Clones[CallerCall.cloneNo()];
4348 // If we already assigned this callsite to call a specific non-default
4349 // clone (i.e. not the original function which is clone 0), ensure that we
4350 // aren't trying to now update it to call a different clone, which is
4351 // indicative of a bug in the graph or function assignment.
4352 if (CurCalleeCloneNo != 0 && CurCalleeCloneNo != NewCalleeCloneNo) {
4353 LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was "
4354 << CurCalleeCloneNo << " now " << NewCalleeCloneNo
4355 << "\n");
4356 MismatchedCloneAssignments++;
4357 }
4358 CurCalleeCloneNo = NewCalleeCloneNo;
4359}
4360
4361// Update the debug information attached to NewFunc to use the clone Name. Note
4362// this needs to be done for both any existing DISubprogram for the definition,
4363// as well as any separate declaration DISubprogram.
4364static void updateSubprogramLinkageName(Function *NewFunc, StringRef Name) {
4365 assert(Name == NewFunc->getName());
4366 auto *SP = NewFunc->getSubprogram();
4367 if (!SP)
4368 return;
4369 auto *MDName = MDString::get(Context&: NewFunc->getParent()->getContext(), Str: Name);
4370 SP->replaceLinkageName(LN: MDName);
4371 DISubprogram *Decl = SP->getDeclaration();
4372 if (!Decl)
4373 return;
4374 TempDISubprogram NewDecl = Decl->clone();
4375 NewDecl->replaceLinkageName(LN: MDName);
4376 SP->replaceDeclaration(Decl: MDNode::replaceWithUniqued(N: std::move(NewDecl)));
4377}
4378
4379CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
4380 Instruction *>::FuncInfo
4381ModuleCallsiteContextGraph::cloneFunctionForCallsite(
4382 FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
4383 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
4384 // Use existing LLVM facilities for cloning and obtaining Call in clone
4385 ValueToValueMapTy VMap;
4386 auto *NewFunc = CloneFunction(F: Func.func(), VMap);
4387 std::string Name = getMemProfFuncName(Base: Func.func()->getName(), CloneNo);
4388 assert(!Func.func()->getParent()->getFunction(Name));
4389 NewFunc->setName(Name);
4390 updateSubprogramLinkageName(NewFunc, Name);
4391 for (auto &Inst : CallsWithMetadataInFunc) {
4392 // This map always has the initial version in it.
4393 assert(Inst.cloneNo() == 0);
4394 CallMap[Inst] = {cast<Instruction>(Val&: VMap[Inst.call()]), CloneNo};
4395 }
4396 OREGetter(Func.func())
4397 .emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
4398 << "created clone " << ore::NV("NewFunction", NewFunc));
4399 return {NewFunc, CloneNo};
4400}
4401
4402CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
4403 IndexCall>::FuncInfo
4404IndexCallsiteContextGraph::cloneFunctionForCallsite(
4405 FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
4406 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
4407 // Check how many clones we have of Call (and therefore function).
4408 // The next clone number is the current size of versions array.
4409 // Confirm this matches the CloneNo provided by the caller, which is based on
4410 // the number of function clones we have.
4411 assert(CloneNo == (isa<AllocInfo *>(Call.call())
4412 ? cast<AllocInfo *>(Call.call())->Versions.size()
4413 : cast<CallsiteInfo *>(Call.call())->Clones.size()));
4414 // Walk all the instructions in this function. Create a new version for
4415 // each (by adding an entry to the Versions/Clones summary array), and copy
4416 // over the version being called for the function clone being cloned here.
4417 // Additionally, add an entry to the CallMap for the new function clone,
4418 // mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
4419 // to the new call clone.
4420 for (auto &Inst : CallsWithMetadataInFunc) {
4421 // This map always has the initial version in it.
4422 assert(Inst.cloneNo() == 0);
4423 if (auto *AI = dyn_cast<AllocInfo *>(Val: Inst.call())) {
4424 assert(AI->Versions.size() == CloneNo);
4425 // We assign the allocation type later (in updateAllocationCall), just add
4426 // an entry for it here.
4427 AI->Versions.push_back(Elt: 0);
4428 } else {
4429 auto *CI = cast<CallsiteInfo *>(Val: Inst.call());
4430 assert(CI && CI->Clones.size() == CloneNo);
4431 // We assign the clone number later (in updateCall), just add an entry for
4432 // it here.
4433 CI->Clones.push_back(Elt: 0);
4434 }
4435 CallMap[Inst] = {Inst.call(), CloneNo};
4436 }
4437 return {Func.func(), CloneNo};
4438}
4439
4440// We perform cloning for each allocation node separately. However, this
4441// sometimes results in a situation where the same node calls multiple
4442// clones of the same callee, created for different allocations. This
4443// causes issues when assigning functions to these clones, as each node can
4444// in reality only call a single callee clone.
4445//
4446// To address this, before assigning functions, merge callee clone nodes as
4447// needed using a post order traversal from the allocations. We attempt to
4448// use existing clones as the merge node when legal, and to share them
4449// among callers with the same properties (callers calling the same set of
4450// callee clone nodes for the same allocations).
4451//
4452// Without this fix, in some cases incorrect function assignment will lead
4453// to calling the wrong allocation clone.
4454template <typename DerivedCCG, typename FuncTy, typename CallTy>
4455void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones() {
4456 if (!MergeClones)
4457 return;
4458
4459 // Generate a map from context id to the associated allocation node for use
4460 // when merging clones.
4461 DenseMap<uint32_t, ContextNode *> ContextIdToAllocationNode;
4462 for (auto &Entry : AllocationCallToContextNodeMap) {
4463 auto *Node = Entry.second;
4464 for (auto Id : Node->getContextIds())
4465 ContextIdToAllocationNode[Id] = Node->getOrigNode();
4466 for (auto *Clone : Node->Clones) {
4467 for (auto Id : Clone->getContextIds())
4468 ContextIdToAllocationNode[Id] = Clone->getOrigNode();
4469 }
4470 }
4471
4472 // Post order traversal starting from allocations to ensure each callsite
4473 // calls a single clone of its callee. Callee nodes that are clones of each
4474 // other are merged (via new merge nodes if needed) to achieve this.
4475 DenseSet<const ContextNode *> Visited;
4476 for (auto &Entry : AllocationCallToContextNodeMap) {
4477 auto *Node = Entry.second;
4478
4479 mergeClones(Node, Visited, ContextIdToAllocationNode);
4480
4481 // Make a copy so the recursive post order traversal that may create new
4482 // clones doesn't mess up iteration. Note that the recursive traversal
4483 // itself does not call mergeClones on any of these nodes, which are all
4484 // (clones of) allocations.
4485 auto Clones = Node->Clones;
4486 for (auto *Clone : Clones)
4487 mergeClones(Clone, Visited, ContextIdToAllocationNode);
4488 }
4489
4490 if (DumpCCG) {
4491 dbgs() << "CCG after merging:\n";
4492 dbgs() << *this;
4493 }
4494 if (ExportToDot)
4495 exportToDot(Label: "aftermerge");
4496
4497 if (VerifyCCG) {
4498 check();
4499 }
4500}
4501
4502// Recursive helper for above mergeClones method.
4503template <typename DerivedCCG, typename FuncTy, typename CallTy>
4504void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones(
4505 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
4506 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
4507 auto Inserted = Visited.insert(Node);
4508 if (!Inserted.second)
4509 return;
4510
4511 // Iteratively perform merging on this node to handle new caller nodes created
4512 // during the recursive traversal. We could do something more elegant such as
4513 // maintain a worklist, but this is a simple approach that doesn't cause a
4514 // measureable compile time effect, as most nodes don't have many caller
4515 // edges to check.
4516 bool FoundUnvisited = true;
4517 unsigned Iters = 0;
4518 while (FoundUnvisited) {
4519 Iters++;
4520 FoundUnvisited = false;
4521 // Make a copy since the recursive call may move a caller edge to a new
4522 // callee, messing up the iterator.
4523 auto CallerEdges = Node->CallerEdges;
4524 for (auto CallerEdge : CallerEdges) {
4525 // Skip any caller edge moved onto a different callee during recursion.
4526 if (CallerEdge->Callee != Node)
4527 continue;
4528 // If we found an unvisited caller, note that we should check the caller
4529 // edges again as mergeClones may add or change caller nodes.
4530 if (DoMergeIteration && !Visited.contains(CallerEdge->Caller))
4531 FoundUnvisited = true;
4532 mergeClones(CallerEdge->Caller, Visited, ContextIdToAllocationNode);
4533 }
4534 }
4535
4536 TotalMergeInvokes++;
4537 TotalMergeIters += Iters;
4538 if (Iters > MaxMergeIters)
4539 MaxMergeIters = Iters;
4540
4541 // Merge for this node after we handle its callers.
4542 mergeNodeCalleeClones(Node, Visited, ContextIdToAllocationNode);
4543}
4544
4545template <typename DerivedCCG, typename FuncTy, typename CallTy>
4546void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeNodeCalleeClones(
4547 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
4548 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
4549 // Ignore Node if we moved all of its contexts to clones.
4550 if (Node->emptyContextIds())
4551 return;
4552
4553 // First identify groups of clones among Node's callee edges, by building
4554 // a map from each callee base node to the associated callee edges from Node.
4555 MapVector<ContextNode *, std::vector<std::shared_ptr<ContextEdge>>>
4556 OrigNodeToCloneEdges;
4557 for (const auto &E : Node->CalleeEdges) {
4558 auto *Callee = E->Callee;
4559 if (!Callee->CloneOf && Callee->Clones.empty())
4560 continue;
4561 ContextNode *Base = Callee->getOrigNode();
4562 OrigNodeToCloneEdges[Base].push_back(E);
4563 }
4564
4565 // Helper for callee edge sorting below. Return true if A's callee has fewer
4566 // caller edges than B, or if A is a clone and B is not, or if A's first
4567 // context id is smaller than B's.
4568 auto CalleeCallerEdgeLessThan = [](const std::shared_ptr<ContextEdge> &A,
4569 const std::shared_ptr<ContextEdge> &B) {
4570 if (A->Callee->CallerEdges.size() != B->Callee->CallerEdges.size())
4571 return A->Callee->CallerEdges.size() < B->Callee->CallerEdges.size();
4572 if (A->Callee->CloneOf && !B->Callee->CloneOf)
4573 return true;
4574 else if (!A->Callee->CloneOf && B->Callee->CloneOf)
4575 return false;
4576 // Use the first context id for each edge as a
4577 // tie-breaker.
4578 return *A->ContextIds.begin() < *B->ContextIds.begin();
4579 };
4580
4581 // Process each set of callee clones called by Node, performing the needed
4582 // merging.
4583 for (auto Entry : OrigNodeToCloneEdges) {
4584 // CalleeEdges is the set of edges from Node reaching callees that are
4585 // mutual clones of each other.
4586 auto &CalleeEdges = Entry.second;
4587 auto NumCalleeClones = CalleeEdges.size();
4588 // A single edge means there is no merging needed.
4589 if (NumCalleeClones == 1)
4590 continue;
4591 // Sort the CalleeEdges calling this group of clones in ascending order of
4592 // their caller edge counts, putting the original non-clone node first in
4593 // cases of a tie. This simplifies finding an existing node to use as the
4594 // merge node.
4595 llvm::stable_sort(CalleeEdges, CalleeCallerEdgeLessThan);
4596
4597 /// Find other callers of the given set of callee edges that can
4598 /// share the same callee merge node. See the comments at this method
4599 /// definition for details.
4600 DenseSet<ContextNode *> OtherCallersToShareMerge;
4601 findOtherCallersToShareMerge(Node, CalleeEdges, ContextIdToAllocationNode,
4602 OtherCallersToShareMerge);
4603
4604 // Now do the actual merging. Identify existing or create a new MergeNode
4605 // during the first iteration. Move each callee over, along with edges from
4606 // other callers we've determined above can share the same merge node.
4607 ContextNode *MergeNode = nullptr;
4608 DenseMap<ContextNode *, unsigned> CallerToMoveCount;
4609 for (auto CalleeEdge : CalleeEdges) {
4610 auto *OrigCallee = CalleeEdge->Callee;
4611 // If we don't have a MergeNode yet (only happens on the first iteration,
4612 // as a new one will be created when we go to move the first callee edge
4613 // over as needed), see if we can use this callee.
4614 if (!MergeNode) {
4615 // If there are no other callers, simply use this callee.
4616 if (CalleeEdge->Callee->CallerEdges.size() == 1) {
4617 MergeNode = OrigCallee;
4618 NonNewMergedNodes++;
4619 continue;
4620 }
4621 // Otherwise, if we have identified other caller nodes that can share
4622 // the merge node with Node, see if all of OrigCallee's callers are
4623 // going to share the same merge node. In that case we can use callee
4624 // (since all of its callers would move to the new merge node).
4625 if (!OtherCallersToShareMerge.empty()) {
4626 bool MoveAllCallerEdges = true;
4627 for (auto CalleeCallerE : OrigCallee->CallerEdges) {
4628 if (CalleeCallerE == CalleeEdge)
4629 continue;
4630 if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller)) {
4631 MoveAllCallerEdges = false;
4632 break;
4633 }
4634 }
4635 // If we are going to move all callers over, we can use this callee as
4636 // the MergeNode.
4637 if (MoveAllCallerEdges) {
4638 MergeNode = OrigCallee;
4639 NonNewMergedNodes++;
4640 continue;
4641 }
4642 }
4643 }
4644 // Move this callee edge, creating a new merge node if necessary.
4645 if (MergeNode) {
4646 assert(MergeNode != OrigCallee);
4647 moveEdgeToExistingCalleeClone(Edge: CalleeEdge, NewCallee: MergeNode,
4648 /*NewClone*/ false);
4649 } else {
4650 MergeNode = moveEdgeToNewCalleeClone(Edge: CalleeEdge);
4651 NewMergedNodes++;
4652 }
4653 // Now move all identified edges from other callers over to the merge node
4654 // as well.
4655 if (!OtherCallersToShareMerge.empty()) {
4656 // Make and iterate over a copy of OrigCallee's caller edges because
4657 // some of these will be moved off of the OrigCallee and that would mess
4658 // up the iteration from OrigCallee.
4659 auto OrigCalleeCallerEdges = OrigCallee->CallerEdges;
4660 for (auto &CalleeCallerE : OrigCalleeCallerEdges) {
4661 if (CalleeCallerE == CalleeEdge)
4662 continue;
4663 if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller))
4664 continue;
4665 CallerToMoveCount[CalleeCallerE->Caller]++;
4666 moveEdgeToExistingCalleeClone(Edge: CalleeCallerE, NewCallee: MergeNode,
4667 /*NewClone*/ false);
4668 }
4669 }
4670 removeNoneTypeCalleeEdges(Node: OrigCallee);
4671 removeNoneTypeCalleeEdges(Node: MergeNode);
4672 }
4673 }
4674}
4675
4676// Look for other nodes that have edges to the same set of callee
4677// clones as the current Node. Those can share the eventual merge node
4678// (reducing cloning and binary size overhead) iff:
4679// - they have edges to the same set of callee clones
4680// - each callee edge reaches a subset of the same allocations as Node's
4681// corresponding edge to the same callee clone.
4682// The second requirement is to ensure that we don't undo any of the
4683// necessary cloning to distinguish contexts with different allocation
4684// behavior.
4685// FIXME: This is somewhat conservative, as we really just need to ensure
4686// that they don't reach the same allocations as contexts on edges from Node
4687// going to any of the *other* callee clones being merged. However, that
4688// requires more tracking and checking to get right.
4689template <typename DerivedCCG, typename FuncTy, typename CallTy>
4690void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
4691 findOtherCallersToShareMerge(
4692 ContextNode *Node,
4693 std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges,
4694 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode,
4695 DenseSet<ContextNode *> &OtherCallersToShareMerge) {
4696 auto NumCalleeClones = CalleeEdges.size();
4697 // This map counts how many edges to the same callee clone exist for other
4698 // caller nodes of each callee clone.
4699 DenseMap<ContextNode *, unsigned> OtherCallersToSharedCalleeEdgeCount;
4700 // Counts the number of other caller nodes that have edges to all callee
4701 // clones that don't violate the allocation context checking.
4702 unsigned PossibleOtherCallerNodes = 0;
4703
4704 // We only need to look at other Caller nodes if the first callee edge has
4705 // multiple callers (recall they are sorted in ascending order above).
4706 if (CalleeEdges[0]->Callee->CallerEdges.size() < 2)
4707 return;
4708
4709 // For each callee edge:
4710 // - Collect the count of other caller nodes calling the same callees.
4711 // - Collect the alloc nodes reached by contexts on each callee edge.
4712 DenseMap<ContextEdge *, DenseSet<ContextNode *>> CalleeEdgeToAllocNodes;
4713 for (auto CalleeEdge : CalleeEdges) {
4714 assert(CalleeEdge->Callee->CallerEdges.size() > 1);
4715 // For each other caller of the same callee, increment the count of
4716 // edges reaching the same callee clone.
4717 for (auto CalleeCallerEdges : CalleeEdge->Callee->CallerEdges) {
4718 if (CalleeCallerEdges->Caller == Node) {
4719 assert(CalleeCallerEdges == CalleeEdge);
4720 continue;
4721 }
4722 OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller]++;
4723 // If this caller edge now reaches all of the same callee clones,
4724 // increment the count of candidate other caller nodes.
4725 if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller] ==
4726 NumCalleeClones)
4727 PossibleOtherCallerNodes++;
4728 }
4729 // Collect the alloc nodes reached by contexts on each callee edge, for
4730 // later analysis.
4731 for (auto Id : CalleeEdge->getContextIds()) {
4732 auto *Alloc = ContextIdToAllocationNode.lookup(Id);
4733 if (!Alloc) {
4734 // FIXME: unclear why this happens occasionally, presumably
4735 // imperfect graph updates possibly with recursion.
4736 MissingAllocForContextId++;
4737 continue;
4738 }
4739 CalleeEdgeToAllocNodes[CalleeEdge.get()].insert(Alloc);
4740 }
4741 }
4742
4743 // Now walk the callee edges again, and make sure that for each candidate
4744 // caller node all of its edges to the callees reach the same allocs (or
4745 // a subset) as those along the corresponding callee edge from Node.
4746 for (auto CalleeEdge : CalleeEdges) {
4747 assert(CalleeEdge->Callee->CallerEdges.size() > 1);
4748 // Stop if we do not have any (more) candidate other caller nodes.
4749 if (!PossibleOtherCallerNodes)
4750 break;
4751 auto &CurCalleeAllocNodes = CalleeEdgeToAllocNodes[CalleeEdge.get()];
4752 // Check each other caller of this callee clone.
4753 for (auto &CalleeCallerE : CalleeEdge->Callee->CallerEdges) {
4754 // Not interested in the callee edge from Node itself.
4755 if (CalleeCallerE == CalleeEdge)
4756 continue;
4757 // Skip any callers that didn't have callee edges to all the same
4758 // callee clones.
4759 if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] !=
4760 NumCalleeClones)
4761 continue;
4762 // Make sure that each context along edge from candidate caller node
4763 // reaches an allocation also reached by this callee edge from Node.
4764 for (auto Id : CalleeCallerE->getContextIds()) {
4765 auto *Alloc = ContextIdToAllocationNode.lookup(Id);
4766 if (!Alloc)
4767 continue;
4768 // If not, simply reset the map entry to 0 so caller is ignored, and
4769 // reduce the count of candidate other caller nodes.
4770 if (!CurCalleeAllocNodes.contains(Alloc)) {
4771 OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] = 0;
4772 PossibleOtherCallerNodes--;
4773 break;
4774 }
4775 }
4776 }
4777 }
4778
4779 if (!PossibleOtherCallerNodes)
4780 return;
4781
4782 // Build the set of other caller nodes that can use the same callee merge
4783 // node.
4784 for (auto &[OtherCaller, Count] : OtherCallersToSharedCalleeEdgeCount) {
4785 if (Count != NumCalleeClones)
4786 continue;
4787 OtherCallersToShareMerge.insert(OtherCaller);
4788 }
4789}
4790
4791// This method assigns cloned callsites to functions, cloning the functions as
4792// needed. The assignment is greedy and proceeds roughly as follows:
4793//
4794// For each function Func:
4795// For each call with graph Node having clones:
4796// Initialize ClonesWorklist to Node and its clones
4797// Initialize NodeCloneCount to 0
4798// While ClonesWorklist is not empty:
4799// Clone = pop front ClonesWorklist
4800// NodeCloneCount++
4801// If Func has been cloned less than NodeCloneCount times:
4802// If NodeCloneCount is 1:
4803// Assign Clone to original Func
4804// Continue
4805// Create a new function clone
4806// If other callers not assigned to call a function clone yet:
4807// Assign them to call new function clone
4808// Continue
4809// Assign any other caller calling the cloned version to new clone
4810//
4811// For each caller of Clone:
4812// If caller is assigned to call a specific function clone:
4813// If we cannot assign Clone to that function clone:
4814// Create new callsite Clone NewClone
4815// Add NewClone to ClonesWorklist
4816// Continue
4817// Assign Clone to existing caller's called function clone
4818// Else:
4819// If Clone not already assigned to a function clone:
4820// Assign to first function clone without assignment
4821// Assign caller to selected function clone
4822// For each call with graph Node having clones:
4823// If number func clones > number call's callsite Node clones:
4824// Record func CallInfo clones without Node clone in UnassignedCallClones
4825// For callsite Nodes in DFS order from allocations:
4826// If IsAllocation:
4827// Update allocation with alloc type
4828// Else:
4829// For Call, all MatchingCalls, and associated UnnassignedCallClones:
4830// Update call to call recorded callee clone
4831//
4832template <typename DerivedCCG, typename FuncTy, typename CallTy>
4833bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
4834 bool Changed = false;
4835
4836 mergeClones();
4837
4838 // Keep track of the assignment of nodes (callsites) to function clones they
4839 // call.
4840 DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
4841
4842 // Update caller node to call function version CalleeFunc, by recording the
4843 // assignment in CallsiteToCalleeFuncCloneMap.
4844 auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
4845 const FuncInfo &CalleeFunc) {
4846 assert(Caller->hasCall());
4847 CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
4848 };
4849
4850 // Information for a single clone of this Func.
4851 struct FuncCloneInfo {
4852 // The function clone.
4853 FuncInfo FuncClone;
4854 // Remappings of each call of interest (from original uncloned call to the
4855 // corresponding cloned call in this function clone).
4856 DenseMap<CallInfo, CallInfo> CallMap;
4857 };
4858
4859 // Map to keep track of information needed to update calls in function clones
4860 // when their corresponding callsite node was not itself cloned for that
4861 // function clone. Because of call context pruning (i.e. we only keep as much
4862 // caller information as needed to distinguish hot vs cold), we may not have
4863 // caller edges coming to each callsite node from all possible function
4864 // callers. A function clone may get created for other callsites in the
4865 // function for which there are caller edges that were not pruned. Any other
4866 // callsites in that function clone, which were not themselved cloned for
4867 // that function clone, should get updated the same way as the corresponding
4868 // callsite in the original function (which may call a clone of its callee).
4869 //
4870 // We build this map after completing function cloning for each function, so
4871 // that we can record the information from its call maps before they are
4872 // destructed. The map will be used as we update calls to update any still
4873 // unassigned call clones. Note that we may create new node clones as we clone
4874 // other functions, so later on we check which node clones were still not
4875 // created. To this end, the inner map is a map from function clone number to
4876 // the list of calls cloned for that function (can be more than one due to the
4877 // Node's MatchingCalls array).
4878 //
4879 // The alternative is creating new callsite clone nodes below as we clone the
4880 // function, but that is tricker to get right and likely more overhead.
4881 //
4882 // Inner map is a std::map so sorted by key (clone number), in order to get
4883 // ordered remarks in the full LTO case.
4884 DenseMap<const ContextNode *, std::map<unsigned, SmallVector<CallInfo, 0>>>
4885 UnassignedCallClones;
4886
4887 // Walk all functions for which we saw calls with memprof metadata, and handle
4888 // cloning for each of its calls.
4889 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
4890 FuncInfo OrigFunc(Func);
4891 // Map from each clone number of OrigFunc to information about that function
4892 // clone (the function clone FuncInfo and call remappings). The index into
4893 // the vector is the clone number, as function clones are created and
4894 // numbered sequentially.
4895 std::vector<FuncCloneInfo> FuncCloneInfos;
4896 for (auto &Call : CallsWithMetadata) {
4897 ContextNode *Node = getNodeForInst(C: Call);
4898 // Skip call if we do not have a node for it (all uses of its stack ids
4899 // were either on inlined chains or pruned from the MIBs), or if we did
4900 // not create any clones for it.
4901 if (!Node || Node->Clones.empty())
4902 continue;
4903 assert(Node->hasCall() &&
4904 "Not having a call should have prevented cloning");
4905
4906 // Track the assignment of function clones to clones of the current
4907 // callsite Node being handled.
4908 std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
4909
4910 // Assign callsite version CallsiteClone to function version FuncClone,
4911 // and also assign (possibly cloned) Call to CallsiteClone.
4912 auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
4913 CallInfo &Call,
4914 ContextNode *CallsiteClone,
4915 bool IsAlloc) {
4916 // Record the clone of callsite node assigned to this function clone.
4917 FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
4918
4919 assert(FuncCloneInfos.size() > FuncClone.cloneNo());
4920 DenseMap<CallInfo, CallInfo> &CallMap =
4921 FuncCloneInfos[FuncClone.cloneNo()].CallMap;
4922 CallInfo CallClone(Call);
4923 if (auto It = CallMap.find(Call); It != CallMap.end())
4924 CallClone = It->second;
4925 CallsiteClone->setCall(CallClone);
4926 // Need to do the same for all matching calls.
4927 for (auto &MatchingCall : Node->MatchingCalls) {
4928 CallInfo CallClone(MatchingCall);
4929 if (auto It = CallMap.find(MatchingCall); It != CallMap.end())
4930 CallClone = It->second;
4931 // Updates the call in the list.
4932 MatchingCall = CallClone;
4933 }
4934 };
4935
4936 // Invokes moveEdgeToNewCalleeClone which creates a new clone, and then
4937 // performs the necessary fixups (removing none type edges, and
4938 // importantly, propagating any function call assignment of the original
4939 // node to the new clone).
4940 auto MoveEdgeToNewCalleeCloneAndSetUp =
4941 [&](const std::shared_ptr<ContextEdge> &Edge) {
4942 ContextNode *OrigCallee = Edge->Callee;
4943 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge);
4944 removeNoneTypeCalleeEdges(Node: NewClone);
4945 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
4946 // If the original Callee was already assigned to call a specific
4947 // function version, make sure its new clone is assigned to call
4948 // that same function clone.
4949 if (CallsiteToCalleeFuncCloneMap.count(OrigCallee))
4950 RecordCalleeFuncOfCallsite(
4951 NewClone, CallsiteToCalleeFuncCloneMap[OrigCallee]);
4952 return NewClone;
4953 };
4954
4955 // Keep track of the clones of callsite Node that need to be assigned to
4956 // function clones. This list may be expanded in the loop body below if we
4957 // find additional cloning is required.
4958 std::deque<ContextNode *> ClonesWorklist;
4959 // Ignore original Node if we moved all of its contexts to clones.
4960 if (!Node->emptyContextIds())
4961 ClonesWorklist.push_back(Node);
4962 llvm::append_range(ClonesWorklist, Node->Clones);
4963
4964 // Now walk through all of the clones of this callsite Node that we need,
4965 // and determine the assignment to a corresponding clone of the current
4966 // function (creating new function clones as needed).
4967 unsigned NodeCloneCount = 0;
4968 while (!ClonesWorklist.empty()) {
4969 ContextNode *Clone = ClonesWorklist.front();
4970 ClonesWorklist.pop_front();
4971 NodeCloneCount++;
4972 if (VerifyNodes)
4973 checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
4974
4975 // Need to create a new function clone if we have more callsite clones
4976 // than existing function clones, which would have been assigned to an
4977 // earlier clone in the list (we assign callsite clones to function
4978 // clones greedily).
4979 if (FuncCloneInfos.size() < NodeCloneCount) {
4980 // If this is the first callsite copy, assign to original function.
4981 if (NodeCloneCount == 1) {
4982 // Since FuncCloneInfos is empty in this case, no clones have
4983 // been created for this function yet, and no callers should have
4984 // been assigned a function clone for this callee node yet.
4985 assert(llvm::none_of(
4986 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
4987 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
4988 }));
4989 // Initialize with empty call map, assign Clone to original function
4990 // and its callers, and skip to the next clone.
4991 FuncCloneInfos.push_back(
4992 {OrigFunc, DenseMap<CallInfo, CallInfo>()});
4993 AssignCallsiteCloneToFuncClone(
4994 OrigFunc, Call, Clone,
4995 AllocationCallToContextNodeMap.count(Call));
4996 for (auto &CE : Clone->CallerEdges) {
4997 // Ignore any caller that does not have a recorded callsite Call.
4998 if (!CE->Caller->hasCall())
4999 continue;
5000 RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
5001 }
5002 continue;
5003 }
5004
5005 // First locate which copy of OrigFunc to clone again. If a caller
5006 // of this callsite clone was already assigned to call a particular
5007 // function clone, we need to redirect all of those callers to the
5008 // new function clone, and update their other callees within this
5009 // function.
5010 FuncInfo PreviousAssignedFuncClone;
5011 auto EI = llvm::find_if(
5012 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
5013 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
5014 });
5015 bool CallerAssignedToCloneOfFunc = false;
5016 if (EI != Clone->CallerEdges.end()) {
5017 const std::shared_ptr<ContextEdge> &Edge = *EI;
5018 PreviousAssignedFuncClone =
5019 CallsiteToCalleeFuncCloneMap[Edge->Caller];
5020 CallerAssignedToCloneOfFunc = true;
5021 }
5022
5023 // Clone function and save it along with the CallInfo map created
5024 // during cloning in the FuncCloneInfos.
5025 DenseMap<CallInfo, CallInfo> NewCallMap;
5026 unsigned CloneNo = FuncCloneInfos.size();
5027 assert(CloneNo > 0 && "Clone 0 is the original function, which "
5028 "should already exist in the map");
5029 FuncInfo NewFuncClone = cloneFunctionForCallsite(
5030 Func&: OrigFunc, Call, CallMap&: NewCallMap, CallsWithMetadataInFunc&: CallsWithMetadata, CloneNo);
5031 FuncCloneInfos.push_back({NewFuncClone, std::move(NewCallMap)});
5032 FunctionClonesAnalysis++;
5033 Changed = true;
5034
5035 // If no caller callsites were already assigned to a clone of this
5036 // function, we can simply assign this clone to the new func clone
5037 // and update all callers to it, then skip to the next clone.
5038 if (!CallerAssignedToCloneOfFunc) {
5039 AssignCallsiteCloneToFuncClone(
5040 NewFuncClone, Call, Clone,
5041 AllocationCallToContextNodeMap.count(Call));
5042 for (auto &CE : Clone->CallerEdges) {
5043 // Ignore any caller that does not have a recorded callsite Call.
5044 if (!CE->Caller->hasCall())
5045 continue;
5046 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
5047 }
5048 continue;
5049 }
5050
5051 // We may need to do additional node cloning in this case.
5052 // Reset the CallsiteToCalleeFuncCloneMap entry for any callers
5053 // that were previously assigned to call PreviousAssignedFuncClone,
5054 // to record that they now call NewFuncClone.
5055 // The none type edge removal may remove some of this Clone's caller
5056 // edges, if it is reached via another of its caller's callees.
5057 // Iterate over a copy and skip any that were removed.
5058 auto CallerEdges = Clone->CallerEdges;
5059 for (auto CE : CallerEdges) {
5060 // Skip any that have been removed on an earlier iteration.
5061 if (CE->isRemoved()) {
5062 assert(!is_contained(Clone->CallerEdges, CE));
5063 continue;
5064 }
5065 assert(CE);
5066 // Ignore any caller that does not have a recorded callsite Call.
5067 if (!CE->Caller->hasCall())
5068 continue;
5069
5070 if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
5071 // We subsequently fall through to later handling that
5072 // will perform any additional cloning required for
5073 // callers that were calling other function clones.
5074 CallsiteToCalleeFuncCloneMap[CE->Caller] !=
5075 PreviousAssignedFuncClone)
5076 continue;
5077
5078 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
5079
5080 // If we are cloning a function that was already assigned to some
5081 // callers, then essentially we are creating new callsite clones
5082 // of the other callsites in that function that are reached by those
5083 // callers. Clone the other callees of the current callsite's caller
5084 // that were already assigned to PreviousAssignedFuncClone
5085 // accordingly. This is important since we subsequently update the
5086 // calls from the nodes in the graph and their assignments to callee
5087 // functions recorded in CallsiteToCalleeFuncCloneMap.
5088 // The none type edge removal may remove some of this caller's
5089 // callee edges, if it is reached via another of its callees.
5090 // Iterate over a copy and skip any that were removed.
5091 auto CalleeEdges = CE->Caller->CalleeEdges;
5092 for (auto CalleeEdge : CalleeEdges) {
5093 // Skip any that have been removed on an earlier iteration when
5094 // cleaning up newly None type callee edges.
5095 if (CalleeEdge->isRemoved()) {
5096 assert(!is_contained(CE->Caller->CalleeEdges, CalleeEdge));
5097 continue;
5098 }
5099 assert(CalleeEdge);
5100 ContextNode *Callee = CalleeEdge->Callee;
5101 // Skip the current callsite, we are looking for other
5102 // callsites Caller calls, as well as any that does not have a
5103 // recorded callsite Call.
5104 if (Callee == Clone || !Callee->hasCall())
5105 continue;
5106 // Skip direct recursive calls. We don't need/want to clone the
5107 // caller node again, and this loop will not behave as expected if
5108 // we tried.
5109 if (Callee == CalleeEdge->Caller)
5110 continue;
5111 ContextNode *NewClone =
5112 MoveEdgeToNewCalleeCloneAndSetUp(CalleeEdge);
5113 // Moving the edge may have resulted in some none type
5114 // callee edges on the original Callee.
5115 removeNoneTypeCalleeEdges(Node: Callee);
5116 // Update NewClone with the new Call clone of this callsite's Call
5117 // created for the new function clone created earlier.
5118 // Recall that we have already ensured when building the graph
5119 // that each caller can only call callsites within the same
5120 // function, so we are guaranteed that Callee Call is in the
5121 // current OrigFunc.
5122 // CallMap is set up as indexed by original Call at clone 0.
5123 CallInfo OrigCall(Callee->getOrigNode()->Call);
5124 OrigCall.setCloneNo(0);
5125 DenseMap<CallInfo, CallInfo> &CallMap =
5126 FuncCloneInfos[NewFuncClone.cloneNo()].CallMap;
5127 assert(CallMap.count(OrigCall));
5128 CallInfo NewCall(CallMap[OrigCall]);
5129 assert(NewCall);
5130 NewClone->setCall(NewCall);
5131 // Need to do the same for all matching calls.
5132 for (auto &MatchingCall : NewClone->MatchingCalls) {
5133 CallInfo OrigMatchingCall(MatchingCall);
5134 OrigMatchingCall.setCloneNo(0);
5135 assert(CallMap.count(OrigMatchingCall));
5136 CallInfo NewCall(CallMap[OrigMatchingCall]);
5137 assert(NewCall);
5138 // Updates the call in the list.
5139 MatchingCall = NewCall;
5140 }
5141 }
5142 }
5143 // Fall through to handling below to perform the recording of the
5144 // function for this callsite clone. This enables handling of cases
5145 // where the callers were assigned to different clones of a function.
5146 }
5147
5148 auto FindFirstAvailFuncClone = [&]() {
5149 // Find first function in FuncCloneInfos without an assigned
5150 // clone of this callsite Node. We should always have one
5151 // available at this point due to the earlier cloning when the
5152 // FuncCloneInfos size was smaller than the clone number.
5153 for (auto &CF : FuncCloneInfos) {
5154 if (!FuncCloneToCurNodeCloneMap.count(CF.FuncClone))
5155 return CF.FuncClone;
5156 }
5157 llvm_unreachable(
5158 "Expected an available func clone for this callsite clone");
5159 };
5160
5161 // See if we can use existing function clone. Walk through
5162 // all caller edges to see if any have already been assigned to
5163 // a clone of this callsite's function. If we can use it, do so. If not,
5164 // because that function clone is already assigned to a different clone
5165 // of this callsite, then we need to clone again.
5166 // Basically, this checking is needed to handle the case where different
5167 // caller functions/callsites may need versions of this function
5168 // containing different mixes of callsite clones across the different
5169 // callsites within the function. If that happens, we need to create
5170 // additional function clones to handle the various combinations.
5171 //
5172 // Keep track of any new clones of this callsite created by the
5173 // following loop, as well as any existing clone that we decided to
5174 // assign this clone to.
5175 std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
5176 FuncInfo FuncCloneAssignedToCurCallsiteClone;
5177 // Iterate over a copy of Clone's caller edges, since we may need to
5178 // remove edges in the moveEdgeTo* methods, and this simplifies the
5179 // handling and makes it less error-prone.
5180 auto CloneCallerEdges = Clone->CallerEdges;
5181 for (auto &Edge : CloneCallerEdges) {
5182 // Skip removed edges (due to direct recursive edges updated when
5183 // updating callee edges when moving an edge and subsequently
5184 // removed by call to removeNoneTypeCalleeEdges on the Clone).
5185 if (Edge->isRemoved())
5186 continue;
5187 // Ignore any caller that does not have a recorded callsite Call.
5188 if (!Edge->Caller->hasCall())
5189 continue;
5190 // If this caller already assigned to call a version of OrigFunc, need
5191 // to ensure we can assign this callsite clone to that function clone.
5192 if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
5193 FuncInfo FuncCloneCalledByCaller =
5194 CallsiteToCalleeFuncCloneMap[Edge->Caller];
5195 // First we need to confirm that this function clone is available
5196 // for use by this callsite node clone.
5197 //
5198 // While FuncCloneToCurNodeCloneMap is built only for this Node and
5199 // its callsite clones, one of those callsite clones X could have
5200 // been assigned to the same function clone called by Edge's caller
5201 // - if Edge's caller calls another callsite within Node's original
5202 // function, and that callsite has another caller reaching clone X.
5203 // We need to clone Node again in this case.
5204 if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
5205 FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
5206 Clone) ||
5207 // Detect when we have multiple callers of this callsite that
5208 // have already been assigned to specific, and different, clones
5209 // of OrigFunc (due to other unrelated callsites in Func they
5210 // reach via call contexts). Is this Clone of callsite Node
5211 // assigned to a different clone of OrigFunc? If so, clone Node
5212 // again.
5213 (FuncCloneAssignedToCurCallsiteClone &&
5214 FuncCloneAssignedToCurCallsiteClone !=
5215 FuncCloneCalledByCaller)) {
5216 // We need to use a different newly created callsite clone, in
5217 // order to assign it to another new function clone on a
5218 // subsequent iteration over the Clones array (adjusted below).
5219 // Note we specifically do not reset the
5220 // CallsiteToCalleeFuncCloneMap entry for this caller, so that
5221 // when this new clone is processed later we know which version of
5222 // the function to copy (so that other callsite clones we have
5223 // assigned to that function clone are properly cloned over). See
5224 // comments in the function cloning handling earlier.
5225
5226 // Check if we already have cloned this callsite again while
5227 // walking through caller edges, for a caller calling the same
5228 // function clone. If so, we can move this edge to that new clone
5229 // rather than creating yet another new clone.
5230 if (FuncCloneToNewCallsiteCloneMap.count(
5231 FuncCloneCalledByCaller)) {
5232 ContextNode *NewClone =
5233 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
5234 moveEdgeToExistingCalleeClone(Edge, NewCallee: NewClone);
5235 // Cleanup any none type edges cloned over.
5236 removeNoneTypeCalleeEdges(Node: NewClone);
5237 } else {
5238 // Create a new callsite clone.
5239 ContextNode *NewClone = MoveEdgeToNewCalleeCloneAndSetUp(Edge);
5240 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
5241 NewClone;
5242 // Add to list of clones and process later.
5243 ClonesWorklist.push_back(NewClone);
5244 }
5245 // Moving the caller edge may have resulted in some none type
5246 // callee edges.
5247 removeNoneTypeCalleeEdges(Node: Clone);
5248 // We will handle the newly created callsite clone in a subsequent
5249 // iteration over this Node's Clones.
5250 continue;
5251 }
5252
5253 // Otherwise, we can use the function clone already assigned to this
5254 // caller.
5255 if (!FuncCloneAssignedToCurCallsiteClone) {
5256 FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
5257 // Assign Clone to FuncCloneCalledByCaller
5258 AssignCallsiteCloneToFuncClone(
5259 FuncCloneCalledByCaller, Call, Clone,
5260 AllocationCallToContextNodeMap.count(Call));
5261 } else
5262 // Don't need to do anything - callsite is already calling this
5263 // function clone.
5264 assert(FuncCloneAssignedToCurCallsiteClone ==
5265 FuncCloneCalledByCaller);
5266
5267 } else {
5268 // We have not already assigned this caller to a version of
5269 // OrigFunc. Do the assignment now.
5270
5271 // First check if we have already assigned this callsite clone to a
5272 // clone of OrigFunc for another caller during this iteration over
5273 // its caller edges.
5274 if (!FuncCloneAssignedToCurCallsiteClone) {
5275 FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone();
5276 assert(FuncCloneAssignedToCurCallsiteClone);
5277 // Assign Clone to FuncCloneAssignedToCurCallsiteClone
5278 AssignCallsiteCloneToFuncClone(
5279 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
5280 AllocationCallToContextNodeMap.count(Call));
5281 } else
5282 assert(FuncCloneToCurNodeCloneMap
5283 [FuncCloneAssignedToCurCallsiteClone] == Clone);
5284 // Update callers to record function version called.
5285 RecordCalleeFuncOfCallsite(Edge->Caller,
5286 FuncCloneAssignedToCurCallsiteClone);
5287 }
5288 }
5289 // If we didn't assign a function clone to this callsite clone yet, e.g.
5290 // none of its callers has a non-null call, do the assignment here.
5291 // We want to ensure that every callsite clone is assigned to some
5292 // function clone, so that the call updates below work as expected.
5293 // In particular if this is the original callsite, we want to ensure it
5294 // is assigned to the original function, otherwise the original function
5295 // will appear available for assignment to other callsite clones,
5296 // leading to unintended effects. For one, the unknown and not updated
5297 // callers will call into cloned paths leading to the wrong hints,
5298 // because they still call the original function (clone 0). Also,
5299 // because all callsites start out as being clone 0 by default, we can't
5300 // easily distinguish between callsites explicitly assigned to clone 0
5301 // vs those never assigned, which can lead to multiple updates of the
5302 // calls when invoking updateCall below, with mismatched clone values.
5303 // TODO: Add a flag to the callsite nodes or some other mechanism to
5304 // better distinguish and identify callsite clones that are not getting
5305 // assigned to function clones as expected.
5306 if (!FuncCloneAssignedToCurCallsiteClone) {
5307 FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone();
5308 assert(FuncCloneAssignedToCurCallsiteClone &&
5309 "No available func clone for this callsite clone");
5310 AssignCallsiteCloneToFuncClone(
5311 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
5312 /*IsAlloc=*/AllocationCallToContextNodeMap.contains(Call));
5313 }
5314 }
5315 if (VerifyCCG) {
5316 checkNode<DerivedCCG, FuncTy, CallTy>(Node);
5317 for (const auto &PE : Node->CalleeEdges)
5318 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
5319 for (const auto &CE : Node->CallerEdges)
5320 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
5321 for (auto *Clone : Node->Clones) {
5322 checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
5323 for (const auto &PE : Clone->CalleeEdges)
5324 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
5325 for (const auto &CE : Clone->CallerEdges)
5326 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
5327 }
5328 }
5329 }
5330
5331 if (FuncCloneInfos.size() < 2)
5332 continue;
5333
5334 // In this case there is more than just the original function copy.
5335 // Record call clones of any callsite nodes in the function that did not
5336 // themselves get cloned for all of the function clones.
5337 for (auto &Call : CallsWithMetadata) {
5338 ContextNode *Node = getNodeForInst(C: Call);
5339 if (!Node || !Node->hasCall() || Node->emptyContextIds())
5340 continue;
5341 // If Node has enough clones already to cover all function clones, we can
5342 // skip it. Need to add one for the original copy.
5343 // Use >= in case there were clones that were skipped due to having empty
5344 // context ids
5345 if (Node->Clones.size() + 1 >= FuncCloneInfos.size())
5346 continue;
5347 // First collect all function clones we cloned this callsite node for.
5348 // They may not be sequential due to empty clones e.g.
5349 DenseSet<unsigned> NodeCallClones;
5350 for (auto *C : Node->Clones)
5351 NodeCallClones.insert(C->Call.cloneNo());
5352 unsigned I = 0;
5353 // Now check all the function clones.
5354 for (auto &FC : FuncCloneInfos) {
5355 // Function clones should be sequential.
5356 assert(FC.FuncClone.cloneNo() == I);
5357 // Skip the first clone which got the original call.
5358 // Also skip any other clones created for this Node.
5359 if (++I == 1 || NodeCallClones.contains(V: I)) {
5360 continue;
5361 }
5362 // Record the call clones created for this callsite in this function
5363 // clone.
5364 auto &CallVector = UnassignedCallClones[Node][I];
5365 DenseMap<CallInfo, CallInfo> &CallMap = FC.CallMap;
5366 if (auto It = CallMap.find(Call); It != CallMap.end()) {
5367 CallInfo CallClone = It->second;
5368 CallVector.push_back(CallClone);
5369 } else {
5370 // All but the original clone (skipped earlier) should have an entry
5371 // for all calls.
5372 assert(false && "Expected to find call in CallMap");
5373 }
5374 // Need to do the same for all matching calls.
5375 for (auto &MatchingCall : Node->MatchingCalls) {
5376 if (auto It = CallMap.find(MatchingCall); It != CallMap.end()) {
5377 CallInfo CallClone = It->second;
5378 CallVector.push_back(CallClone);
5379 } else {
5380 // All but the original clone (skipped earlier) should have an entry
5381 // for all calls.
5382 assert(false && "Expected to find call in CallMap");
5383 }
5384 }
5385 }
5386 }
5387 }
5388
5389 uint8_t BothTypes =
5390 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
5391
5392 auto UpdateCalls = [&](ContextNode *Node,
5393 DenseSet<const ContextNode *> &Visited,
5394 auto &&UpdateCalls) {
5395 auto Inserted = Visited.insert(Node);
5396 if (!Inserted.second)
5397 return;
5398
5399 for (auto *Clone : Node->Clones)
5400 UpdateCalls(Clone, Visited, UpdateCalls);
5401
5402 for (auto &Edge : Node->CallerEdges)
5403 UpdateCalls(Edge->Caller, Visited, UpdateCalls);
5404
5405 // Skip if either no call to update, or if we ended up with no context ids
5406 // (we moved all edges onto other clones).
5407 if (!Node->hasCall() || Node->emptyContextIds())
5408 return;
5409
5410 if (Node->IsAllocation) {
5411 auto AT = allocTypeToUse(Node->AllocTypes);
5412 // If the allocation type is ambiguous, and more aggressive hinting
5413 // has been enabled via the MinClonedColdBytePercent flag, see if this
5414 // allocation should be hinted cold anyway because its fraction cold bytes
5415 // allocated is at least the given threshold.
5416 if (Node->AllocTypes == BothTypes && MinClonedColdBytePercent < 100 &&
5417 !ContextIdToContextSizeInfos.empty()) {
5418 uint64_t TotalCold = 0;
5419 uint64_t Total = 0;
5420 for (auto Id : Node->getContextIds()) {
5421 auto TypeI = ContextIdToAllocationType.find(Id);
5422 assert(TypeI != ContextIdToAllocationType.end());
5423 auto CSI = ContextIdToContextSizeInfos.find(Id);
5424 if (CSI != ContextIdToContextSizeInfos.end()) {
5425 for (auto &Info : CSI->second) {
5426 Total += Info.TotalSize;
5427 if (TypeI->second == AllocationType::Cold)
5428 TotalCold += Info.TotalSize;
5429 }
5430 }
5431 }
5432 if (TotalCold * 100 >= Total * MinClonedColdBytePercent)
5433 AT = AllocationType::Cold;
5434 }
5435 updateAllocationCall(Call&: Node->Call, AllocType: AT);
5436 assert(Node->MatchingCalls.empty());
5437 return;
5438 }
5439
5440 if (!CallsiteToCalleeFuncCloneMap.count(Node))
5441 return;
5442
5443 auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
5444 updateCall(CallerCall&: Node->Call, CalleeFunc);
5445 // Update all the matching calls as well.
5446 for (auto &Call : Node->MatchingCalls)
5447 updateCall(CallerCall&: Call, CalleeFunc);
5448
5449 // Now update all calls recorded earlier that are still in function clones
5450 // which don't have a clone of this callsite node.
5451 if (!UnassignedCallClones.contains(Node))
5452 return;
5453 DenseSet<unsigned> NodeCallClones;
5454 for (auto *C : Node->Clones)
5455 NodeCallClones.insert(C->Call.cloneNo());
5456 // Note that we already confirmed Node is in this map a few lines above.
5457 auto &ClonedCalls = UnassignedCallClones[Node];
5458 for (auto &[CloneNo, CallVector] : ClonedCalls) {
5459 // Should start at 1 as we never create an entry for original node.
5460 assert(CloneNo > 0);
5461 // If we subsequently created a clone, skip this one.
5462 if (NodeCallClones.contains(V: CloneNo))
5463 continue;
5464 // Use the original Node's CalleeFunc.
5465 for (auto &Call : CallVector)
5466 updateCall(CallerCall&: Call, CalleeFunc);
5467 }
5468 };
5469
5470 // Performs DFS traversal starting from allocation nodes to update calls to
5471 // reflect cloning decisions recorded earlier. For regular LTO this will
5472 // update the actual calls in the IR to call the appropriate function clone
5473 // (and add attributes to allocation calls), whereas for ThinLTO the decisions
5474 // are recorded in the summary entries.
5475 DenseSet<const ContextNode *> Visited;
5476 for (auto &Entry : AllocationCallToContextNodeMap)
5477 UpdateCalls(Entry.second, Visited, UpdateCalls);
5478
5479 return Changed;
5480}
5481
5482// Compute a SHA1 hash of the callsite and alloc version information of clone I
5483// in the summary, to use in detection of duplicate clones.
5484uint64_t ComputeHash(const FunctionSummary *FS, unsigned I) {
5485 SHA1 Hasher;
5486 // Update hash with any callsites that call non-default (non-zero) callee
5487 // versions.
5488 for (auto &SN : FS->callsites()) {
5489 // In theory all callsites and allocs in this function should have the same
5490 // number of clone entries, but handle any discrepancies gracefully below
5491 // for NDEBUG builds.
5492 assert(
5493 SN.Clones.size() > I &&
5494 "Callsite summary has fewer entries than other summaries in function");
5495 if (SN.Clones.size() <= I || !SN.Clones[I])
5496 continue;
5497 uint8_t Data[sizeof(SN.Clones[I])];
5498 support::endian::write32le(P: Data, V: SN.Clones[I]);
5499 Hasher.update(Data);
5500 }
5501 // Update hash with any allocs that have non-default (non-None) hints.
5502 for (auto &AN : FS->allocs()) {
5503 // In theory all callsites and allocs in this function should have the same
5504 // number of clone entries, but handle any discrepancies gracefully below
5505 // for NDEBUG builds.
5506 assert(AN.Versions.size() > I &&
5507 "Alloc summary has fewer entries than other summaries in function");
5508 if (AN.Versions.size() <= I ||
5509 (AllocationType)AN.Versions[I] == AllocationType::None)
5510 continue;
5511 Hasher.update(Data: ArrayRef<uint8_t>(&AN.Versions[I], 1));
5512 }
5513 return support::endian::read64le(P: Hasher.result().data());
5514}
5515
5516static SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> createFunctionClones(
5517 Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
5518 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
5519 &FuncToAliasMap,
5520 FunctionSummary *FS) {
5521 auto TakeDeclNameAndReplace = [](GlobalValue *DeclGV, GlobalValue *NewGV) {
5522 // We might have created this when adjusting callsite in another
5523 // function. It should be a declaration.
5524 assert(DeclGV->isDeclaration());
5525 NewGV->takeName(V: DeclGV);
5526 DeclGV->replaceAllUsesWith(V: NewGV);
5527 DeclGV->eraseFromParent();
5528 };
5529
5530 // Handle aliases to this function, and create analogous alias clones to the
5531 // provided clone of this function.
5532 auto CloneFuncAliases = [&](Function *NewF, unsigned I) {
5533 if (!FuncToAliasMap.count(x: &F))
5534 return;
5535 for (auto *A : FuncToAliasMap[&F]) {
5536 std::string AliasName = getMemProfFuncName(Base: A->getName(), CloneNo: I);
5537 auto *PrevA = M.getNamedAlias(Name: AliasName);
5538 auto *NewA = GlobalAlias::create(Ty: A->getValueType(),
5539 AddressSpace: A->getType()->getPointerAddressSpace(),
5540 Linkage: A->getLinkage(), Name: AliasName, Aliasee: NewF);
5541 NewA->copyAttributesFrom(Src: A);
5542 if (PrevA)
5543 TakeDeclNameAndReplace(PrevA, NewA);
5544 }
5545 };
5546
5547 // The first "clone" is the original copy, we should only call this if we
5548 // needed to create new clones.
5549 assert(NumClones > 1);
5550 SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
5551 VMaps.reserve(N: NumClones - 1);
5552 FunctionsClonedThinBackend++;
5553
5554 // Map of hash of callsite/alloc versions to the instantiated function clone
5555 // (possibly the original) implementing those calls. Used to avoid
5556 // instantiating duplicate function clones.
5557 // FIXME: Ideally the thin link would not generate such duplicate clones to
5558 // start with, but right now it happens due to phase ordering in the function
5559 // assignment and possible new clones that produces. We simply make each
5560 // duplicate an alias to the matching instantiated clone recorded in the map
5561 // (except for available_externally which are made declarations as they would
5562 // be aliases in the prevailing module, and available_externally aliases are
5563 // not well supported right now).
5564 DenseMap<uint64_t, Function *> HashToFunc;
5565
5566 // Save the hash of the original function version.
5567 HashToFunc[ComputeHash(FS, I: 0)] = &F;
5568
5569 for (unsigned I = 1; I < NumClones; I++) {
5570 VMaps.emplace_back(Args: std::make_unique<ValueToValueMapTy>());
5571 std::string Name = getMemProfFuncName(Base: F.getName(), CloneNo: I);
5572 auto Hash = ComputeHash(FS, I);
5573 // If this clone would duplicate a previously seen clone, don't generate the
5574 // duplicate clone body, just make an alias to satisfy any (potentially
5575 // cross-module) references.
5576 if (HashToFunc.contains(Val: Hash)) {
5577 FunctionCloneDuplicatesThinBackend++;
5578 auto *Func = HashToFunc[Hash];
5579 if (Func->hasAvailableExternallyLinkage()) {
5580 // Skip these as EliminateAvailableExternallyPass does not handle
5581 // available_externally aliases correctly and we end up with an
5582 // available_externally alias to a declaration. Just create a
5583 // declaration for now as we know we will have a definition in another
5584 // module.
5585 auto Decl = M.getOrInsertFunction(Name, T: Func->getFunctionType());
5586 ORE.emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
5587 << "created clone decl " << ore::NV("Decl", Decl.getCallee()));
5588 continue;
5589 }
5590 auto *PrevF = M.getFunction(Name);
5591 auto *Alias = GlobalAlias::create(Name, Aliasee: Func);
5592 if (PrevF)
5593 TakeDeclNameAndReplace(PrevF, Alias);
5594 ORE.emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
5595 << "created clone alias " << ore::NV("Alias", Alias));
5596
5597 // Now handle aliases to this function, and clone those as well.
5598 CloneFuncAliases(Func, I);
5599 continue;
5600 }
5601 auto *NewF = CloneFunction(F: &F, VMap&: *VMaps.back());
5602 HashToFunc[Hash] = NewF;
5603 FunctionClonesThinBackend++;
5604 // Strip memprof and callsite metadata from clone as they are no longer
5605 // needed.
5606 for (auto &BB : *NewF) {
5607 for (auto &Inst : BB) {
5608 Inst.setMetadata(KindID: LLVMContext::MD_memprof, Node: nullptr);
5609 Inst.setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
5610 }
5611 }
5612 auto *PrevF = M.getFunction(Name);
5613 if (PrevF)
5614 TakeDeclNameAndReplace(PrevF, NewF);
5615 else
5616 NewF->setName(Name);
5617 updateSubprogramLinkageName(NewFunc: NewF, Name);
5618 ORE.emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
5619 << "created clone " << ore::NV("NewFunction", NewF));
5620
5621 // Now handle aliases to this function, and clone those as well.
5622 CloneFuncAliases(NewF, I);
5623 }
5624 return VMaps;
5625}
5626
5627// Locate the summary for F. This is complicated by the fact that it might
5628// have been internalized or promoted.
5629static ValueInfo findValueInfoForFunc(const Function &F, const Module &M,
5630 const ModuleSummaryIndex *ImportSummary,
5631 const Function *CallingFunc = nullptr) {
5632 // FIXME: Ideally we would retain the original GUID in some fashion on the
5633 // function (e.g. as metadata), but for now do our best to locate the
5634 // summary without that information.
5635 ValueInfo TheFnVI = ImportSummary->getValueInfo(GUID: F.getGUID());
5636 if (!TheFnVI)
5637 // See if theFn was internalized, by checking index directly with
5638 // original name (this avoids the name adjustment done by getGUID() for
5639 // internal symbols).
5640 TheFnVI = ImportSummary->getValueInfo(
5641 GUID: GlobalValue::getGUIDAssumingExternalLinkage(GlobalName: F.getName()));
5642 if (TheFnVI)
5643 return TheFnVI;
5644 // Now query with the original name before any promotion was performed.
5645 StringRef OrigName =
5646 ModuleSummaryIndex::getOriginalNameBeforePromote(Name: F.getName());
5647 // When this pass is enabled, we always add thinlto_src_file provenance
5648 // metadata to imported function definitions, which allows us to recreate the
5649 // original internal symbol's GUID.
5650 auto SrcFileMD = F.getMetadata(Kind: "thinlto_src_file");
5651 // If this is a call to an imported/promoted local for which we didn't import
5652 // the definition, the metadata will not exist on the declaration. However,
5653 // since we are doing this early, before any inlining in the LTO backend, we
5654 // can simply look at the metadata on the calling function which must have
5655 // been from the same module if F was an internal symbol originally.
5656 if (!SrcFileMD && F.isDeclaration()) {
5657 // We would only call this for a declaration for a direct callsite, in which
5658 // case the caller would have provided the calling function pointer.
5659 assert(CallingFunc);
5660 SrcFileMD = CallingFunc->getMetadata(Kind: "thinlto_src_file");
5661 // If this is a promoted local (OrigName != F.getName()), since this is a
5662 // declaration, it must be imported from a different module and therefore we
5663 // should always find the metadata on its calling function. Any call to a
5664 // promoted local that came from this module should still be a definition.
5665 assert(SrcFileMD || OrigName == F.getName());
5666 }
5667 StringRef SrcFile = M.getSourceFileName();
5668 if (SrcFileMD)
5669 SrcFile = dyn_cast<MDString>(Val: SrcFileMD->getOperand(I: 0))->getString();
5670 std::string OrigId = GlobalValue::getGlobalIdentifier(
5671 Name: OrigName, Linkage: GlobalValue::InternalLinkage, FileName: SrcFile);
5672 TheFnVI = ImportSummary->getValueInfo(
5673 GUID: GlobalValue::getGUIDAssumingExternalLinkage(GlobalName: OrigId));
5674 // Internal func in original module may have gotten a numbered suffix if we
5675 // imported an external function with the same name. This happens
5676 // automatically during IR linking for naming conflicts. It would have to
5677 // still be internal in that case (otherwise it would have been renamed on
5678 // promotion in which case we wouldn't have a naming conflict).
5679 if (!TheFnVI && OrigName == F.getName() && F.hasLocalLinkage() &&
5680 F.getName().contains(C: '.')) {
5681 OrigName = F.getName().rsplit(Separator: '.').first;
5682 OrigId = GlobalValue::getGlobalIdentifier(
5683 Name: OrigName, Linkage: GlobalValue::InternalLinkage, FileName: SrcFile);
5684 TheFnVI = ImportSummary->getValueInfo(
5685 GUID: GlobalValue::getGUIDAssumingExternalLinkage(GlobalName: OrigId));
5686 }
5687 // The only way we may not have a VI is if this is a declaration created for
5688 // an imported reference. For distributed ThinLTO we may not have a VI for
5689 // such declarations in the distributed summary.
5690 assert(TheFnVI || F.isDeclaration());
5691 return TheFnVI;
5692}
5693
5694bool MemProfContextDisambiguation::initializeIndirectCallPromotionInfo(
5695 Module &M) {
5696 ICallAnalysis = std::make_unique<ICallPromotionAnalysis>();
5697 Symtab = std::make_unique<InstrProfSymtab>();
5698 // Don't add canonical names, to avoid multiple functions to the symtab
5699 // when they both have the same root name with "." suffixes stripped.
5700 // If we pick the wrong one then this could lead to incorrect ICP and calling
5701 // a memprof clone that we don't actually create (resulting in linker unsats).
5702 // What this means is that the GUID of the function (or its PGOFuncName
5703 // metadata) *must* match that in the VP metadata to allow promotion.
5704 // In practice this should not be a limitation, since local functions should
5705 // have PGOFuncName metadata and global function names shouldn't need any
5706 // special handling (they should not get the ".llvm.*" suffix that the
5707 // canonicalization handling is attempting to strip).
5708 if (Error E = Symtab->create(M, /*InLTO=*/true, /*AddCanonical=*/false)) {
5709 std::string SymtabFailure = toString(E: std::move(E));
5710 M.getContext().emitError(ErrorStr: "Failed to create symtab: " + SymtabFailure);
5711 return false;
5712 }
5713 return true;
5714}
5715
5716#ifndef NDEBUG
5717// Sanity check that the MIB stack ids match between the summary and
5718// instruction metadata.
5719static void checkAllocContextIds(
5720 const AllocInfo &AllocNode, const MDNode *MemProfMD,
5721 const CallStack<MDNode, MDNode::op_iterator> &CallsiteContext,
5722 const ModuleSummaryIndex *ImportSummary) {
5723 auto MIBIter = AllocNode.MIBs.begin();
5724 for (auto &MDOp : MemProfMD->operands()) {
5725 assert(MIBIter != AllocNode.MIBs.end());
5726 auto StackIdIndexIter = MIBIter->StackIdIndices.begin();
5727 auto *MIBMD = cast<const MDNode>(MDOp);
5728 MDNode *StackMDNode = getMIBStackNode(MIBMD);
5729 assert(StackMDNode);
5730 CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
5731 auto ContextIterBegin =
5732 StackContext.beginAfterSharedPrefix(CallsiteContext);
5733 // Skip the checking on the first iteration.
5734 uint64_t LastStackContextId =
5735 (ContextIterBegin != StackContext.end() && *ContextIterBegin == 0) ? 1
5736 : 0;
5737 for (auto ContextIter = ContextIterBegin; ContextIter != StackContext.end();
5738 ++ContextIter) {
5739 // If this is a direct recursion, simply skip the duplicate
5740 // entries, to be consistent with how the summary ids were
5741 // generated during ModuleSummaryAnalysis.
5742 if (LastStackContextId == *ContextIter)
5743 continue;
5744 LastStackContextId = *ContextIter;
5745 assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
5746 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
5747 *ContextIter);
5748 StackIdIndexIter++;
5749 }
5750 MIBIter++;
5751 }
5752}
5753#endif
5754
5755bool MemProfContextDisambiguation::applyImport(Module &M) {
5756 assert(ImportSummary);
5757 bool Changed = false;
5758
5759 // We also need to clone any aliases that reference cloned functions, because
5760 // the modified callsites may invoke via the alias. Keep track of the aliases
5761 // for each function.
5762 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
5763 FuncToAliasMap;
5764 for (auto &A : M.aliases()) {
5765 auto *Aliasee = A.getAliaseeObject();
5766 if (auto *F = dyn_cast<Function>(Val: Aliasee))
5767 FuncToAliasMap[F].insert(Ptr: &A);
5768 }
5769
5770 if (!initializeIndirectCallPromotionInfo(M))
5771 return false;
5772
5773 for (auto &F : M) {
5774 if (F.isDeclaration() || isMemProfClone(F))
5775 continue;
5776
5777 OptimizationRemarkEmitter ORE(&F);
5778
5779 SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
5780 bool ClonesCreated = false;
5781 unsigned NumClonesCreated = 0;
5782 auto CloneFuncIfNeeded = [&](unsigned NumClones, FunctionSummary *FS) {
5783 // We should at least have version 0 which is the original copy.
5784 assert(NumClones > 0);
5785 // If only one copy needed use original.
5786 if (NumClones == 1)
5787 return;
5788 // If we already performed cloning of this function, confirm that the
5789 // requested number of clones matches (the thin link should ensure the
5790 // number of clones for each constituent callsite is consistent within
5791 // each function), before returning.
5792 if (ClonesCreated) {
5793 assert(NumClonesCreated == NumClones);
5794 return;
5795 }
5796 VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap, FS);
5797 // The first "clone" is the original copy, which doesn't have a VMap.
5798 assert(VMaps.size() == NumClones - 1);
5799 Changed = true;
5800 ClonesCreated = true;
5801 NumClonesCreated = NumClones;
5802 };
5803
5804 auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
5805 Function *CalledFunction, FunctionSummary *FS) {
5806 // Perform cloning if not yet done.
5807 CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size(), FS);
5808
5809 assert(!isMemProfClone(*CalledFunction));
5810
5811 // Because we update the cloned calls by calling setCalledOperand (see
5812 // comment below), out of an abundance of caution make sure the called
5813 // function was actually the called operand (or its aliasee). We also
5814 // strip pointer casts when looking for calls (to match behavior during
5815 // summary generation), however, with opaque pointers in theory this
5816 // should not be an issue. Note we still clone the current function
5817 // (containing this call) above, as that could be needed for its callers.
5818 auto *GA = dyn_cast_or_null<GlobalAlias>(Val: CB->getCalledOperand());
5819 if (CalledFunction != CB->getCalledOperand() &&
5820 (!GA || CalledFunction != GA->getAliaseeObject())) {
5821 SkippedCallsCloning++;
5822 return;
5823 }
5824 // Update the calls per the summary info.
5825 // Save orig name since it gets updated in the first iteration
5826 // below.
5827 auto CalleeOrigName = CalledFunction->getName();
5828 for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
5829 // If the VMap is empty, this clone was a duplicate of another and was
5830 // created as an alias or a declaration.
5831 if (J > 0 && VMaps[J - 1]->empty())
5832 continue;
5833 // Do nothing if this version calls the original version of its
5834 // callee.
5835 if (!StackNode.Clones[J])
5836 continue;
5837 auto NewF = M.getOrInsertFunction(
5838 Name: getMemProfFuncName(Base: CalleeOrigName, CloneNo: StackNode.Clones[J]),
5839 T: CalledFunction->getFunctionType());
5840 CallBase *CBClone;
5841 // Copy 0 is the original function.
5842 if (!J)
5843 CBClone = CB;
5844 else
5845 CBClone = cast<CallBase>(Val&: (*VMaps[J - 1])[CB]);
5846 // Set the called operand directly instead of calling setCalledFunction,
5847 // as the latter mutates the function type on the call. In rare cases
5848 // we may have a slightly different type on a callee function
5849 // declaration due to it being imported from a different module with
5850 // incomplete types. We really just want to change the name of the
5851 // function to the clone, and not make any type changes.
5852 CBClone->setCalledOperand(NewF.getCallee());
5853 ORE.emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
5854 << ore::NV("Call", CBClone) << " in clone "
5855 << ore::NV("Caller", CBClone->getFunction())
5856 << " assigned to call function clone "
5857 << ore::NV("Callee", NewF.getCallee()));
5858 }
5859 };
5860
5861 // Locate the summary for F.
5862 ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
5863 // If not found, this could be an imported local (see comment in
5864 // findValueInfoForFunc). Skip for now as it will be cloned in its original
5865 // module (where it would have been promoted to global scope so should
5866 // satisfy any reference in this module).
5867 if (!TheFnVI)
5868 continue;
5869
5870 auto *GVSummary =
5871 ImportSummary->findSummaryInModule(VI: TheFnVI, ModuleId: M.getModuleIdentifier());
5872 if (!GVSummary) {
5873 // Must have been imported, use the summary which matches the definition。
5874 // (might be multiple if this was a linkonce_odr).
5875 auto SrcModuleMD = F.getMetadata(Kind: "thinlto_src_module");
5876 assert(SrcModuleMD &&
5877 "enable-import-metadata is needed to emit thinlto_src_module");
5878 StringRef SrcModule =
5879 dyn_cast<MDString>(Val: SrcModuleMD->getOperand(I: 0))->getString();
5880 for (auto &GVS : TheFnVI.getSummaryList()) {
5881 if (GVS->modulePath() == SrcModule) {
5882 GVSummary = GVS.get();
5883 break;
5884 }
5885 }
5886 assert(GVSummary && GVSummary->modulePath() == SrcModule);
5887 }
5888
5889 // If this was an imported alias skip it as we won't have the function
5890 // summary, and it should be cloned in the original module.
5891 if (isa<AliasSummary>(Val: GVSummary))
5892 continue;
5893
5894 auto *FS = cast<FunctionSummary>(Val: GVSummary->getBaseObject());
5895
5896 if (FS->allocs().empty() && FS->callsites().empty())
5897 continue;
5898
5899 auto SI = FS->callsites().begin();
5900 auto AI = FS->allocs().begin();
5901
5902 // To handle callsite infos synthesized for tail calls which have missing
5903 // frames in the profiled context, map callee VI to the synthesized callsite
5904 // info.
5905 DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite;
5906 // Iterate the callsites for this function in reverse, since we place all
5907 // those synthesized for tail calls at the end.
5908 for (auto CallsiteIt = FS->callsites().rbegin();
5909 CallsiteIt != FS->callsites().rend(); CallsiteIt++) {
5910 auto &Callsite = *CallsiteIt;
5911 // Stop as soon as we see a non-synthesized callsite info (see comment
5912 // above loop). All the entries added for discovered tail calls have empty
5913 // stack ids.
5914 if (!Callsite.StackIdIndices.empty())
5915 break;
5916 MapTailCallCalleeVIToCallsite.insert(KV: {Callsite.Callee, Callsite});
5917 }
5918
5919 // Keeps track of needed ICP for the function.
5920 SmallVector<ICallAnalysisData> ICallAnalysisInfo;
5921
5922 // Assume for now that the instructions are in the exact same order
5923 // as when the summary was created, but confirm this is correct by
5924 // matching the stack ids.
5925 for (auto &BB : F) {
5926 for (auto &I : BB) {
5927 auto *CB = dyn_cast<CallBase>(Val: &I);
5928 // Same handling as when creating module summary.
5929 if (!mayHaveMemprofSummary(CB))
5930 continue;
5931
5932 auto *CalledValue = CB->getCalledOperand();
5933 auto *CalledFunction = CB->getCalledFunction();
5934 if (CalledValue && !CalledFunction) {
5935 CalledValue = CalledValue->stripPointerCasts();
5936 // Stripping pointer casts can reveal a called function.
5937 CalledFunction = dyn_cast<Function>(Val: CalledValue);
5938 }
5939 // Check if this is an alias to a function. If so, get the
5940 // called aliasee for the checks below.
5941 if (auto *GA = dyn_cast<GlobalAlias>(Val: CalledValue)) {
5942 assert(!CalledFunction &&
5943 "Expected null called function in callsite for alias");
5944 CalledFunction = dyn_cast<Function>(Val: GA->getAliaseeObject());
5945 }
5946
5947 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
5948 I.getMetadata(KindID: LLVMContext::MD_callsite));
5949 auto *MemProfMD = I.getMetadata(KindID: LLVMContext::MD_memprof);
5950
5951 // Include allocs that were already assigned a memprof function
5952 // attribute in the statistics. Only do this for those that do not have
5953 // memprof metadata, since we add an "ambiguous" memprof attribute by
5954 // default.
5955 if (CB->getAttributes().hasFnAttr(Kind: "memprof") && !MemProfMD) {
5956 CB->getAttributes().getFnAttr(Kind: "memprof").getValueAsString() == "cold"
5957 ? AllocTypeColdThinBackend++
5958 : AllocTypeNotColdThinBackend++;
5959 OrigAllocsThinBackend++;
5960 AllocVersionsThinBackend++;
5961 if (!MaxAllocVersionsThinBackend)
5962 MaxAllocVersionsThinBackend = 1;
5963 continue;
5964 }
5965
5966 if (MemProfMD) {
5967 // Consult the next alloc node.
5968 assert(AI != FS->allocs().end());
5969 auto &AllocNode = *(AI++);
5970
5971#ifndef NDEBUG
5972 checkAllocContextIds(AllocNode, MemProfMD, CallsiteContext,
5973 ImportSummary);
5974#endif
5975
5976 // Perform cloning if not yet done.
5977 CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size(), FS);
5978
5979 OrigAllocsThinBackend++;
5980 AllocVersionsThinBackend += AllocNode.Versions.size();
5981 if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
5982 MaxAllocVersionsThinBackend = AllocNode.Versions.size();
5983
5984 // If there is only one version that means we didn't end up
5985 // considering this function for cloning, and in that case the alloc
5986 // will still be none type or should have gotten the default NotCold.
5987 // Skip that after calling clone helper since that does some sanity
5988 // checks that confirm we haven't decided yet that we need cloning.
5989 // We might have a single version that is cold due to the
5990 // MinClonedColdBytePercent heuristic, make sure we don't skip in that
5991 // case.
5992 if (AllocNode.Versions.size() == 1 &&
5993 (AllocationType)AllocNode.Versions[0] != AllocationType::Cold) {
5994 assert((AllocationType)AllocNode.Versions[0] ==
5995 AllocationType::NotCold ||
5996 (AllocationType)AllocNode.Versions[0] ==
5997 AllocationType::None);
5998 UnclonableAllocsThinBackend++;
5999 continue;
6000 }
6001
6002 // All versions should have a singular allocation type.
6003 assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
6004 return Type == ((uint8_t)AllocationType::NotCold |
6005 (uint8_t)AllocationType::Cold);
6006 }));
6007
6008 // Update the allocation types per the summary info.
6009 for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
6010 // If the VMap is empty, this clone was a duplicate of another and
6011 // was created as an alias or a declaration.
6012 if (J > 0 && VMaps[J - 1]->empty())
6013 continue;
6014 // Ignore any that didn't get an assigned allocation type.
6015 if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
6016 continue;
6017 AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
6018 AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
6019 : AllocTypeNotColdThinBackend++;
6020 std::string AllocTypeString = getAllocTypeAttributeString(Type: AllocTy);
6021 auto A = llvm::Attribute::get(Context&: F.getContext(), Kind: "memprof",
6022 Val: AllocTypeString);
6023 CallBase *CBClone;
6024 // Copy 0 is the original function.
6025 if (!J)
6026 CBClone = CB;
6027 else
6028 // Since VMaps are only created for new clones, we index with
6029 // clone J-1 (J==0 is the original clone and does not have a VMaps
6030 // entry).
6031 CBClone = cast<CallBase>(Val&: (*VMaps[J - 1])[CB]);
6032 removeAnyExistingAmbiguousAttribute(CB: CBClone);
6033 CBClone->addFnAttr(Attr: A);
6034 ORE.emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
6035 << ore::NV("AllocationCall", CBClone) << " in clone "
6036 << ore::NV("Caller", CBClone->getFunction())
6037 << " marked with memprof allocation attribute "
6038 << ore::NV("Attribute", AllocTypeString));
6039 }
6040 } else if (!CallsiteContext.empty()) {
6041 if (!CalledFunction) {
6042#ifndef NDEBUG
6043 // We should have skipped inline assembly calls.
6044 auto *CI = dyn_cast<CallInst>(CB);
6045 assert(!CI || !CI->isInlineAsm());
6046#endif
6047 // We should have skipped direct calls via a Constant.
6048 assert(CalledValue && !isa<Constant>(CalledValue));
6049
6050 // This is an indirect call, see if we have profile information and
6051 // whether any clones were recorded for the profiled targets (that
6052 // we synthesized CallsiteInfo summary records for when building the
6053 // index).
6054 auto NumClones =
6055 recordICPInfo(CB, AllCallsites: FS->callsites(), SI, ICallAnalysisInfo);
6056
6057 // Perform cloning if not yet done. This is done here in case
6058 // we don't need to do ICP, but might need to clone this
6059 // function as it is the target of other cloned calls.
6060 if (NumClones)
6061 CloneFuncIfNeeded(NumClones, FS);
6062 }
6063
6064 else {
6065 // Consult the next callsite node.
6066 assert(SI != FS->callsites().end());
6067 auto &StackNode = *(SI++);
6068
6069#ifndef NDEBUG
6070 // Sanity check that the stack ids match between the summary and
6071 // instruction metadata.
6072 auto StackIdIndexIter = StackNode.StackIdIndices.begin();
6073 for (auto StackId : CallsiteContext) {
6074 assert(StackIdIndexIter != StackNode.StackIdIndices.end());
6075 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
6076 StackId);
6077 StackIdIndexIter++;
6078 }
6079#endif
6080
6081 CloneCallsite(StackNode, CB, CalledFunction, FS);
6082 }
6083 } else if (CB->isTailCall() && CalledFunction) {
6084 // Locate the synthesized callsite info for the callee VI, if any was
6085 // created, and use that for cloning.
6086 ValueInfo CalleeVI =
6087 findValueInfoForFunc(F: *CalledFunction, M, ImportSummary, CallingFunc: &F);
6088 if (CalleeVI && MapTailCallCalleeVIToCallsite.count(Val: CalleeVI)) {
6089 auto Callsite = MapTailCallCalleeVIToCallsite.find(Val: CalleeVI);
6090 assert(Callsite != MapTailCallCalleeVIToCallsite.end());
6091 CloneCallsite(Callsite->second, CB, CalledFunction, FS);
6092 }
6093 }
6094 }
6095 }
6096
6097 // Now do any promotion required for cloning.
6098 performICP(M, AllCallsites: FS->callsites(), VMaps, ICallAnalysisInfo, ORE);
6099 }
6100
6101 // We skip some of the functions and instructions above, so remove all the
6102 // metadata in a single sweep here.
6103 for (auto &F : M) {
6104 // We can skip memprof clones because createFunctionClones already strips
6105 // the metadata from the newly created clones.
6106 if (F.isDeclaration() || isMemProfClone(F))
6107 continue;
6108 for (auto &BB : F) {
6109 for (auto &I : BB) {
6110 if (!isa<CallBase>(Val: I))
6111 continue;
6112 I.setMetadata(KindID: LLVMContext::MD_memprof, Node: nullptr);
6113 I.setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
6114 }
6115 }
6116 }
6117
6118 return Changed;
6119}
6120
6121unsigned MemProfContextDisambiguation::recordICPInfo(
6122 CallBase *CB, ArrayRef<CallsiteInfo> AllCallsites,
6123 ArrayRef<CallsiteInfo>::iterator &SI,
6124 SmallVector<ICallAnalysisData> &ICallAnalysisInfo) {
6125 // First see if we have profile information for this indirect call.
6126 uint32_t NumCandidates;
6127 uint64_t TotalCount;
6128 auto CandidateProfileData =
6129 ICallAnalysis->getPromotionCandidatesForInstruction(
6130 I: CB, TotalCount, NumCandidates, MaxNumValueData: MaxSummaryIndirectEdges);
6131 if (CandidateProfileData.empty())
6132 return 0;
6133
6134 // Iterate through all of the candidate profiled targets along with the
6135 // CallsiteInfo summary records synthesized for them when building the index,
6136 // and see if any are cloned and/or refer to clones.
6137 bool ICPNeeded = false;
6138 unsigned NumClones = 0;
6139 size_t CallsiteInfoStartIndex = std::distance(first: AllCallsites.begin(), last: SI);
6140 for (const auto &Candidate : CandidateProfileData) {
6141#ifndef NDEBUG
6142 auto CalleeValueInfo =
6143#endif
6144 ImportSummary->getValueInfo(GUID: Candidate.Value);
6145 // We might not have a ValueInfo if this is a distributed
6146 // ThinLTO backend and decided not to import that function.
6147 assert(!CalleeValueInfo || SI->Callee == CalleeValueInfo);
6148 assert(SI != AllCallsites.end());
6149 auto &StackNode = *(SI++);
6150 // See if any of the clones of the indirect callsite for this
6151 // profiled target should call a cloned version of the profiled
6152 // target. We only need to do the ICP here if so.
6153 ICPNeeded |= llvm::any_of(Range: StackNode.Clones,
6154 P: [](unsigned CloneNo) { return CloneNo != 0; });
6155 // Every callsite in the same function should have been cloned the same
6156 // number of times.
6157 assert(!NumClones || NumClones == StackNode.Clones.size());
6158 NumClones = StackNode.Clones.size();
6159 }
6160 if (!ICPNeeded)
6161 return NumClones;
6162 // Save information for ICP, which is performed later to avoid messing up the
6163 // current function traversal.
6164 ICallAnalysisInfo.push_back(Elt: {.CB: CB, .CandidateProfileData: CandidateProfileData.vec(), .NumCandidates: NumCandidates,
6165 .TotalCount: TotalCount, .CallsiteInfoStartIndex: CallsiteInfoStartIndex});
6166 return NumClones;
6167}
6168
6169void MemProfContextDisambiguation::performICP(
6170 Module &M, ArrayRef<CallsiteInfo> AllCallsites,
6171 ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps,
6172 ArrayRef<ICallAnalysisData> ICallAnalysisInfo,
6173 OptimizationRemarkEmitter &ORE) {
6174 // Now do any promotion required for cloning. Specifically, for each
6175 // recorded ICP candidate (which was only recorded because one clone of that
6176 // candidate should call a cloned target), we perform ICP (speculative
6177 // devirtualization) for each clone of the callsite, and update its callee
6178 // to the appropriate clone. Note that the ICP compares against the original
6179 // version of the target, which is what is in the vtable.
6180 for (auto &Info : ICallAnalysisInfo) {
6181 auto *CB = Info.CB;
6182 auto CallsiteIndex = Info.CallsiteInfoStartIndex;
6183 auto TotalCount = Info.TotalCount;
6184 unsigned NumPromoted = 0;
6185 unsigned NumClones = 0;
6186
6187 for (auto &Candidate : Info.CandidateProfileData) {
6188 auto &StackNode = AllCallsites[CallsiteIndex++];
6189
6190 // All calls in the same function must have the same number of clones.
6191 assert(!NumClones || NumClones == StackNode.Clones.size());
6192 NumClones = StackNode.Clones.size();
6193
6194 // See if the target is in the module. If it wasn't imported, it is
6195 // possible that this profile could have been collected on a different
6196 // target (or version of the code), and we need to be conservative
6197 // (similar to what is done in the ICP pass).
6198 Function *TargetFunction = Symtab->getFunction(FuncMD5Hash: Candidate.Value);
6199 if (TargetFunction == nullptr ||
6200 // Any ThinLTO global dead symbol removal should have already
6201 // occurred, so it should be safe to promote when the target is a
6202 // declaration.
6203 // TODO: Remove internal option once more fully tested.
6204 (MemProfRequireDefinitionForPromotion &&
6205 TargetFunction->isDeclaration())) {
6206 ORE.emit(RemarkBuilder: [&]() {
6207 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", CB)
6208 << "Memprof cannot promote indirect call: target with md5sum "
6209 << ore::NV("target md5sum", Candidate.Value) << " not found";
6210 });
6211 // FIXME: See if we can use the new declaration importing support to
6212 // at least get the declarations imported for this case. Hot indirect
6213 // targets should have been imported normally, however.
6214 continue;
6215 }
6216
6217 // Check if legal to promote
6218 const char *Reason = nullptr;
6219 if (!isLegalToPromote(CB: *CB, Callee: TargetFunction, FailureReason: &Reason)) {
6220 ORE.emit(RemarkBuilder: [&]() {
6221 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", CB)
6222 << "Memprof cannot promote indirect call to "
6223 << ore::NV("TargetFunction", TargetFunction)
6224 << " with count of " << ore::NV("TotalCount", TotalCount)
6225 << ": " << Reason;
6226 });
6227 continue;
6228 }
6229
6230 assert(!isMemProfClone(*TargetFunction));
6231
6232 // Handle each call clone, applying ICP so that each clone directly
6233 // calls the specified callee clone, guarded by the appropriate ICP
6234 // check.
6235 CallBase *CBClone = CB;
6236 for (unsigned J = 0; J < NumClones; J++) {
6237 // If the VMap is empty, this clone was a duplicate of another and was
6238 // created as an alias or a declaration.
6239 if (J > 0 && VMaps[J - 1]->empty())
6240 continue;
6241 // Copy 0 is the original function.
6242 if (J > 0)
6243 CBClone = cast<CallBase>(Val&: (*VMaps[J - 1])[CB]);
6244 // We do the promotion using the original name, so that the comparison
6245 // is against the name in the vtable. Then just below, change the new
6246 // direct call to call the cloned function.
6247 auto &DirectCall =
6248 pgo::promoteIndirectCall(CB&: *CBClone, F: TargetFunction, Count: Candidate.Count,
6249 TotalCount, AttachProfToDirectCall: isSamplePGO, ORE: &ORE);
6250 auto *TargetToUse = TargetFunction;
6251 // Call original if this version calls the original version of its
6252 // callee.
6253 if (StackNode.Clones[J]) {
6254 TargetToUse =
6255 cast<Function>(Val: M.getOrInsertFunction(
6256 Name: getMemProfFuncName(Base: TargetFunction->getName(),
6257 CloneNo: StackNode.Clones[J]),
6258 T: TargetFunction->getFunctionType())
6259 .getCallee());
6260 }
6261 DirectCall.setCalledFunction(TargetToUse);
6262 // During matching we generate synthetic VP metadata for indirect calls
6263 // not already having any, from the memprof profile's callee GUIDs. If
6264 // we subsequently promote and inline those callees, we currently lose
6265 // the ability to generate this synthetic VP metadata. Optionally apply
6266 // a noinline attribute to promoted direct calls, where the threshold is
6267 // set to capture synthetic VP metadata targets which get a count of 1.
6268 if (MemProfICPNoInlineThreshold &&
6269 Candidate.Count < MemProfICPNoInlineThreshold)
6270 DirectCall.setIsNoInline();
6271 ORE.emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
6272 << ore::NV("Call", CBClone) << " in clone "
6273 << ore::NV("Caller", CBClone->getFunction())
6274 << " promoted and assigned to call function clone "
6275 << ore::NV("Callee", TargetToUse));
6276 }
6277
6278 // Update TotalCount (all clones should get same count above)
6279 TotalCount -= Candidate.Count;
6280 NumPromoted++;
6281 }
6282 // Adjust the MD.prof metadata for all clones, now that we have the new
6283 // TotalCount and the number promoted.
6284 CallBase *CBClone = CB;
6285 for (unsigned J = 0; J < NumClones; J++) {
6286 // If the VMap is empty, this clone was a duplicate of another and was
6287 // created as an alias or a declaration.
6288 if (J > 0 && VMaps[J - 1]->empty())
6289 continue;
6290 // Copy 0 is the original function.
6291 if (J > 0)
6292 CBClone = cast<CallBase>(Val&: (*VMaps[J - 1])[CB]);
6293 // First delete the old one.
6294 CBClone->setMetadata(KindID: LLVMContext::MD_prof, Node: nullptr);
6295 // If all promoted, we don't need the MD.prof metadata.
6296 // Otherwise we need update with the un-promoted records back.
6297 if (TotalCount != 0)
6298 annotateValueSite(
6299 M, Inst&: *CBClone, VDs: ArrayRef(Info.CandidateProfileData).slice(N: NumPromoted),
6300 Sum: TotalCount, ValueKind: IPVK_IndirectCallTarget, MaxMDCount: Info.NumCandidates);
6301 }
6302 }
6303}
6304
6305template <typename DerivedCCG, typename FuncTy, typename CallTy>
6306bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process(
6307 function_ref<void(StringRef, StringRef, const Twine &)> EmitRemark) {
6308 if (DumpCCG) {
6309 dbgs() << "CCG before cloning:\n";
6310 dbgs() << *this;
6311 }
6312 if (ExportToDot)
6313 exportToDot(Label: "postbuild");
6314
6315 if (VerifyCCG) {
6316 check();
6317 }
6318
6319 identifyClones();
6320
6321 if (VerifyCCG) {
6322 check();
6323 }
6324
6325 if (DumpCCG) {
6326 dbgs() << "CCG after cloning:\n";
6327 dbgs() << *this;
6328 }
6329 if (ExportToDot)
6330 exportToDot(Label: "cloned");
6331
6332 bool Changed = assignFunctions();
6333
6334 if (DumpCCG) {
6335 dbgs() << "CCG after assigning function clones:\n";
6336 dbgs() << *this;
6337 }
6338 if (ExportToDot)
6339 exportToDot(Label: "clonefuncassign");
6340
6341 if (MemProfReportHintedSizes)
6342 printTotalSizes(OS&: errs(), EmitRemark);
6343
6344 return Changed;
6345}
6346
6347bool MemProfContextDisambiguation::processModule(
6348 Module &M,
6349 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
6350
6351 // If we have an import summary, then the cloning decisions were made during
6352 // the thin link on the index. Apply them and return.
6353 if (ImportSummary)
6354 return applyImport(M);
6355
6356 // TODO: If/when other types of memprof cloning are enabled beyond just for
6357 // hot and cold, we will need to change this to individually control the
6358 // AllocationType passed to addStackNodesForMIB during CCG construction.
6359 // Note that we specifically check this after applying imports above, so that
6360 // the option isn't needed to be passed to distributed ThinLTO backend
6361 // clang processes, which won't necessarily have visibility into the linker
6362 // dependences. Instead the information is communicated from the LTO link to
6363 // the backends via the combined summary index.
6364 if (!SupportsHotColdNew)
6365 return false;
6366
6367 ModuleCallsiteContextGraph CCG(M, OREGetter);
6368 return CCG.process();
6369}
6370
6371MemProfContextDisambiguation::MemProfContextDisambiguation(
6372 const ModuleSummaryIndex *Summary, bool isSamplePGO)
6373 : ImportSummary(Summary), isSamplePGO(isSamplePGO) {
6374 // Check the dot graph printing options once here, to make sure we have valid
6375 // and expected combinations.
6376 if (DotGraphScope == DotScope::Alloc && !AllocIdForDot.getNumOccurrences())
6377 llvm::report_fatal_error(
6378 reason: "-memprof-dot-scope=alloc requires -memprof-dot-alloc-id");
6379 if (DotGraphScope == DotScope::Context &&
6380 !ContextIdForDot.getNumOccurrences())
6381 llvm::report_fatal_error(
6382 reason: "-memprof-dot-scope=context requires -memprof-dot-context-id");
6383 if (DotGraphScope == DotScope::All && AllocIdForDot.getNumOccurrences() &&
6384 ContextIdForDot.getNumOccurrences())
6385 llvm::report_fatal_error(
6386 reason: "-memprof-dot-scope=all can't have both -memprof-dot-alloc-id and "
6387 "-memprof-dot-context-id");
6388 if (ImportSummary) {
6389 // The MemProfImportSummary should only be used for testing ThinLTO
6390 // distributed backend handling via opt, in which case we don't have a
6391 // summary from the pass pipeline.
6392 assert(MemProfImportSummary.empty());
6393 return;
6394 }
6395 if (MemProfImportSummary.empty())
6396 return;
6397
6398 auto ReadSummaryFile =
6399 errorOrToExpected(EO: MemoryBuffer::getFile(Filename: MemProfImportSummary));
6400 if (!ReadSummaryFile) {
6401 logAllUnhandledErrors(E: ReadSummaryFile.takeError(), OS&: errs(),
6402 ErrorBanner: "Error loading file '" + MemProfImportSummary +
6403 "': ");
6404 return;
6405 }
6406 auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(Buffer: **ReadSummaryFile);
6407 if (!ImportSummaryForTestingOrErr) {
6408 logAllUnhandledErrors(E: ImportSummaryForTestingOrErr.takeError(), OS&: errs(),
6409 ErrorBanner: "Error parsing file '" + MemProfImportSummary +
6410 "': ");
6411 return;
6412 }
6413 ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
6414 ImportSummary = ImportSummaryForTesting.get();
6415}
6416
6417PreservedAnalyses MemProfContextDisambiguation::run(Module &M,
6418 ModuleAnalysisManager &AM) {
6419 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(IR&: M).getManager();
6420 auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
6421 return FAM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: *F);
6422 };
6423 if (!processModule(M, OREGetter))
6424 return PreservedAnalyses::all();
6425 return PreservedAnalyses::none();
6426}
6427
6428void MemProfContextDisambiguation::run(
6429 ModuleSummaryIndex &Index,
6430 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
6431 isPrevailing,
6432 function_ref<void(StringRef, StringRef, const Twine &)> EmitRemark) {
6433 // TODO: If/when other types of memprof cloning are enabled beyond just for
6434 // hot and cold, we will need to change this to individually control the
6435 // AllocationType passed to addStackNodesForMIB during CCG construction.
6436 // The index was set from the option, so these should be in sync.
6437 assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
6438 if (!SupportsHotColdNew)
6439 return;
6440
6441 IndexCallsiteContextGraph CCG(Index, isPrevailing);
6442 CCG.process(EmitRemark);
6443}
6444
6445// Strips MemProf attributes and metadata. Can be invoked by the pass pipeline
6446// when we don't have an index that has recorded that we are linking with
6447// allocation libraries containing the necessary APIs for downstream
6448// transformations.
6449PreservedAnalyses MemProfRemoveInfo::run(Module &M, ModuleAnalysisManager &AM) {
6450 // The profile matcher applies hotness attributes directly for allocations,
6451 // and those will cause us to generate calls to the hot/cold interfaces
6452 // unconditionally. If supports-hot-cold-new was not enabled in the LTO
6453 // link then assume we don't want these calls (e.g. not linking with
6454 // the appropriate library, or otherwise trying to disable this behavior).
6455 bool Changed = false;
6456 for (auto &F : M) {
6457 for (auto &BB : F) {
6458 for (auto &I : BB) {
6459 auto *CI = dyn_cast<CallBase>(Val: &I);
6460 if (!CI)
6461 continue;
6462 if (CI->hasFnAttr(Kind: "memprof")) {
6463 CI->removeFnAttr(Kind: "memprof");
6464 Changed = true;
6465 }
6466 if (!CI->hasMetadata(KindID: LLVMContext::MD_callsite)) {
6467 assert(!CI->hasMetadata(LLVMContext::MD_memprof));
6468 continue;
6469 }
6470 // Strip off all memprof metadata as it is no longer needed.
6471 // Importantly, this avoids the addition of new memprof attributes
6472 // after inlining propagation.
6473 CI->setMetadata(KindID: LLVMContext::MD_memprof, Node: nullptr);
6474 CI->setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
6475 Changed = true;
6476 }
6477 }
6478 }
6479 if (!Changed)
6480 return PreservedAnalyses::all();
6481 return PreservedAnalyses::none();
6482}
6483