1//==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements support for context disambiguation of allocation
10// calls for profile guided heap optimization. Specifically, it uses Memprof
11// profiles which indicate context specific allocation behavior (currently
12// distinguishing cold vs hot memory allocations). Cloning is performed to
13// expose the cold allocation call contexts, and the allocation calls are
14// subsequently annotated with an attribute for later transformation.
15//
16// The transformations can be performed either directly on IR (regular LTO), or
17// on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
18// Both types of LTO operate on a the same base graph representation, which
19// uses CRTP to support either IR or Index formats.
20//
21//===----------------------------------------------------------------------===//
22
23#include "llvm/Transforms/IPO/MemProfContextDisambiguation.h"
24#include "llvm/ADT/DenseMap.h"
25#include "llvm/ADT/DenseSet.h"
26#include "llvm/ADT/MapVector.h"
27#include "llvm/ADT/SetOperations.h"
28#include "llvm/ADT/SmallPtrSet.h"
29#include "llvm/ADT/SmallSet.h"
30#include "llvm/ADT/SmallVector.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/StringExtras.h"
33#include "llvm/Analysis/MemoryProfileInfo.h"
34#include "llvm/Analysis/ModuleSummaryAnalysis.h"
35#include "llvm/Analysis/OptimizationRemarkEmitter.h"
36#include "llvm/Bitcode/BitcodeReader.h"
37#include "llvm/IR/Instructions.h"
38#include "llvm/IR/Module.h"
39#include "llvm/IR/ModuleSummaryIndex.h"
40#include "llvm/Pass.h"
41#include "llvm/Support/CommandLine.h"
42#include "llvm/Support/GraphWriter.h"
43#include "llvm/Support/InterleavedRange.h"
44#include "llvm/Support/SHA1.h"
45#include "llvm/Support/raw_ostream.h"
46#include "llvm/Transforms/IPO.h"
47#include "llvm/Transforms/Utils/CallPromotionUtils.h"
48#include "llvm/Transforms/Utils/Cloning.h"
49#include "llvm/Transforms/Utils/Instrumentation.h"
50#include <deque>
51#include <sstream>
52#include <unordered_map>
53#include <vector>
54using namespace llvm;
55using namespace llvm::memprof;
56
57#define DEBUG_TYPE "memprof-context-disambiguation"
58
59STATISTIC(FunctionClonesAnalysis,
60 "Number of function clones created during whole program analysis");
61STATISTIC(FunctionClonesThinBackend,
62 "Number of function clones created during ThinLTO backend");
63STATISTIC(FunctionsClonedThinBackend,
64 "Number of functions that had clones created during ThinLTO backend");
65STATISTIC(
66 FunctionCloneDuplicatesThinBackend,
67 "Number of function clone duplicates detected during ThinLTO backend");
68STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
69 "cloned) during whole program analysis");
70STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
71 "during whole program analysis");
72STATISTIC(AllocTypeNotColdThinBackend,
73 "Number of not cold static allocations (possibly cloned) during "
74 "ThinLTO backend");
75STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
76 "(possibly cloned) during ThinLTO backend");
77STATISTIC(OrigAllocsThinBackend,
78 "Number of original (not cloned) allocations with memprof profiles "
79 "during ThinLTO backend");
80STATISTIC(
81 AllocVersionsThinBackend,
82 "Number of allocation versions (including clones) during ThinLTO backend");
83STATISTIC(MaxAllocVersionsThinBackend,
84 "Maximum number of allocation versions created for an original "
85 "allocation during ThinLTO backend");
86STATISTIC(UnclonableAllocsThinBackend,
87 "Number of unclonable ambigous allocations during ThinLTO backend");
88STATISTIC(RemovedEdgesWithMismatchedCallees,
89 "Number of edges removed due to mismatched callees (profiled vs IR)");
90STATISTIC(FoundProfiledCalleeCount,
91 "Number of profiled callees found via tail calls");
92STATISTIC(FoundProfiledCalleeDepth,
93 "Aggregate depth of profiled callees found via tail calls");
94STATISTIC(FoundProfiledCalleeMaxDepth,
95 "Maximum depth of profiled callees found via tail calls");
96STATISTIC(FoundProfiledCalleeNonUniquelyCount,
97 "Number of profiled callees found via multiple tail call chains");
98STATISTIC(DeferredBackedges, "Number of backedges with deferred cloning");
99STATISTIC(NewMergedNodes, "Number of new nodes created during merging");
100STATISTIC(NonNewMergedNodes, "Number of non new nodes used during merging");
101STATISTIC(MissingAllocForContextId,
102 "Number of missing alloc nodes for context ids");
103STATISTIC(SkippedCallsCloning,
104 "Number of calls skipped during cloning due to unexpected operand");
105STATISTIC(MismatchedCloneAssignments,
106 "Number of callsites assigned to call multiple non-matching clones");
107STATISTIC(TotalMergeInvokes, "Number of merge invocations for nodes");
108STATISTIC(TotalMergeIters, "Number of merge iterations for nodes");
109STATISTIC(MaxMergeIters, "Max merge iterations for nodes");
110STATISTIC(NumImportantContextIds, "Number of important context ids");
111STATISTIC(NumFixupEdgeIdsInserted, "Number of fixup edge ids inserted");
112STATISTIC(NumFixupEdgesAdded, "Number of fixup edges added");
113STATISTIC(NumFixedContexts, "Number of contexts with fixed edges");
114STATISTIC(AliaseesPrevailingInDiffModuleFromAlias,
115 "Number of aliasees prevailing in a different module than its alias");
116
117static cl::opt<std::string> DotFilePathPrefix(
118 "memprof-dot-file-path-prefix", cl::init(Val: ""), cl::Hidden,
119 cl::value_desc("filename"),
120 cl::desc("Specify the path prefix of the MemProf dot files."));
121
122static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(Val: false),
123 cl::Hidden,
124 cl::desc("Export graph to dot files."));
125
126// TODO: Remove this option once new handling is validated more widely.
127static cl::opt<bool> DoMergeIteration(
128 "memprof-merge-iteration", cl::init(Val: true), cl::Hidden,
129 cl::desc("Iteratively apply merging on a node to catch new callers"));
130
131// How much of the graph to export to dot.
132enum DotScope {
133 All, // The full CCG graph.
134 Alloc, // Only contexts for the specified allocation.
135 Context, // Only the specified context.
136};
137
138static cl::opt<DotScope> DotGraphScope(
139 "memprof-dot-scope", cl::desc("Scope of graph to export to dot"),
140 cl::Hidden, cl::init(Val: DotScope::All),
141 cl::values(
142 clEnumValN(DotScope::All, "all", "Export full callsite graph"),
143 clEnumValN(DotScope::Alloc, "alloc",
144 "Export only nodes with contexts feeding given "
145 "-memprof-dot-alloc-id"),
146 clEnumValN(DotScope::Context, "context",
147 "Export only nodes with given -memprof-dot-context-id")));
148
149static cl::opt<unsigned>
150 AllocIdForDot("memprof-dot-alloc-id", cl::init(Val: 0), cl::Hidden,
151 cl::desc("Id of alloc to export if -memprof-dot-scope=alloc "
152 "or to highlight if -memprof-dot-scope=all"));
153
154static cl::opt<unsigned> ContextIdForDot(
155 "memprof-dot-context-id", cl::init(Val: 0), cl::Hidden,
156 cl::desc("Id of context to export if -memprof-dot-scope=context or to "
157 "highlight otherwise"));
158
159static cl::opt<bool>
160 DumpCCG("memprof-dump-ccg", cl::init(Val: false), cl::Hidden,
161 cl::desc("Dump CallingContextGraph to stdout after each stage."));
162
163static cl::opt<bool>
164 VerifyCCG("memprof-verify-ccg", cl::init(Val: false), cl::Hidden,
165 cl::desc("Perform verification checks on CallingContextGraph."));
166
167static cl::opt<bool>
168 VerifyNodes("memprof-verify-nodes", cl::init(Val: false), cl::Hidden,
169 cl::desc("Perform frequent verification checks on nodes."));
170
171static cl::opt<std::string> MemProfImportSummary(
172 "memprof-import-summary",
173 cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
174 cl::Hidden);
175
176static cl::opt<unsigned>
177 TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(Val: 5),
178 cl::Hidden,
179 cl::desc("Max depth to recursively search for missing "
180 "frames through tail calls."));
181
182// Optionally enable cloning of callsites involved with recursive cycles
183static cl::opt<bool> AllowRecursiveCallsites(
184 "memprof-allow-recursive-callsites", cl::init(Val: true), cl::Hidden,
185 cl::desc("Allow cloning of callsites involved in recursive cycles"));
186
187static cl::opt<bool> CloneRecursiveContexts(
188 "memprof-clone-recursive-contexts", cl::init(Val: true), cl::Hidden,
189 cl::desc("Allow cloning of contexts through recursive cycles"));
190
191// Generally this is needed for correct assignment of allocation clones to
192// function clones, however, allow it to be disabled for debugging while the
193// functionality is new and being tested more widely.
194static cl::opt<bool>
195 MergeClones("memprof-merge-clones", cl::init(Val: true), cl::Hidden,
196 cl::desc("Merge clones before assigning functions"));
197
198// When disabled, try to detect and prevent cloning of recursive contexts.
199// This is only necessary until we support cloning through recursive cycles.
200// Leave on by default for now, as disabling requires a little bit of compile
201// time overhead and doesn't affect correctness, it will just inflate the cold
202// hinted bytes reporting a bit when -memprof-report-hinted-sizes is enabled.
203static cl::opt<bool> AllowRecursiveContexts(
204 "memprof-allow-recursive-contexts", cl::init(Val: true), cl::Hidden,
205 cl::desc("Allow cloning of contexts having recursive cycles"));
206
207// Set the minimum absolute count threshold for allowing inlining of indirect
208// calls promoted during cloning.
209static cl::opt<unsigned> MemProfICPNoInlineThreshold(
210 "memprof-icp-noinline-threshold", cl::init(Val: 2), cl::Hidden,
211 cl::desc("Minimum absolute count for promoted target to be inlinable"));
212
213namespace llvm {
214cl::opt<bool> EnableMemProfContextDisambiguation(
215 "enable-memprof-context-disambiguation", cl::init(Val: false), cl::Hidden,
216 cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));
217
218// Indicate we are linking with an allocator that supports hot/cold operator
219// new interfaces.
220cl::opt<bool> SupportsHotColdNew(
221 "supports-hot-cold-new", cl::init(Val: false), cl::Hidden,
222 cl::desc("Linking with hot/cold operator new interfaces"));
223
224static cl::opt<bool> MemProfRequireDefinitionForPromotion(
225 "memprof-require-definition-for-promotion", cl::init(Val: false), cl::Hidden,
226 cl::desc(
227 "Require target function definition when promoting indirect calls"));
228
229extern cl::opt<bool> MemProfReportHintedSizes;
230extern cl::opt<unsigned> MinClonedColdBytePercent;
231
232cl::opt<unsigned> MemProfTopNImportant(
233 "memprof-top-n-important", cl::init(Val: 10), cl::Hidden,
234 cl::desc("Number of largest cold contexts to consider important"));
235
236cl::opt<bool> MemProfFixupImportant(
237 "memprof-fixup-important", cl::init(Val: true), cl::Hidden,
238 cl::desc("Enables edge fixup for important contexts"));
239
240extern cl::opt<unsigned> MaxSummaryIndirectEdges;
241
242} // namespace llvm
243
244namespace {
245
246/// CRTP base for graphs built from either IR or ThinLTO summary index.
247///
248/// The graph represents the call contexts in all memprof metadata on allocation
249/// calls, with nodes for the allocations themselves, as well as for the calls
250/// in each context. The graph is initially built from the allocation memprof
251/// metadata (or summary) MIBs. It is then updated to match calls with callsite
252/// metadata onto the nodes, updating it to reflect any inlining performed on
253/// those calls.
254///
255/// Each MIB (representing an allocation's call context with allocation
256/// behavior) is assigned a unique context id during the graph build. The edges
257/// and nodes in the graph are decorated with the context ids they carry. This
258/// is used to correctly update the graph when cloning is performed so that we
259/// can uniquify the context for a single (possibly cloned) allocation.
260template <typename DerivedCCG, typename FuncTy, typename CallTy>
261class CallsiteContextGraph {
262public:
263 CallsiteContextGraph() = default;
264 CallsiteContextGraph(const CallsiteContextGraph &) = default;
265 CallsiteContextGraph(CallsiteContextGraph &&) = default;
266
267 /// Main entry point to perform analysis and transformations on graph.
268 bool process();
269
270 /// Perform cloning on the graph necessary to uniquely identify the allocation
271 /// behavior of an allocation based on its context.
272 void identifyClones();
273
274 /// Assign callsite clones to functions, cloning functions as needed to
275 /// accommodate the combinations of their callsite clones reached by callers.
276 /// For regular LTO this clones functions and callsites in the IR, but for
277 /// ThinLTO the cloning decisions are noted in the summaries and later applied
278 /// in applyImport.
279 bool assignFunctions();
280
281 void dump() const;
282 void print(raw_ostream &OS) const;
283 void printTotalSizes(raw_ostream &OS) const;
284
285 friend raw_ostream &operator<<(raw_ostream &OS,
286 const CallsiteContextGraph &CCG) {
287 CCG.print(OS);
288 return OS;
289 }
290
291 friend struct GraphTraits<
292 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
293 friend struct DOTGraphTraits<
294 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
295
296 void exportToDot(std::string Label) const;
297
298 /// Represents a function clone via FuncTy pointer and clone number pair.
299 struct FuncInfo final
300 : public std::pair<FuncTy *, unsigned /*Clone number*/> {
301 using Base = std::pair<FuncTy *, unsigned>;
302 FuncInfo(const Base &B) : Base(B) {}
303 FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
304 explicit operator bool() const { return this->first != nullptr; }
305 FuncTy *func() const { return this->first; }
306 unsigned cloneNo() const { return this->second; }
307 };
308
309 /// Represents a callsite clone via CallTy and clone number pair.
310 struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
311 using Base = std::pair<CallTy, unsigned>;
312 CallInfo(const Base &B) : Base(B) {}
313 CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
314 : Base(Call, CloneNo) {}
315 explicit operator bool() const { return (bool)this->first; }
316 CallTy call() const { return this->first; }
317 unsigned cloneNo() const { return this->second; }
318 void setCloneNo(unsigned N) { this->second = N; }
319 void print(raw_ostream &OS) const {
320 if (!operator bool()) {
321 assert(!cloneNo());
322 OS << "null Call";
323 return;
324 }
325 call()->print(OS);
326 OS << "\t(clone " << cloneNo() << ")";
327 }
328 void dump() const {
329 print(OS&: dbgs());
330 dbgs() << "\n";
331 }
332 friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
333 Call.print(OS);
334 return OS;
335 }
336 };
337
338 struct ContextEdge;
339
340 /// Node in the Callsite Context Graph
341 struct ContextNode {
342 // Assigned to nodes as they are created, useful for debugging.
343 unsigned NodeId = 0;
344
345 // Keep this for now since in the IR case where we have an Instruction* it
346 // is not as immediately discoverable. Used for printing richer information
347 // when dumping graph.
348 bool IsAllocation;
349
350 // Keeps track of when the Call was reset to null because there was
351 // recursion.
352 bool Recursive = false;
353
354 // This will be formed by ORing together the AllocationType enum values
355 // for contexts including this node.
356 uint8_t AllocTypes = 0;
357
358 // The corresponding allocation or interior call. This is the primary call
359 // for which we have created this node.
360 CallInfo Call;
361
362 // List of other calls that can be treated the same as the primary call
363 // through cloning. I.e. located in the same function and have the same
364 // (possibly pruned) stack ids. They will be updated the same way as the
365 // primary call when assigning to function clones.
366 SmallVector<CallInfo, 0> MatchingCalls;
367
368 // For alloc nodes this is a unique id assigned when constructed, and for
369 // callsite stack nodes it is the original stack id when the node is
370 // constructed from the memprof MIB metadata on the alloc nodes. Note that
371 // this is only used when matching callsite metadata onto the stack nodes
372 // created when processing the allocation memprof MIBs, and for labeling
373 // nodes in the dot graph. Therefore we don't bother to assign a value for
374 // clones.
375 uint64_t OrigStackOrAllocId = 0;
376
377 // Edges to all callees in the profiled call stacks.
378 // TODO: Should this be a map (from Callee node) for more efficient lookup?
379 std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
380
381 // Edges to all callers in the profiled call stacks.
382 // TODO: Should this be a map (from Caller node) for more efficient lookup?
383 std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
384
385 // Returns true if we need to look at the callee edges for determining the
386 // node context ids and allocation type.
387 bool useCallerEdgesForContextInfo() const {
388 // Typically if the callee edges are empty either the caller edges are
389 // also empty, or this is an allocation (leaf node). However, if we are
390 // allowing recursive callsites and contexts this will be violated for
391 // incompletely cloned recursive cycles.
392 assert(!CalleeEdges.empty() || CallerEdges.empty() || IsAllocation ||
393 (AllowRecursiveCallsites && AllowRecursiveContexts));
394 // When cloning for a recursive context, during cloning we might be in the
395 // midst of cloning for a recurrence and have moved context ids off of a
396 // caller edge onto the clone but not yet off of the incoming caller
397 // (back) edge. If we don't look at those we miss the fact that this node
398 // still has context ids of interest.
399 return IsAllocation || CloneRecursiveContexts;
400 }
401
402 // Compute the context ids for this node from the union of its edge context
403 // ids.
404 DenseSet<uint32_t> getContextIds() const {
405 unsigned Count = 0;
406 // Compute the number of ids for reserve below. In general we only need to
407 // look at one set of edges, typically the callee edges, since other than
408 // allocations and in some cases during recursion cloning, all the context
409 // ids on the callers should also flow out via callee edges.
410 for (auto &Edge : CalleeEdges.empty() ? CallerEdges : CalleeEdges)
411 Count += Edge->getContextIds().size();
412 DenseSet<uint32_t> ContextIds;
413 ContextIds.reserve(Size: Count);
414 auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
415 CalleeEdges, useCallerEdgesForContextInfo()
416 ? CallerEdges
417 : std::vector<std::shared_ptr<ContextEdge>>());
418 for (const auto &Edge : Edges)
419 ContextIds.insert_range(Edge->getContextIds());
420 return ContextIds;
421 }
422
423 // Compute the allocation type for this node from the OR of its edge
424 // allocation types.
425 uint8_t computeAllocType() const {
426 uint8_t BothTypes =
427 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
428 uint8_t AllocType = (uint8_t)AllocationType::None;
429 auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
430 CalleeEdges, useCallerEdgesForContextInfo()
431 ? CallerEdges
432 : std::vector<std::shared_ptr<ContextEdge>>());
433 for (const auto &Edge : Edges) {
434 AllocType |= Edge->AllocTypes;
435 // Bail early if alloc type reached both, no further refinement.
436 if (AllocType == BothTypes)
437 return AllocType;
438 }
439 return AllocType;
440 }
441
442 // The context ids set for this node is empty if its edge context ids are
443 // also all empty.
444 bool emptyContextIds() const {
445 auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
446 CalleeEdges, useCallerEdgesForContextInfo()
447 ? CallerEdges
448 : std::vector<std::shared_ptr<ContextEdge>>());
449 for (const auto &Edge : Edges) {
450 if (!Edge->getContextIds().empty())
451 return false;
452 }
453 return true;
454 }
455
456 // List of clones of this ContextNode, initially empty.
457 std::vector<ContextNode *> Clones;
458
459 // If a clone, points to the original uncloned node.
460 ContextNode *CloneOf = nullptr;
461
462 ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
463
464 ContextNode(bool IsAllocation, CallInfo C)
465 : IsAllocation(IsAllocation), Call(C) {}
466
467 void addClone(ContextNode *Clone) {
468 if (CloneOf) {
469 CloneOf->Clones.push_back(Clone);
470 Clone->CloneOf = CloneOf;
471 } else {
472 Clones.push_back(Clone);
473 assert(!Clone->CloneOf);
474 Clone->CloneOf = this;
475 }
476 }
477
478 ContextNode *getOrigNode() {
479 if (!CloneOf)
480 return this;
481 return CloneOf;
482 }
483
484 void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
485 unsigned int ContextId);
486
487 ContextEdge *findEdgeFromCallee(const ContextNode *Callee);
488 ContextEdge *findEdgeFromCaller(const ContextNode *Caller);
489 void eraseCalleeEdge(const ContextEdge *Edge);
490 void eraseCallerEdge(const ContextEdge *Edge);
491
492 void setCall(CallInfo C) { Call = std::move(C); }
493
494 bool hasCall() const { return (bool)Call.call(); }
495
496 void printCall(raw_ostream &OS) const { Call.print(OS); }
497
498 // True if this node was effectively removed from the graph, in which case
499 // it should have an allocation type of None and empty context ids.
500 bool isRemoved() const {
501 // Typically if the callee edges are empty either the caller edges are
502 // also empty, or this is an allocation (leaf node). However, if we are
503 // allowing recursive callsites and contexts this will be violated for
504 // incompletely cloned recursive cycles.
505 assert((AllowRecursiveCallsites && AllowRecursiveContexts) ||
506 (AllocTypes == (uint8_t)AllocationType::None) ==
507 emptyContextIds());
508 return AllocTypes == (uint8_t)AllocationType::None;
509 }
510
511 void dump() const;
512 void print(raw_ostream &OS) const;
513
514 friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
515 Node.print(OS);
516 return OS;
517 }
518 };
519
520 /// Edge in the Callsite Context Graph from a ContextNode N to a caller or
521 /// callee.
522 struct ContextEdge {
523 ContextNode *Callee;
524 ContextNode *Caller;
525
526 // This will be formed by ORing together the AllocationType enum values
527 // for contexts including this edge.
528 uint8_t AllocTypes = 0;
529
530 // Set just before initiating cloning when cloning of recursive contexts is
531 // enabled. Used to defer cloning of backedges until we have done cloning of
532 // the callee node for non-backedge caller edges. This exposes cloning
533 // opportunities through the backedge of the cycle.
534 // TODO: Note that this is not updated during cloning, and it is unclear
535 // whether that would be needed.
536 bool IsBackedge = false;
537
538 // The set of IDs for contexts including this edge.
539 DenseSet<uint32_t> ContextIds;
540
541 ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
542 DenseSet<uint32_t> ContextIds)
543 : Callee(Callee), Caller(Caller), AllocTypes(AllocType),
544 ContextIds(std::move(ContextIds)) {}
545
546 DenseSet<uint32_t> &getContextIds() { return ContextIds; }
547
548 // Helper to clear the fields of this edge when we are removing it from the
549 // graph.
550 inline void clear() {
551 ContextIds.clear();
552 AllocTypes = (uint8_t)AllocationType::None;
553 Caller = nullptr;
554 Callee = nullptr;
555 }
556
557 // Check if edge was removed from the graph. This is useful while iterating
558 // over a copy of edge lists when performing operations that mutate the
559 // graph in ways that might remove one of the edges.
560 inline bool isRemoved() const {
561 if (Callee || Caller)
562 return false;
563 // Any edges that have been removed from the graph but are still in a
564 // shared_ptr somewhere should have all fields null'ed out by clear()
565 // above.
566 assert(AllocTypes == (uint8_t)AllocationType::None);
567 assert(ContextIds.empty());
568 return true;
569 }
570
571 void dump() const;
572 void print(raw_ostream &OS) const;
573
574 friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
575 Edge.print(OS);
576 return OS;
577 }
578 };
579
580 /// Helpers to remove edges that have allocation type None (due to not
581 /// carrying any context ids) after transformations.
582 void removeNoneTypeCalleeEdges(ContextNode *Node);
583 void removeNoneTypeCallerEdges(ContextNode *Node);
584 void
585 recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node,
586 DenseSet<const ContextNode *> &Visited);
587
588protected:
589 /// Get a list of nodes corresponding to the stack ids in the given callsite
590 /// context.
591 template <class NodeT, class IteratorT>
592 std::vector<uint64_t>
593 getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
594
595 /// Adds nodes for the given allocation and any stack ids on its memprof MIB
596 /// metadata (or summary).
597 ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);
598
599 /// Adds nodes for the given MIB stack ids.
600 template <class NodeT, class IteratorT>
601 void addStackNodesForMIB(
602 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
603 CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
604 ArrayRef<ContextTotalSize> ContextSizeInfo,
605 std::map<uint64_t, uint32_t> &TotalSizeToContextIdTopNCold);
606
607 /// Matches all callsite metadata (or summary) to the nodes created for
608 /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
609 /// inlining performed on those callsite instructions.
610 void updateStackNodes();
611
612 /// Optionally fixup edges for the N largest cold contexts to better enable
613 /// cloning. This is particularly helpful if the context includes recursion
614 /// as well as inlining, resulting in a single stack node for multiple stack
615 /// ids in the context. With recursion it is particularly difficult to get the
616 /// edge updates correct as in the general case we have lost the original
617 /// stack id ordering for the context. Do more expensive fixup for the largest
618 /// contexts, controlled by MemProfTopNImportant and MemProfFixupImportant.
619 void fixupImportantContexts();
620
621 /// Update graph to conservatively handle any callsite stack nodes that target
622 /// multiple different callee target functions.
623 void handleCallsitesWithMultipleTargets();
624
625 /// Mark backedges via the standard DFS based backedge algorithm.
626 void markBackedges();
627
628 /// Merge clones generated during cloning for different allocations but that
629 /// are called by the same caller node, to ensure proper function assignment.
630 void mergeClones();
631
632 // Try to partition calls on the given node (already placed into the AllCalls
633 // array) by callee function, creating new copies of Node as needed to hold
634 // calls with different callees, and moving the callee edges appropriately.
635 // Returns true if partitioning was successful.
636 bool partitionCallsByCallee(
637 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
638 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode);
639
640 /// Save lists of calls with MemProf metadata in each function, for faster
641 /// iteration.
642 MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
643
644 /// Map from callsite node to the enclosing caller function.
645 std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
646
647 // When exporting to dot, and an allocation id is specified, contains the
648 // context ids on that allocation.
649 DenseSet<uint32_t> DotAllocContextIds;
650
651private:
652 using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
653
654 // Structure to keep track of information for each call as we are matching
655 // non-allocation callsites onto context nodes created from the allocation
656 // call metadata / summary contexts.
657 struct CallContextInfo {
658 // The callsite we're trying to match.
659 CallTy Call;
660 // The callsites stack ids that have a context node in the graph.
661 std::vector<uint64_t> StackIds;
662 // The function containing this callsite.
663 const FuncTy *Func;
664 // Initially empty, if needed this will be updated to contain the context
665 // ids for use in a new context node created for this callsite.
666 DenseSet<uint32_t> ContextIds;
667 };
668
669 /// Helper to remove edge from graph, updating edge iterator if it is provided
670 /// (in which case CalleeIter indicates which edge list is being iterated).
671 /// This will also perform the necessary clearing of the ContextEdge members
672 /// to enable later checking if the edge has been removed (since we may have
673 /// other copies of the shared_ptr in existence, and in fact rely on this to
674 /// enable removal while iterating over a copy of a node's edge list).
675 void removeEdgeFromGraph(ContextEdge *Edge, EdgeIter *EI = nullptr,
676 bool CalleeIter = true);
677
678 /// Assigns the given Node to calls at or inlined into the location with
679 /// the Node's stack id, after post order traversing and processing its
680 /// caller nodes. Uses the call information recorded in the given
681 /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
682 /// as needed. Called by updateStackNodes which sets up the given
683 /// StackIdToMatchingCalls map.
684 void assignStackNodesPostOrder(
685 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
686 DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls,
687 DenseMap<CallInfo, CallInfo> &CallToMatchingCall,
688 const DenseSet<uint32_t> &ImportantContextIds);
689
690 /// Duplicates the given set of context ids, updating the provided
691 /// map from each original id with the newly generated context ids,
692 /// and returning the new duplicated id set.
693 DenseSet<uint32_t> duplicateContextIds(
694 const DenseSet<uint32_t> &StackSequenceContextIds,
695 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
696
697 /// Propagates all duplicated context ids across the graph.
698 void propagateDuplicateContextIds(
699 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
700
701 /// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
702 /// else to its callers. Also updates OrigNode's edges to remove any context
703 /// ids moved to the newly created edge.
704 void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
705 bool TowardsCallee,
706 DenseSet<uint32_t> RemainingContextIds);
707
708 /// Get the stack id corresponding to the given Id or Index (for IR this will
709 /// return itself, for a summary index this will return the id recorded in the
710 /// index for that stack id index value).
711 uint64_t getStackId(uint64_t IdOrIndex) const {
712 return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
713 }
714
715 /// Returns true if the given call targets the callee of the given edge, or if
716 /// we were able to identify the call chain through intermediate tail calls.
717 /// In the latter case new context nodes are added to the graph for the
718 /// identified tail calls, and their synthesized nodes are added to
719 /// TailCallToContextNodeMap. The EdgeIter is updated in the latter case for
720 /// the updated edges and to prepare it for an increment in the caller.
721 bool
722 calleesMatch(CallTy Call, EdgeIter &EI,
723 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);
724
725 // Return the callee function of the given call, or nullptr if it can't be
726 // determined
727 const FuncTy *getCalleeFunc(CallTy Call) {
728 return static_cast<DerivedCCG *>(this)->getCalleeFunc(Call);
729 }
730
731 /// Returns true if the given call targets the given function, or if we were
732 /// able to identify the call chain through intermediate tail calls (in which
733 /// case FoundCalleeChain will be populated).
734 bool calleeMatchesFunc(
735 CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc,
736 std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) {
737 return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(
738 Call, Func, CallerFunc, FoundCalleeChain);
739 }
740
741 /// Returns true if both call instructions have the same callee.
742 bool sameCallee(CallTy Call1, CallTy Call2) {
743 return static_cast<DerivedCCG *>(this)->sameCallee(Call1, Call2);
744 }
745
746 /// Get a list of nodes corresponding to the stack ids in the given
747 /// callsite's context.
748 std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
749 return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
750 Call);
751 }
752
753 /// Get the last stack id in the context for callsite.
754 uint64_t getLastStackId(CallTy Call) {
755 return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
756 }
757
758 /// Update the allocation call to record type of allocated memory.
759 void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
760 AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
761 static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
762 }
763
764 /// Get the AllocationType assigned to the given allocation instruction clone.
765 AllocationType getAllocationCallType(const CallInfo &Call) const {
766 return static_cast<const DerivedCCG *>(this)->getAllocationCallType(Call);
767 }
768
769 /// Update non-allocation call to invoke (possibly cloned) function
770 /// CalleeFunc.
771 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
772 static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
773 }
774
775 /// Clone the given function for the given callsite, recording mapping of all
776 /// of the functions tracked calls to their new versions in the CallMap.
777 /// Assigns new clones to clone number CloneNo.
778 FuncInfo cloneFunctionForCallsite(
779 FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
780 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
781 return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
782 Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
783 }
784
785 /// Gets a label to use in the dot graph for the given call clone in the given
786 /// function.
787 std::string getLabel(const FuncTy *Func, const CallTy Call,
788 unsigned CloneNo) const {
789 return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
790 }
791
792 // Create and return a new ContextNode.
793 ContextNode *createNewNode(bool IsAllocation, const FuncTy *F = nullptr,
794 CallInfo C = CallInfo()) {
795 NodeOwner.push_back(std::make_unique<ContextNode>(IsAllocation, C));
796 auto *NewNode = NodeOwner.back().get();
797 if (F)
798 NodeToCallingFunc[NewNode] = F;
799 NewNode->NodeId = NodeOwner.size();
800 return NewNode;
801 }
802
803 /// Helpers to find the node corresponding to the given call or stackid.
804 ContextNode *getNodeForInst(const CallInfo &C);
805 ContextNode *getNodeForAlloc(const CallInfo &C);
806 ContextNode *getNodeForStackId(uint64_t StackId);
807
808 /// Computes the alloc type corresponding to the given context ids, by
809 /// unioning their recorded alloc types.
810 uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds) const;
811
812 /// Returns the allocation type of the intersection of the contexts of two
813 /// nodes (based on their provided context id sets), optimized for the case
814 /// when Node1Ids is smaller than Node2Ids.
815 uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
816 const DenseSet<uint32_t> &Node2Ids) const;
817
818 /// Returns the allocation type of the intersection of the contexts of two
819 /// nodes (based on their provided context id sets).
820 uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
821 const DenseSet<uint32_t> &Node2Ids) const;
822
823 /// Create a clone of Edge's callee and move Edge to that new callee node,
824 /// performing the necessary context id and allocation type updates.
825 /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
826 /// moved to an edge to the new callee.
827 ContextNode *
828 moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
829 DenseSet<uint32_t> ContextIdsToMove = {});
830
831 /// Change the callee of Edge to existing callee clone NewCallee, performing
832 /// the necessary context id and allocation type updates.
833 /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
834 /// moved to an edge to the new callee.
835 void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
836 ContextNode *NewCallee,
837 bool NewClone = false,
838 DenseSet<uint32_t> ContextIdsToMove = {});
839
840 /// Change the caller of the edge at the given callee edge iterator to be
841 /// NewCaller, performing the necessary context id and allocation type
842 /// updates. This is similar to the above moveEdgeToExistingCalleeClone, but
843 /// a simplified version of it as we always move the given edge and all of its
844 /// context ids.
845 void moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
846 ContextNode *NewCaller);
847
848 /// Recursive helper for marking backedges via DFS.
849 void markBackedges(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
850 DenseSet<const ContextNode *> &CurrentStack);
851
852 /// Recursive helper for merging clones.
853 void
854 mergeClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
855 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode);
856 /// Main worker for merging callee clones for a given node.
857 void mergeNodeCalleeClones(
858 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
859 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode);
860 /// Helper to find other callers of the given set of callee edges that can
861 /// share the same callee merge node.
862 void findOtherCallersToShareMerge(
863 ContextNode *Node, std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges,
864 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode,
865 DenseSet<ContextNode *> &OtherCallersToShareMerge);
866
867 /// Recursively perform cloning on the graph for the given Node and its
868 /// callers, in order to uniquely identify the allocation behavior of an
869 /// allocation given its context. The context ids of the allocation being
870 /// processed are given in AllocContextIds.
871 void identifyClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
872 const DenseSet<uint32_t> &AllocContextIds);
873
874 /// Map from each context ID to the AllocationType assigned to that context.
875 DenseMap<uint32_t, AllocationType> ContextIdToAllocationType;
876
877 /// Map from each contextID to the profiled full contexts and their total
878 /// sizes (there may be more than one due to context trimming),
879 /// optionally populated when requested (via MemProfReportHintedSizes or
880 /// MinClonedColdBytePercent).
881 DenseMap<uint32_t, std::vector<ContextTotalSize>> ContextIdToContextSizeInfos;
882
883 /// Identifies the context node created for a stack id when adding the MIB
884 /// contexts to the graph. This is used to locate the context nodes when
885 /// trying to assign the corresponding callsites with those stack ids to these
886 /// nodes.
887 DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
888
889 /// Saves information for the contexts identified as important (the largest
890 /// cold contexts up to MemProfTopNImportant).
891 struct ImportantContextInfo {
892 // The original list of leaf first stack ids corresponding to this context.
893 std::vector<uint64_t> StackIds;
894 // Max length of stack ids corresponding to a single stack ContextNode for
895 // this context (i.e. the max length of a key in StackIdsToNode below).
896 unsigned MaxLength = 0;
897 // Mapping of slices of the stack ids to the corresponding ContextNode
898 // (there can be multiple stack ids due to inlining). Populated when
899 // updating stack nodes while matching them to the IR or summary.
900 std::map<std::vector<uint64_t>, ContextNode *> StackIdsToNode;
901 };
902
903 // Map of important full context ids to information about each.
904 DenseMap<uint32_t, ImportantContextInfo> ImportantContextIdInfo;
905
906 // For each important context id found in Node (if any), records the list of
907 // stack ids that corresponded to the given callsite Node. There can be more
908 // than one in the case of inlining.
909 void recordStackNode(std::vector<uint64_t> &StackIds, ContextNode *Node,
910 // We pass in the Node's context ids to avoid the
911 // overhead of computing them as the caller already has
912 // them in some cases.
913 const DenseSet<uint32_t> &NodeContextIds,
914 const DenseSet<uint32_t> &ImportantContextIds) {
915 if (!MemProfTopNImportant) {
916 assert(ImportantContextIds.empty());
917 return;
918 }
919 DenseSet<uint32_t> Ids =
920 set_intersection(S1: NodeContextIds, S2: ImportantContextIds);
921 if (Ids.empty())
922 return;
923 auto Size = StackIds.size();
924 for (auto Id : Ids) {
925 auto &Entry = ImportantContextIdInfo[Id];
926 Entry.StackIdsToNode[StackIds] = Node;
927 // Keep track of the max to simplify later analysis.
928 if (Size > Entry.MaxLength)
929 Entry.MaxLength = Size;
930 }
931 }
932
933 /// Maps to track the calls to their corresponding nodes in the graph.
934 MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
935 MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
936
937 /// Owner of all ContextNode unique_ptrs.
938 std::vector<std::unique_ptr<ContextNode>> NodeOwner;
939
940 /// Perform sanity checks on graph when requested.
941 void check() const;
942
943 /// Keeps track of the last unique context id assigned.
944 unsigned int LastContextId = 0;
945};
946
947template <typename DerivedCCG, typename FuncTy, typename CallTy>
948using ContextNode =
949 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
950template <typename DerivedCCG, typename FuncTy, typename CallTy>
951using ContextEdge =
952 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
953template <typename DerivedCCG, typename FuncTy, typename CallTy>
954using FuncInfo =
955 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
956template <typename DerivedCCG, typename FuncTy, typename CallTy>
957using CallInfo =
958 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
959
960/// CRTP derived class for graphs built from IR (regular LTO).
961class ModuleCallsiteContextGraph
962 : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
963 Instruction *> {
964public:
965 ModuleCallsiteContextGraph(
966 Module &M,
967 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
968
969private:
970 friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
971 Instruction *>;
972
973 uint64_t getStackId(uint64_t IdOrIndex) const;
974 const Function *getCalleeFunc(Instruction *Call);
975 bool calleeMatchesFunc(
976 Instruction *Call, const Function *Func, const Function *CallerFunc,
977 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain);
978 bool sameCallee(Instruction *Call1, Instruction *Call2);
979 bool findProfiledCalleeThroughTailCalls(
980 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
981 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
982 bool &FoundMultipleCalleeChains);
983 uint64_t getLastStackId(Instruction *Call);
984 std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
985 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
986 AllocationType getAllocationCallType(const CallInfo &Call) const;
987 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
988 CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
989 Instruction *>::FuncInfo
990 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
991 DenseMap<CallInfo, CallInfo> &CallMap,
992 std::vector<CallInfo> &CallsWithMetadataInFunc,
993 unsigned CloneNo);
994 std::string getLabel(const Function *Func, const Instruction *Call,
995 unsigned CloneNo) const;
996
997 const Module &Mod;
998 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
999};
1000
1001/// Represents a call in the summary index graph, which can either be an
1002/// allocation or an interior callsite node in an allocation's context.
1003/// Holds a pointer to the corresponding data structure in the index.
1004struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
1005 IndexCall() : PointerUnion() {}
1006 IndexCall(std::nullptr_t) : IndexCall() {}
1007 IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {}
1008 IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
1009 IndexCall(PointerUnion PT) : PointerUnion(PT) {}
1010
1011 IndexCall *operator->() { return this; }
1012
1013 void print(raw_ostream &OS) const {
1014 PointerUnion<CallsiteInfo *, AllocInfo *> Base = *this;
1015 if (auto *AI = llvm::dyn_cast_if_present<AllocInfo *>(Val&: Base)) {
1016 OS << *AI;
1017 } else {
1018 auto *CI = llvm::dyn_cast_if_present<CallsiteInfo *>(Val&: Base);
1019 assert(CI);
1020 OS << *CI;
1021 }
1022 }
1023};
1024} // namespace
1025
1026namespace llvm {
1027template <> struct simplify_type<IndexCall> {
1028 using SimpleType = PointerUnion<CallsiteInfo *, AllocInfo *>;
1029 static SimpleType getSimplifiedValue(IndexCall &Val) { return Val; }
1030};
1031template <> struct simplify_type<const IndexCall> {
1032 using SimpleType = const PointerUnion<CallsiteInfo *, AllocInfo *>;
1033 static SimpleType getSimplifiedValue(const IndexCall &Val) { return Val; }
1034};
1035} // namespace llvm
1036
1037namespace {
1038/// CRTP derived class for graphs built from summary index (ThinLTO).
1039class IndexCallsiteContextGraph
1040 : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
1041 IndexCall> {
1042public:
1043 IndexCallsiteContextGraph(
1044 ModuleSummaryIndex &Index,
1045 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
1046 isPrevailing);
1047
1048 ~IndexCallsiteContextGraph() {
1049 // Now that we are done with the graph it is safe to add the new
1050 // CallsiteInfo structs to the function summary vectors. The graph nodes
1051 // point into locations within these vectors, so we don't want to add them
1052 // any earlier.
1053 for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) {
1054 auto *FS = I.first;
1055 for (auto &Callsite : I.second)
1056 FS->addCallsite(Callsite&: *Callsite.second);
1057 }
1058 }
1059
1060private:
1061 friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
1062 IndexCall>;
1063
1064 uint64_t getStackId(uint64_t IdOrIndex) const;
1065 const FunctionSummary *getCalleeFunc(IndexCall &Call);
1066 bool calleeMatchesFunc(
1067 IndexCall &Call, const FunctionSummary *Func,
1068 const FunctionSummary *CallerFunc,
1069 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);
1070 bool sameCallee(IndexCall &Call1, IndexCall &Call2);
1071 bool findProfiledCalleeThroughTailCalls(
1072 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
1073 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
1074 bool &FoundMultipleCalleeChains);
1075 uint64_t getLastStackId(IndexCall &Call);
1076 std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
1077 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
1078 AllocationType getAllocationCallType(const CallInfo &Call) const;
1079 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
1080 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
1081 IndexCall>::FuncInfo
1082 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
1083 DenseMap<CallInfo, CallInfo> &CallMap,
1084 std::vector<CallInfo> &CallsWithMetadataInFunc,
1085 unsigned CloneNo);
1086 std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
1087 unsigned CloneNo) const;
1088 DenseSet<GlobalValue::GUID> findAliaseeGUIDsPrevailingInDifferentModule();
1089
1090 // Saves mapping from function summaries containing memprof records back to
1091 // its VI, for use in checking and debugging.
1092 std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
1093
1094 const ModuleSummaryIndex &Index;
1095 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
1096 isPrevailing;
1097
1098 // Saves/owns the callsite info structures synthesized for missing tail call
1099 // frames that we discover while building the graph.
1100 // It maps from the summary of the function making the tail call, to a map
1101 // of callee ValueInfo to corresponding synthesized callsite info.
1102 std::unordered_map<FunctionSummary *,
1103 std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>>
1104 FunctionCalleesToSynthesizedCallsiteInfos;
1105};
1106} // namespace
1107
1108template <>
1109struct llvm::DenseMapInfo<CallsiteContextGraph<
1110 ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
1111 : public DenseMapInfo<std::pair<Instruction *, unsigned>> {};
1112template <>
1113struct llvm::DenseMapInfo<CallsiteContextGraph<
1114 IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
1115 : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
1116template <>
1117struct llvm::DenseMapInfo<IndexCall>
1118 : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
1119
1120namespace {
1121
1122// Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
1123// type we should actually use on the corresponding allocation.
1124// If we can't clone a node that has NotCold+Cold alloc type, we will fall
1125// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
1126// from NotCold.
1127AllocationType allocTypeToUse(uint8_t AllocTypes) {
1128 assert(AllocTypes != (uint8_t)AllocationType::None);
1129 if (AllocTypes ==
1130 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
1131 return AllocationType::NotCold;
1132 else
1133 return (AllocationType)AllocTypes;
1134}
1135
1136// Helper to check if the alloc types for all edges recorded in the
1137// InAllocTypes vector match the alloc types for all edges in the Edges
1138// vector.
1139template <typename DerivedCCG, typename FuncTy, typename CallTy>
1140bool allocTypesMatch(
1141 const std::vector<uint8_t> &InAllocTypes,
1142 const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
1143 &Edges) {
1144 // This should be called only when the InAllocTypes vector was computed for
1145 // this set of Edges. Make sure the sizes are the same.
1146 assert(InAllocTypes.size() == Edges.size());
1147 return std::equal(
1148 InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(), Edges.end(),
1149 [](const uint8_t &l,
1150 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
1151 // Can share if one of the edges is None type - don't
1152 // care about the type along that edge as it doesn't
1153 // exist for those context ids.
1154 if (l == (uint8_t)AllocationType::None ||
1155 r->AllocTypes == (uint8_t)AllocationType::None)
1156 return true;
1157 return allocTypeToUse(AllocTypes: l) == allocTypeToUse(r->AllocTypes);
1158 });
1159}
1160
1161// Helper to check if the alloc types for all edges recorded in the
1162// InAllocTypes vector match the alloc types for callee edges in the given
1163// clone. Because the InAllocTypes were computed from the original node's callee
1164// edges, and other cloning could have happened after this clone was created, we
1165// need to find the matching clone callee edge, which may or may not exist.
1166template <typename DerivedCCG, typename FuncTy, typename CallTy>
1167bool allocTypesMatchClone(
1168 const std::vector<uint8_t> &InAllocTypes,
1169 const ContextNode<DerivedCCG, FuncTy, CallTy> *Clone) {
1170 const ContextNode<DerivedCCG, FuncTy, CallTy> *Node = Clone->CloneOf;
1171 assert(Node);
1172 // InAllocTypes should have been computed for the original node's callee
1173 // edges.
1174 assert(InAllocTypes.size() == Node->CalleeEdges.size());
1175 // First create a map of the clone callee edge callees to the edge alloc type.
1176 DenseMap<const ContextNode<DerivedCCG, FuncTy, CallTy> *, uint8_t>
1177 EdgeCalleeMap;
1178 for (const auto &E : Clone->CalleeEdges) {
1179 assert(!EdgeCalleeMap.contains(E->Callee));
1180 EdgeCalleeMap[E->Callee] = E->AllocTypes;
1181 }
1182 // Next, walk the original node's callees, and look for the corresponding
1183 // clone edge to that callee.
1184 for (unsigned I = 0; I < Node->CalleeEdges.size(); I++) {
1185 auto Iter = EdgeCalleeMap.find(Node->CalleeEdges[I]->Callee);
1186 // Not found is ok, we will simply add an edge if we use this clone.
1187 if (Iter == EdgeCalleeMap.end())
1188 continue;
1189 // Can share if one of the edges is None type - don't
1190 // care about the type along that edge as it doesn't
1191 // exist for those context ids.
1192 if (InAllocTypes[I] == (uint8_t)AllocationType::None ||
1193 Iter->second == (uint8_t)AllocationType::None)
1194 continue;
1195 if (allocTypeToUse(Iter->second) != allocTypeToUse(AllocTypes: InAllocTypes[I]))
1196 return false;
1197 }
1198 return true;
1199}
1200
1201} // end anonymous namespace
1202
1203template <typename DerivedCCG, typename FuncTy, typename CallTy>
1204typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1205CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
1206 const CallInfo &C) {
1207 ContextNode *Node = getNodeForAlloc(C);
1208 if (Node)
1209 return Node;
1210
1211 return NonAllocationCallToContextNodeMap.lookup(C);
1212}
1213
1214template <typename DerivedCCG, typename FuncTy, typename CallTy>
1215typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1216CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
1217 const CallInfo &C) {
1218 return AllocationCallToContextNodeMap.lookup(C);
1219}
1220
1221template <typename DerivedCCG, typename FuncTy, typename CallTy>
1222typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1223CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
1224 uint64_t StackId) {
1225 auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
1226 if (StackEntryNode != StackEntryIdToContextNodeMap.end())
1227 return StackEntryNode->second;
1228 return nullptr;
1229}
1230
1231template <typename DerivedCCG, typename FuncTy, typename CallTy>
1232void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1233 addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
1234 unsigned int ContextId) {
1235 for (auto &Edge : CallerEdges) {
1236 if (Edge->Caller == Caller) {
1237 Edge->AllocTypes |= (uint8_t)AllocType;
1238 Edge->getContextIds().insert(ContextId);
1239 return;
1240 }
1241 }
1242 std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
1243 this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
1244 CallerEdges.push_back(Edge);
1245 Caller->CalleeEdges.push_back(Edge);
1246}
1247
1248template <typename DerivedCCG, typename FuncTy, typename CallTy>
1249void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::removeEdgeFromGraph(
1250 ContextEdge *Edge, EdgeIter *EI, bool CalleeIter) {
1251 assert(!EI || (*EI)->get() == Edge);
1252 assert(!Edge->isRemoved());
1253 // Save the Caller and Callee pointers so we can erase Edge from their edge
1254 // lists after clearing Edge below. We do the clearing first in case it is
1255 // destructed after removing from the edge lists (if those were the last
1256 // shared_ptr references to Edge).
1257 auto *Callee = Edge->Callee;
1258 auto *Caller = Edge->Caller;
1259
1260 // Make sure the edge fields are cleared out so we can properly detect
1261 // removed edges if Edge is not destructed because there is still a shared_ptr
1262 // reference.
1263 Edge->clear();
1264
1265#ifndef NDEBUG
1266 auto CalleeCallerCount = Callee->CallerEdges.size();
1267 auto CallerCalleeCount = Caller->CalleeEdges.size();
1268#endif
1269 if (!EI) {
1270 Callee->eraseCallerEdge(Edge);
1271 Caller->eraseCalleeEdge(Edge);
1272 } else if (CalleeIter) {
1273 Callee->eraseCallerEdge(Edge);
1274 *EI = Caller->CalleeEdges.erase(*EI);
1275 } else {
1276 Caller->eraseCalleeEdge(Edge);
1277 *EI = Callee->CallerEdges.erase(*EI);
1278 }
1279 assert(Callee->CallerEdges.size() < CalleeCallerCount);
1280 assert(Caller->CalleeEdges.size() < CallerCalleeCount);
1281}
1282
1283template <typename DerivedCCG, typename FuncTy, typename CallTy>
1284void CallsiteContextGraph<
1285 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
1286 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
1287 auto Edge = *EI;
1288 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1289 assert(Edge->ContextIds.empty());
1290 removeEdgeFromGraph(Edge: Edge.get(), EI: &EI, /*CalleeIter=*/true);
1291 } else
1292 ++EI;
1293 }
1294}
1295
1296template <typename DerivedCCG, typename FuncTy, typename CallTy>
1297void CallsiteContextGraph<
1298 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCallerEdges(ContextNode *Node) {
1299 for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
1300 auto Edge = *EI;
1301 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1302 assert(Edge->ContextIds.empty());
1303 Edge->Caller->eraseCalleeEdge(Edge.get());
1304 EI = Node->CallerEdges.erase(EI);
1305 } else
1306 ++EI;
1307 }
1308}
1309
1310template <typename DerivedCCG, typename FuncTy, typename CallTy>
1311typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1312CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1313 findEdgeFromCallee(const ContextNode *Callee) {
1314 for (const auto &Edge : CalleeEdges)
1315 if (Edge->Callee == Callee)
1316 return Edge.get();
1317 return nullptr;
1318}
1319
1320template <typename DerivedCCG, typename FuncTy, typename CallTy>
1321typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1322CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1323 findEdgeFromCaller(const ContextNode *Caller) {
1324 for (const auto &Edge : CallerEdges)
1325 if (Edge->Caller == Caller)
1326 return Edge.get();
1327 return nullptr;
1328}
1329
1330template <typename DerivedCCG, typename FuncTy, typename CallTy>
1331void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1332 eraseCalleeEdge(const ContextEdge *Edge) {
1333 auto EI = llvm::find_if(
1334 CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
1335 return CalleeEdge.get() == Edge;
1336 });
1337 assert(EI != CalleeEdges.end());
1338 CalleeEdges.erase(EI);
1339}
1340
1341template <typename DerivedCCG, typename FuncTy, typename CallTy>
1342void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1343 eraseCallerEdge(const ContextEdge *Edge) {
1344 auto EI = llvm::find_if(
1345 CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
1346 return CallerEdge.get() == Edge;
1347 });
1348 assert(EI != CallerEdges.end());
1349 CallerEdges.erase(EI);
1350}
1351
1352template <typename DerivedCCG, typename FuncTy, typename CallTy>
1353uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
1354 DenseSet<uint32_t> &ContextIds) const {
1355 uint8_t BothTypes =
1356 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1357 uint8_t AllocType = (uint8_t)AllocationType::None;
1358 for (auto Id : ContextIds) {
1359 AllocType |= (uint8_t)ContextIdToAllocationType.at(Val: Id);
1360 // Bail early if alloc type reached both, no further refinement.
1361 if (AllocType == BothTypes)
1362 return AllocType;
1363 }
1364 return AllocType;
1365}
1366
1367template <typename DerivedCCG, typename FuncTy, typename CallTy>
1368uint8_t
1369CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
1370 const DenseSet<uint32_t> &Node1Ids,
1371 const DenseSet<uint32_t> &Node2Ids) const {
1372 uint8_t BothTypes =
1373 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1374 uint8_t AllocType = (uint8_t)AllocationType::None;
1375 for (auto Id : Node1Ids) {
1376 if (!Node2Ids.count(V: Id))
1377 continue;
1378 AllocType |= (uint8_t)ContextIdToAllocationType.at(Val: Id);
1379 // Bail early if alloc type reached both, no further refinement.
1380 if (AllocType == BothTypes)
1381 return AllocType;
1382 }
1383 return AllocType;
1384}
1385
1386template <typename DerivedCCG, typename FuncTy, typename CallTy>
1387uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
1388 const DenseSet<uint32_t> &Node1Ids,
1389 const DenseSet<uint32_t> &Node2Ids) const {
1390 if (Node1Ids.size() < Node2Ids.size())
1391 return intersectAllocTypesImpl(Node1Ids, Node2Ids);
1392 else
1393 return intersectAllocTypesImpl(Node1Ids: Node2Ids, Node2Ids: Node1Ids);
1394}
1395
1396template <typename DerivedCCG, typename FuncTy, typename CallTy>
1397typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1398CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
1399 CallInfo Call, const FuncTy *F) {
1400 assert(!getNodeForAlloc(Call));
1401 ContextNode *AllocNode = createNewNode(/*IsAllocation=*/true, F, C: Call);
1402 AllocationCallToContextNodeMap[Call] = AllocNode;
1403 // Use LastContextId as a uniq id for MIB allocation nodes.
1404 AllocNode->OrigStackOrAllocId = LastContextId;
1405 // Alloc type should be updated as we add in the MIBs. We should assert
1406 // afterwards that it is not still None.
1407 AllocNode->AllocTypes = (uint8_t)AllocationType::None;
1408
1409 return AllocNode;
1410}
1411
1412static std::string getAllocTypeString(uint8_t AllocTypes) {
1413 if (!AllocTypes)
1414 return "None";
1415 std::string Str;
1416 if (AllocTypes & (uint8_t)AllocationType::NotCold)
1417 Str += "NotCold";
1418 if (AllocTypes & (uint8_t)AllocationType::Cold)
1419 Str += "Cold";
1420 return Str;
1421}
1422
1423template <typename DerivedCCG, typename FuncTy, typename CallTy>
1424template <class NodeT, class IteratorT>
1425void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
1426 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
1427 CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
1428 ArrayRef<ContextTotalSize> ContextSizeInfo,
1429 std::map<uint64_t, uint32_t> &TotalSizeToContextIdTopNCold) {
1430 // Treating the hot alloc type as NotCold before the disambiguation for "hot"
1431 // is done.
1432 if (AllocType == AllocationType::Hot)
1433 AllocType = AllocationType::NotCold;
1434
1435 ContextIdToAllocationType[++LastContextId] = AllocType;
1436
1437 bool IsImportant = false;
1438 if (!ContextSizeInfo.empty()) {
1439 auto &Entry = ContextIdToContextSizeInfos[LastContextId];
1440 // If this is a cold allocation, and we are collecting non-zero largest
1441 // contexts, see if this is a candidate.
1442 if (AllocType == AllocationType::Cold && MemProfTopNImportant > 0) {
1443 uint64_t TotalCold = 0;
1444 for (auto &CSI : ContextSizeInfo)
1445 TotalCold += CSI.TotalSize;
1446 // Record this context if either we haven't found the first top-n largest
1447 // yet, or if it is larger than the smallest already recorded.
1448 if (TotalSizeToContextIdTopNCold.size() < MemProfTopNImportant ||
1449 // Since TotalSizeToContextIdTopNCold is a std::map, it is implicitly
1450 // sorted in ascending size of its key which is the size.
1451 TotalCold > TotalSizeToContextIdTopNCold.begin()->first) {
1452 if (TotalSizeToContextIdTopNCold.size() == MemProfTopNImportant) {
1453 // Remove old one and its associated entries.
1454 auto IdToRemove = TotalSizeToContextIdTopNCold.begin()->second;
1455 TotalSizeToContextIdTopNCold.erase(
1456 position: TotalSizeToContextIdTopNCold.begin());
1457 assert(ImportantContextIdInfo.count(IdToRemove));
1458 ImportantContextIdInfo.erase(IdToRemove);
1459 }
1460 TotalSizeToContextIdTopNCold[TotalCold] = LastContextId;
1461 IsImportant = true;
1462 }
1463 }
1464 Entry.insert(position: Entry.begin(), first: ContextSizeInfo.begin(), last: ContextSizeInfo.end());
1465 }
1466
1467 // Update alloc type and context ids for this MIB.
1468 AllocNode->AllocTypes |= (uint8_t)AllocType;
1469
1470 // Now add or update nodes for each stack id in alloc's context.
1471 // Later when processing the stack ids on non-alloc callsites we will adjust
1472 // for any inlining in the context.
1473 ContextNode *PrevNode = AllocNode;
1474 // Look for recursion (direct recursion should have been collapsed by
1475 // module summary analysis, here we should just be detecting mutual
1476 // recursion). Mark these nodes so we don't try to clone.
1477 SmallSet<uint64_t, 8> StackIdSet;
1478 // Skip any on the allocation call (inlining).
1479 for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
1480 ContextIter != StackContext.end(); ++ContextIter) {
1481 auto StackId = getStackId(IdOrIndex: *ContextIter);
1482 if (IsImportant)
1483 ImportantContextIdInfo[LastContextId].StackIds.push_back(StackId);
1484 ContextNode *StackNode = getNodeForStackId(StackId);
1485 if (!StackNode) {
1486 StackNode = createNewNode(/*IsAllocation=*/false);
1487 StackEntryIdToContextNodeMap[StackId] = StackNode;
1488 StackNode->OrigStackOrAllocId = StackId;
1489 }
1490 // Marking a node recursive will prevent its cloning completely, even for
1491 // non-recursive contexts flowing through it.
1492 if (!AllowRecursiveCallsites) {
1493 auto Ins = StackIdSet.insert(StackId);
1494 if (!Ins.second)
1495 StackNode->Recursive = true;
1496 }
1497 StackNode->AllocTypes |= (uint8_t)AllocType;
1498 PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
1499 PrevNode = StackNode;
1500 }
1501}
1502
1503template <typename DerivedCCG, typename FuncTy, typename CallTy>
1504DenseSet<uint32_t>
1505CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
1506 const DenseSet<uint32_t> &StackSequenceContextIds,
1507 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1508 DenseSet<uint32_t> NewContextIds;
1509 for (auto OldId : StackSequenceContextIds) {
1510 NewContextIds.insert(V: ++LastContextId);
1511 OldToNewContextIds[OldId].insert(V: LastContextId);
1512 assert(ContextIdToAllocationType.count(OldId));
1513 // The new context has the same allocation type and size info as original.
1514 ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
1515 auto CSI = ContextIdToContextSizeInfos.find(Val: OldId);
1516 if (CSI != ContextIdToContextSizeInfos.end())
1517 ContextIdToContextSizeInfos[LastContextId] = CSI->second;
1518 if (DotAllocContextIds.contains(V: OldId))
1519 DotAllocContextIds.insert(V: LastContextId);
1520 }
1521 return NewContextIds;
1522}
1523
1524template <typename DerivedCCG, typename FuncTy, typename CallTy>
1525void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1526 propagateDuplicateContextIds(
1527 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1528 // Build a set of duplicated context ids corresponding to the input id set.
1529 auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
1530 DenseSet<uint32_t> NewIds;
1531 for (auto Id : ContextIds)
1532 if (auto NewId = OldToNewContextIds.find(Val: Id);
1533 NewId != OldToNewContextIds.end())
1534 NewIds.insert_range(R: NewId->second);
1535 return NewIds;
1536 };
1537
1538 // Recursively update context ids sets along caller edges.
1539 auto UpdateCallers = [&](ContextNode *Node,
1540 DenseSet<const ContextEdge *> &Visited,
1541 auto &&UpdateCallers) -> void {
1542 for (const auto &Edge : Node->CallerEdges) {
1543 auto Inserted = Visited.insert(Edge.get());
1544 if (!Inserted.second)
1545 continue;
1546 ContextNode *NextNode = Edge->Caller;
1547 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
1548 // Only need to recursively iterate to NextNode via this caller edge if
1549 // it resulted in any added ids to NextNode.
1550 if (!NewIdsToAdd.empty()) {
1551 Edge->getContextIds().insert_range(NewIdsToAdd);
1552 UpdateCallers(NextNode, Visited, UpdateCallers);
1553 }
1554 }
1555 };
1556
1557 DenseSet<const ContextEdge *> Visited;
1558 for (auto &Entry : AllocationCallToContextNodeMap) {
1559 auto *Node = Entry.second;
1560 UpdateCallers(Node, Visited, UpdateCallers);
1561 }
1562}
1563
1564template <typename DerivedCCG, typename FuncTy, typename CallTy>
1565void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
1566 ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee,
1567 // This must be passed by value to make a copy since it will be adjusted
1568 // as ids are moved.
1569 DenseSet<uint32_t> RemainingContextIds) {
1570 auto &OrigEdges =
1571 TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
1572 DenseSet<uint32_t> RecursiveContextIds;
1573 DenseSet<uint32_t> AllCallerContextIds;
1574 if (AllowRecursiveCallsites) {
1575 // Identify which context ids are recursive which is needed to properly
1576 // update the RemainingContextIds set. The relevant recursive context ids
1577 // are those that are in multiple edges.
1578 for (auto &CE : OrigEdges) {
1579 AllCallerContextIds.reserve(Size: CE->getContextIds().size());
1580 for (auto Id : CE->getContextIds())
1581 if (!AllCallerContextIds.insert(Id).second)
1582 RecursiveContextIds.insert(Id);
1583 }
1584 }
1585 // Increment iterator in loop so that we can remove edges as needed.
1586 for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
1587 auto Edge = *EI;
1588 DenseSet<uint32_t> NewEdgeContextIds;
1589 DenseSet<uint32_t> NotFoundContextIds;
1590 // Remove any matching context ids from Edge, return set that were found and
1591 // removed, these are the new edge's context ids. Also update the remaining
1592 // (not found ids).
1593 set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
1594 NotFoundContextIds);
1595 // Update the remaining context ids set for the later edges. This is a
1596 // compile time optimization.
1597 if (RecursiveContextIds.empty()) {
1598 // No recursive ids, so all of the previously remaining context ids that
1599 // were not seen on this edge are the new remaining set.
1600 RemainingContextIds.swap(RHS&: NotFoundContextIds);
1601 } else {
1602 // Keep the recursive ids in the remaining set as we expect to see those
1603 // on another edge. We can remove the non-recursive remaining ids that
1604 // were seen on this edge, however. We already have the set of remaining
1605 // ids that were on this edge (in NewEdgeContextIds). Figure out which are
1606 // non-recursive and only remove those. Note that despite the higher
1607 // overhead of updating the remaining context ids set when recursion
1608 // handling is enabled, it was found to be at worst performance neutral
1609 // and in one case a clear win.
1610 DenseSet<uint32_t> NonRecursiveRemainingCurEdgeIds =
1611 set_difference(S1: NewEdgeContextIds, S2: RecursiveContextIds);
1612 set_subtract(S1&: RemainingContextIds, S2: NonRecursiveRemainingCurEdgeIds);
1613 }
1614 // If no matching context ids for this edge, skip it.
1615 if (NewEdgeContextIds.empty()) {
1616 ++EI;
1617 continue;
1618 }
1619 if (TowardsCallee) {
1620 uint8_t NewAllocType = computeAllocType(ContextIds&: NewEdgeContextIds);
1621 auto NewEdge = std::make_shared<ContextEdge>(
1622 Edge->Callee, NewNode, NewAllocType, std::move(NewEdgeContextIds));
1623 NewNode->CalleeEdges.push_back(NewEdge);
1624 NewEdge->Callee->CallerEdges.push_back(NewEdge);
1625 } else {
1626 uint8_t NewAllocType = computeAllocType(ContextIds&: NewEdgeContextIds);
1627 auto NewEdge = std::make_shared<ContextEdge>(
1628 NewNode, Edge->Caller, NewAllocType, std::move(NewEdgeContextIds));
1629 NewNode->CallerEdges.push_back(NewEdge);
1630 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
1631 }
1632 // Remove old edge if context ids empty.
1633 if (Edge->getContextIds().empty()) {
1634 removeEdgeFromGraph(Edge: Edge.get(), EI: &EI, CalleeIter: TowardsCallee);
1635 continue;
1636 }
1637 ++EI;
1638 }
1639}
1640
1641template <typename DerivedCCG, typename FuncTy, typename CallTy>
1642static void checkEdge(
1643 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
1644 // Confirm that alloc type is not None and that we have at least one context
1645 // id.
1646 assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
1647 assert(!Edge->ContextIds.empty());
1648}
1649
1650template <typename DerivedCCG, typename FuncTy, typename CallTy>
1651static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
1652 bool CheckEdges = true) {
1653 if (Node->isRemoved())
1654 return;
1655#ifndef NDEBUG
1656 // Compute node's context ids once for use in asserts.
1657 auto NodeContextIds = Node->getContextIds();
1658#endif
1659 // Node's context ids should be the union of both its callee and caller edge
1660 // context ids.
1661 if (Node->CallerEdges.size()) {
1662 DenseSet<uint32_t> CallerEdgeContextIds(
1663 Node->CallerEdges.front()->ContextIds);
1664 for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) {
1665 if (CheckEdges)
1666 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1667 set_union(CallerEdgeContextIds, Edge->ContextIds);
1668 }
1669 // Node can have more context ids than callers if some contexts terminate at
1670 // node and some are longer. If we are allowing recursive callsites and
1671 // contexts this will be violated for incompletely cloned recursive cycles,
1672 // so skip the checking in that case.
1673 assert((AllowRecursiveCallsites && AllowRecursiveContexts) ||
1674 NodeContextIds == CallerEdgeContextIds ||
1675 set_is_subset(CallerEdgeContextIds, NodeContextIds));
1676 }
1677 if (Node->CalleeEdges.size()) {
1678 DenseSet<uint32_t> CalleeEdgeContextIds(
1679 Node->CalleeEdges.front()->ContextIds);
1680 for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) {
1681 if (CheckEdges)
1682 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1683 set_union(CalleeEdgeContextIds, Edge->getContextIds());
1684 }
1685 // If we are allowing recursive callsites and contexts this will be violated
1686 // for incompletely cloned recursive cycles, so skip the checking in that
1687 // case.
1688 assert((AllowRecursiveCallsites && AllowRecursiveContexts) ||
1689 NodeContextIds == CalleeEdgeContextIds);
1690 }
1691 // FIXME: Since this checking is only invoked under an option, we should
1692 // change the error checking from using assert to something that will trigger
1693 // an error on a release build.
1694#ifndef NDEBUG
1695 // Make sure we don't end up with duplicate edges between the same caller and
1696 // callee.
1697 DenseSet<ContextNode<DerivedCCG, FuncTy, CallTy> *> NodeSet;
1698 for (const auto &E : Node->CalleeEdges)
1699 NodeSet.insert(E->Callee);
1700 assert(NodeSet.size() == Node->CalleeEdges.size());
1701#endif
1702}
1703
1704template <typename DerivedCCG, typename FuncTy, typename CallTy>
1705void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1706 assignStackNodesPostOrder(ContextNode *Node,
1707 DenseSet<const ContextNode *> &Visited,
1708 DenseMap<uint64_t, std::vector<CallContextInfo>>
1709 &StackIdToMatchingCalls,
1710 DenseMap<CallInfo, CallInfo> &CallToMatchingCall,
1711 const DenseSet<uint32_t> &ImportantContextIds) {
1712 auto Inserted = Visited.insert(Node);
1713 if (!Inserted.second)
1714 return;
1715 // Post order traversal. Iterate over a copy since we may add nodes and
1716 // therefore new callers during the recursive call, invalidating any
1717 // iterator over the original edge vector. We don't need to process these
1718 // new nodes as they were already processed on creation.
1719 auto CallerEdges = Node->CallerEdges;
1720 for (auto &Edge : CallerEdges) {
1721 // Skip any that have been removed during the recursion.
1722 if (Edge->isRemoved()) {
1723 assert(!is_contained(Node->CallerEdges, Edge));
1724 continue;
1725 }
1726 assignStackNodesPostOrder(Node: Edge->Caller, Visited, StackIdToMatchingCalls,
1727 CallToMatchingCall, ImportantContextIds);
1728 }
1729
1730 // If this node's stack id is in the map, update the graph to contain new
1731 // nodes representing any inlining at interior callsites. Note we move the
1732 // associated context ids over to the new nodes.
1733
1734 // Ignore this node if it is for an allocation or we didn't record any
1735 // stack id lists ending at it.
1736 if (Node->IsAllocation ||
1737 !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
1738 return;
1739
1740 auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
1741 // Handle the simple case first. A single call with a single stack id.
1742 // In this case there is no need to create any new context nodes, simply
1743 // assign the context node for stack id to this Call.
1744 if (Calls.size() == 1) {
1745 auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
1746 if (Ids.size() == 1) {
1747 assert(SavedContextIds.empty());
1748 // It should be this Node
1749 assert(Node == getNodeForStackId(Ids[0]));
1750 if (Node->Recursive)
1751 return;
1752 Node->setCall(Call);
1753 NonAllocationCallToContextNodeMap[Call] = Node;
1754 NodeToCallingFunc[Node] = Func;
1755 recordStackNode(StackIds&: Ids, Node, NodeContextIds: Node->getContextIds(), ImportantContextIds);
1756 return;
1757 }
1758 }
1759
1760#ifndef NDEBUG
1761 // Find the node for the last stack id, which should be the same
1762 // across all calls recorded for this id, and is this node's id.
1763 uint64_t LastId = Node->OrigStackOrAllocId;
1764 ContextNode *LastNode = getNodeForStackId(LastId);
1765 // We should only have kept stack ids that had nodes.
1766 assert(LastNode);
1767 assert(LastNode == Node);
1768#else
1769 ContextNode *LastNode = Node;
1770#endif
1771
1772 // Compute the last node's context ids once, as it is shared by all calls in
1773 // this entry.
1774 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1775
1776 [[maybe_unused]] bool PrevIterCreatedNode = false;
1777 bool CreatedNode = false;
1778 for (unsigned I = 0; I < Calls.size();
1779 I++, PrevIterCreatedNode = CreatedNode) {
1780 CreatedNode = false;
1781 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1782 // Skip any for which we didn't assign any ids, these don't get a node in
1783 // the graph.
1784 if (SavedContextIds.empty()) {
1785 // If this call has a matching call (located in the same function and
1786 // having the same stack ids), simply add it to the context node created
1787 // for its matching call earlier. These can be treated the same through
1788 // cloning and get updated at the same time.
1789 if (!CallToMatchingCall.contains(Call))
1790 continue;
1791 auto MatchingCall = CallToMatchingCall[Call];
1792 if (!NonAllocationCallToContextNodeMap.contains(MatchingCall)) {
1793 // This should only happen if we had a prior iteration, and it didn't
1794 // create a node because of the below recomputation of context ids
1795 // finding none remaining and continuing early.
1796 assert(I > 0 && !PrevIterCreatedNode);
1797 continue;
1798 }
1799 NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back(
1800 Call);
1801 continue;
1802 }
1803
1804 assert(LastId == Ids.back());
1805
1806 // Recompute the context ids for this stack id sequence (the
1807 // intersection of the context ids of the corresponding nodes).
1808 // Start with the ids we saved in the map for this call, which could be
1809 // duplicated context ids. We have to recompute as we might have overlap
1810 // overlap between the saved context ids for different last nodes, and
1811 // removed them already during the post order traversal.
1812 set_intersect(SavedContextIds, LastNodeContextIds);
1813 ContextNode *PrevNode = LastNode;
1814 bool Skip = false;
1815 // Iterate backwards through the stack Ids, starting after the last Id
1816 // in the list, which was handled once outside for all Calls.
1817 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1818 auto Id = *IdIter;
1819 ContextNode *CurNode = getNodeForStackId(StackId: Id);
1820 // We should only have kept stack ids that had nodes and weren't
1821 // recursive.
1822 assert(CurNode);
1823 assert(!CurNode->Recursive);
1824
1825 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1826 if (!Edge) {
1827 Skip = true;
1828 break;
1829 }
1830 PrevNode = CurNode;
1831
1832 // Update the context ids, which is the intersection of the ids along
1833 // all edges in the sequence.
1834 set_intersect(SavedContextIds, Edge->getContextIds());
1835
1836 // If we now have no context ids for clone, skip this call.
1837 if (SavedContextIds.empty()) {
1838 Skip = true;
1839 break;
1840 }
1841 }
1842 if (Skip)
1843 continue;
1844
1845 // Create new context node.
1846 ContextNode *NewNode = createNewNode(/*IsAllocation=*/false, F: Func, C: Call);
1847 NonAllocationCallToContextNodeMap[Call] = NewNode;
1848 CreatedNode = true;
1849 NewNode->AllocTypes = computeAllocType(ContextIds&: SavedContextIds);
1850
1851 ContextNode *FirstNode = getNodeForStackId(StackId: Ids[0]);
1852 assert(FirstNode);
1853
1854 // Connect to callees of innermost stack frame in inlined call chain.
1855 // This updates context ids for FirstNode's callee's to reflect those
1856 // moved to NewNode.
1857 connectNewNode(NewNode, OrigNode: FirstNode, /*TowardsCallee=*/true, RemainingContextIds: SavedContextIds);
1858
1859 // Connect to callers of outermost stack frame in inlined call chain.
1860 // This updates context ids for FirstNode's caller's to reflect those
1861 // moved to NewNode.
1862 connectNewNode(NewNode, OrigNode: LastNode, /*TowardsCallee=*/false, RemainingContextIds: SavedContextIds);
1863
1864 // Now we need to remove context ids from edges/nodes between First and
1865 // Last Node.
1866 PrevNode = nullptr;
1867 for (auto Id : Ids) {
1868 ContextNode *CurNode = getNodeForStackId(StackId: Id);
1869 // We should only have kept stack ids that had nodes.
1870 assert(CurNode);
1871
1872 // Remove the context ids moved to NewNode from CurNode, and the
1873 // edge from the prior node.
1874 if (PrevNode) {
1875 auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
1876 // If the sequence contained recursion, we might have already removed
1877 // some edges during the connectNewNode calls above.
1878 if (!PrevEdge) {
1879 PrevNode = CurNode;
1880 continue;
1881 }
1882 set_subtract(PrevEdge->getContextIds(), SavedContextIds);
1883 if (PrevEdge->getContextIds().empty())
1884 removeEdgeFromGraph(Edge: PrevEdge);
1885 }
1886 // Since we update the edges from leaf to tail, only look at the callee
1887 // edges. This isn't an alloc node, so if there are no callee edges, the
1888 // alloc type is None.
1889 CurNode->AllocTypes = CurNode->CalleeEdges.empty()
1890 ? (uint8_t)AllocationType::None
1891 : CurNode->computeAllocType();
1892 PrevNode = CurNode;
1893 }
1894
1895 recordStackNode(StackIds&: Ids, Node: NewNode, NodeContextIds: SavedContextIds, ImportantContextIds);
1896
1897 if (VerifyNodes) {
1898 checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true);
1899 for (auto Id : Ids) {
1900 ContextNode *CurNode = getNodeForStackId(StackId: Id);
1901 // We should only have kept stack ids that had nodes.
1902 assert(CurNode);
1903 checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /*CheckEdges=*/true);
1904 }
1905 }
1906 }
1907}
1908
1909template <typename DerivedCCG, typename FuncTy, typename CallTy>
1910void CallsiteContextGraph<DerivedCCG, FuncTy,
1911 CallTy>::fixupImportantContexts() {
1912 if (ImportantContextIdInfo.empty())
1913 return;
1914
1915 // Update statistics as we are done building this map at this point.
1916 NumImportantContextIds = ImportantContextIdInfo.size();
1917
1918 if (!MemProfFixupImportant)
1919 return;
1920
1921 if (ExportToDot)
1922 exportToDot(Label: "beforestackfixup");
1923
1924 // For each context we identified as important, walk through the saved context
1925 // stack ids in order from leaf upwards, and make sure all edges are correct.
1926 // These can be difficult to get right when updating the graph while mapping
1927 // nodes onto summary or IR, especially when there is recursion. In
1928 // particular, when we have created new nodes to reflect inlining, it is
1929 // sometimes impossible to know exactly how to update the edges in the face of
1930 // recursion, as we have lost the original ordering of the stack ids in the
1931 // contexts.
1932 // TODO: Consider only doing this if we detect the context has recursive
1933 // cycles.
1934 //
1935 // I.e. assume we have a context with stack ids like: {A B A C A D E}
1936 // and let's say A was inlined into B, C, and D. The original graph will have
1937 // multiple recursive cycles through A. When we match the original context
1938 // nodes onto the IR or summary, we will merge {A B} into one context node,
1939 // {A C} onto another, and {A D} onto another. Looking at the stack sequence
1940 // above, we should end up with a non-cyclic set of edges like:
1941 // {AB} <- {AC} <- {AD} <- E. However, because we normally have lost the
1942 // original ordering, we won't get the edges correct initially (it's
1943 // impossible without the original ordering). Here we do the fixup (add and
1944 // removing edges where necessary) for this context. In the
1945 // ImportantContextInfo struct in this case we should have a MaxLength = 2,
1946 // and map entries for {A B}, {A C}, {A D}, and {E}.
1947 for (auto &[CurContextId, Info] : ImportantContextIdInfo) {
1948 if (Info.StackIdsToNode.empty())
1949 continue;
1950 bool Changed = false;
1951 ContextNode *PrevNode = nullptr;
1952 ContextNode *CurNode = nullptr;
1953 DenseSet<const ContextEdge *> VisitedEdges;
1954 ArrayRef<uint64_t> AllStackIds(Info.StackIds);
1955 // Try to identify what callsite ContextNode maps to which slice of the
1956 // context's ordered stack ids.
1957 for (unsigned I = 0; I < AllStackIds.size(); I++, PrevNode = CurNode) {
1958 // We will do this greedily, trying up to MaxLength stack ids in a row, to
1959 // see if we recorded a context node for that sequence.
1960 auto Len = Info.MaxLength;
1961 auto LenToEnd = AllStackIds.size() - I;
1962 if (Len > LenToEnd)
1963 Len = LenToEnd;
1964 CurNode = nullptr;
1965 // Try to find a recorded context node starting with the longest length
1966 // recorded, and on down until we check for just a single stack node.
1967 for (; Len > 0; Len--) {
1968 // Get the slice of the original stack id sequence to check.
1969 auto CheckStackIds = AllStackIds.slice(I, Len);
1970 auto EntryIt = Info.StackIdsToNode.find(CheckStackIds);
1971 if (EntryIt == Info.StackIdsToNode.end())
1972 continue;
1973 CurNode = EntryIt->second;
1974 // Skip forward so we don't try to look for the ones we just matched.
1975 // We increment by Len - 1, because the outer for loop will increment I.
1976 I += Len - 1;
1977 break;
1978 }
1979 // Give up if we couldn't find a node. Since we need to clone from the
1980 // leaf allocation upwards, no sense in doing anymore fixup further up
1981 // the context if we couldn't match part of the original stack context
1982 // onto a callsite node.
1983 if (!CurNode)
1984 break;
1985 // No edges to fix up until we have a pair of nodes that should be
1986 // adjacent in the graph.
1987 if (!PrevNode)
1988 continue;
1989 // See if we already have a call edge from CurNode to PrevNode.
1990 auto *CurEdge = PrevNode->findEdgeFromCaller(CurNode);
1991 if (CurEdge) {
1992 // We already have an edge. Make sure it contains this context id.
1993 if (CurEdge->getContextIds().insert(CurContextId).second) {
1994 NumFixupEdgeIdsInserted++;
1995 Changed = true;
1996 }
1997 } else {
1998 // No edge exists - add one.
1999 NumFixupEdgesAdded++;
2000 DenseSet<uint32_t> ContextIds({CurContextId});
2001 auto AllocType = computeAllocType(ContextIds);
2002 auto NewEdge = std::make_shared<ContextEdge>(
2003 PrevNode, CurNode, AllocType, std::move(ContextIds));
2004 PrevNode->CallerEdges.push_back(NewEdge);
2005 CurNode->CalleeEdges.push_back(NewEdge);
2006 // Save the new edge for the below handling.
2007 CurEdge = NewEdge.get();
2008 Changed = true;
2009 }
2010 VisitedEdges.insert(CurEdge);
2011 // Now remove this context id from any other caller edges calling
2012 // PrevNode.
2013 for (auto &Edge : PrevNode->CallerEdges) {
2014 // Skip the edge updating/created above and edges we have already
2015 // visited (due to recursion).
2016 if (Edge.get() != CurEdge && !VisitedEdges.contains(Edge.get()))
2017 Edge->getContextIds().erase(CurContextId);
2018 }
2019 }
2020 if (Changed)
2021 NumFixedContexts++;
2022 }
2023}
2024
2025template <typename DerivedCCG, typename FuncTy, typename CallTy>
2026void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
2027 // Map of stack id to all calls with that as the last (outermost caller)
2028 // callsite id that has a context node (some might not due to pruning
2029 // performed during matching of the allocation profile contexts).
2030 // The CallContextInfo contains the Call and a list of its stack ids with
2031 // ContextNodes, the function containing Call, and the set of context ids
2032 // the analysis will eventually identify for use in any new node created
2033 // for that callsite.
2034 DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls;
2035 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
2036 for (auto &Call : CallsWithMetadata) {
2037 // Ignore allocations, already handled.
2038 if (AllocationCallToContextNodeMap.count(Call))
2039 continue;
2040 auto StackIdsWithContextNodes =
2041 getStackIdsWithContextNodesForCall(Call: Call.call());
2042 // If there were no nodes created for MIBs on allocs (maybe this was in
2043 // the unambiguous part of the MIB stack that was pruned), ignore.
2044 if (StackIdsWithContextNodes.empty())
2045 continue;
2046 // Otherwise, record this Call along with the list of ids for the last
2047 // (outermost caller) stack id with a node.
2048 StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
2049 {Call.call(), StackIdsWithContextNodes, Func, {}});
2050 }
2051 }
2052
2053 // First make a pass through all stack ids that correspond to a call,
2054 // as identified in the above loop. Compute the context ids corresponding to
2055 // each of these calls when they correspond to multiple stack ids due to
2056 // due to inlining. Perform any duplication of context ids required when
2057 // there is more than one call with the same stack ids. Their (possibly newly
2058 // duplicated) context ids are saved in the StackIdToMatchingCalls map.
2059 DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
2060 // Save a map from each call to any that are found to match it. I.e. located
2061 // in the same function and have the same (possibly pruned) stack ids. We use
2062 // this to avoid creating extra graph nodes as they can be treated the same.
2063 DenseMap<CallInfo, CallInfo> CallToMatchingCall;
2064 for (auto &It : StackIdToMatchingCalls) {
2065 auto &Calls = It.getSecond();
2066 // Skip single calls with a single stack id. These don't need a new node.
2067 if (Calls.size() == 1) {
2068 auto &Ids = Calls[0].StackIds;
2069 if (Ids.size() == 1)
2070 continue;
2071 }
2072 // In order to do the best and maximal matching of inlined calls to context
2073 // node sequences we will sort the vectors of stack ids in descending order
2074 // of length, and within each length, lexicographically by stack id. The
2075 // latter is so that we can specially handle calls that have identical stack
2076 // id sequences (either due to cloning or artificially because of the MIB
2077 // context pruning). Those with the same Ids are then sorted by function to
2078 // facilitate efficiently mapping them to the same context node.
2079 // Because the functions are pointers, to ensure a stable sort first assign
2080 // each function pointer to its first index in the Calls array, and then use
2081 // that to sort by.
2082 DenseMap<const FuncTy *, unsigned> FuncToIndex;
2083 for (const auto &[Idx, CallCtxInfo] : enumerate(Calls))
2084 FuncToIndex.insert({CallCtxInfo.Func, Idx});
2085 llvm::stable_sort(
2086 Calls,
2087 [&FuncToIndex](const CallContextInfo &A, const CallContextInfo &B) {
2088 return A.StackIds.size() > B.StackIds.size() ||
2089 (A.StackIds.size() == B.StackIds.size() &&
2090 (A.StackIds < B.StackIds ||
2091 (A.StackIds == B.StackIds &&
2092 FuncToIndex[A.Func] < FuncToIndex[B.Func])));
2093 });
2094
2095 // Find the node for the last stack id, which should be the same
2096 // across all calls recorded for this id, and is the id for this
2097 // entry in the StackIdToMatchingCalls map.
2098 uint64_t LastId = It.getFirst();
2099 ContextNode *LastNode = getNodeForStackId(StackId: LastId);
2100 // We should only have kept stack ids that had nodes.
2101 assert(LastNode);
2102
2103 if (LastNode->Recursive)
2104 continue;
2105
2106 // Initialize the context ids with the last node's. We will subsequently
2107 // refine the context ids by computing the intersection along all edges.
2108 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
2109 assert(!LastNodeContextIds.empty());
2110
2111#ifndef NDEBUG
2112 // Save the set of functions seen for a particular set of the same stack
2113 // ids. This is used to ensure that they have been correctly sorted to be
2114 // adjacent in the Calls list, since we rely on that to efficiently place
2115 // all such matching calls onto the same context node.
2116 DenseSet<const FuncTy *> MatchingIdsFuncSet;
2117#endif
2118
2119 for (unsigned I = 0; I < Calls.size(); I++) {
2120 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
2121 assert(SavedContextIds.empty());
2122 assert(LastId == Ids.back());
2123
2124#ifndef NDEBUG
2125 // If this call has a different set of ids than the last one, clear the
2126 // set used to ensure they are sorted properly.
2127 if (I > 0 && Ids != Calls[I - 1].StackIds)
2128 MatchingIdsFuncSet.clear();
2129#endif
2130
2131 // First compute the context ids for this stack id sequence (the
2132 // intersection of the context ids of the corresponding nodes).
2133 // Start with the remaining saved ids for the last node.
2134 assert(!LastNodeContextIds.empty());
2135 DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
2136
2137 ContextNode *PrevNode = LastNode;
2138 ContextNode *CurNode = LastNode;
2139 bool Skip = false;
2140
2141 // Iterate backwards through the stack Ids, starting after the last Id
2142 // in the list, which was handled once outside for all Calls.
2143 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
2144 auto Id = *IdIter;
2145 CurNode = getNodeForStackId(StackId: Id);
2146 // We should only have kept stack ids that had nodes.
2147 assert(CurNode);
2148
2149 if (CurNode->Recursive) {
2150 Skip = true;
2151 break;
2152 }
2153
2154 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
2155 // If there is no edge then the nodes belong to different MIB contexts,
2156 // and we should skip this inlined context sequence. For example, this
2157 // particular inlined context may include stack ids A->B, and we may
2158 // indeed have nodes for both A and B, but it is possible that they were
2159 // never profiled in sequence in a single MIB for any allocation (i.e.
2160 // we might have profiled an allocation that involves the callsite A,
2161 // but through a different one of its callee callsites, and we might
2162 // have profiled an allocation that involves callsite B, but reached
2163 // from a different caller callsite).
2164 if (!Edge) {
2165 Skip = true;
2166 break;
2167 }
2168 PrevNode = CurNode;
2169
2170 // Update the context ids, which is the intersection of the ids along
2171 // all edges in the sequence.
2172 set_intersect(StackSequenceContextIds, Edge->getContextIds());
2173
2174 // If we now have no context ids for clone, skip this call.
2175 if (StackSequenceContextIds.empty()) {
2176 Skip = true;
2177 break;
2178 }
2179 }
2180 if (Skip)
2181 continue;
2182
2183 // If some of this call's stack ids did not have corresponding nodes (due
2184 // to pruning), don't include any context ids for contexts that extend
2185 // beyond these nodes. Otherwise we would be matching part of unrelated /
2186 // not fully matching stack contexts. To do this, subtract any context ids
2187 // found in caller nodes of the last node found above.
2188 if (Ids.back() != getLastStackId(Call)) {
2189 for (const auto &PE : LastNode->CallerEdges) {
2190 set_subtract(StackSequenceContextIds, PE->getContextIds());
2191 if (StackSequenceContextIds.empty())
2192 break;
2193 }
2194 // If we now have no context ids for clone, skip this call.
2195 if (StackSequenceContextIds.empty())
2196 continue;
2197 }
2198
2199#ifndef NDEBUG
2200 // If the prior call had the same stack ids this set would not be empty.
2201 // Check if we already have a call that "matches" because it is located
2202 // in the same function. If the Calls list was sorted properly we should
2203 // not encounter this situation as all such entries should be adjacent
2204 // and processed in bulk further below.
2205 assert(!MatchingIdsFuncSet.contains(Func));
2206
2207 MatchingIdsFuncSet.insert(Func);
2208#endif
2209
2210 // Check if the next set of stack ids is the same (since the Calls vector
2211 // of tuples is sorted by the stack ids we can just look at the next one).
2212 // If so, save them in the CallToMatchingCall map so that they get
2213 // assigned to the same context node, and skip them.
2214 bool DuplicateContextIds = false;
2215 for (unsigned J = I + 1; J < Calls.size(); J++) {
2216 auto &CallCtxInfo = Calls[J];
2217 auto &NextIds = CallCtxInfo.StackIds;
2218 if (NextIds != Ids)
2219 break;
2220 auto *NextFunc = CallCtxInfo.Func;
2221 if (NextFunc != Func) {
2222 // We have another Call with the same ids but that cannot share this
2223 // node, must duplicate ids for it.
2224 DuplicateContextIds = true;
2225 break;
2226 }
2227 auto &NextCall = CallCtxInfo.Call;
2228 CallToMatchingCall[NextCall] = Call;
2229 // Update I so that it gets incremented correctly to skip this call.
2230 I = J;
2231 }
2232
2233 // If we don't have duplicate context ids, then we can assign all the
2234 // context ids computed for the original node sequence to this call.
2235 // If there are duplicate calls with the same stack ids then we synthesize
2236 // new context ids that are duplicates of the originals. These are
2237 // assigned to SavedContextIds, which is a reference into the map entry
2238 // for this call, allowing us to access these ids later on.
2239 OldToNewContextIds.reserve(NumEntries: OldToNewContextIds.size() +
2240 StackSequenceContextIds.size());
2241 SavedContextIds =
2242 DuplicateContextIds
2243 ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
2244 : StackSequenceContextIds;
2245 assert(!SavedContextIds.empty());
2246
2247 if (!DuplicateContextIds) {
2248 // Update saved last node's context ids to remove those that are
2249 // assigned to other calls, so that it is ready for the next call at
2250 // this stack id.
2251 set_subtract(S1&: LastNodeContextIds, S2: StackSequenceContextIds);
2252 if (LastNodeContextIds.empty())
2253 break;
2254 }
2255 }
2256 }
2257
2258 // Propagate the duplicate context ids over the graph.
2259 propagateDuplicateContextIds(OldToNewContextIds);
2260
2261 if (VerifyCCG)
2262 check();
2263
2264 // Now perform a post-order traversal over the graph, starting with the
2265 // allocation nodes, essentially processing nodes from callers to callees.
2266 // For any that contains an id in the map, update the graph to contain new
2267 // nodes representing any inlining at interior callsites. Note we move the
2268 // associated context ids over to the new nodes.
2269 DenseSet<const ContextNode *> Visited;
2270 DenseSet<uint32_t> ImportantContextIds(llvm::from_range,
2271 ImportantContextIdInfo.keys());
2272 for (auto &Entry : AllocationCallToContextNodeMap)
2273 assignStackNodesPostOrder(Node: Entry.second, Visited, StackIdToMatchingCalls,
2274 CallToMatchingCall, ImportantContextIds);
2275
2276 fixupImportantContexts();
2277
2278 if (VerifyCCG)
2279 check();
2280}
2281
2282uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
2283 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
2284 Call->getMetadata(KindID: LLVMContext::MD_callsite));
2285 return CallsiteContext.back();
2286}
2287
2288uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
2289 assert(isa<CallsiteInfo *>(Call));
2290 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
2291 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Val&: Call));
2292 // Need to convert index into stack id.
2293 return Index.getStackIdAtIndex(Index: CallsiteContext.back());
2294}
2295
2296static const std::string MemProfCloneSuffix = ".memprof.";
2297
2298static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
2299 // We use CloneNo == 0 to refer to the original version, which doesn't get
2300 // renamed with a suffix.
2301 if (!CloneNo)
2302 return Base.str();
2303 return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
2304}
2305
2306static bool isMemProfClone(const Function &F) {
2307 return F.getName().contains(Other: MemProfCloneSuffix);
2308}
2309
2310// Return the clone number of the given function by extracting it from the
2311// memprof suffix. Assumes the caller has already confirmed it is a memprof
2312// clone.
2313static unsigned getMemProfCloneNum(const Function &F) {
2314 assert(isMemProfClone(F));
2315 auto Pos = F.getName().find_last_of(C: '.');
2316 assert(Pos > 0);
2317 unsigned CloneNo;
2318 bool Err = F.getName().drop_front(N: Pos + 1).getAsInteger(Radix: 10, Result&: CloneNo);
2319 assert(!Err);
2320 (void)Err;
2321 return CloneNo;
2322}
2323
2324std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
2325 const Instruction *Call,
2326 unsigned CloneNo) const {
2327 return (Twine(Call->getFunction()->getName()) + " -> " +
2328 cast<CallBase>(Val: Call)->getCalledFunction()->getName())
2329 .str();
2330}
2331
2332std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
2333 const IndexCall &Call,
2334 unsigned CloneNo) const {
2335 auto VI = FSToVIMap.find(x: Func);
2336 assert(VI != FSToVIMap.end());
2337 std::string CallerName = getMemProfFuncName(Base: VI->second.name(), CloneNo);
2338 if (isa<AllocInfo *>(Val: Call))
2339 return CallerName + " -> alloc";
2340 else {
2341 auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Val: Call);
2342 return CallerName + " -> " +
2343 getMemProfFuncName(Base: Callsite->Callee.name(),
2344 CloneNo: Callsite->Clones[CloneNo]);
2345 }
2346}
2347
2348std::vector<uint64_t>
2349ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
2350 Instruction *Call) {
2351 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
2352 Call->getMetadata(KindID: LLVMContext::MD_callsite));
2353 return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
2354 CallsiteContext);
2355}
2356
2357std::vector<uint64_t>
2358IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
2359 assert(isa<CallsiteInfo *>(Call));
2360 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
2361 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Val&: Call));
2362 return getStackIdsWithContextNodes<CallsiteInfo,
2363 SmallVector<unsigned>::const_iterator>(
2364 CallsiteContext);
2365}
2366
2367template <typename DerivedCCG, typename FuncTy, typename CallTy>
2368template <class NodeT, class IteratorT>
2369std::vector<uint64_t>
2370CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
2371 CallStack<NodeT, IteratorT> &CallsiteContext) {
2372 std::vector<uint64_t> StackIds;
2373 for (auto IdOrIndex : CallsiteContext) {
2374 auto StackId = getStackId(IdOrIndex);
2375 ContextNode *Node = getNodeForStackId(StackId);
2376 if (!Node)
2377 break;
2378 StackIds.push_back(StackId);
2379 }
2380 return StackIds;
2381}
2382
2383ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
2384 Module &M,
2385 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)
2386 : Mod(M), OREGetter(OREGetter) {
2387 // Map for keeping track of the largest cold contexts up to the number given
2388 // by MemProfTopNImportant. Must be a std::map (not DenseMap) because keys
2389 // must be sorted.
2390 std::map<uint64_t, uint32_t> TotalSizeToContextIdTopNCold;
2391 for (auto &F : M) {
2392 std::vector<CallInfo> CallsWithMetadata;
2393 for (auto &BB : F) {
2394 for (auto &I : BB) {
2395 if (!isa<CallBase>(Val: I))
2396 continue;
2397 if (auto *MemProfMD = I.getMetadata(KindID: LLVMContext::MD_memprof)) {
2398 CallsWithMetadata.push_back(x: &I);
2399 auto *AllocNode = addAllocNode(Call: &I, F: &F);
2400 auto *CallsiteMD = I.getMetadata(KindID: LLVMContext::MD_callsite);
2401 assert(CallsiteMD);
2402 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
2403 // Add all of the MIBs and their stack nodes.
2404 for (auto &MDOp : MemProfMD->operands()) {
2405 auto *MIBMD = cast<const MDNode>(Val: MDOp);
2406 std::vector<ContextTotalSize> ContextSizeInfo;
2407 // Collect the context size information if it exists.
2408 if (MIBMD->getNumOperands() > 2) {
2409 for (unsigned I = 2; I < MIBMD->getNumOperands(); I++) {
2410 MDNode *ContextSizePair =
2411 dyn_cast<MDNode>(Val: MIBMD->getOperand(I));
2412 assert(ContextSizePair->getNumOperands() == 2);
2413 uint64_t FullStackId = mdconst::dyn_extract<ConstantInt>(
2414 MD: ContextSizePair->getOperand(I: 0))
2415 ->getZExtValue();
2416 uint64_t TotalSize = mdconst::dyn_extract<ConstantInt>(
2417 MD: ContextSizePair->getOperand(I: 1))
2418 ->getZExtValue();
2419 ContextSizeInfo.push_back(x: {.FullStackId: FullStackId, .TotalSize: TotalSize});
2420 }
2421 }
2422 MDNode *StackNode = getMIBStackNode(MIB: MIBMD);
2423 assert(StackNode);
2424 CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode);
2425 addStackNodesForMIB<MDNode, MDNode::op_iterator>(
2426 AllocNode, StackContext, CallsiteContext,
2427 AllocType: getMIBAllocType(MIB: MIBMD), ContextSizeInfo,
2428 TotalSizeToContextIdTopNCold);
2429 }
2430 // If exporting the graph to dot and an allocation id of interest was
2431 // specified, record all the context ids for this allocation node.
2432 if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
2433 DotAllocContextIds = AllocNode->getContextIds();
2434 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2435 // Memprof and callsite metadata on memory allocations no longer
2436 // needed.
2437 I.setMetadata(KindID: LLVMContext::MD_memprof, Node: nullptr);
2438 I.setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
2439 }
2440 // For callsite metadata, add to list for this function for later use.
2441 else if (I.getMetadata(KindID: LLVMContext::MD_callsite)) {
2442 CallsWithMetadata.push_back(x: &I);
2443 }
2444 }
2445 }
2446 if (!CallsWithMetadata.empty())
2447 FuncToCallsWithMetadata[&F] = CallsWithMetadata;
2448 }
2449
2450 if (DumpCCG) {
2451 dbgs() << "CCG before updating call stack chains:\n";
2452 dbgs() << *this;
2453 }
2454
2455 if (ExportToDot)
2456 exportToDot(Label: "prestackupdate");
2457
2458 updateStackNodes();
2459
2460 if (ExportToDot)
2461 exportToDot(Label: "poststackupdate");
2462
2463 handleCallsitesWithMultipleTargets();
2464
2465 markBackedges();
2466
2467 // Strip off remaining callsite metadata, no longer needed.
2468 for (auto &FuncEntry : FuncToCallsWithMetadata)
2469 for (auto &Call : FuncEntry.second)
2470 Call.call()->setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
2471}
2472
2473// Finds the set of GUIDs for weak aliasees that are prevailing in different
2474// modules than any of their aliases. We need to handle these specially.
2475DenseSet<GlobalValue::GUID>
2476IndexCallsiteContextGraph::findAliaseeGUIDsPrevailingInDifferentModule() {
2477 DenseSet<GlobalValue::GUID> AliaseeGUIDs;
2478 for (auto &I : Index) {
2479 auto VI = Index.getValueInfo(R: I);
2480 for (auto &S : VI.getSummaryList()) {
2481 // We only care about aliases to functions.
2482 auto *AS = dyn_cast<AliasSummary>(Val: S.get());
2483 if (!AS)
2484 continue;
2485 auto *AliaseeSummary = &AS->getAliasee();
2486 auto *AliaseeFS = dyn_cast<FunctionSummary>(Val: AliaseeSummary);
2487 if (!AliaseeFS)
2488 continue;
2489 // Skip this summary if it is not for the prevailing symbol for this GUID.
2490 // The linker doesn't resolve local linkage values so don't check whether
2491 // those are prevailing.
2492 if (!GlobalValue::isLocalLinkage(Linkage: S->linkage()) &&
2493 !isPrevailing(VI.getGUID(), S.get()))
2494 continue;
2495 // Prevailing aliasee could be in a different module only if it is weak.
2496 if (!GlobalValue::isWeakForLinker(Linkage: AliaseeSummary->linkage()))
2497 continue;
2498 auto AliaseeGUID = AS->getAliaseeGUID();
2499 // If the aliasee copy in this module is not prevailing, record it.
2500 if (!isPrevailing(AliaseeGUID, AliaseeSummary))
2501 AliaseeGUIDs.insert(V: AliaseeGUID);
2502 }
2503 }
2504 AliaseesPrevailingInDiffModuleFromAlias += AliaseeGUIDs.size();
2505 return AliaseeGUIDs;
2506}
2507
2508IndexCallsiteContextGraph::IndexCallsiteContextGraph(
2509 ModuleSummaryIndex &Index,
2510 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
2511 isPrevailing)
2512 : Index(Index), isPrevailing(isPrevailing) {
2513 // Since we use the aliasee summary info to create the necessary clones for
2514 // its aliases, conservatively skip recording the aliasee function's callsites
2515 // in the CCG for any that are prevailing in a different module than one of
2516 // its aliases. We could record the necessary information to do this in the
2517 // summary, but this case should not be common.
2518 DenseSet<GlobalValue::GUID> GUIDsToSkip =
2519 findAliaseeGUIDsPrevailingInDifferentModule();
2520 // Map for keeping track of the largest cold contexts up to the number given
2521 // by MemProfTopNImportant. Must be a std::map (not DenseMap) because keys
2522 // must be sorted.
2523 std::map<uint64_t, uint32_t> TotalSizeToContextIdTopNCold;
2524 for (auto &I : Index) {
2525 auto VI = Index.getValueInfo(R: I);
2526 if (GUIDsToSkip.contains(V: VI.getGUID()))
2527 continue;
2528 for (auto &S : VI.getSummaryList()) {
2529 // We should only add the prevailing nodes. Otherwise we may try to clone
2530 // in a weak copy that won't be linked (and may be different than the
2531 // prevailing version).
2532 // We only keep the memprof summary on the prevailing copy now when
2533 // building the combined index, as a space optimization, however don't
2534 // rely on this optimization. The linker doesn't resolve local linkage
2535 // values so don't check whether those are prevailing.
2536 if (!GlobalValue::isLocalLinkage(Linkage: S->linkage()) &&
2537 !isPrevailing(VI.getGUID(), S.get()))
2538 continue;
2539 auto *FS = dyn_cast<FunctionSummary>(Val: S.get());
2540 if (!FS)
2541 continue;
2542 std::vector<CallInfo> CallsWithMetadata;
2543 if (!FS->allocs().empty()) {
2544 for (auto &AN : FS->mutableAllocs()) {
2545 // This can happen because of recursion elimination handling that
2546 // currently exists in ModuleSummaryAnalysis. Skip these for now.
2547 // We still added them to the summary because we need to be able to
2548 // correlate properly in applyImport in the backends.
2549 if (AN.MIBs.empty())
2550 continue;
2551 IndexCall AllocCall(&AN);
2552 CallsWithMetadata.push_back(x: AllocCall);
2553 auto *AllocNode = addAllocNode(Call: AllocCall, F: FS);
2554 // Pass an empty CallStack to the CallsiteContext (second)
2555 // parameter, since for ThinLTO we already collapsed out the inlined
2556 // stack ids on the allocation call during ModuleSummaryAnalysis.
2557 CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
2558 EmptyContext;
2559 unsigned I = 0;
2560 assert(!metadataMayIncludeContextSizeInfo() ||
2561 AN.ContextSizeInfos.size() == AN.MIBs.size());
2562 // Now add all of the MIBs and their stack nodes.
2563 for (auto &MIB : AN.MIBs) {
2564 CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
2565 StackContext(&MIB);
2566 std::vector<ContextTotalSize> ContextSizeInfo;
2567 if (!AN.ContextSizeInfos.empty()) {
2568 for (auto [FullStackId, TotalSize] : AN.ContextSizeInfos[I])
2569 ContextSizeInfo.push_back(x: {.FullStackId: FullStackId, .TotalSize: TotalSize});
2570 }
2571 addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
2572 AllocNode, StackContext, CallsiteContext&: EmptyContext, AllocType: MIB.AllocType,
2573 ContextSizeInfo, TotalSizeToContextIdTopNCold);
2574 I++;
2575 }
2576 // If exporting the graph to dot and an allocation id of interest was
2577 // specified, record all the context ids for this allocation node.
2578 if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
2579 DotAllocContextIds = AllocNode->getContextIds();
2580 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2581 // Initialize version 0 on the summary alloc node to the current alloc
2582 // type, unless it has both types in which case make it default, so
2583 // that in the case where we aren't able to clone the original version
2584 // always ends up with the default allocation behavior.
2585 AN.Versions[0] = (uint8_t)allocTypeToUse(AllocTypes: AllocNode->AllocTypes);
2586 }
2587 }
2588 // For callsite metadata, add to list for this function for later use.
2589 if (!FS->callsites().empty())
2590 for (auto &SN : FS->mutableCallsites()) {
2591 IndexCall StackNodeCall(&SN);
2592 CallsWithMetadata.push_back(x: StackNodeCall);
2593 }
2594
2595 if (!CallsWithMetadata.empty())
2596 FuncToCallsWithMetadata[FS] = CallsWithMetadata;
2597
2598 if (!FS->allocs().empty() || !FS->callsites().empty())
2599 FSToVIMap[FS] = VI;
2600 }
2601 }
2602
2603 if (DumpCCG) {
2604 dbgs() << "CCG before updating call stack chains:\n";
2605 dbgs() << *this;
2606 }
2607
2608 if (ExportToDot)
2609 exportToDot(Label: "prestackupdate");
2610
2611 updateStackNodes();
2612
2613 if (ExportToDot)
2614 exportToDot(Label: "poststackupdate");
2615
2616 handleCallsitesWithMultipleTargets();
2617
2618 markBackedges();
2619}
2620
2621template <typename DerivedCCG, typename FuncTy, typename CallTy>
2622void CallsiteContextGraph<DerivedCCG, FuncTy,
2623 CallTy>::handleCallsitesWithMultipleTargets() {
2624 // Look for and workaround callsites that call multiple functions.
2625 // This can happen for indirect calls, which needs better handling, and in
2626 // more rare cases (e.g. macro expansion).
2627 // TODO: To fix this for indirect calls we will want to perform speculative
2628 // devirtualization using either the normal PGO info with ICP, or using the
2629 // information in the profiled MemProf contexts. We can do this prior to
2630 // this transformation for regular LTO, and for ThinLTO we can simulate that
2631 // effect in the summary and perform the actual speculative devirtualization
2632 // while cloning in the ThinLTO backend.
2633
2634 // Keep track of the new nodes synthesized for discovered tail calls missing
2635 // from the profiled contexts.
2636 MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;
2637
2638 std::vector<std::pair<CallInfo, ContextNode *>> NewCallToNode;
2639 for (auto &Entry : NonAllocationCallToContextNodeMap) {
2640 auto *Node = Entry.second;
2641 assert(Node->Clones.empty());
2642 // Check all node callees and see if in the same function.
2643 // We need to check all of the calls recorded in this Node, because in some
2644 // cases we may have had multiple calls with the same debug info calling
2645 // different callees. This can happen, for example, when an object is
2646 // constructed in the paramter list - the destructor call of the object has
2647 // the same debug info (line/col) as the call the object was passed to.
2648 // Here we will prune any that don't match all callee nodes.
2649 std::vector<CallInfo> AllCalls;
2650 AllCalls.reserve(Node->MatchingCalls.size() + 1);
2651 AllCalls.push_back(Node->Call);
2652 llvm::append_range(AllCalls, Node->MatchingCalls);
2653
2654 // First see if we can partition the calls by callee function, creating new
2655 // nodes to host each set of calls calling the same callees. This is
2656 // necessary for support indirect calls with ThinLTO, for which we
2657 // synthesized CallsiteInfo records for each target. They will all have the
2658 // same callsite stack ids and would be sharing a context node at this
2659 // point. We need to perform separate cloning for each, which will be
2660 // applied along with speculative devirtualization in the ThinLTO backends
2661 // as needed. Note this does not currently support looking through tail
2662 // calls, it is unclear if we need that for indirect call targets.
2663 // First partition calls by callee func. Map indexed by func, value is
2664 // struct with list of matching calls, assigned node.
2665 if (partitionCallsByCallee(Node, AllCalls, NewCallToNode))
2666 continue;
2667
2668 auto It = AllCalls.begin();
2669 // Iterate through the calls until we find the first that matches.
2670 for (; It != AllCalls.end(); ++It) {
2671 auto ThisCall = *It;
2672 bool Match = true;
2673 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();
2674 ++EI) {
2675 auto Edge = *EI;
2676 if (!Edge->Callee->hasCall())
2677 continue;
2678 assert(NodeToCallingFunc.count(Edge->Callee));
2679 // Check if the called function matches that of the callee node.
2680 if (!calleesMatch(Call: ThisCall.call(), EI, TailCallToContextNodeMap)) {
2681 Match = false;
2682 break;
2683 }
2684 }
2685 // Found a call that matches the callee nodes, we can quit now.
2686 if (Match) {
2687 // If the first match is not the primary call on the Node, update it
2688 // now. We will update the list of matching calls further below.
2689 if (Node->Call != ThisCall) {
2690 Node->setCall(ThisCall);
2691 // We need to update the NonAllocationCallToContextNodeMap, but don't
2692 // want to do this during iteration over that map, so save the calls
2693 // that need updated entries.
2694 NewCallToNode.push_back({ThisCall, Node});
2695 }
2696 break;
2697 }
2698 }
2699 // We will update this list below (or leave it cleared if there was no
2700 // match found above).
2701 Node->MatchingCalls.clear();
2702 // If we hit the end of the AllCalls vector, no call matching the callee
2703 // nodes was found, clear the call information in the node.
2704 if (It == AllCalls.end()) {
2705 RemovedEdgesWithMismatchedCallees++;
2706 // Work around by setting Node to have a null call, so it gets
2707 // skipped during cloning. Otherwise assignFunctions will assert
2708 // because its data structures are not designed to handle this case.
2709 Node->setCall(CallInfo());
2710 continue;
2711 }
2712 // Now add back any matching calls that call the same function as the
2713 // matching primary call on Node.
2714 for (++It; It != AllCalls.end(); ++It) {
2715 auto ThisCall = *It;
2716 if (!sameCallee(Call1: Node->Call.call(), Call2: ThisCall.call()))
2717 continue;
2718 Node->MatchingCalls.push_back(ThisCall);
2719 }
2720 }
2721
2722 // Remove all mismatched nodes identified in the above loop from the node map
2723 // (checking whether they have a null call which is set above). For a
2724 // MapVector like NonAllocationCallToContextNodeMap it is much more efficient
2725 // to do the removal via remove_if than by individually erasing entries above.
2726 // Also remove any entries if we updated the node's primary call above.
2727 NonAllocationCallToContextNodeMap.remove_if([](const auto &it) {
2728 return !it.second->hasCall() || it.second->Call != it.first;
2729 });
2730
2731 // Add entries for any new primary calls recorded above.
2732 for (auto &[Call, Node] : NewCallToNode)
2733 NonAllocationCallToContextNodeMap[Call] = Node;
2734
2735 // Add the new nodes after the above loop so that the iteration is not
2736 // invalidated.
2737 for (auto &[Call, Node] : TailCallToContextNodeMap)
2738 NonAllocationCallToContextNodeMap[Call] = Node;
2739}
2740
2741template <typename DerivedCCG, typename FuncTy, typename CallTy>
2742bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::partitionCallsByCallee(
2743 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
2744 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode) {
2745 // Struct to keep track of all the calls having the same callee function,
2746 // and the node we eventually assign to them. Eventually we will record the
2747 // context node assigned to this group of calls.
2748 struct CallsWithSameCallee {
2749 std::vector<CallInfo> Calls;
2750 ContextNode *Node = nullptr;
2751 };
2752
2753 // First partition calls by callee function. Build map from each function
2754 // to the list of matching calls.
2755 DenseMap<const FuncTy *, CallsWithSameCallee> CalleeFuncToCallInfo;
2756 for (auto ThisCall : AllCalls) {
2757 auto *F = getCalleeFunc(Call: ThisCall.call());
2758 if (F)
2759 CalleeFuncToCallInfo[F].Calls.push_back(ThisCall);
2760 }
2761
2762 // Next, walk through all callee edges. For each callee node, get its
2763 // containing function and see if it was recorded in the above map (meaning we
2764 // have at least one matching call). Build another map from each callee node
2765 // with a matching call to the structure instance created above containing all
2766 // the calls.
2767 DenseMap<ContextNode *, CallsWithSameCallee *> CalleeNodeToCallInfo;
2768 for (const auto &Edge : Node->CalleeEdges) {
2769 if (!Edge->Callee->hasCall())
2770 continue;
2771 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2772 if (CalleeFuncToCallInfo.contains(ProfiledCalleeFunc))
2773 CalleeNodeToCallInfo[Edge->Callee] =
2774 &CalleeFuncToCallInfo[ProfiledCalleeFunc];
2775 }
2776
2777 // If there are entries in the second map, then there were no matching
2778 // calls/callees, nothing to do here. Return so we can go to the handling that
2779 // looks through tail calls.
2780 if (CalleeNodeToCallInfo.empty())
2781 return false;
2782
2783 // Walk through all callee edges again. Any and all callee edges that didn't
2784 // match any calls (callee not in the CalleeNodeToCallInfo map) are moved to a
2785 // new caller node (UnmatchedCalleesNode) which gets a null call so that it is
2786 // ignored during cloning. If it is in the map, then we use the node recorded
2787 // in that entry (creating it if needed), and move the callee edge to it.
2788 // The first callee will use the original node instead of creating a new one.
2789 // Note that any of the original calls on this node (in AllCalls) that didn't
2790 // have a callee function automatically get dropped from the node as part of
2791 // this process.
2792 ContextNode *UnmatchedCalleesNode = nullptr;
2793 // Track whether we already assigned original node to a callee.
2794 bool UsedOrigNode = false;
2795 assert(NodeToCallingFunc[Node]);
2796 // Iterate over a copy of Node's callee edges, since we may need to remove
2797 // edges in moveCalleeEdgeToNewCaller, and this simplifies the handling and
2798 // makes it less error-prone.
2799 auto CalleeEdges = Node->CalleeEdges;
2800 for (auto &Edge : CalleeEdges) {
2801 if (!Edge->Callee->hasCall())
2802 continue;
2803
2804 // Will be updated below to point to whatever (caller) node this callee edge
2805 // should be moved to.
2806 ContextNode *CallerNodeToUse = nullptr;
2807
2808 // Handle the case where there were no matching calls first. Move this
2809 // callee edge to the UnmatchedCalleesNode, creating it if needed.
2810 if (!CalleeNodeToCallInfo.contains(Edge->Callee)) {
2811 if (!UnmatchedCalleesNode)
2812 UnmatchedCalleesNode =
2813 createNewNode(/*IsAllocation=*/false, F: NodeToCallingFunc[Node]);
2814 CallerNodeToUse = UnmatchedCalleesNode;
2815 } else {
2816 // Look up the information recorded for this callee node, and use the
2817 // recorded caller node (creating it if needed).
2818 auto *Info = CalleeNodeToCallInfo[Edge->Callee];
2819 if (!Info->Node) {
2820 // If we haven't assigned any callees to the original node use it.
2821 if (!UsedOrigNode) {
2822 Info->Node = Node;
2823 // Clear the set of matching calls which will be updated below.
2824 Node->MatchingCalls.clear();
2825 UsedOrigNode = true;
2826 } else
2827 Info->Node =
2828 createNewNode(/*IsAllocation=*/false, F: NodeToCallingFunc[Node]);
2829 assert(!Info->Calls.empty());
2830 // The first call becomes the primary call for this caller node, and the
2831 // rest go in the matching calls list.
2832 Info->Node->setCall(Info->Calls.front());
2833 llvm::append_range(Info->Node->MatchingCalls,
2834 llvm::drop_begin(Info->Calls));
2835 // Save the primary call to node correspondence so that we can update
2836 // the NonAllocationCallToContextNodeMap, which is being iterated in the
2837 // caller of this function.
2838 NewCallToNode.push_back({Info->Node->Call, Info->Node});
2839 }
2840 CallerNodeToUse = Info->Node;
2841 }
2842
2843 // Don't need to move edge if we are using the original node;
2844 if (CallerNodeToUse == Node)
2845 continue;
2846
2847 moveCalleeEdgeToNewCaller(Edge, NewCaller: CallerNodeToUse);
2848 }
2849 // Now that we are done moving edges, clean up any caller edges that ended
2850 // up with no type or context ids. During moveCalleeEdgeToNewCaller all
2851 // caller edges from Node are replicated onto the new callers, and it
2852 // simplifies the handling to leave them until we have moved all
2853 // edges/context ids.
2854 for (auto &I : CalleeNodeToCallInfo)
2855 removeNoneTypeCallerEdges(Node: I.second->Node);
2856 if (UnmatchedCalleesNode)
2857 removeNoneTypeCallerEdges(Node: UnmatchedCalleesNode);
2858 removeNoneTypeCallerEdges(Node);
2859
2860 return true;
2861}
2862
2863uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2864 // In the Module (IR) case this is already the Id.
2865 return IdOrIndex;
2866}
2867
2868uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2869 // In the Index case this is an index into the stack id list in the summary
2870 // index, convert it to an Id.
2871 return Index.getStackIdAtIndex(Index: IdOrIndex);
2872}
2873
2874template <typename DerivedCCG, typename FuncTy, typename CallTy>
2875bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
2876 CallTy Call, EdgeIter &EI,
2877 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) {
2878 auto Edge = *EI;
2879 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2880 const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller];
2881 // Will be populated in order of callee to caller if we find a chain of tail
2882 // calls between the profiled caller and callee.
2883 std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain;
2884 if (!calleeMatchesFunc(Call, Func: ProfiledCalleeFunc, CallerFunc,
2885 FoundCalleeChain))
2886 return false;
2887
2888 // The usual case where the profiled callee matches that of the IR/summary.
2889 if (FoundCalleeChain.empty())
2890 return true;
2891
2892 auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) {
2893 auto *CurEdge = Callee->findEdgeFromCaller(Caller);
2894 // If there is already an edge between these nodes, simply update it and
2895 // return.
2896 if (CurEdge) {
2897 CurEdge->ContextIds.insert_range(Edge->ContextIds);
2898 CurEdge->AllocTypes |= Edge->AllocTypes;
2899 return;
2900 }
2901 // Otherwise, create a new edge and insert it into the caller and callee
2902 // lists.
2903 auto NewEdge = std::make_shared<ContextEdge>(
2904 Callee, Caller, Edge->AllocTypes, Edge->ContextIds);
2905 Callee->CallerEdges.push_back(NewEdge);
2906 if (Caller == Edge->Caller) {
2907 // If we are inserting the new edge into the current edge's caller, insert
2908 // the new edge before the current iterator position, and then increment
2909 // back to the current edge.
2910 EI = Caller->CalleeEdges.insert(EI, NewEdge);
2911 ++EI;
2912 assert(*EI == Edge &&
2913 "Iterator position not restored after insert and increment");
2914 } else
2915 Caller->CalleeEdges.push_back(NewEdge);
2916 };
2917
2918 // Create new nodes for each found callee and connect in between the profiled
2919 // caller and callee.
2920 auto *CurCalleeNode = Edge->Callee;
2921 for (auto &[NewCall, Func] : FoundCalleeChain) {
2922 ContextNode *NewNode = nullptr;
2923 // First check if we have already synthesized a node for this tail call.
2924 if (TailCallToContextNodeMap.count(NewCall)) {
2925 NewNode = TailCallToContextNodeMap[NewCall];
2926 NewNode->AllocTypes |= Edge->AllocTypes;
2927 } else {
2928 FuncToCallsWithMetadata[Func].push_back({NewCall});
2929 // Create Node and record node info.
2930 NewNode = createNewNode(/*IsAllocation=*/false, F: Func, C: NewCall);
2931 TailCallToContextNodeMap[NewCall] = NewNode;
2932 NewNode->AllocTypes = Edge->AllocTypes;
2933 }
2934
2935 // Hook up node to its callee node
2936 AddEdge(NewNode, CurCalleeNode);
2937
2938 CurCalleeNode = NewNode;
2939 }
2940
2941 // Hook up edge's original caller to new callee node.
2942 AddEdge(Edge->Caller, CurCalleeNode);
2943
2944#ifndef NDEBUG
2945 // Save this because Edge's fields get cleared below when removed.
2946 auto *Caller = Edge->Caller;
2947#endif
2948
2949 // Remove old edge
2950 removeEdgeFromGraph(Edge: Edge.get(), EI: &EI, /*CalleeIter=*/true);
2951
2952 // To simplify the increment of EI in the caller, subtract one from EI.
2953 // In the final AddEdge call we would have either added a new callee edge,
2954 // to Edge->Caller, or found an existing one. Either way we are guaranteed
2955 // that there is at least one callee edge.
2956 assert(!Caller->CalleeEdges.empty());
2957 --EI;
2958
2959 return true;
2960}
2961
2962bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2963 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
2964 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
2965 bool &FoundMultipleCalleeChains) {
2966 // Stop recursive search if we have already explored the maximum specified
2967 // depth.
2968 if (Depth > TailCallSearchDepth)
2969 return false;
2970
2971 auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) {
2972 FoundCalleeChain.push_back(x: {Callsite, F});
2973 };
2974
2975 auto *CalleeFunc = dyn_cast<Function>(Val: CurCallee);
2976 if (!CalleeFunc) {
2977 auto *Alias = dyn_cast<GlobalAlias>(Val: CurCallee);
2978 assert(Alias);
2979 CalleeFunc = dyn_cast<Function>(Val: Alias->getAliasee());
2980 assert(CalleeFunc);
2981 }
2982
2983 // Look for tail calls in this function, and check if they either call the
2984 // profiled callee directly, or indirectly (via a recursive search).
2985 // Only succeed if there is a single unique tail call chain found between the
2986 // profiled caller and callee, otherwise we could perform incorrect cloning.
2987 bool FoundSingleCalleeChain = false;
2988 for (auto &BB : *CalleeFunc) {
2989 for (auto &I : BB) {
2990 auto *CB = dyn_cast<CallBase>(Val: &I);
2991 if (!CB || !CB->isTailCall())
2992 continue;
2993 auto *CalledValue = CB->getCalledOperand();
2994 auto *CalledFunction = CB->getCalledFunction();
2995 if (CalledValue && !CalledFunction) {
2996 CalledValue = CalledValue->stripPointerCasts();
2997 // Stripping pointer casts can reveal a called function.
2998 CalledFunction = dyn_cast<Function>(Val: CalledValue);
2999 }
3000 // Check if this is an alias to a function. If so, get the
3001 // called aliasee for the checks below.
3002 if (auto *GA = dyn_cast<GlobalAlias>(Val: CalledValue)) {
3003 assert(!CalledFunction &&
3004 "Expected null called function in callsite for alias");
3005 CalledFunction = dyn_cast<Function>(Val: GA->getAliaseeObject());
3006 }
3007 if (!CalledFunction)
3008 continue;
3009 if (CalledFunction == ProfiledCallee) {
3010 if (FoundSingleCalleeChain) {
3011 FoundMultipleCalleeChains = true;
3012 return false;
3013 }
3014 FoundSingleCalleeChain = true;
3015 FoundProfiledCalleeCount++;
3016 FoundProfiledCalleeDepth += Depth;
3017 if (Depth > FoundProfiledCalleeMaxDepth)
3018 FoundProfiledCalleeMaxDepth = Depth;
3019 SaveCallsiteInfo(&I, CalleeFunc);
3020 } else if (findProfiledCalleeThroughTailCalls(
3021 ProfiledCallee, CurCallee: CalledFunction, Depth: Depth + 1,
3022 FoundCalleeChain, FoundMultipleCalleeChains)) {
3023 // findProfiledCalleeThroughTailCalls should not have returned
3024 // true if FoundMultipleCalleeChains.
3025 assert(!FoundMultipleCalleeChains);
3026 if (FoundSingleCalleeChain) {
3027 FoundMultipleCalleeChains = true;
3028 return false;
3029 }
3030 FoundSingleCalleeChain = true;
3031 SaveCallsiteInfo(&I, CalleeFunc);
3032 } else if (FoundMultipleCalleeChains)
3033 return false;
3034 }
3035 }
3036
3037 return FoundSingleCalleeChain;
3038}
3039
3040const Function *ModuleCallsiteContextGraph::getCalleeFunc(Instruction *Call) {
3041 auto *CB = dyn_cast<CallBase>(Val: Call);
3042 if (!CB->getCalledOperand() || CB->isIndirectCall())
3043 return nullptr;
3044 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
3045 auto *Alias = dyn_cast<GlobalAlias>(Val: CalleeVal);
3046 if (Alias)
3047 return dyn_cast<Function>(Val: Alias->getAliasee());
3048 return dyn_cast<Function>(Val: CalleeVal);
3049}
3050
3051bool ModuleCallsiteContextGraph::calleeMatchesFunc(
3052 Instruction *Call, const Function *Func, const Function *CallerFunc,
3053 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) {
3054 auto *CB = dyn_cast<CallBase>(Val: Call);
3055 if (!CB->getCalledOperand() || CB->isIndirectCall())
3056 return false;
3057 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
3058 auto *CalleeFunc = dyn_cast<Function>(Val: CalleeVal);
3059 if (CalleeFunc == Func)
3060 return true;
3061 auto *Alias = dyn_cast<GlobalAlias>(Val: CalleeVal);
3062 if (Alias && Alias->getAliasee() == Func)
3063 return true;
3064
3065 // Recursively search for the profiled callee through tail calls starting with
3066 // the actual Callee. The discovered tail call chain is saved in
3067 // FoundCalleeChain, and we will fixup the graph to include these callsites
3068 // after returning.
3069 // FIXME: We will currently redo the same recursive walk if we find the same
3070 // mismatched callee from another callsite. We can improve this with more
3071 // bookkeeping of the created chain of new nodes for each mismatch.
3072 unsigned Depth = 1;
3073 bool FoundMultipleCalleeChains = false;
3074 if (!findProfiledCalleeThroughTailCalls(ProfiledCallee: Func, CurCallee: CalleeVal, Depth,
3075 FoundCalleeChain,
3076 FoundMultipleCalleeChains)) {
3077 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: "
3078 << Func->getName() << " from " << CallerFunc->getName()
3079 << " that actually called " << CalleeVal->getName()
3080 << (FoundMultipleCalleeChains
3081 ? " (found multiple possible chains)"
3082 : "")
3083 << "\n");
3084 if (FoundMultipleCalleeChains)
3085 FoundProfiledCalleeNonUniquelyCount++;
3086 return false;
3087 }
3088
3089 return true;
3090}
3091
3092bool ModuleCallsiteContextGraph::sameCallee(Instruction *Call1,
3093 Instruction *Call2) {
3094 auto *CB1 = cast<CallBase>(Val: Call1);
3095 if (!CB1->getCalledOperand() || CB1->isIndirectCall())
3096 return false;
3097 auto *CalleeVal1 = CB1->getCalledOperand()->stripPointerCasts();
3098 auto *CalleeFunc1 = dyn_cast<Function>(Val: CalleeVal1);
3099 auto *CB2 = cast<CallBase>(Val: Call2);
3100 if (!CB2->getCalledOperand() || CB2->isIndirectCall())
3101 return false;
3102 auto *CalleeVal2 = CB2->getCalledOperand()->stripPointerCasts();
3103 auto *CalleeFunc2 = dyn_cast<Function>(Val: CalleeVal2);
3104 return CalleeFunc1 == CalleeFunc2;
3105}
3106
3107bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
3108 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
3109 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
3110 bool &FoundMultipleCalleeChains) {
3111 // Stop recursive search if we have already explored the maximum specified
3112 // depth.
3113 if (Depth > TailCallSearchDepth)
3114 return false;
3115
3116 auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) {
3117 // Make a CallsiteInfo for each discovered callee, if one hasn't already
3118 // been synthesized.
3119 if (!FunctionCalleesToSynthesizedCallsiteInfos.count(x: FS) ||
3120 !FunctionCalleesToSynthesizedCallsiteInfos[FS].count(x: Callee))
3121 // StackIds is empty (we don't have debug info available in the index for
3122 // these callsites)
3123 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] =
3124 std::make_unique<CallsiteInfo>(args&: Callee, args: SmallVector<unsigned>());
3125 CallsiteInfo *NewCallsiteInfo =
3126 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get();
3127 FoundCalleeChain.push_back(x: {NewCallsiteInfo, FS});
3128 };
3129
3130 // Look for tail calls in this function, and check if they either call the
3131 // profiled callee directly, or indirectly (via a recursive search).
3132 // Only succeed if there is a single unique tail call chain found between the
3133 // profiled caller and callee, otherwise we could perform incorrect cloning.
3134 bool FoundSingleCalleeChain = false;
3135 for (auto &S : CurCallee.getSummaryList()) {
3136 if (!GlobalValue::isLocalLinkage(Linkage: S->linkage()) &&
3137 !isPrevailing(CurCallee.getGUID(), S.get()))
3138 continue;
3139 auto *FS = dyn_cast<FunctionSummary>(Val: S->getBaseObject());
3140 if (!FS)
3141 continue;
3142 auto FSVI = CurCallee;
3143 auto *AS = dyn_cast<AliasSummary>(Val: S.get());
3144 if (AS)
3145 FSVI = AS->getAliaseeVI();
3146 for (auto &CallEdge : FS->calls()) {
3147 if (!CallEdge.second.hasTailCall())
3148 continue;
3149 if (CallEdge.first == ProfiledCallee) {
3150 if (FoundSingleCalleeChain) {
3151 FoundMultipleCalleeChains = true;
3152 return false;
3153 }
3154 FoundSingleCalleeChain = true;
3155 FoundProfiledCalleeCount++;
3156 FoundProfiledCalleeDepth += Depth;
3157 if (Depth > FoundProfiledCalleeMaxDepth)
3158 FoundProfiledCalleeMaxDepth = Depth;
3159 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
3160 // Add FS to FSToVIMap in case it isn't already there.
3161 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
3162 FSToVIMap[FS] = FSVI;
3163 } else if (findProfiledCalleeThroughTailCalls(
3164 ProfiledCallee, CurCallee: CallEdge.first, Depth: Depth + 1,
3165 FoundCalleeChain, FoundMultipleCalleeChains)) {
3166 // findProfiledCalleeThroughTailCalls should not have returned
3167 // true if FoundMultipleCalleeChains.
3168 assert(!FoundMultipleCalleeChains);
3169 if (FoundSingleCalleeChain) {
3170 FoundMultipleCalleeChains = true;
3171 return false;
3172 }
3173 FoundSingleCalleeChain = true;
3174 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
3175 // Add FS to FSToVIMap in case it isn't already there.
3176 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
3177 FSToVIMap[FS] = FSVI;
3178 } else if (FoundMultipleCalleeChains)
3179 return false;
3180 }
3181 }
3182
3183 return FoundSingleCalleeChain;
3184}
3185
3186const FunctionSummary *
3187IndexCallsiteContextGraph::getCalleeFunc(IndexCall &Call) {
3188 ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Val&: Call)->Callee;
3189 if (Callee.getSummaryList().empty())
3190 return nullptr;
3191 return dyn_cast<FunctionSummary>(Val: Callee.getSummaryList()[0]->getBaseObject());
3192}
3193
3194bool IndexCallsiteContextGraph::calleeMatchesFunc(
3195 IndexCall &Call, const FunctionSummary *Func,
3196 const FunctionSummary *CallerFunc,
3197 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {
3198 ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Val&: Call)->Callee;
3199 // If there is no summary list then this is a call to an externally defined
3200 // symbol.
3201 AliasSummary *Alias =
3202 Callee.getSummaryList().empty()
3203 ? nullptr
3204 : dyn_cast<AliasSummary>(Val: Callee.getSummaryList()[0].get());
3205 assert(FSToVIMap.count(Func));
3206 auto FuncVI = FSToVIMap[Func];
3207 if (Callee == FuncVI ||
3208 // If callee is an alias, check the aliasee, since only function
3209 // summary base objects will contain the stack node summaries and thus
3210 // get a context node.
3211 (Alias && Alias->getAliaseeVI() == FuncVI))
3212 return true;
3213
3214 // Recursively search for the profiled callee through tail calls starting with
3215 // the actual Callee. The discovered tail call chain is saved in
3216 // FoundCalleeChain, and we will fixup the graph to include these callsites
3217 // after returning.
3218 // FIXME: We will currently redo the same recursive walk if we find the same
3219 // mismatched callee from another callsite. We can improve this with more
3220 // bookkeeping of the created chain of new nodes for each mismatch.
3221 unsigned Depth = 1;
3222 bool FoundMultipleCalleeChains = false;
3223 if (!findProfiledCalleeThroughTailCalls(
3224 ProfiledCallee: FuncVI, CurCallee: Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) {
3225 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI
3226 << " from " << FSToVIMap[CallerFunc]
3227 << " that actually called " << Callee
3228 << (FoundMultipleCalleeChains
3229 ? " (found multiple possible chains)"
3230 : "")
3231 << "\n");
3232 if (FoundMultipleCalleeChains)
3233 FoundProfiledCalleeNonUniquelyCount++;
3234 return false;
3235 }
3236
3237 return true;
3238}
3239
3240bool IndexCallsiteContextGraph::sameCallee(IndexCall &Call1, IndexCall &Call2) {
3241 ValueInfo Callee1 = dyn_cast_if_present<CallsiteInfo *>(Val&: Call1)->Callee;
3242 ValueInfo Callee2 = dyn_cast_if_present<CallsiteInfo *>(Val&: Call2)->Callee;
3243 return Callee1 == Callee2;
3244}
3245
3246template <typename DerivedCCG, typename FuncTy, typename CallTy>
3247void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
3248 const {
3249 print(OS&: dbgs());
3250 dbgs() << "\n";
3251}
3252
3253template <typename DerivedCCG, typename FuncTy, typename CallTy>
3254void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
3255 raw_ostream &OS) const {
3256 OS << "Node " << this << "\n";
3257 OS << "\t";
3258 printCall(OS);
3259 if (Recursive)
3260 OS << " (recursive)";
3261 OS << "\n";
3262 if (!MatchingCalls.empty()) {
3263 OS << "\tMatchingCalls:\n";
3264 for (auto &MatchingCall : MatchingCalls) {
3265 OS << "\t";
3266 MatchingCall.print(OS);
3267 OS << "\n";
3268 }
3269 }
3270 OS << "\tNodeId: " << NodeId << "\n";
3271 OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
3272 OS << "\tContextIds:";
3273 // Make a copy of the computed context ids that we can sort for stability.
3274 auto ContextIds = getContextIds();
3275 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3276 std::sort(first: SortedIds.begin(), last: SortedIds.end());
3277 for (auto Id : SortedIds)
3278 OS << " " << Id;
3279 OS << "\n";
3280 OS << "\tCalleeEdges:\n";
3281 for (auto &Edge : CalleeEdges)
3282 OS << "\t\t" << *Edge << " (Callee NodeId: " << Edge->Callee->NodeId
3283 << ")\n";
3284 OS << "\tCallerEdges:\n";
3285 for (auto &Edge : CallerEdges)
3286 OS << "\t\t" << *Edge << " (Caller NodeId: " << Edge->Caller->NodeId
3287 << ")\n";
3288 if (!Clones.empty()) {
3289 OS << "\tClones: ";
3290 ListSeparator LS;
3291 for (auto *C : Clones)
3292 OS << LS << C << " NodeId: " << C->NodeId;
3293 OS << "\n";
3294 } else if (CloneOf) {
3295 OS << "\tClone of " << CloneOf << " NodeId: " << CloneOf->NodeId << "\n";
3296 }
3297}
3298
3299template <typename DerivedCCG, typename FuncTy, typename CallTy>
3300void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
3301 const {
3302 print(OS&: dbgs());
3303 dbgs() << "\n";
3304}
3305
3306template <typename DerivedCCG, typename FuncTy, typename CallTy>
3307void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
3308 raw_ostream &OS) const {
3309 OS << "Edge from Callee " << Callee << " to Caller: " << Caller
3310 << (IsBackedge ? " (BE)" : "")
3311 << " AllocTypes: " << getAllocTypeString(AllocTypes);
3312 OS << " ContextIds:";
3313 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3314 std::sort(first: SortedIds.begin(), last: SortedIds.end());
3315 for (auto Id : SortedIds)
3316 OS << " " << Id;
3317}
3318
3319template <typename DerivedCCG, typename FuncTy, typename CallTy>
3320void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
3321 print(OS&: dbgs());
3322}
3323
3324template <typename DerivedCCG, typename FuncTy, typename CallTy>
3325void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
3326 raw_ostream &OS) const {
3327 OS << "Callsite Context Graph:\n";
3328 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3329 for (const auto Node : nodes<GraphType>(this)) {
3330 if (Node->isRemoved())
3331 continue;
3332 Node->print(OS);
3333 OS << "\n";
3334 }
3335}
3336
3337template <typename DerivedCCG, typename FuncTy, typename CallTy>
3338void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
3339 raw_ostream &OS) const {
3340 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3341 for (const auto Node : nodes<GraphType>(this)) {
3342 if (Node->isRemoved())
3343 continue;
3344 if (!Node->IsAllocation)
3345 continue;
3346 DenseSet<uint32_t> ContextIds = Node->getContextIds();
3347 auto AllocTypeFromCall = getAllocationCallType(Call: Node->Call);
3348 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3349 std::sort(first: SortedIds.begin(), last: SortedIds.end());
3350 for (auto Id : SortedIds) {
3351 auto TypeI = ContextIdToAllocationType.find(Val: Id);
3352 assert(TypeI != ContextIdToAllocationType.end());
3353 auto CSI = ContextIdToContextSizeInfos.find(Val: Id);
3354 if (CSI != ContextIdToContextSizeInfos.end()) {
3355 for (auto &Info : CSI->second) {
3356 OS << "MemProf hinting: "
3357 << getAllocTypeString(AllocTypes: (uint8_t)TypeI->second)
3358 << " full allocation context " << Info.FullStackId
3359 << " with total size " << Info.TotalSize << " is "
3360 << getAllocTypeString(Node->AllocTypes) << " after cloning";
3361 if (allocTypeToUse(Node->AllocTypes) != AllocTypeFromCall)
3362 OS << " marked " << getAllocTypeString(AllocTypes: (uint8_t)AllocTypeFromCall)
3363 << " due to cold byte percent";
3364 // Print the internal context id to aid debugging and visualization.
3365 OS << " (context id " << Id << ")";
3366 OS << "\n";
3367 }
3368 }
3369 }
3370 }
3371}
3372
3373template <typename DerivedCCG, typename FuncTy, typename CallTy>
3374void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
3375 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3376 for (const auto Node : nodes<GraphType>(this)) {
3377 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3378 for (auto &Edge : Node->CallerEdges)
3379 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
3380 }
3381}
3382
3383template <typename DerivedCCG, typename FuncTy, typename CallTy>
3384struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
3385 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3386 using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
3387
3388 using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
3389 static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
3390
3391 using nodes_iterator =
3392 mapped_iterator<typename std::vector<NodePtrTy>::const_iterator,
3393 decltype(&getNode)>;
3394
3395 static nodes_iterator nodes_begin(GraphType G) {
3396 return nodes_iterator(G->NodeOwner.begin(), &getNode);
3397 }
3398
3399 static nodes_iterator nodes_end(GraphType G) {
3400 return nodes_iterator(G->NodeOwner.end(), &getNode);
3401 }
3402
3403 static NodeRef getEntryNode(GraphType G) {
3404 return G->NodeOwner.begin()->get();
3405 }
3406
3407 using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
3408 static const ContextNode<DerivedCCG, FuncTy, CallTy> *
3409 GetCallee(const EdgePtrTy &P) {
3410 return P->Callee;
3411 }
3412
3413 using ChildIteratorType =
3414 mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
3415 DerivedCCG, FuncTy, CallTy>>>::const_iterator,
3416 decltype(&GetCallee)>;
3417
3418 static ChildIteratorType child_begin(NodeRef N) {
3419 return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
3420 }
3421
3422 static ChildIteratorType child_end(NodeRef N) {
3423 return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
3424 }
3425};
3426
3427template <typename DerivedCCG, typename FuncTy, typename CallTy>
3428struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
3429 : public DefaultDOTGraphTraits {
3430 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {
3431 // If the user requested the full graph to be exported, but provided an
3432 // allocation id, or if the user gave a context id and requested more than
3433 // just a specific context to be exported, note that highlighting is
3434 // enabled.
3435 DoHighlight =
3436 (AllocIdForDot.getNumOccurrences() && DotGraphScope == DotScope::All) ||
3437 (ContextIdForDot.getNumOccurrences() &&
3438 DotGraphScope != DotScope::Context);
3439 }
3440
3441 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3442 using GTraits = GraphTraits<GraphType>;
3443 using NodeRef = typename GTraits::NodeRef;
3444 using ChildIteratorType = typename GTraits::ChildIteratorType;
3445
3446 static std::string getNodeLabel(NodeRef Node, GraphType G) {
3447 std::string LabelString =
3448 (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
3449 Twine(Node->OrigStackOrAllocId) + " NodeId: " + Twine(Node->NodeId))
3450 .str();
3451 LabelString += "\n";
3452 if (Node->hasCall()) {
3453 auto Func = G->NodeToCallingFunc.find(Node);
3454 assert(Func != G->NodeToCallingFunc.end());
3455 LabelString +=
3456 G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
3457 for (auto &MatchingCall : Node->MatchingCalls) {
3458 LabelString += "\n";
3459 LabelString += G->getLabel(Func->second, MatchingCall.call(),
3460 MatchingCall.cloneNo());
3461 }
3462 } else {
3463 LabelString += "null call";
3464 if (Node->Recursive)
3465 LabelString += " (recursive)";
3466 else
3467 LabelString += " (external)";
3468 }
3469 return LabelString;
3470 }
3471
3472 static std::string getNodeAttributes(NodeRef Node, GraphType G) {
3473 auto ContextIds = Node->getContextIds();
3474 // If highlighting enabled, see if this node contains any of the context ids
3475 // of interest. If so, it will use a different color and a larger fontsize
3476 // (which makes the node larger as well).
3477 bool Highlight = false;
3478 if (DoHighlight) {
3479 assert(ContextIdForDot.getNumOccurrences() ||
3480 AllocIdForDot.getNumOccurrences());
3481 if (ContextIdForDot.getNumOccurrences())
3482 Highlight = ContextIds.contains(ContextIdForDot);
3483 else
3484 Highlight = set_intersects(ContextIds, G->DotAllocContextIds);
3485 }
3486 std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
3487 getContextIds(ContextIds) + "\"")
3488 .str();
3489 // Default fontsize is 14
3490 if (Highlight)
3491 AttributeString += ",fontsize=\"30\"";
3492 AttributeString +=
3493 (Twine(",fillcolor=\"") + getColor(AllocTypes: Node->AllocTypes, Highlight) + "\"")
3494 .str();
3495 if (Node->CloneOf) {
3496 AttributeString += ",color=\"blue\"";
3497 AttributeString += ",style=\"filled,bold,dashed\"";
3498 } else
3499 AttributeString += ",style=\"filled\"";
3500 return AttributeString;
3501 }
3502
3503 static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
3504 GraphType G) {
3505 auto &Edge = *(ChildIter.getCurrent());
3506 // If highlighting enabled, see if this edge contains any of the context ids
3507 // of interest. If so, it will use a different color and a heavier arrow
3508 // size and weight (the larger weight makes the highlighted path
3509 // straighter).
3510 bool Highlight = false;
3511 if (DoHighlight) {
3512 assert(ContextIdForDot.getNumOccurrences() ||
3513 AllocIdForDot.getNumOccurrences());
3514 if (ContextIdForDot.getNumOccurrences())
3515 Highlight = Edge->ContextIds.contains(ContextIdForDot);
3516 else
3517 Highlight = set_intersects(Edge->ContextIds, G->DotAllocContextIds);
3518 }
3519 auto Color = getColor(AllocTypes: Edge->AllocTypes, Highlight);
3520 std::string AttributeString =
3521 (Twine("tooltip=\"") + getContextIds(ContextIds: Edge->ContextIds) + "\"" +
3522 // fillcolor is the arrow head and color is the line
3523 Twine(",fillcolor=\"") + Color + "\"" + Twine(",color=\"") + Color +
3524 "\"")
3525 .str();
3526 if (Edge->IsBackedge)
3527 AttributeString += ",style=\"dotted\"";
3528 // Default penwidth and weight are both 1.
3529 if (Highlight)
3530 AttributeString += ",penwidth=\"2.0\",weight=\"2\"";
3531 return AttributeString;
3532 }
3533
3534 // Since the NodeOwners list includes nodes that are no longer connected to
3535 // the graph, skip them here.
3536 static bool isNodeHidden(NodeRef Node, GraphType G) {
3537 if (Node->isRemoved())
3538 return true;
3539 // If a scope smaller than the full graph was requested, see if this node
3540 // contains any of the context ids of interest.
3541 if (DotGraphScope == DotScope::Alloc)
3542 return !set_intersects(Node->getContextIds(), G->DotAllocContextIds);
3543 if (DotGraphScope == DotScope::Context)
3544 return !Node->getContextIds().contains(ContextIdForDot);
3545 return false;
3546 }
3547
3548private:
3549 static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
3550 std::string IdString = "ContextIds:";
3551 if (ContextIds.size() < 100) {
3552 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3553 std::sort(first: SortedIds.begin(), last: SortedIds.end());
3554 for (auto Id : SortedIds)
3555 IdString += (" " + Twine(Id)).str();
3556 } else {
3557 IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
3558 }
3559 return IdString;
3560 }
3561
3562 static std::string getColor(uint8_t AllocTypes, bool Highlight) {
3563 // If DoHighlight is not enabled, we want to use the highlight colors for
3564 // NotCold and Cold, and the non-highlight color for NotCold+Cold. This is
3565 // both compatible with the color scheme before highlighting was supported,
3566 // and for the NotCold+Cold color the non-highlight color is a bit more
3567 // readable.
3568 if (AllocTypes == (uint8_t)AllocationType::NotCold)
3569 // Color "brown1" actually looks like a lighter red.
3570 return !DoHighlight || Highlight ? "brown1" : "lightpink";
3571 if (AllocTypes == (uint8_t)AllocationType::Cold)
3572 return !DoHighlight || Highlight ? "cyan" : "lightskyblue";
3573 if (AllocTypes ==
3574 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
3575 return Highlight ? "magenta" : "mediumorchid1";
3576 return "gray";
3577 }
3578
3579 static std::string getNodeId(NodeRef Node) {
3580 std::stringstream SStream;
3581 SStream << std::hex << "N0x" << (unsigned long long)Node;
3582 std::string Result = SStream.str();
3583 return Result;
3584 }
3585
3586 // True if we should highlight a specific context or allocation's contexts in
3587 // the emitted graph.
3588 static bool DoHighlight;
3589};
3590
3591template <typename DerivedCCG, typename FuncTy, typename CallTy>
3592bool DOTGraphTraits<
3593 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>::DoHighlight =
3594 false;
3595
3596template <typename DerivedCCG, typename FuncTy, typename CallTy>
3597void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
3598 std::string Label) const {
3599 WriteGraph(this, "", false, Label,
3600 DotFilePathPrefix + "ccg." + Label + ".dot");
3601}
3602
3603template <typename DerivedCCG, typename FuncTy, typename CallTy>
3604typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
3605CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
3606 const std::shared_ptr<ContextEdge> &Edge,
3607 DenseSet<uint32_t> ContextIdsToMove) {
3608 ContextNode *Node = Edge->Callee;
3609 assert(NodeToCallingFunc.count(Node));
3610 ContextNode *Clone =
3611 createNewNode(IsAllocation: Node->IsAllocation, F: NodeToCallingFunc[Node], C: Node->Call);
3612 Node->addClone(Clone);
3613 Clone->MatchingCalls = Node->MatchingCalls;
3614 moveEdgeToExistingCalleeClone(Edge, NewCallee: Clone, /*NewClone=*/true,
3615 ContextIdsToMove);
3616 return Clone;
3617}
3618
3619template <typename DerivedCCG, typename FuncTy, typename CallTy>
3620void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3621 moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
3622 ContextNode *NewCallee, bool NewClone,
3623 DenseSet<uint32_t> ContextIdsToMove) {
3624 // NewCallee and Edge's current callee must be clones of the same original
3625 // node (Edge's current callee may be the original node too).
3626 assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
3627
3628 bool EdgeIsRecursive = Edge->Callee == Edge->Caller;
3629
3630 ContextNode *OldCallee = Edge->Callee;
3631
3632 // We might already have an edge to the new callee from earlier cloning for a
3633 // different allocation. If one exists we will reuse it.
3634 auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller);
3635
3636 // Callers will pass an empty ContextIdsToMove set when they want to move the
3637 // edge. Copy in Edge's ids for simplicity.
3638 if (ContextIdsToMove.empty())
3639 ContextIdsToMove = Edge->getContextIds();
3640
3641 // If we are moving all of Edge's ids, then just move the whole Edge.
3642 // Otherwise only move the specified subset, to a new edge if needed.
3643 if (Edge->getContextIds().size() == ContextIdsToMove.size()) {
3644 // First, update the alloc types on New Callee from Edge.
3645 // Do this before we potentially clear Edge's fields below!
3646 NewCallee->AllocTypes |= Edge->AllocTypes;
3647 // Moving the whole Edge.
3648 if (ExistingEdgeToNewCallee) {
3649 // Since we already have an edge to NewCallee, simply move the ids
3650 // onto it, and remove the existing Edge.
3651 ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
3652 ExistingEdgeToNewCallee->AllocTypes |= Edge->AllocTypes;
3653 assert(Edge->ContextIds == ContextIdsToMove);
3654 removeEdgeFromGraph(Edge: Edge.get());
3655 } else {
3656 // Otherwise just reconnect Edge to NewCallee.
3657 Edge->Callee = NewCallee;
3658 NewCallee->CallerEdges.push_back(Edge);
3659 // Remove it from callee where it was previously connected.
3660 OldCallee->eraseCallerEdge(Edge.get());
3661 // Don't need to update Edge's context ids since we are simply
3662 // reconnecting it.
3663 }
3664 } else {
3665 // Only moving a subset of Edge's ids.
3666 // Compute the alloc type of the subset of ids being moved.
3667 auto CallerEdgeAllocType = computeAllocType(ContextIds&: ContextIdsToMove);
3668 if (ExistingEdgeToNewCallee) {
3669 // Since we already have an edge to NewCallee, simply move the ids
3670 // onto it.
3671 ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
3672 ExistingEdgeToNewCallee->AllocTypes |= CallerEdgeAllocType;
3673 } else {
3674 // Otherwise, create a new edge to NewCallee for the ids being moved.
3675 auto NewEdge = std::make_shared<ContextEdge>(
3676 NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove);
3677 Edge->Caller->CalleeEdges.push_back(NewEdge);
3678 NewCallee->CallerEdges.push_back(NewEdge);
3679 }
3680 // In either case, need to update the alloc types on NewCallee, and remove
3681 // those ids and update the alloc type on the original Edge.
3682 NewCallee->AllocTypes |= CallerEdgeAllocType;
3683 set_subtract(Edge->ContextIds, ContextIdsToMove);
3684 Edge->AllocTypes = computeAllocType(ContextIds&: Edge->ContextIds);
3685 }
3686 // Now walk the old callee node's callee edges and move Edge's context ids
3687 // over to the corresponding edge into the clone (which is created here if
3688 // this is a newly created clone).
3689 for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
3690 ContextNode *CalleeToUse = OldCalleeEdge->Callee;
3691 // If this is a direct recursion edge, use NewCallee (the clone) as the
3692 // callee as well, so that any edge updated/created here is also direct
3693 // recursive.
3694 if (CalleeToUse == OldCallee) {
3695 // If this is a recursive edge, see if we already moved a recursive edge
3696 // (which would have to have been this one) - if we were only moving a
3697 // subset of context ids it would still be on OldCallee.
3698 if (EdgeIsRecursive) {
3699 assert(OldCalleeEdge == Edge);
3700 continue;
3701 }
3702 CalleeToUse = NewCallee;
3703 }
3704 // The context ids moving to the new callee are the subset of this edge's
3705 // context ids and the context ids on the caller edge being moved.
3706 DenseSet<uint32_t> EdgeContextIdsToMove =
3707 set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove);
3708 set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
3709 OldCalleeEdge->AllocTypes =
3710 computeAllocType(ContextIds&: OldCalleeEdge->getContextIds());
3711 if (!NewClone) {
3712 // Update context ids / alloc type on corresponding edge to NewCallee.
3713 // There is a chance this may not exist if we are reusing an existing
3714 // clone, specifically during function assignment, where we would have
3715 // removed none type edges after creating the clone. If we can't find
3716 // a corresponding edge there, fall through to the cloning below.
3717 if (auto *NewCalleeEdge = NewCallee->findEdgeFromCallee(CalleeToUse)) {
3718 NewCalleeEdge->getContextIds().insert_range(EdgeContextIdsToMove);
3719 NewCalleeEdge->AllocTypes |= computeAllocType(ContextIds&: EdgeContextIdsToMove);
3720 continue;
3721 }
3722 }
3723 auto NewEdge = std::make_shared<ContextEdge>(
3724 CalleeToUse, NewCallee, computeAllocType(ContextIds&: EdgeContextIdsToMove),
3725 EdgeContextIdsToMove);
3726 NewCallee->CalleeEdges.push_back(NewEdge);
3727 NewEdge->Callee->CallerEdges.push_back(NewEdge);
3728 }
3729 // Recompute the node alloc type now that its callee edges have been
3730 // updated (since we will compute from those edges).
3731 OldCallee->AllocTypes = OldCallee->computeAllocType();
3732 // OldCallee alloc type should be None iff its context id set is now empty.
3733 assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
3734 OldCallee->emptyContextIds());
3735 if (VerifyCCG) {
3736 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
3737 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
3738 for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
3739 checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
3740 /*CheckEdges=*/false);
3741 for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
3742 checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
3743 /*CheckEdges=*/false);
3744 }
3745}
3746
3747template <typename DerivedCCG, typename FuncTy, typename CallTy>
3748void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3749 moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
3750 ContextNode *NewCaller) {
3751 auto *OldCallee = Edge->Callee;
3752 auto *NewCallee = OldCallee;
3753 // If this edge was direct recursive, make any new/updated edge also direct
3754 // recursive to NewCaller.
3755 bool Recursive = Edge->Caller == Edge->Callee;
3756 if (Recursive)
3757 NewCallee = NewCaller;
3758
3759 ContextNode *OldCaller = Edge->Caller;
3760 OldCaller->eraseCalleeEdge(Edge.get());
3761
3762 // We might already have an edge to the new caller. If one exists we will
3763 // reuse it.
3764 auto ExistingEdgeToNewCaller = NewCaller->findEdgeFromCallee(NewCallee);
3765
3766 if (ExistingEdgeToNewCaller) {
3767 // Since we already have an edge to NewCaller, simply move the ids
3768 // onto it, and remove the existing Edge.
3769 ExistingEdgeToNewCaller->getContextIds().insert_range(
3770 Edge->getContextIds());
3771 ExistingEdgeToNewCaller->AllocTypes |= Edge->AllocTypes;
3772 Edge->ContextIds.clear();
3773 Edge->AllocTypes = (uint8_t)AllocationType::None;
3774 OldCallee->eraseCallerEdge(Edge.get());
3775 } else {
3776 // Otherwise just reconnect Edge to NewCaller.
3777 Edge->Caller = NewCaller;
3778 NewCaller->CalleeEdges.push_back(Edge);
3779 if (Recursive) {
3780 assert(NewCallee == NewCaller);
3781 // In the case of (direct) recursive edges, we update the callee as well
3782 // so that it becomes recursive on the new caller.
3783 Edge->Callee = NewCallee;
3784 NewCallee->CallerEdges.push_back(Edge);
3785 OldCallee->eraseCallerEdge(Edge.get());
3786 }
3787 // Don't need to update Edge's context ids since we are simply
3788 // reconnecting it.
3789 }
3790 // In either case, need to update the alloc types on New Caller.
3791 NewCaller->AllocTypes |= Edge->AllocTypes;
3792
3793 // Now walk the old caller node's caller edges and move Edge's context ids
3794 // over to the corresponding edge into the node (which is created here if
3795 // this is a newly created node). We can tell whether this is a newly created
3796 // node by seeing if it has any caller edges yet.
3797#ifndef NDEBUG
3798 bool IsNewNode = NewCaller->CallerEdges.empty();
3799#endif
3800 // If we just moved a direct recursive edge, presumably its context ids should
3801 // also flow out of OldCaller via some other non-recursive callee edge. We
3802 // don't want to remove the recursive context ids from other caller edges yet,
3803 // otherwise the context ids get into an inconsistent state on OldCaller.
3804 // We will update these context ids on the non-recursive caller edge when and
3805 // if they are updated on the non-recursive callee.
3806 if (!Recursive) {
3807 for (auto &OldCallerEdge : OldCaller->CallerEdges) {
3808 auto OldCallerCaller = OldCallerEdge->Caller;
3809 // The context ids moving to the new caller are the subset of this edge's
3810 // context ids and the context ids on the callee edge being moved.
3811 DenseSet<uint32_t> EdgeContextIdsToMove = set_intersection(
3812 OldCallerEdge->getContextIds(), Edge->getContextIds());
3813 if (OldCaller == OldCallerCaller) {
3814 OldCallerCaller = NewCaller;
3815 // Don't actually move this one. The caller will move it directly via a
3816 // call to this function with this as the Edge if it is appropriate to
3817 // move to a diff node that has a matching callee (itself).
3818 continue;
3819 }
3820 set_subtract(OldCallerEdge->getContextIds(), EdgeContextIdsToMove);
3821 OldCallerEdge->AllocTypes =
3822 computeAllocType(ContextIds&: OldCallerEdge->getContextIds());
3823 // In this function we expect that any pre-existing node already has edges
3824 // from the same callers as the old node. That should be true in the
3825 // current use case, where we will remove None-type edges after copying
3826 // over all caller edges from the callee.
3827 auto *ExistingCallerEdge = NewCaller->findEdgeFromCaller(OldCallerCaller);
3828 // Since we would have skipped caller edges when moving a direct recursive
3829 // edge, this may not hold true when recursive handling enabled.
3830 assert(IsNewNode || ExistingCallerEdge || AllowRecursiveCallsites);
3831 if (ExistingCallerEdge) {
3832 ExistingCallerEdge->getContextIds().insert_range(EdgeContextIdsToMove);
3833 ExistingCallerEdge->AllocTypes |=
3834 computeAllocType(ContextIds&: EdgeContextIdsToMove);
3835 continue;
3836 }
3837 auto NewEdge = std::make_shared<ContextEdge>(
3838 NewCaller, OldCallerCaller, computeAllocType(ContextIds&: EdgeContextIdsToMove),
3839 EdgeContextIdsToMove);
3840 NewCaller->CallerEdges.push_back(NewEdge);
3841 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
3842 }
3843 }
3844 // Recompute the node alloc type now that its caller edges have been
3845 // updated (since we will compute from those edges).
3846 OldCaller->AllocTypes = OldCaller->computeAllocType();
3847 // OldCaller alloc type should be None iff its context id set is now empty.
3848 assert((OldCaller->AllocTypes == (uint8_t)AllocationType::None) ==
3849 OldCaller->emptyContextIds());
3850 if (VerifyCCG) {
3851 checkNode<DerivedCCG, FuncTy, CallTy>(OldCaller, /*CheckEdges=*/false);
3852 checkNode<DerivedCCG, FuncTy, CallTy>(NewCaller, /*CheckEdges=*/false);
3853 for (const auto &OldCallerEdge : OldCaller->CallerEdges)
3854 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallerEdge->Caller,
3855 /*CheckEdges=*/false);
3856 for (const auto &NewCallerEdge : NewCaller->CallerEdges)
3857 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallerEdge->Caller,
3858 /*CheckEdges=*/false);
3859 }
3860}
3861
3862template <typename DerivedCCG, typename FuncTy, typename CallTy>
3863void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3864 recursivelyRemoveNoneTypeCalleeEdges(
3865 ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
3866 auto Inserted = Visited.insert(Node);
3867 if (!Inserted.second)
3868 return;
3869
3870 removeNoneTypeCalleeEdges(Node);
3871
3872 for (auto *Clone : Node->Clones)
3873 recursivelyRemoveNoneTypeCalleeEdges(Node: Clone, Visited);
3874
3875 // The recursive call may remove some of this Node's caller edges.
3876 // Iterate over a copy and skip any that were removed.
3877 auto CallerEdges = Node->CallerEdges;
3878 for (auto &Edge : CallerEdges) {
3879 // Skip any that have been removed by an earlier recursive call.
3880 if (Edge->isRemoved()) {
3881 assert(!is_contained(Node->CallerEdges, Edge));
3882 continue;
3883 }
3884 recursivelyRemoveNoneTypeCalleeEdges(Node: Edge->Caller, Visited);
3885 }
3886}
3887
3888// This is the standard DFS based backedge discovery algorithm.
3889template <typename DerivedCCG, typename FuncTy, typename CallTy>
3890void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges() {
3891 // If we are cloning recursive contexts, find and mark backedges from all root
3892 // callers, using the typical DFS based backedge analysis.
3893 if (!CloneRecursiveContexts)
3894 return;
3895 DenseSet<const ContextNode *> Visited;
3896 DenseSet<const ContextNode *> CurrentStack;
3897 for (auto &Entry : NonAllocationCallToContextNodeMap) {
3898 auto *Node = Entry.second;
3899 if (Node->isRemoved())
3900 continue;
3901 // It is a root if it doesn't have callers.
3902 if (!Node->CallerEdges.empty())
3903 continue;
3904 markBackedges(Node, Visited, CurrentStack);
3905 assert(CurrentStack.empty());
3906 }
3907}
3908
3909// Recursive helper for above markBackedges method.
3910template <typename DerivedCCG, typename FuncTy, typename CallTy>
3911void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges(
3912 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3913 DenseSet<const ContextNode *> &CurrentStack) {
3914 auto I = Visited.insert(Node);
3915 // We should only call this for unvisited nodes.
3916 assert(I.second);
3917 (void)I;
3918 for (auto &CalleeEdge : Node->CalleeEdges) {
3919 auto *Callee = CalleeEdge->Callee;
3920 if (Visited.count(Callee)) {
3921 // Since this was already visited we need to check if it is currently on
3922 // the recursive stack in which case it is a backedge.
3923 if (CurrentStack.count(Callee))
3924 CalleeEdge->IsBackedge = true;
3925 continue;
3926 }
3927 CurrentStack.insert(Callee);
3928 markBackedges(Callee, Visited, CurrentStack);
3929 CurrentStack.erase(Callee);
3930 }
3931}
3932
3933template <typename DerivedCCG, typename FuncTy, typename CallTy>
3934void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
3935 DenseSet<const ContextNode *> Visited;
3936 for (auto &Entry : AllocationCallToContextNodeMap) {
3937 Visited.clear();
3938 identifyClones(Entry.second, Visited, Entry.second->getContextIds());
3939 }
3940 Visited.clear();
3941 for (auto &Entry : AllocationCallToContextNodeMap)
3942 recursivelyRemoveNoneTypeCalleeEdges(Node: Entry.second, Visited);
3943 if (VerifyCCG)
3944 check();
3945}
3946
3947// helper function to check an AllocType is cold or notcold or both.
3948bool checkColdOrNotCold(uint8_t AllocType) {
3949 return (AllocType == (uint8_t)AllocationType::Cold) ||
3950 (AllocType == (uint8_t)AllocationType::NotCold) ||
3951 (AllocType ==
3952 ((uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold));
3953}
3954
3955template <typename DerivedCCG, typename FuncTy, typename CallTy>
3956void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
3957 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3958 const DenseSet<uint32_t> &AllocContextIds) {
3959 if (VerifyNodes)
3960 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3961 assert(!Node->CloneOf);
3962
3963 // If Node as a null call, then either it wasn't found in the module (regular
3964 // LTO) or summary index (ThinLTO), or there were other conditions blocking
3965 // cloning (e.g. recursion, calls multiple targets, etc).
3966 // Do this here so that we don't try to recursively clone callers below, which
3967 // isn't useful at least for this node.
3968 if (!Node->hasCall())
3969 return;
3970
3971 // No need to look at any callers if allocation type already unambiguous.
3972 if (hasSingleAllocType(Node->AllocTypes))
3973 return;
3974
3975#ifndef NDEBUG
3976 auto Insert =
3977#endif
3978 Visited.insert(Node);
3979 // We should not have visited this node yet.
3980 assert(Insert.second);
3981 // The recursive call to identifyClones may delete the current edge from the
3982 // CallerEdges vector. Make a copy and iterate on that, simpler than passing
3983 // in an iterator and having recursive call erase from it. Other edges may
3984 // also get removed during the recursion, which will have null Callee and
3985 // Caller pointers (and are deleted later), so we skip those below.
3986 {
3987 auto CallerEdges = Node->CallerEdges;
3988 for (auto &Edge : CallerEdges) {
3989 // Skip any that have been removed by an earlier recursive call.
3990 if (Edge->isRemoved()) {
3991 assert(!is_contained(Node->CallerEdges, Edge));
3992 continue;
3993 }
3994 // Defer backedges. See comments further below where these edges are
3995 // handled during the cloning of this Node.
3996 if (Edge->IsBackedge) {
3997 // We should only mark these if cloning recursive contexts, where we
3998 // need to do this deferral.
3999 assert(CloneRecursiveContexts);
4000 continue;
4001 }
4002 // Ignore any caller we previously visited via another edge.
4003 if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
4004 identifyClones(Edge->Caller, Visited, AllocContextIds);
4005 }
4006 }
4007 }
4008
4009 // Check if we reached an unambiguous call or have have only a single caller.
4010 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
4011 return;
4012
4013 // We need to clone.
4014
4015 // Try to keep the original version as alloc type NotCold. This will make
4016 // cases with indirect calls or any other situation with an unknown call to
4017 // the original function get the default behavior. We do this by sorting the
4018 // CallerEdges of the Node we will clone by alloc type.
4019 //
4020 // Give NotCold edge the lowest sort priority so those edges are at the end of
4021 // the caller edges vector, and stay on the original version (since the below
4022 // code clones greedily until it finds all remaining edges have the same type
4023 // and leaves the remaining ones on the original Node).
4024 //
4025 // We shouldn't actually have any None type edges, so the sorting priority for
4026 // that is arbitrary, and we assert in that case below.
4027 const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4,
4028 /*Cold*/ 1,
4029 /*NotColdCold*/ 2};
4030 llvm::stable_sort(Node->CallerEdges,
4031 [&](const std::shared_ptr<ContextEdge> &A,
4032 const std::shared_ptr<ContextEdge> &B) {
4033 // Nodes with non-empty context ids should be sorted
4034 // before those with empty context ids.
4035 if (A->ContextIds.empty())
4036 // Either B ContextIds are non-empty (in which case we
4037 // should return false because B < A), or B ContextIds
4038 // are empty, in which case they are equal, and we
4039 // should maintain the original relative ordering.
4040 return false;
4041 if (B->ContextIds.empty())
4042 return true;
4043
4044 if (A->AllocTypes == B->AllocTypes)
4045 // Use the first context id for each edge as a
4046 // tie-breaker.
4047 return *A->ContextIds.begin() < *B->ContextIds.begin();
4048 return AllocTypeCloningPriority[A->AllocTypes] <
4049 AllocTypeCloningPriority[B->AllocTypes];
4050 });
4051
4052 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
4053
4054 DenseSet<uint32_t> RecursiveContextIds;
4055 assert(AllowRecursiveContexts || !CloneRecursiveContexts);
4056 // If we are allowing recursive callsites, but have also disabled recursive
4057 // contexts, look for context ids that show up in multiple caller edges.
4058 if (AllowRecursiveCallsites && !AllowRecursiveContexts) {
4059 DenseSet<uint32_t> AllCallerContextIds;
4060 for (auto &CE : Node->CallerEdges) {
4061 // Resize to the largest set of caller context ids, since we know the
4062 // final set will be at least that large.
4063 AllCallerContextIds.reserve(Size: CE->getContextIds().size());
4064 for (auto Id : CE->getContextIds())
4065 if (!AllCallerContextIds.insert(Id).second)
4066 RecursiveContextIds.insert(Id);
4067 }
4068 }
4069
4070 // Iterate until we find no more opportunities for disambiguating the alloc
4071 // types via cloning. In most cases this loop will terminate once the Node
4072 // has a single allocation type, in which case no more cloning is needed.
4073 // Iterate over a copy of Node's caller edges, since we may need to remove
4074 // edges in the moveEdgeTo* methods, and this simplifies the handling and
4075 // makes it less error-prone.
4076 auto CallerEdges = Node->CallerEdges;
4077 for (auto &CallerEdge : CallerEdges) {
4078 // Skip any that have been removed by an earlier recursive call.
4079 if (CallerEdge->isRemoved()) {
4080 assert(!is_contained(Node->CallerEdges, CallerEdge));
4081 continue;
4082 }
4083 assert(CallerEdge->Callee == Node);
4084
4085 // See if cloning the prior caller edge left this node with a single alloc
4086 // type or a single caller. In that case no more cloning of Node is needed.
4087 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
4088 break;
4089
4090 // If the caller was not successfully matched to a call in the IR/summary,
4091 // there is no point in trying to clone for it as we can't update that call.
4092 if (!CallerEdge->Caller->hasCall())
4093 continue;
4094
4095 // Only need to process the ids along this edge pertaining to the given
4096 // allocation.
4097 auto CallerEdgeContextsForAlloc =
4098 set_intersection(CallerEdge->getContextIds(), AllocContextIds);
4099 if (!RecursiveContextIds.empty())
4100 CallerEdgeContextsForAlloc =
4101 set_difference(CallerEdgeContextsForAlloc, RecursiveContextIds);
4102 if (CallerEdgeContextsForAlloc.empty())
4103 continue;
4104
4105 auto CallerAllocTypeForAlloc = computeAllocType(ContextIds&: CallerEdgeContextsForAlloc);
4106
4107 // Compute the node callee edge alloc types corresponding to the context ids
4108 // for this caller edge.
4109 std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
4110 CalleeEdgeAllocTypesForCallerEdge.reserve(n: Node->CalleeEdges.size());
4111 for (auto &CalleeEdge : Node->CalleeEdges)
4112 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
4113 Node1Ids: CalleeEdge->getContextIds(), Node2Ids: CallerEdgeContextsForAlloc));
4114
4115 // Don't clone if doing so will not disambiguate any alloc types amongst
4116 // caller edges (including the callee edges that would be cloned).
4117 // Otherwise we will simply move all edges to the clone.
4118 //
4119 // First check if by cloning we will disambiguate the caller allocation
4120 // type from node's allocation type. Query allocTypeToUse so that we don't
4121 // bother cloning to distinguish NotCold+Cold from NotCold. Note that
4122 // neither of these should be None type.
4123 //
4124 // Then check if by cloning node at least one of the callee edges will be
4125 // disambiguated by splitting out different context ids.
4126 //
4127 // However, always do the cloning if this is a backedge, in which case we
4128 // have not yet cloned along this caller edge.
4129 assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
4130 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
4131 if (!CallerEdge->IsBackedge &&
4132 allocTypeToUse(CallerAllocTypeForAlloc) ==
4133 allocTypeToUse(Node->AllocTypes) &&
4134 allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
4135 CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {
4136 continue;
4137 }
4138
4139 if (CallerEdge->IsBackedge) {
4140 // We should only mark these if cloning recursive contexts, where we
4141 // need to do this deferral.
4142 assert(CloneRecursiveContexts);
4143 DeferredBackedges++;
4144 }
4145
4146 // If this is a backedge, we now do recursive cloning starting from its
4147 // caller since we may have moved unambiguous caller contexts to a clone
4148 // of this Node in a previous iteration of the current loop, giving more
4149 // opportunity for cloning through the backedge. Because we sorted the
4150 // caller edges earlier so that cold caller edges are first, we would have
4151 // visited and cloned this node for any unamibiguously cold non-recursive
4152 // callers before any ambiguous backedge callers. Note that we don't do this
4153 // if the caller is already cloned or visited during cloning (e.g. via a
4154 // different context path from the allocation).
4155 // TODO: Can we do better in the case where the caller was already visited?
4156 if (CallerEdge->IsBackedge && !CallerEdge->Caller->CloneOf &&
4157 !Visited.count(CallerEdge->Caller)) {
4158 const auto OrigIdCount = CallerEdge->getContextIds().size();
4159 // Now do the recursive cloning of this backedge's caller, which was
4160 // deferred earlier.
4161 identifyClones(CallerEdge->Caller, Visited, CallerEdgeContextsForAlloc);
4162 removeNoneTypeCalleeEdges(Node: CallerEdge->Caller);
4163 // See if the recursive call to identifyClones moved the context ids to a
4164 // new edge from this node to a clone of caller, and switch to looking at
4165 // that new edge so that we clone Node for the new caller clone.
4166 bool UpdatedEdge = false;
4167 if (OrigIdCount > CallerEdge->getContextIds().size()) {
4168 for (auto E : Node->CallerEdges) {
4169 // Only interested in clones of the current edges caller.
4170 if (E->Caller->CloneOf != CallerEdge->Caller)
4171 continue;
4172 // See if this edge contains any of the context ids originally on the
4173 // current caller edge.
4174 auto CallerEdgeContextsForAllocNew =
4175 set_intersection(CallerEdgeContextsForAlloc, E->getContextIds());
4176 if (CallerEdgeContextsForAllocNew.empty())
4177 continue;
4178 // Make sure we don't pick a previously existing caller edge of this
4179 // Node, which would be processed on a different iteration of the
4180 // outer loop over the saved CallerEdges.
4181 if (llvm::is_contained(CallerEdges, E))
4182 continue;
4183 // The CallerAllocTypeForAlloc and CalleeEdgeAllocTypesForCallerEdge
4184 // are updated further below for all cases where we just invoked
4185 // identifyClones recursively.
4186 CallerEdgeContextsForAlloc.swap(CallerEdgeContextsForAllocNew);
4187 CallerEdge = E;
4188 UpdatedEdge = true;
4189 break;
4190 }
4191 }
4192 // If cloning removed this edge (and we didn't update it to a new edge
4193 // above), we're done with this edge. It's possible we moved all of the
4194 // context ids to an existing clone, in which case there's no need to do
4195 // further processing for them.
4196 if (CallerEdge->isRemoved())
4197 continue;
4198
4199 // Now we need to update the information used for the cloning decisions
4200 // further below, as we may have modified edges and their context ids.
4201
4202 // Note if we changed the CallerEdge above we would have already updated
4203 // the context ids.
4204 if (!UpdatedEdge) {
4205 CallerEdgeContextsForAlloc = set_intersection(
4206 CallerEdgeContextsForAlloc, CallerEdge->getContextIds());
4207 if (CallerEdgeContextsForAlloc.empty())
4208 continue;
4209 }
4210 // Update the other information that depends on the edges and on the now
4211 // updated CallerEdgeContextsForAlloc.
4212 CallerAllocTypeForAlloc = computeAllocType(ContextIds&: CallerEdgeContextsForAlloc);
4213 CalleeEdgeAllocTypesForCallerEdge.clear();
4214 for (auto &CalleeEdge : Node->CalleeEdges) {
4215 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
4216 Node1Ids: CalleeEdge->getContextIds(), Node2Ids: CallerEdgeContextsForAlloc));
4217 }
4218 }
4219
4220 // First see if we can use an existing clone. Check each clone and its
4221 // callee edges for matching alloc types.
4222 ContextNode *Clone = nullptr;
4223 for (auto *CurClone : Node->Clones) {
4224 if (allocTypeToUse(CurClone->AllocTypes) !=
4225 allocTypeToUse(CallerAllocTypeForAlloc))
4226 continue;
4227
4228 bool BothSingleAlloc = hasSingleAllocType(CurClone->AllocTypes) &&
4229 hasSingleAllocType(CallerAllocTypeForAlloc);
4230 // The above check should mean that if both have single alloc types that
4231 // they should be equal.
4232 assert(!BothSingleAlloc ||
4233 CurClone->AllocTypes == CallerAllocTypeForAlloc);
4234
4235 // If either both have a single alloc type (which are the same), or if the
4236 // clone's callee edges have the same alloc types as those for the current
4237 // allocation on Node's callee edges (CalleeEdgeAllocTypesForCallerEdge),
4238 // then we can reuse this clone.
4239 if (BothSingleAlloc || allocTypesMatchClone<DerivedCCG, FuncTy, CallTy>(
4240 CalleeEdgeAllocTypesForCallerEdge, CurClone)) {
4241 Clone = CurClone;
4242 break;
4243 }
4244 }
4245
4246 // The edge iterator is adjusted when we move the CallerEdge to the clone.
4247 if (Clone)
4248 moveEdgeToExistingCalleeClone(Edge: CallerEdge, NewCallee: Clone, /*NewClone=*/false,
4249 ContextIdsToMove: CallerEdgeContextsForAlloc);
4250 else
4251 Clone = moveEdgeToNewCalleeClone(Edge: CallerEdge, ContextIdsToMove: CallerEdgeContextsForAlloc);
4252
4253 // Sanity check that no alloc types on clone or its edges are None.
4254 assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
4255 }
4256
4257 // We should still have some context ids on the original Node.
4258 assert(!Node->emptyContextIds());
4259
4260 // Sanity check that no alloc types on node or edges are None.
4261 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
4262
4263 if (VerifyNodes)
4264 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
4265}
4266
4267void ModuleCallsiteContextGraph::updateAllocationCall(
4268 CallInfo &Call, AllocationType AllocType) {
4269 std::string AllocTypeString = getAllocTypeAttributeString(Type: AllocType);
4270 removeAnyExistingAmbiguousAttribute(CB: cast<CallBase>(Val: Call.call()));
4271 auto A = llvm::Attribute::get(Context&: Call.call()->getFunction()->getContext(),
4272 Kind: "memprof", Val: AllocTypeString);
4273 cast<CallBase>(Val: Call.call())->addFnAttr(Attr: A);
4274 OREGetter(Call.call()->getFunction())
4275 .emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
4276 << ore::NV("AllocationCall", Call.call()) << " in clone "
4277 << ore::NV("Caller", Call.call()->getFunction())
4278 << " marked with memprof allocation attribute "
4279 << ore::NV("Attribute", AllocTypeString));
4280}
4281
4282void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
4283 AllocationType AllocType) {
4284 auto *AI = cast<AllocInfo *>(Val: Call.call());
4285 assert(AI);
4286 assert(AI->Versions.size() > Call.cloneNo());
4287 AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
4288}
4289
4290AllocationType
4291ModuleCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
4292 const auto *CB = cast<CallBase>(Val: Call.call());
4293 if (!CB->getAttributes().hasFnAttr(Kind: "memprof"))
4294 return AllocationType::None;
4295 return CB->getAttributes().getFnAttr(Kind: "memprof").getValueAsString() == "cold"
4296 ? AllocationType::Cold
4297 : AllocationType::NotCold;
4298}
4299
4300AllocationType
4301IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
4302 const auto *AI = cast<AllocInfo *>(Val: Call.call());
4303 assert(AI->Versions.size() > Call.cloneNo());
4304 return (AllocationType)AI->Versions[Call.cloneNo()];
4305}
4306
4307void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
4308 FuncInfo CalleeFunc) {
4309 auto *CurF = getCalleeFunc(Call: CallerCall.call());
4310 auto NewCalleeCloneNo = CalleeFunc.cloneNo();
4311 if (isMemProfClone(F: *CurF)) {
4312 // If we already assigned this callsite to call a specific non-default
4313 // clone (i.e. not the original function which is clone 0), ensure that we
4314 // aren't trying to now update it to call a different clone, which is
4315 // indicative of a bug in the graph or function assignment.
4316 auto CurCalleeCloneNo = getMemProfCloneNum(F: *CurF);
4317 if (CurCalleeCloneNo != NewCalleeCloneNo) {
4318 LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was "
4319 << CurCalleeCloneNo << " now " << NewCalleeCloneNo
4320 << "\n");
4321 MismatchedCloneAssignments++;
4322 }
4323 }
4324 if (NewCalleeCloneNo > 0)
4325 cast<CallBase>(Val: CallerCall.call())->setCalledFunction(CalleeFunc.func());
4326 OREGetter(CallerCall.call()->getFunction())
4327 .emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
4328 << ore::NV("Call", CallerCall.call()) << " in clone "
4329 << ore::NV("Caller", CallerCall.call()->getFunction())
4330 << " assigned to call function clone "
4331 << ore::NV("Callee", CalleeFunc.func()));
4332}
4333
4334void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
4335 FuncInfo CalleeFunc) {
4336 auto *CI = cast<CallsiteInfo *>(Val: CallerCall.call());
4337 assert(CI &&
4338 "Caller cannot be an allocation which should not have profiled calls");
4339 assert(CI->Clones.size() > CallerCall.cloneNo());
4340 auto NewCalleeCloneNo = CalleeFunc.cloneNo();
4341 auto &CurCalleeCloneNo = CI->Clones[CallerCall.cloneNo()];
4342 // If we already assigned this callsite to call a specific non-default
4343 // clone (i.e. not the original function which is clone 0), ensure that we
4344 // aren't trying to now update it to call a different clone, which is
4345 // indicative of a bug in the graph or function assignment.
4346 if (CurCalleeCloneNo != 0 && CurCalleeCloneNo != NewCalleeCloneNo) {
4347 LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was "
4348 << CurCalleeCloneNo << " now " << NewCalleeCloneNo
4349 << "\n");
4350 MismatchedCloneAssignments++;
4351 }
4352 CurCalleeCloneNo = NewCalleeCloneNo;
4353}
4354
4355// Update the debug information attached to NewFunc to use the clone Name. Note
4356// this needs to be done for both any existing DISubprogram for the definition,
4357// as well as any separate declaration DISubprogram.
4358static void updateSubprogramLinkageName(Function *NewFunc, StringRef Name) {
4359 assert(Name == NewFunc->getName());
4360 auto *SP = NewFunc->getSubprogram();
4361 if (!SP)
4362 return;
4363 auto *MDName = MDString::get(Context&: NewFunc->getParent()->getContext(), Str: Name);
4364 SP->replaceLinkageName(LN: MDName);
4365 DISubprogram *Decl = SP->getDeclaration();
4366 if (!Decl)
4367 return;
4368 TempDISubprogram NewDecl = Decl->clone();
4369 NewDecl->replaceLinkageName(LN: MDName);
4370 SP->replaceDeclaration(Decl: MDNode::replaceWithUniqued(N: std::move(NewDecl)));
4371}
4372
4373CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
4374 Instruction *>::FuncInfo
4375ModuleCallsiteContextGraph::cloneFunctionForCallsite(
4376 FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
4377 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
4378 // Use existing LLVM facilities for cloning and obtaining Call in clone
4379 ValueToValueMapTy VMap;
4380 auto *NewFunc = CloneFunction(F: Func.func(), VMap);
4381 std::string Name = getMemProfFuncName(Base: Func.func()->getName(), CloneNo);
4382 assert(!Func.func()->getParent()->getFunction(Name));
4383 NewFunc->setName(Name);
4384 updateSubprogramLinkageName(NewFunc, Name);
4385 for (auto &Inst : CallsWithMetadataInFunc) {
4386 // This map always has the initial version in it.
4387 assert(Inst.cloneNo() == 0);
4388 CallMap[Inst] = {cast<Instruction>(Val&: VMap[Inst.call()]), CloneNo};
4389 }
4390 OREGetter(Func.func())
4391 .emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
4392 << "created clone " << ore::NV("NewFunction", NewFunc));
4393 return {NewFunc, CloneNo};
4394}
4395
4396CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
4397 IndexCall>::FuncInfo
4398IndexCallsiteContextGraph::cloneFunctionForCallsite(
4399 FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
4400 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
4401 // Check how many clones we have of Call (and therefore function).
4402 // The next clone number is the current size of versions array.
4403 // Confirm this matches the CloneNo provided by the caller, which is based on
4404 // the number of function clones we have.
4405 assert(CloneNo == (isa<AllocInfo *>(Call.call())
4406 ? cast<AllocInfo *>(Call.call())->Versions.size()
4407 : cast<CallsiteInfo *>(Call.call())->Clones.size()));
4408 // Walk all the instructions in this function. Create a new version for
4409 // each (by adding an entry to the Versions/Clones summary array), and copy
4410 // over the version being called for the function clone being cloned here.
4411 // Additionally, add an entry to the CallMap for the new function clone,
4412 // mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
4413 // to the new call clone.
4414 for (auto &Inst : CallsWithMetadataInFunc) {
4415 // This map always has the initial version in it.
4416 assert(Inst.cloneNo() == 0);
4417 if (auto *AI = dyn_cast<AllocInfo *>(Val: Inst.call())) {
4418 assert(AI->Versions.size() == CloneNo);
4419 // We assign the allocation type later (in updateAllocationCall), just add
4420 // an entry for it here.
4421 AI->Versions.push_back(Elt: 0);
4422 } else {
4423 auto *CI = cast<CallsiteInfo *>(Val: Inst.call());
4424 assert(CI && CI->Clones.size() == CloneNo);
4425 // We assign the clone number later (in updateCall), just add an entry for
4426 // it here.
4427 CI->Clones.push_back(Elt: 0);
4428 }
4429 CallMap[Inst] = {Inst.call(), CloneNo};
4430 }
4431 return {Func.func(), CloneNo};
4432}
4433
4434// We perform cloning for each allocation node separately. However, this
4435// sometimes results in a situation where the same node calls multiple
4436// clones of the same callee, created for different allocations. This
4437// causes issues when assigning functions to these clones, as each node can
4438// in reality only call a single callee clone.
4439//
4440// To address this, before assigning functions, merge callee clone nodes as
4441// needed using a post order traversal from the allocations. We attempt to
4442// use existing clones as the merge node when legal, and to share them
4443// among callers with the same properties (callers calling the same set of
4444// callee clone nodes for the same allocations).
4445//
4446// Without this fix, in some cases incorrect function assignment will lead
4447// to calling the wrong allocation clone.
4448template <typename DerivedCCG, typename FuncTy, typename CallTy>
4449void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones() {
4450 if (!MergeClones)
4451 return;
4452
4453 // Generate a map from context id to the associated allocation node for use
4454 // when merging clones.
4455 DenseMap<uint32_t, ContextNode *> ContextIdToAllocationNode;
4456 for (auto &Entry : AllocationCallToContextNodeMap) {
4457 auto *Node = Entry.second;
4458 for (auto Id : Node->getContextIds())
4459 ContextIdToAllocationNode[Id] = Node->getOrigNode();
4460 for (auto *Clone : Node->Clones) {
4461 for (auto Id : Clone->getContextIds())
4462 ContextIdToAllocationNode[Id] = Clone->getOrigNode();
4463 }
4464 }
4465
4466 // Post order traversal starting from allocations to ensure each callsite
4467 // calls a single clone of its callee. Callee nodes that are clones of each
4468 // other are merged (via new merge nodes if needed) to achieve this.
4469 DenseSet<const ContextNode *> Visited;
4470 for (auto &Entry : AllocationCallToContextNodeMap) {
4471 auto *Node = Entry.second;
4472
4473 mergeClones(Node, Visited, ContextIdToAllocationNode);
4474
4475 // Make a copy so the recursive post order traversal that may create new
4476 // clones doesn't mess up iteration. Note that the recursive traversal
4477 // itself does not call mergeClones on any of these nodes, which are all
4478 // (clones of) allocations.
4479 auto Clones = Node->Clones;
4480 for (auto *Clone : Clones)
4481 mergeClones(Clone, Visited, ContextIdToAllocationNode);
4482 }
4483
4484 if (DumpCCG) {
4485 dbgs() << "CCG after merging:\n";
4486 dbgs() << *this;
4487 }
4488 if (ExportToDot)
4489 exportToDot(Label: "aftermerge");
4490
4491 if (VerifyCCG) {
4492 check();
4493 }
4494}
4495
4496// Recursive helper for above mergeClones method.
4497template <typename DerivedCCG, typename FuncTy, typename CallTy>
4498void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones(
4499 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
4500 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
4501 auto Inserted = Visited.insert(Node);
4502 if (!Inserted.second)
4503 return;
4504
4505 // Iteratively perform merging on this node to handle new caller nodes created
4506 // during the recursive traversal. We could do something more elegant such as
4507 // maintain a worklist, but this is a simple approach that doesn't cause a
4508 // measureable compile time effect, as most nodes don't have many caller
4509 // edges to check.
4510 bool FoundUnvisited = true;
4511 unsigned Iters = 0;
4512 while (FoundUnvisited) {
4513 Iters++;
4514 FoundUnvisited = false;
4515 // Make a copy since the recursive call may move a caller edge to a new
4516 // callee, messing up the iterator.
4517 auto CallerEdges = Node->CallerEdges;
4518 for (auto CallerEdge : CallerEdges) {
4519 // Skip any caller edge moved onto a different callee during recursion.
4520 if (CallerEdge->Callee != Node)
4521 continue;
4522 // If we found an unvisited caller, note that we should check the caller
4523 // edges again as mergeClones may add or change caller nodes.
4524 if (DoMergeIteration && !Visited.contains(CallerEdge->Caller))
4525 FoundUnvisited = true;
4526 mergeClones(CallerEdge->Caller, Visited, ContextIdToAllocationNode);
4527 }
4528 }
4529
4530 TotalMergeInvokes++;
4531 TotalMergeIters += Iters;
4532 if (Iters > MaxMergeIters)
4533 MaxMergeIters = Iters;
4534
4535 // Merge for this node after we handle its callers.
4536 mergeNodeCalleeClones(Node, Visited, ContextIdToAllocationNode);
4537}
4538
4539template <typename DerivedCCG, typename FuncTy, typename CallTy>
4540void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeNodeCalleeClones(
4541 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
4542 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
4543 // Ignore Node if we moved all of its contexts to clones.
4544 if (Node->emptyContextIds())
4545 return;
4546
4547 // First identify groups of clones among Node's callee edges, by building
4548 // a map from each callee base node to the associated callee edges from Node.
4549 MapVector<ContextNode *, std::vector<std::shared_ptr<ContextEdge>>>
4550 OrigNodeToCloneEdges;
4551 for (const auto &E : Node->CalleeEdges) {
4552 auto *Callee = E->Callee;
4553 if (!Callee->CloneOf && Callee->Clones.empty())
4554 continue;
4555 ContextNode *Base = Callee->getOrigNode();
4556 OrigNodeToCloneEdges[Base].push_back(E);
4557 }
4558
4559 // Helper for callee edge sorting below. Return true if A's callee has fewer
4560 // caller edges than B, or if A is a clone and B is not, or if A's first
4561 // context id is smaller than B's.
4562 auto CalleeCallerEdgeLessThan = [](const std::shared_ptr<ContextEdge> &A,
4563 const std::shared_ptr<ContextEdge> &B) {
4564 if (A->Callee->CallerEdges.size() != B->Callee->CallerEdges.size())
4565 return A->Callee->CallerEdges.size() < B->Callee->CallerEdges.size();
4566 if (A->Callee->CloneOf && !B->Callee->CloneOf)
4567 return true;
4568 else if (!A->Callee->CloneOf && B->Callee->CloneOf)
4569 return false;
4570 // Use the first context id for each edge as a
4571 // tie-breaker.
4572 return *A->ContextIds.begin() < *B->ContextIds.begin();
4573 };
4574
4575 // Process each set of callee clones called by Node, performing the needed
4576 // merging.
4577 for (auto Entry : OrigNodeToCloneEdges) {
4578 // CalleeEdges is the set of edges from Node reaching callees that are
4579 // mutual clones of each other.
4580 auto &CalleeEdges = Entry.second;
4581 auto NumCalleeClones = CalleeEdges.size();
4582 // A single edge means there is no merging needed.
4583 if (NumCalleeClones == 1)
4584 continue;
4585 // Sort the CalleeEdges calling this group of clones in ascending order of
4586 // their caller edge counts, putting the original non-clone node first in
4587 // cases of a tie. This simplifies finding an existing node to use as the
4588 // merge node.
4589 llvm::stable_sort(CalleeEdges, CalleeCallerEdgeLessThan);
4590
4591 /// Find other callers of the given set of callee edges that can
4592 /// share the same callee merge node. See the comments at this method
4593 /// definition for details.
4594 DenseSet<ContextNode *> OtherCallersToShareMerge;
4595 findOtherCallersToShareMerge(Node, CalleeEdges, ContextIdToAllocationNode,
4596 OtherCallersToShareMerge);
4597
4598 // Now do the actual merging. Identify existing or create a new MergeNode
4599 // during the first iteration. Move each callee over, along with edges from
4600 // other callers we've determined above can share the same merge node.
4601 ContextNode *MergeNode = nullptr;
4602 DenseMap<ContextNode *, unsigned> CallerToMoveCount;
4603 for (auto CalleeEdge : CalleeEdges) {
4604 auto *OrigCallee = CalleeEdge->Callee;
4605 // If we don't have a MergeNode yet (only happens on the first iteration,
4606 // as a new one will be created when we go to move the first callee edge
4607 // over as needed), see if we can use this callee.
4608 if (!MergeNode) {
4609 // If there are no other callers, simply use this callee.
4610 if (CalleeEdge->Callee->CallerEdges.size() == 1) {
4611 MergeNode = OrigCallee;
4612 NonNewMergedNodes++;
4613 continue;
4614 }
4615 // Otherwise, if we have identified other caller nodes that can share
4616 // the merge node with Node, see if all of OrigCallee's callers are
4617 // going to share the same merge node. In that case we can use callee
4618 // (since all of its callers would move to the new merge node).
4619 if (!OtherCallersToShareMerge.empty()) {
4620 bool MoveAllCallerEdges = true;
4621 for (auto CalleeCallerE : OrigCallee->CallerEdges) {
4622 if (CalleeCallerE == CalleeEdge)
4623 continue;
4624 if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller)) {
4625 MoveAllCallerEdges = false;
4626 break;
4627 }
4628 }
4629 // If we are going to move all callers over, we can use this callee as
4630 // the MergeNode.
4631 if (MoveAllCallerEdges) {
4632 MergeNode = OrigCallee;
4633 NonNewMergedNodes++;
4634 continue;
4635 }
4636 }
4637 }
4638 // Move this callee edge, creating a new merge node if necessary.
4639 if (MergeNode) {
4640 assert(MergeNode != OrigCallee);
4641 moveEdgeToExistingCalleeClone(Edge: CalleeEdge, NewCallee: MergeNode,
4642 /*NewClone*/ false);
4643 } else {
4644 MergeNode = moveEdgeToNewCalleeClone(Edge: CalleeEdge);
4645 NewMergedNodes++;
4646 }
4647 // Now move all identified edges from other callers over to the merge node
4648 // as well.
4649 if (!OtherCallersToShareMerge.empty()) {
4650 // Make and iterate over a copy of OrigCallee's caller edges because
4651 // some of these will be moved off of the OrigCallee and that would mess
4652 // up the iteration from OrigCallee.
4653 auto OrigCalleeCallerEdges = OrigCallee->CallerEdges;
4654 for (auto &CalleeCallerE : OrigCalleeCallerEdges) {
4655 if (CalleeCallerE == CalleeEdge)
4656 continue;
4657 if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller))
4658 continue;
4659 CallerToMoveCount[CalleeCallerE->Caller]++;
4660 moveEdgeToExistingCalleeClone(Edge: CalleeCallerE, NewCallee: MergeNode,
4661 /*NewClone*/ false);
4662 }
4663 }
4664 removeNoneTypeCalleeEdges(Node: OrigCallee);
4665 removeNoneTypeCalleeEdges(Node: MergeNode);
4666 }
4667 }
4668}
4669
4670// Look for other nodes that have edges to the same set of callee
4671// clones as the current Node. Those can share the eventual merge node
4672// (reducing cloning and binary size overhead) iff:
4673// - they have edges to the same set of callee clones
4674// - each callee edge reaches a subset of the same allocations as Node's
4675// corresponding edge to the same callee clone.
4676// The second requirement is to ensure that we don't undo any of the
4677// necessary cloning to distinguish contexts with different allocation
4678// behavior.
4679// FIXME: This is somewhat conservative, as we really just need to ensure
4680// that they don't reach the same allocations as contexts on edges from Node
4681// going to any of the *other* callee clones being merged. However, that
4682// requires more tracking and checking to get right.
4683template <typename DerivedCCG, typename FuncTy, typename CallTy>
4684void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
4685 findOtherCallersToShareMerge(
4686 ContextNode *Node,
4687 std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges,
4688 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode,
4689 DenseSet<ContextNode *> &OtherCallersToShareMerge) {
4690 auto NumCalleeClones = CalleeEdges.size();
4691 // This map counts how many edges to the same callee clone exist for other
4692 // caller nodes of each callee clone.
4693 DenseMap<ContextNode *, unsigned> OtherCallersToSharedCalleeEdgeCount;
4694 // Counts the number of other caller nodes that have edges to all callee
4695 // clones that don't violate the allocation context checking.
4696 unsigned PossibleOtherCallerNodes = 0;
4697
4698 // We only need to look at other Caller nodes if the first callee edge has
4699 // multiple callers (recall they are sorted in ascending order above).
4700 if (CalleeEdges[0]->Callee->CallerEdges.size() < 2)
4701 return;
4702
4703 // For each callee edge:
4704 // - Collect the count of other caller nodes calling the same callees.
4705 // - Collect the alloc nodes reached by contexts on each callee edge.
4706 DenseMap<ContextEdge *, DenseSet<ContextNode *>> CalleeEdgeToAllocNodes;
4707 for (auto CalleeEdge : CalleeEdges) {
4708 assert(CalleeEdge->Callee->CallerEdges.size() > 1);
4709 // For each other caller of the same callee, increment the count of
4710 // edges reaching the same callee clone.
4711 for (auto CalleeCallerEdges : CalleeEdge->Callee->CallerEdges) {
4712 if (CalleeCallerEdges->Caller == Node) {
4713 assert(CalleeCallerEdges == CalleeEdge);
4714 continue;
4715 }
4716 OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller]++;
4717 // If this caller edge now reaches all of the same callee clones,
4718 // increment the count of candidate other caller nodes.
4719 if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller] ==
4720 NumCalleeClones)
4721 PossibleOtherCallerNodes++;
4722 }
4723 // Collect the alloc nodes reached by contexts on each callee edge, for
4724 // later analysis.
4725 for (auto Id : CalleeEdge->getContextIds()) {
4726 auto *Alloc = ContextIdToAllocationNode.lookup(Id);
4727 if (!Alloc) {
4728 // FIXME: unclear why this happens occasionally, presumably
4729 // imperfect graph updates possibly with recursion.
4730 MissingAllocForContextId++;
4731 continue;
4732 }
4733 CalleeEdgeToAllocNodes[CalleeEdge.get()].insert(Alloc);
4734 }
4735 }
4736
4737 // Now walk the callee edges again, and make sure that for each candidate
4738 // caller node all of its edges to the callees reach the same allocs (or
4739 // a subset) as those along the corresponding callee edge from Node.
4740 for (auto CalleeEdge : CalleeEdges) {
4741 assert(CalleeEdge->Callee->CallerEdges.size() > 1);
4742 // Stop if we do not have any (more) candidate other caller nodes.
4743 if (!PossibleOtherCallerNodes)
4744 break;
4745 auto &CurCalleeAllocNodes = CalleeEdgeToAllocNodes[CalleeEdge.get()];
4746 // Check each other caller of this callee clone.
4747 for (auto &CalleeCallerE : CalleeEdge->Callee->CallerEdges) {
4748 // Not interested in the callee edge from Node itself.
4749 if (CalleeCallerE == CalleeEdge)
4750 continue;
4751 // Skip any callers that didn't have callee edges to all the same
4752 // callee clones.
4753 if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] !=
4754 NumCalleeClones)
4755 continue;
4756 // Make sure that each context along edge from candidate caller node
4757 // reaches an allocation also reached by this callee edge from Node.
4758 for (auto Id : CalleeCallerE->getContextIds()) {
4759 auto *Alloc = ContextIdToAllocationNode.lookup(Id);
4760 if (!Alloc)
4761 continue;
4762 // If not, simply reset the map entry to 0 so caller is ignored, and
4763 // reduce the count of candidate other caller nodes.
4764 if (!CurCalleeAllocNodes.contains(Alloc)) {
4765 OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] = 0;
4766 PossibleOtherCallerNodes--;
4767 break;
4768 }
4769 }
4770 }
4771 }
4772
4773 if (!PossibleOtherCallerNodes)
4774 return;
4775
4776 // Build the set of other caller nodes that can use the same callee merge
4777 // node.
4778 for (auto &[OtherCaller, Count] : OtherCallersToSharedCalleeEdgeCount) {
4779 if (Count != NumCalleeClones)
4780 continue;
4781 OtherCallersToShareMerge.insert(OtherCaller);
4782 }
4783}
4784
4785// This method assigns cloned callsites to functions, cloning the functions as
4786// needed. The assignment is greedy and proceeds roughly as follows:
4787//
4788// For each function Func:
4789// For each call with graph Node having clones:
4790// Initialize ClonesWorklist to Node and its clones
4791// Initialize NodeCloneCount to 0
4792// While ClonesWorklist is not empty:
4793// Clone = pop front ClonesWorklist
4794// NodeCloneCount++
4795// If Func has been cloned less than NodeCloneCount times:
4796// If NodeCloneCount is 1:
4797// Assign Clone to original Func
4798// Continue
4799// Create a new function clone
4800// If other callers not assigned to call a function clone yet:
4801// Assign them to call new function clone
4802// Continue
4803// Assign any other caller calling the cloned version to new clone
4804//
4805// For each caller of Clone:
4806// If caller is assigned to call a specific function clone:
4807// If we cannot assign Clone to that function clone:
4808// Create new callsite Clone NewClone
4809// Add NewClone to ClonesWorklist
4810// Continue
4811// Assign Clone to existing caller's called function clone
4812// Else:
4813// If Clone not already assigned to a function clone:
4814// Assign to first function clone without assignment
4815// Assign caller to selected function clone
4816// For each call with graph Node having clones:
4817// If number func clones > number call's callsite Node clones:
4818// Record func CallInfo clones without Node clone in UnassignedCallClones
4819// For callsite Nodes in DFS order from allocations:
4820// If IsAllocation:
4821// Update allocation with alloc type
4822// Else:
4823// For Call, all MatchingCalls, and associated UnnassignedCallClones:
4824// Update call to call recorded callee clone
4825//
4826template <typename DerivedCCG, typename FuncTy, typename CallTy>
4827bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
4828 bool Changed = false;
4829
4830 mergeClones();
4831
4832 // Keep track of the assignment of nodes (callsites) to function clones they
4833 // call.
4834 DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
4835
4836 // Update caller node to call function version CalleeFunc, by recording the
4837 // assignment in CallsiteToCalleeFuncCloneMap.
4838 auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
4839 const FuncInfo &CalleeFunc) {
4840 assert(Caller->hasCall());
4841 CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
4842 };
4843
4844 // Information for a single clone of this Func.
4845 struct FuncCloneInfo {
4846 // The function clone.
4847 FuncInfo FuncClone;
4848 // Remappings of each call of interest (from original uncloned call to the
4849 // corresponding cloned call in this function clone).
4850 DenseMap<CallInfo, CallInfo> CallMap;
4851 };
4852
4853 // Map to keep track of information needed to update calls in function clones
4854 // when their corresponding callsite node was not itself cloned for that
4855 // function clone. Because of call context pruning (i.e. we only keep as much
4856 // caller information as needed to distinguish hot vs cold), we may not have
4857 // caller edges coming to each callsite node from all possible function
4858 // callers. A function clone may get created for other callsites in the
4859 // function for which there are caller edges that were not pruned. Any other
4860 // callsites in that function clone, which were not themselved cloned for
4861 // that function clone, should get updated the same way as the corresponding
4862 // callsite in the original function (which may call a clone of its callee).
4863 //
4864 // We build this map after completing function cloning for each function, so
4865 // that we can record the information from its call maps before they are
4866 // destructed. The map will be used as we update calls to update any still
4867 // unassigned call clones. Note that we may create new node clones as we clone
4868 // other functions, so later on we check which node clones were still not
4869 // created. To this end, the inner map is a map from function clone number to
4870 // the list of calls cloned for that function (can be more than one due to the
4871 // Node's MatchingCalls array).
4872 //
4873 // The alternative is creating new callsite clone nodes below as we clone the
4874 // function, but that is tricker to get right and likely more overhead.
4875 //
4876 // Inner map is a std::map so sorted by key (clone number), in order to get
4877 // ordered remarks in the full LTO case.
4878 DenseMap<const ContextNode *, std::map<unsigned, SmallVector<CallInfo, 0>>>
4879 UnassignedCallClones;
4880
4881 // Walk all functions for which we saw calls with memprof metadata, and handle
4882 // cloning for each of its calls.
4883 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
4884 FuncInfo OrigFunc(Func);
4885 // Map from each clone number of OrigFunc to information about that function
4886 // clone (the function clone FuncInfo and call remappings). The index into
4887 // the vector is the clone number, as function clones are created and
4888 // numbered sequentially.
4889 std::vector<FuncCloneInfo> FuncCloneInfos;
4890 for (auto &Call : CallsWithMetadata) {
4891 ContextNode *Node = getNodeForInst(C: Call);
4892 // Skip call if we do not have a node for it (all uses of its stack ids
4893 // were either on inlined chains or pruned from the MIBs), or if we did
4894 // not create any clones for it.
4895 if (!Node || Node->Clones.empty())
4896 continue;
4897 assert(Node->hasCall() &&
4898 "Not having a call should have prevented cloning");
4899
4900 // Track the assignment of function clones to clones of the current
4901 // callsite Node being handled.
4902 std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
4903
4904 // Assign callsite version CallsiteClone to function version FuncClone,
4905 // and also assign (possibly cloned) Call to CallsiteClone.
4906 auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
4907 CallInfo &Call,
4908 ContextNode *CallsiteClone,
4909 bool IsAlloc) {
4910 // Record the clone of callsite node assigned to this function clone.
4911 FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
4912
4913 assert(FuncCloneInfos.size() > FuncClone.cloneNo());
4914 DenseMap<CallInfo, CallInfo> &CallMap =
4915 FuncCloneInfos[FuncClone.cloneNo()].CallMap;
4916 CallInfo CallClone(Call);
4917 if (auto It = CallMap.find(Call); It != CallMap.end())
4918 CallClone = It->second;
4919 CallsiteClone->setCall(CallClone);
4920 // Need to do the same for all matching calls.
4921 for (auto &MatchingCall : Node->MatchingCalls) {
4922 CallInfo CallClone(MatchingCall);
4923 if (auto It = CallMap.find(MatchingCall); It != CallMap.end())
4924 CallClone = It->second;
4925 // Updates the call in the list.
4926 MatchingCall = CallClone;
4927 }
4928 };
4929
4930 // Invokes moveEdgeToNewCalleeClone which creates a new clone, and then
4931 // performs the necessary fixups (removing none type edges, and
4932 // importantly, propagating any function call assignment of the original
4933 // node to the new clone).
4934 auto MoveEdgeToNewCalleeCloneAndSetUp =
4935 [&](const std::shared_ptr<ContextEdge> &Edge) {
4936 ContextNode *OrigCallee = Edge->Callee;
4937 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge);
4938 removeNoneTypeCalleeEdges(Node: NewClone);
4939 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
4940 // If the original Callee was already assigned to call a specific
4941 // function version, make sure its new clone is assigned to call
4942 // that same function clone.
4943 if (CallsiteToCalleeFuncCloneMap.count(OrigCallee))
4944 RecordCalleeFuncOfCallsite(
4945 NewClone, CallsiteToCalleeFuncCloneMap[OrigCallee]);
4946 return NewClone;
4947 };
4948
4949 // Keep track of the clones of callsite Node that need to be assigned to
4950 // function clones. This list may be expanded in the loop body below if we
4951 // find additional cloning is required.
4952 std::deque<ContextNode *> ClonesWorklist;
4953 // Ignore original Node if we moved all of its contexts to clones.
4954 if (!Node->emptyContextIds())
4955 ClonesWorklist.push_back(Node);
4956 llvm::append_range(ClonesWorklist, Node->Clones);
4957
4958 // Now walk through all of the clones of this callsite Node that we need,
4959 // and determine the assignment to a corresponding clone of the current
4960 // function (creating new function clones as needed).
4961 unsigned NodeCloneCount = 0;
4962 while (!ClonesWorklist.empty()) {
4963 ContextNode *Clone = ClonesWorklist.front();
4964 ClonesWorklist.pop_front();
4965 NodeCloneCount++;
4966 if (VerifyNodes)
4967 checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
4968
4969 // Need to create a new function clone if we have more callsite clones
4970 // than existing function clones, which would have been assigned to an
4971 // earlier clone in the list (we assign callsite clones to function
4972 // clones greedily).
4973 if (FuncCloneInfos.size() < NodeCloneCount) {
4974 // If this is the first callsite copy, assign to original function.
4975 if (NodeCloneCount == 1) {
4976 // Since FuncCloneInfos is empty in this case, no clones have
4977 // been created for this function yet, and no callers should have
4978 // been assigned a function clone for this callee node yet.
4979 assert(llvm::none_of(
4980 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
4981 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
4982 }));
4983 // Initialize with empty call map, assign Clone to original function
4984 // and its callers, and skip to the next clone.
4985 FuncCloneInfos.push_back(
4986 {OrigFunc, DenseMap<CallInfo, CallInfo>()});
4987 AssignCallsiteCloneToFuncClone(
4988 OrigFunc, Call, Clone,
4989 AllocationCallToContextNodeMap.count(Call));
4990 for (auto &CE : Clone->CallerEdges) {
4991 // Ignore any caller that does not have a recorded callsite Call.
4992 if (!CE->Caller->hasCall())
4993 continue;
4994 RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
4995 }
4996 continue;
4997 }
4998
4999 // First locate which copy of OrigFunc to clone again. If a caller
5000 // of this callsite clone was already assigned to call a particular
5001 // function clone, we need to redirect all of those callers to the
5002 // new function clone, and update their other callees within this
5003 // function.
5004 FuncInfo PreviousAssignedFuncClone;
5005 auto EI = llvm::find_if(
5006 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
5007 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
5008 });
5009 bool CallerAssignedToCloneOfFunc = false;
5010 if (EI != Clone->CallerEdges.end()) {
5011 const std::shared_ptr<ContextEdge> &Edge = *EI;
5012 PreviousAssignedFuncClone =
5013 CallsiteToCalleeFuncCloneMap[Edge->Caller];
5014 CallerAssignedToCloneOfFunc = true;
5015 }
5016
5017 // Clone function and save it along with the CallInfo map created
5018 // during cloning in the FuncCloneInfos.
5019 DenseMap<CallInfo, CallInfo> NewCallMap;
5020 unsigned CloneNo = FuncCloneInfos.size();
5021 assert(CloneNo > 0 && "Clone 0 is the original function, which "
5022 "should already exist in the map");
5023 FuncInfo NewFuncClone = cloneFunctionForCallsite(
5024 Func&: OrigFunc, Call, CallMap&: NewCallMap, CallsWithMetadataInFunc&: CallsWithMetadata, CloneNo);
5025 FuncCloneInfos.push_back({NewFuncClone, std::move(NewCallMap)});
5026 FunctionClonesAnalysis++;
5027 Changed = true;
5028
5029 // If no caller callsites were already assigned to a clone of this
5030 // function, we can simply assign this clone to the new func clone
5031 // and update all callers to it, then skip to the next clone.
5032 if (!CallerAssignedToCloneOfFunc) {
5033 AssignCallsiteCloneToFuncClone(
5034 NewFuncClone, Call, Clone,
5035 AllocationCallToContextNodeMap.count(Call));
5036 for (auto &CE : Clone->CallerEdges) {
5037 // Ignore any caller that does not have a recorded callsite Call.
5038 if (!CE->Caller->hasCall())
5039 continue;
5040 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
5041 }
5042 continue;
5043 }
5044
5045 // We may need to do additional node cloning in this case.
5046 // Reset the CallsiteToCalleeFuncCloneMap entry for any callers
5047 // that were previously assigned to call PreviousAssignedFuncClone,
5048 // to record that they now call NewFuncClone.
5049 // The none type edge removal may remove some of this Clone's caller
5050 // edges, if it is reached via another of its caller's callees.
5051 // Iterate over a copy and skip any that were removed.
5052 auto CallerEdges = Clone->CallerEdges;
5053 for (auto CE : CallerEdges) {
5054 // Skip any that have been removed on an earlier iteration.
5055 if (CE->isRemoved()) {
5056 assert(!is_contained(Clone->CallerEdges, CE));
5057 continue;
5058 }
5059 assert(CE);
5060 // Ignore any caller that does not have a recorded callsite Call.
5061 if (!CE->Caller->hasCall())
5062 continue;
5063
5064 if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
5065 // We subsequently fall through to later handling that
5066 // will perform any additional cloning required for
5067 // callers that were calling other function clones.
5068 CallsiteToCalleeFuncCloneMap[CE->Caller] !=
5069 PreviousAssignedFuncClone)
5070 continue;
5071
5072 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
5073
5074 // If we are cloning a function that was already assigned to some
5075 // callers, then essentially we are creating new callsite clones
5076 // of the other callsites in that function that are reached by those
5077 // callers. Clone the other callees of the current callsite's caller
5078 // that were already assigned to PreviousAssignedFuncClone
5079 // accordingly. This is important since we subsequently update the
5080 // calls from the nodes in the graph and their assignments to callee
5081 // functions recorded in CallsiteToCalleeFuncCloneMap.
5082 // The none type edge removal may remove some of this caller's
5083 // callee edges, if it is reached via another of its callees.
5084 // Iterate over a copy and skip any that were removed.
5085 auto CalleeEdges = CE->Caller->CalleeEdges;
5086 for (auto CalleeEdge : CalleeEdges) {
5087 // Skip any that have been removed on an earlier iteration when
5088 // cleaning up newly None type callee edges.
5089 if (CalleeEdge->isRemoved()) {
5090 assert(!is_contained(CE->Caller->CalleeEdges, CalleeEdge));
5091 continue;
5092 }
5093 assert(CalleeEdge);
5094 ContextNode *Callee = CalleeEdge->Callee;
5095 // Skip the current callsite, we are looking for other
5096 // callsites Caller calls, as well as any that does not have a
5097 // recorded callsite Call.
5098 if (Callee == Clone || !Callee->hasCall())
5099 continue;
5100 // Skip direct recursive calls. We don't need/want to clone the
5101 // caller node again, and this loop will not behave as expected if
5102 // we tried.
5103 if (Callee == CalleeEdge->Caller)
5104 continue;
5105 ContextNode *NewClone =
5106 MoveEdgeToNewCalleeCloneAndSetUp(CalleeEdge);
5107 // Moving the edge may have resulted in some none type
5108 // callee edges on the original Callee.
5109 removeNoneTypeCalleeEdges(Node: Callee);
5110 // Update NewClone with the new Call clone of this callsite's Call
5111 // created for the new function clone created earlier.
5112 // Recall that we have already ensured when building the graph
5113 // that each caller can only call callsites within the same
5114 // function, so we are guaranteed that Callee Call is in the
5115 // current OrigFunc.
5116 // CallMap is set up as indexed by original Call at clone 0.
5117 CallInfo OrigCall(Callee->getOrigNode()->Call);
5118 OrigCall.setCloneNo(0);
5119 DenseMap<CallInfo, CallInfo> &CallMap =
5120 FuncCloneInfos[NewFuncClone.cloneNo()].CallMap;
5121 assert(CallMap.count(OrigCall));
5122 CallInfo NewCall(CallMap[OrigCall]);
5123 assert(NewCall);
5124 NewClone->setCall(NewCall);
5125 // Need to do the same for all matching calls.
5126 for (auto &MatchingCall : NewClone->MatchingCalls) {
5127 CallInfo OrigMatchingCall(MatchingCall);
5128 OrigMatchingCall.setCloneNo(0);
5129 assert(CallMap.count(OrigMatchingCall));
5130 CallInfo NewCall(CallMap[OrigMatchingCall]);
5131 assert(NewCall);
5132 // Updates the call in the list.
5133 MatchingCall = NewCall;
5134 }
5135 }
5136 }
5137 // Fall through to handling below to perform the recording of the
5138 // function for this callsite clone. This enables handling of cases
5139 // where the callers were assigned to different clones of a function.
5140 }
5141
5142 auto FindFirstAvailFuncClone = [&]() {
5143 // Find first function in FuncCloneInfos without an assigned
5144 // clone of this callsite Node. We should always have one
5145 // available at this point due to the earlier cloning when the
5146 // FuncCloneInfos size was smaller than the clone number.
5147 for (auto &CF : FuncCloneInfos) {
5148 if (!FuncCloneToCurNodeCloneMap.count(CF.FuncClone))
5149 return CF.FuncClone;
5150 }
5151 llvm_unreachable(
5152 "Expected an available func clone for this callsite clone");
5153 };
5154
5155 // See if we can use existing function clone. Walk through
5156 // all caller edges to see if any have already been assigned to
5157 // a clone of this callsite's function. If we can use it, do so. If not,
5158 // because that function clone is already assigned to a different clone
5159 // of this callsite, then we need to clone again.
5160 // Basically, this checking is needed to handle the case where different
5161 // caller functions/callsites may need versions of this function
5162 // containing different mixes of callsite clones across the different
5163 // callsites within the function. If that happens, we need to create
5164 // additional function clones to handle the various combinations.
5165 //
5166 // Keep track of any new clones of this callsite created by the
5167 // following loop, as well as any existing clone that we decided to
5168 // assign this clone to.
5169 std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
5170 FuncInfo FuncCloneAssignedToCurCallsiteClone;
5171 // Iterate over a copy of Clone's caller edges, since we may need to
5172 // remove edges in the moveEdgeTo* methods, and this simplifies the
5173 // handling and makes it less error-prone.
5174 auto CloneCallerEdges = Clone->CallerEdges;
5175 for (auto &Edge : CloneCallerEdges) {
5176 // Skip removed edges (due to direct recursive edges updated when
5177 // updating callee edges when moving an edge and subsequently
5178 // removed by call to removeNoneTypeCalleeEdges on the Clone).
5179 if (Edge->isRemoved())
5180 continue;
5181 // Ignore any caller that does not have a recorded callsite Call.
5182 if (!Edge->Caller->hasCall())
5183 continue;
5184 // If this caller already assigned to call a version of OrigFunc, need
5185 // to ensure we can assign this callsite clone to that function clone.
5186 if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
5187 FuncInfo FuncCloneCalledByCaller =
5188 CallsiteToCalleeFuncCloneMap[Edge->Caller];
5189 // First we need to confirm that this function clone is available
5190 // for use by this callsite node clone.
5191 //
5192 // While FuncCloneToCurNodeCloneMap is built only for this Node and
5193 // its callsite clones, one of those callsite clones X could have
5194 // been assigned to the same function clone called by Edge's caller
5195 // - if Edge's caller calls another callsite within Node's original
5196 // function, and that callsite has another caller reaching clone X.
5197 // We need to clone Node again in this case.
5198 if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
5199 FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
5200 Clone) ||
5201 // Detect when we have multiple callers of this callsite that
5202 // have already been assigned to specific, and different, clones
5203 // of OrigFunc (due to other unrelated callsites in Func they
5204 // reach via call contexts). Is this Clone of callsite Node
5205 // assigned to a different clone of OrigFunc? If so, clone Node
5206 // again.
5207 (FuncCloneAssignedToCurCallsiteClone &&
5208 FuncCloneAssignedToCurCallsiteClone !=
5209 FuncCloneCalledByCaller)) {
5210 // We need to use a different newly created callsite clone, in
5211 // order to assign it to another new function clone on a
5212 // subsequent iteration over the Clones array (adjusted below).
5213 // Note we specifically do not reset the
5214 // CallsiteToCalleeFuncCloneMap entry for this caller, so that
5215 // when this new clone is processed later we know which version of
5216 // the function to copy (so that other callsite clones we have
5217 // assigned to that function clone are properly cloned over). See
5218 // comments in the function cloning handling earlier.
5219
5220 // Check if we already have cloned this callsite again while
5221 // walking through caller edges, for a caller calling the same
5222 // function clone. If so, we can move this edge to that new clone
5223 // rather than creating yet another new clone.
5224 if (FuncCloneToNewCallsiteCloneMap.count(
5225 FuncCloneCalledByCaller)) {
5226 ContextNode *NewClone =
5227 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
5228 moveEdgeToExistingCalleeClone(Edge, NewCallee: NewClone);
5229 // Cleanup any none type edges cloned over.
5230 removeNoneTypeCalleeEdges(Node: NewClone);
5231 } else {
5232 // Create a new callsite clone.
5233 ContextNode *NewClone = MoveEdgeToNewCalleeCloneAndSetUp(Edge);
5234 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
5235 NewClone;
5236 // Add to list of clones and process later.
5237 ClonesWorklist.push_back(NewClone);
5238 }
5239 // Moving the caller edge may have resulted in some none type
5240 // callee edges.
5241 removeNoneTypeCalleeEdges(Node: Clone);
5242 // We will handle the newly created callsite clone in a subsequent
5243 // iteration over this Node's Clones.
5244 continue;
5245 }
5246
5247 // Otherwise, we can use the function clone already assigned to this
5248 // caller.
5249 if (!FuncCloneAssignedToCurCallsiteClone) {
5250 FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
5251 // Assign Clone to FuncCloneCalledByCaller
5252 AssignCallsiteCloneToFuncClone(
5253 FuncCloneCalledByCaller, Call, Clone,
5254 AllocationCallToContextNodeMap.count(Call));
5255 } else
5256 // Don't need to do anything - callsite is already calling this
5257 // function clone.
5258 assert(FuncCloneAssignedToCurCallsiteClone ==
5259 FuncCloneCalledByCaller);
5260
5261 } else {
5262 // We have not already assigned this caller to a version of
5263 // OrigFunc. Do the assignment now.
5264
5265 // First check if we have already assigned this callsite clone to a
5266 // clone of OrigFunc for another caller during this iteration over
5267 // its caller edges.
5268 if (!FuncCloneAssignedToCurCallsiteClone) {
5269 FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone();
5270 assert(FuncCloneAssignedToCurCallsiteClone);
5271 // Assign Clone to FuncCloneAssignedToCurCallsiteClone
5272 AssignCallsiteCloneToFuncClone(
5273 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
5274 AllocationCallToContextNodeMap.count(Call));
5275 } else
5276 assert(FuncCloneToCurNodeCloneMap
5277 [FuncCloneAssignedToCurCallsiteClone] == Clone);
5278 // Update callers to record function version called.
5279 RecordCalleeFuncOfCallsite(Edge->Caller,
5280 FuncCloneAssignedToCurCallsiteClone);
5281 }
5282 }
5283 // If we didn't assign a function clone to this callsite clone yet, e.g.
5284 // none of its callers has a non-null call, do the assignment here.
5285 // We want to ensure that every callsite clone is assigned to some
5286 // function clone, so that the call updates below work as expected.
5287 // In particular if this is the original callsite, we want to ensure it
5288 // is assigned to the original function, otherwise the original function
5289 // will appear available for assignment to other callsite clones,
5290 // leading to unintended effects. For one, the unknown and not updated
5291 // callers will call into cloned paths leading to the wrong hints,
5292 // because they still call the original function (clone 0). Also,
5293 // because all callsites start out as being clone 0 by default, we can't
5294 // easily distinguish between callsites explicitly assigned to clone 0
5295 // vs those never assigned, which can lead to multiple updates of the
5296 // calls when invoking updateCall below, with mismatched clone values.
5297 // TODO: Add a flag to the callsite nodes or some other mechanism to
5298 // better distinguish and identify callsite clones that are not getting
5299 // assigned to function clones as expected.
5300 if (!FuncCloneAssignedToCurCallsiteClone) {
5301 FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone();
5302 assert(FuncCloneAssignedToCurCallsiteClone &&
5303 "No available func clone for this callsite clone");
5304 AssignCallsiteCloneToFuncClone(
5305 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
5306 /*IsAlloc=*/AllocationCallToContextNodeMap.contains(Call));
5307 }
5308 }
5309 if (VerifyCCG) {
5310 checkNode<DerivedCCG, FuncTy, CallTy>(Node);
5311 for (const auto &PE : Node->CalleeEdges)
5312 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
5313 for (const auto &CE : Node->CallerEdges)
5314 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
5315 for (auto *Clone : Node->Clones) {
5316 checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
5317 for (const auto &PE : Clone->CalleeEdges)
5318 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
5319 for (const auto &CE : Clone->CallerEdges)
5320 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
5321 }
5322 }
5323 }
5324
5325 if (FuncCloneInfos.size() < 2)
5326 continue;
5327
5328 // In this case there is more than just the original function copy.
5329 // Record call clones of any callsite nodes in the function that did not
5330 // themselves get cloned for all of the function clones.
5331 for (auto &Call : CallsWithMetadata) {
5332 ContextNode *Node = getNodeForInst(C: Call);
5333 if (!Node || !Node->hasCall() || Node->emptyContextIds())
5334 continue;
5335 // If Node has enough clones already to cover all function clones, we can
5336 // skip it. Need to add one for the original copy.
5337 // Use >= in case there were clones that were skipped due to having empty
5338 // context ids
5339 if (Node->Clones.size() + 1 >= FuncCloneInfos.size())
5340 continue;
5341 // First collect all function clones we cloned this callsite node for.
5342 // They may not be sequential due to empty clones e.g.
5343 DenseSet<unsigned> NodeCallClones;
5344 for (auto *C : Node->Clones)
5345 NodeCallClones.insert(C->Call.cloneNo());
5346 unsigned I = 0;
5347 // Now check all the function clones.
5348 for (auto &FC : FuncCloneInfos) {
5349 // Function clones should be sequential.
5350 assert(FC.FuncClone.cloneNo() == I);
5351 // Skip the first clone which got the original call.
5352 // Also skip any other clones created for this Node.
5353 if (++I == 1 || NodeCallClones.contains(V: I)) {
5354 continue;
5355 }
5356 // Record the call clones created for this callsite in this function
5357 // clone.
5358 auto &CallVector = UnassignedCallClones[Node][I];
5359 DenseMap<CallInfo, CallInfo> &CallMap = FC.CallMap;
5360 if (auto It = CallMap.find(Call); It != CallMap.end()) {
5361 CallInfo CallClone = It->second;
5362 CallVector.push_back(CallClone);
5363 } else {
5364 // All but the original clone (skipped earlier) should have an entry
5365 // for all calls.
5366 assert(false && "Expected to find call in CallMap");
5367 }
5368 // Need to do the same for all matching calls.
5369 for (auto &MatchingCall : Node->MatchingCalls) {
5370 if (auto It = CallMap.find(MatchingCall); It != CallMap.end()) {
5371 CallInfo CallClone = It->second;
5372 CallVector.push_back(CallClone);
5373 } else {
5374 // All but the original clone (skipped earlier) should have an entry
5375 // for all calls.
5376 assert(false && "Expected to find call in CallMap");
5377 }
5378 }
5379 }
5380 }
5381 }
5382
5383 uint8_t BothTypes =
5384 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
5385
5386 auto UpdateCalls = [&](ContextNode *Node,
5387 DenseSet<const ContextNode *> &Visited,
5388 auto &&UpdateCalls) {
5389 auto Inserted = Visited.insert(Node);
5390 if (!Inserted.second)
5391 return;
5392
5393 for (auto *Clone : Node->Clones)
5394 UpdateCalls(Clone, Visited, UpdateCalls);
5395
5396 for (auto &Edge : Node->CallerEdges)
5397 UpdateCalls(Edge->Caller, Visited, UpdateCalls);
5398
5399 // Skip if either no call to update, or if we ended up with no context ids
5400 // (we moved all edges onto other clones).
5401 if (!Node->hasCall() || Node->emptyContextIds())
5402 return;
5403
5404 if (Node->IsAllocation) {
5405 auto AT = allocTypeToUse(Node->AllocTypes);
5406 // If the allocation type is ambiguous, and more aggressive hinting
5407 // has been enabled via the MinClonedColdBytePercent flag, see if this
5408 // allocation should be hinted cold anyway because its fraction cold bytes
5409 // allocated is at least the given threshold.
5410 if (Node->AllocTypes == BothTypes && MinClonedColdBytePercent < 100 &&
5411 !ContextIdToContextSizeInfos.empty()) {
5412 uint64_t TotalCold = 0;
5413 uint64_t Total = 0;
5414 for (auto Id : Node->getContextIds()) {
5415 auto TypeI = ContextIdToAllocationType.find(Id);
5416 assert(TypeI != ContextIdToAllocationType.end());
5417 auto CSI = ContextIdToContextSizeInfos.find(Id);
5418 if (CSI != ContextIdToContextSizeInfos.end()) {
5419 for (auto &Info : CSI->second) {
5420 Total += Info.TotalSize;
5421 if (TypeI->second == AllocationType::Cold)
5422 TotalCold += Info.TotalSize;
5423 }
5424 }
5425 }
5426 if (TotalCold * 100 >= Total * MinClonedColdBytePercent)
5427 AT = AllocationType::Cold;
5428 }
5429 updateAllocationCall(Call&: Node->Call, AllocType: AT);
5430 assert(Node->MatchingCalls.empty());
5431 return;
5432 }
5433
5434 if (!CallsiteToCalleeFuncCloneMap.count(Node))
5435 return;
5436
5437 auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
5438 updateCall(CallerCall&: Node->Call, CalleeFunc);
5439 // Update all the matching calls as well.
5440 for (auto &Call : Node->MatchingCalls)
5441 updateCall(CallerCall&: Call, CalleeFunc);
5442
5443 // Now update all calls recorded earlier that are still in function clones
5444 // which don't have a clone of this callsite node.
5445 if (!UnassignedCallClones.contains(Node))
5446 return;
5447 DenseSet<unsigned> NodeCallClones;
5448 for (auto *C : Node->Clones)
5449 NodeCallClones.insert(C->Call.cloneNo());
5450 // Note that we already confirmed Node is in this map a few lines above.
5451 auto &ClonedCalls = UnassignedCallClones[Node];
5452 for (auto &[CloneNo, CallVector] : ClonedCalls) {
5453 // Should start at 1 as we never create an entry for original node.
5454 assert(CloneNo > 0);
5455 // If we subsequently created a clone, skip this one.
5456 if (NodeCallClones.contains(V: CloneNo))
5457 continue;
5458 // Use the original Node's CalleeFunc.
5459 for (auto &Call : CallVector)
5460 updateCall(CallerCall&: Call, CalleeFunc);
5461 }
5462 };
5463
5464 // Performs DFS traversal starting from allocation nodes to update calls to
5465 // reflect cloning decisions recorded earlier. For regular LTO this will
5466 // update the actual calls in the IR to call the appropriate function clone
5467 // (and add attributes to allocation calls), whereas for ThinLTO the decisions
5468 // are recorded in the summary entries.
5469 DenseSet<const ContextNode *> Visited;
5470 for (auto &Entry : AllocationCallToContextNodeMap)
5471 UpdateCalls(Entry.second, Visited, UpdateCalls);
5472
5473 return Changed;
5474}
5475
5476// Compute a SHA1 hash of the callsite and alloc version information of clone I
5477// in the summary, to use in detection of duplicate clones.
5478uint64_t ComputeHash(const FunctionSummary *FS, unsigned I) {
5479 SHA1 Hasher;
5480 // Update hash with any callsites that call non-default (non-zero) callee
5481 // versions.
5482 for (auto &SN : FS->callsites()) {
5483 // In theory all callsites and allocs in this function should have the same
5484 // number of clone entries, but handle any discrepancies gracefully below
5485 // for NDEBUG builds.
5486 assert(
5487 SN.Clones.size() > I &&
5488 "Callsite summary has fewer entries than other summaries in function");
5489 if (SN.Clones.size() <= I || !SN.Clones[I])
5490 continue;
5491 uint8_t Data[sizeof(SN.Clones[I])];
5492 support::endian::write32le(P: Data, V: SN.Clones[I]);
5493 Hasher.update(Data);
5494 }
5495 // Update hash with any allocs that have non-default (non-None) hints.
5496 for (auto &AN : FS->allocs()) {
5497 // In theory all callsites and allocs in this function should have the same
5498 // number of clone entries, but handle any discrepancies gracefully below
5499 // for NDEBUG builds.
5500 assert(AN.Versions.size() > I &&
5501 "Alloc summary has fewer entries than other summaries in function");
5502 if (AN.Versions.size() <= I ||
5503 (AllocationType)AN.Versions[I] == AllocationType::None)
5504 continue;
5505 Hasher.update(Data: ArrayRef<uint8_t>(&AN.Versions[I], 1));
5506 }
5507 return support::endian::read64le(P: Hasher.result().data());
5508}
5509
5510static SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> createFunctionClones(
5511 Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
5512 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
5513 &FuncToAliasMap,
5514 FunctionSummary *FS) {
5515 auto TakeDeclNameAndReplace = [](GlobalValue *DeclGV, GlobalValue *NewGV) {
5516 // We might have created this when adjusting callsite in another
5517 // function. It should be a declaration.
5518 assert(DeclGV->isDeclaration());
5519 NewGV->takeName(V: DeclGV);
5520 DeclGV->replaceAllUsesWith(V: NewGV);
5521 DeclGV->eraseFromParent();
5522 };
5523
5524 // Handle aliases to this function, and create analogous alias clones to the
5525 // provided clone of this function.
5526 auto CloneFuncAliases = [&](Function *NewF, unsigned I) {
5527 if (!FuncToAliasMap.count(x: &F))
5528 return;
5529 for (auto *A : FuncToAliasMap[&F]) {
5530 std::string AliasName = getMemProfFuncName(Base: A->getName(), CloneNo: I);
5531 auto *PrevA = M.getNamedAlias(Name: AliasName);
5532 auto *NewA = GlobalAlias::create(Ty: A->getValueType(),
5533 AddressSpace: A->getType()->getPointerAddressSpace(),
5534 Linkage: A->getLinkage(), Name: AliasName, Aliasee: NewF);
5535 NewA->copyAttributesFrom(Src: A);
5536 if (PrevA)
5537 TakeDeclNameAndReplace(PrevA, NewA);
5538 }
5539 };
5540
5541 // The first "clone" is the original copy, we should only call this if we
5542 // needed to create new clones.
5543 assert(NumClones > 1);
5544 SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
5545 VMaps.reserve(N: NumClones - 1);
5546 FunctionsClonedThinBackend++;
5547
5548 // Map of hash of callsite/alloc versions to the instantiated function clone
5549 // (possibly the original) implementing those calls. Used to avoid
5550 // instantiating duplicate function clones.
5551 // FIXME: Ideally the thin link would not generate such duplicate clones to
5552 // start with, but right now it happens due to phase ordering in the function
5553 // assignment and possible new clones that produces. We simply make each
5554 // duplicate an alias to the matching instantiated clone recorded in the map
5555 // (except for available_externally which are made declarations as they would
5556 // be aliases in the prevailing module, and available_externally aliases are
5557 // not well supported right now).
5558 DenseMap<uint64_t, Function *> HashToFunc;
5559
5560 // Save the hash of the original function version.
5561 HashToFunc[ComputeHash(FS, I: 0)] = &F;
5562
5563 for (unsigned I = 1; I < NumClones; I++) {
5564 VMaps.emplace_back(Args: std::make_unique<ValueToValueMapTy>());
5565 std::string Name = getMemProfFuncName(Base: F.getName(), CloneNo: I);
5566 auto Hash = ComputeHash(FS, I);
5567 // If this clone would duplicate a previously seen clone, don't generate the
5568 // duplicate clone body, just make an alias to satisfy any (potentially
5569 // cross-module) references.
5570 if (HashToFunc.contains(Val: Hash)) {
5571 FunctionCloneDuplicatesThinBackend++;
5572 auto *Func = HashToFunc[Hash];
5573 if (Func->hasAvailableExternallyLinkage()) {
5574 // Skip these as EliminateAvailableExternallyPass does not handle
5575 // available_externally aliases correctly and we end up with an
5576 // available_externally alias to a declaration. Just create a
5577 // declaration for now as we know we will have a definition in another
5578 // module.
5579 auto Decl = M.getOrInsertFunction(Name, T: Func->getFunctionType());
5580 ORE.emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
5581 << "created clone decl " << ore::NV("Decl", Decl.getCallee()));
5582 continue;
5583 }
5584 auto *PrevF = M.getFunction(Name);
5585 auto *Alias = GlobalAlias::create(Name, Aliasee: Func);
5586 if (PrevF)
5587 TakeDeclNameAndReplace(PrevF, Alias);
5588 ORE.emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
5589 << "created clone alias " << ore::NV("Alias", Alias));
5590
5591 // Now handle aliases to this function, and clone those as well.
5592 CloneFuncAliases(Func, I);
5593 continue;
5594 }
5595 auto *NewF = CloneFunction(F: &F, VMap&: *VMaps.back());
5596 HashToFunc[Hash] = NewF;
5597 FunctionClonesThinBackend++;
5598 // Strip memprof and callsite metadata from clone as they are no longer
5599 // needed.
5600 for (auto &BB : *NewF) {
5601 for (auto &Inst : BB) {
5602 Inst.setMetadata(KindID: LLVMContext::MD_memprof, Node: nullptr);
5603 Inst.setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
5604 }
5605 }
5606 auto *PrevF = M.getFunction(Name);
5607 if (PrevF)
5608 TakeDeclNameAndReplace(PrevF, NewF);
5609 else
5610 NewF->setName(Name);
5611 updateSubprogramLinkageName(NewFunc: NewF, Name);
5612 ORE.emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
5613 << "created clone " << ore::NV("NewFunction", NewF));
5614
5615 // Now handle aliases to this function, and clone those as well.
5616 CloneFuncAliases(NewF, I);
5617 }
5618 return VMaps;
5619}
5620
5621// Locate the summary for F. This is complicated by the fact that it might
5622// have been internalized or promoted.
5623static ValueInfo findValueInfoForFunc(const Function &F, const Module &M,
5624 const ModuleSummaryIndex *ImportSummary,
5625 const Function *CallingFunc = nullptr) {
5626 // FIXME: Ideally we would retain the original GUID in some fashion on the
5627 // function (e.g. as metadata), but for now do our best to locate the
5628 // summary without that information.
5629 ValueInfo TheFnVI = ImportSummary->getValueInfo(GUID: F.getGUID());
5630 if (!TheFnVI)
5631 // See if theFn was internalized, by checking index directly with
5632 // original name (this avoids the name adjustment done by getGUID() for
5633 // internal symbols).
5634 TheFnVI = ImportSummary->getValueInfo(
5635 GUID: GlobalValue::getGUIDAssumingExternalLinkage(GlobalName: F.getName()));
5636 if (TheFnVI)
5637 return TheFnVI;
5638 // Now query with the original name before any promotion was performed.
5639 StringRef OrigName =
5640 ModuleSummaryIndex::getOriginalNameBeforePromote(Name: F.getName());
5641 // When this pass is enabled, we always add thinlto_src_file provenance
5642 // metadata to imported function definitions, which allows us to recreate the
5643 // original internal symbol's GUID.
5644 auto SrcFileMD = F.getMetadata(Kind: "thinlto_src_file");
5645 // If this is a call to an imported/promoted local for which we didn't import
5646 // the definition, the metadata will not exist on the declaration. However,
5647 // since we are doing this early, before any inlining in the LTO backend, we
5648 // can simply look at the metadata on the calling function which must have
5649 // been from the same module if F was an internal symbol originally.
5650 if (!SrcFileMD && F.isDeclaration()) {
5651 // We would only call this for a declaration for a direct callsite, in which
5652 // case the caller would have provided the calling function pointer.
5653 assert(CallingFunc);
5654 SrcFileMD = CallingFunc->getMetadata(Kind: "thinlto_src_file");
5655 // If this is a promoted local (OrigName != F.getName()), since this is a
5656 // declaration, it must be imported from a different module and therefore we
5657 // should always find the metadata on its calling function. Any call to a
5658 // promoted local that came from this module should still be a definition.
5659 assert(SrcFileMD || OrigName == F.getName());
5660 }
5661 StringRef SrcFile = M.getSourceFileName();
5662 if (SrcFileMD)
5663 SrcFile = dyn_cast<MDString>(Val: SrcFileMD->getOperand(I: 0))->getString();
5664 std::string OrigId = GlobalValue::getGlobalIdentifier(
5665 Name: OrigName, Linkage: GlobalValue::InternalLinkage, FileName: SrcFile);
5666 TheFnVI = ImportSummary->getValueInfo(
5667 GUID: GlobalValue::getGUIDAssumingExternalLinkage(GlobalName: OrigId));
5668 // Internal func in original module may have gotten a numbered suffix if we
5669 // imported an external function with the same name. This happens
5670 // automatically during IR linking for naming conflicts. It would have to
5671 // still be internal in that case (otherwise it would have been renamed on
5672 // promotion in which case we wouldn't have a naming conflict).
5673 if (!TheFnVI && OrigName == F.getName() && F.hasLocalLinkage() &&
5674 F.getName().contains(C: '.')) {
5675 OrigName = F.getName().rsplit(Separator: '.').first;
5676 OrigId = GlobalValue::getGlobalIdentifier(
5677 Name: OrigName, Linkage: GlobalValue::InternalLinkage, FileName: SrcFile);
5678 TheFnVI = ImportSummary->getValueInfo(
5679 GUID: GlobalValue::getGUIDAssumingExternalLinkage(GlobalName: OrigId));
5680 }
5681 // The only way we may not have a VI is if this is a declaration created for
5682 // an imported reference. For distributed ThinLTO we may not have a VI for
5683 // such declarations in the distributed summary.
5684 assert(TheFnVI || F.isDeclaration());
5685 return TheFnVI;
5686}
5687
5688bool MemProfContextDisambiguation::initializeIndirectCallPromotionInfo(
5689 Module &M) {
5690 ICallAnalysis = std::make_unique<ICallPromotionAnalysis>();
5691 Symtab = std::make_unique<InstrProfSymtab>();
5692 // Don't add canonical names, to avoid multiple functions to the symtab
5693 // when they both have the same root name with "." suffixes stripped.
5694 // If we pick the wrong one then this could lead to incorrect ICP and calling
5695 // a memprof clone that we don't actually create (resulting in linker unsats).
5696 // What this means is that the GUID of the function (or its PGOFuncName
5697 // metadata) *must* match that in the VP metadata to allow promotion.
5698 // In practice this should not be a limitation, since local functions should
5699 // have PGOFuncName metadata and global function names shouldn't need any
5700 // special handling (they should not get the ".llvm.*" suffix that the
5701 // canonicalization handling is attempting to strip).
5702 if (Error E = Symtab->create(M, /*InLTO=*/true, /*AddCanonical=*/false)) {
5703 std::string SymtabFailure = toString(E: std::move(E));
5704 M.getContext().emitError(ErrorStr: "Failed to create symtab: " + SymtabFailure);
5705 return false;
5706 }
5707 return true;
5708}
5709
5710#ifndef NDEBUG
5711// Sanity check that the MIB stack ids match between the summary and
5712// instruction metadata.
5713static void checkAllocContextIds(
5714 const AllocInfo &AllocNode, const MDNode *MemProfMD,
5715 const CallStack<MDNode, MDNode::op_iterator> &CallsiteContext,
5716 const ModuleSummaryIndex *ImportSummary) {
5717 auto MIBIter = AllocNode.MIBs.begin();
5718 for (auto &MDOp : MemProfMD->operands()) {
5719 assert(MIBIter != AllocNode.MIBs.end());
5720 auto StackIdIndexIter = MIBIter->StackIdIndices.begin();
5721 auto *MIBMD = cast<const MDNode>(MDOp);
5722 MDNode *StackMDNode = getMIBStackNode(MIBMD);
5723 assert(StackMDNode);
5724 CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
5725 auto ContextIterBegin =
5726 StackContext.beginAfterSharedPrefix(CallsiteContext);
5727 // Skip the checking on the first iteration.
5728 uint64_t LastStackContextId =
5729 (ContextIterBegin != StackContext.end() && *ContextIterBegin == 0) ? 1
5730 : 0;
5731 for (auto ContextIter = ContextIterBegin; ContextIter != StackContext.end();
5732 ++ContextIter) {
5733 // If this is a direct recursion, simply skip the duplicate
5734 // entries, to be consistent with how the summary ids were
5735 // generated during ModuleSummaryAnalysis.
5736 if (LastStackContextId == *ContextIter)
5737 continue;
5738 LastStackContextId = *ContextIter;
5739 assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
5740 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
5741 *ContextIter);
5742 StackIdIndexIter++;
5743 }
5744 MIBIter++;
5745 }
5746}
5747#endif
5748
5749bool MemProfContextDisambiguation::applyImport(Module &M) {
5750 assert(ImportSummary);
5751 bool Changed = false;
5752
5753 // We also need to clone any aliases that reference cloned functions, because
5754 // the modified callsites may invoke via the alias. Keep track of the aliases
5755 // for each function.
5756 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
5757 FuncToAliasMap;
5758 for (auto &A : M.aliases()) {
5759 auto *Aliasee = A.getAliaseeObject();
5760 if (auto *F = dyn_cast<Function>(Val: Aliasee))
5761 FuncToAliasMap[F].insert(Ptr: &A);
5762 }
5763
5764 if (!initializeIndirectCallPromotionInfo(M))
5765 return false;
5766
5767 for (auto &F : M) {
5768 if (F.isDeclaration() || isMemProfClone(F))
5769 continue;
5770
5771 OptimizationRemarkEmitter ORE(&F);
5772
5773 SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
5774 bool ClonesCreated = false;
5775 unsigned NumClonesCreated = 0;
5776 auto CloneFuncIfNeeded = [&](unsigned NumClones, FunctionSummary *FS) {
5777 // We should at least have version 0 which is the original copy.
5778 assert(NumClones > 0);
5779 // If only one copy needed use original.
5780 if (NumClones == 1)
5781 return;
5782 // If we already performed cloning of this function, confirm that the
5783 // requested number of clones matches (the thin link should ensure the
5784 // number of clones for each constituent callsite is consistent within
5785 // each function), before returning.
5786 if (ClonesCreated) {
5787 assert(NumClonesCreated == NumClones);
5788 return;
5789 }
5790 VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap, FS);
5791 // The first "clone" is the original copy, which doesn't have a VMap.
5792 assert(VMaps.size() == NumClones - 1);
5793 Changed = true;
5794 ClonesCreated = true;
5795 NumClonesCreated = NumClones;
5796 };
5797
5798 auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
5799 Function *CalledFunction, FunctionSummary *FS) {
5800 // Perform cloning if not yet done.
5801 CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size(), FS);
5802
5803 assert(!isMemProfClone(*CalledFunction));
5804
5805 // Because we update the cloned calls by calling setCalledOperand (see
5806 // comment below), out of an abundance of caution make sure the called
5807 // function was actually the called operand (or its aliasee). We also
5808 // strip pointer casts when looking for calls (to match behavior during
5809 // summary generation), however, with opaque pointers in theory this
5810 // should not be an issue. Note we still clone the current function
5811 // (containing this call) above, as that could be needed for its callers.
5812 auto *GA = dyn_cast_or_null<GlobalAlias>(Val: CB->getCalledOperand());
5813 if (CalledFunction != CB->getCalledOperand() &&
5814 (!GA || CalledFunction != GA->getAliaseeObject())) {
5815 SkippedCallsCloning++;
5816 return;
5817 }
5818 // Update the calls per the summary info.
5819 // Save orig name since it gets updated in the first iteration
5820 // below.
5821 auto CalleeOrigName = CalledFunction->getName();
5822 for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
5823 // If the VMap is empty, this clone was a duplicate of another and was
5824 // created as an alias or a declaration.
5825 if (J > 0 && VMaps[J - 1]->empty())
5826 continue;
5827 // Do nothing if this version calls the original version of its
5828 // callee.
5829 if (!StackNode.Clones[J])
5830 continue;
5831 auto NewF = M.getOrInsertFunction(
5832 Name: getMemProfFuncName(Base: CalleeOrigName, CloneNo: StackNode.Clones[J]),
5833 T: CalledFunction->getFunctionType());
5834 CallBase *CBClone;
5835 // Copy 0 is the original function.
5836 if (!J)
5837 CBClone = CB;
5838 else
5839 CBClone = cast<CallBase>(Val&: (*VMaps[J - 1])[CB]);
5840 // Set the called operand directly instead of calling setCalledFunction,
5841 // as the latter mutates the function type on the call. In rare cases
5842 // we may have a slightly different type on a callee function
5843 // declaration due to it being imported from a different module with
5844 // incomplete types. We really just want to change the name of the
5845 // function to the clone, and not make any type changes.
5846 CBClone->setCalledOperand(NewF.getCallee());
5847 ORE.emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
5848 << ore::NV("Call", CBClone) << " in clone "
5849 << ore::NV("Caller", CBClone->getFunction())
5850 << " assigned to call function clone "
5851 << ore::NV("Callee", NewF.getCallee()));
5852 }
5853 };
5854
5855 // Locate the summary for F.
5856 ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
5857 // If not found, this could be an imported local (see comment in
5858 // findValueInfoForFunc). Skip for now as it will be cloned in its original
5859 // module (where it would have been promoted to global scope so should
5860 // satisfy any reference in this module).
5861 if (!TheFnVI)
5862 continue;
5863
5864 auto *GVSummary =
5865 ImportSummary->findSummaryInModule(VI: TheFnVI, ModuleId: M.getModuleIdentifier());
5866 if (!GVSummary) {
5867 // Must have been imported, use the summary which matches the definition。
5868 // (might be multiple if this was a linkonce_odr).
5869 auto SrcModuleMD = F.getMetadata(Kind: "thinlto_src_module");
5870 assert(SrcModuleMD &&
5871 "enable-import-metadata is needed to emit thinlto_src_module");
5872 StringRef SrcModule =
5873 dyn_cast<MDString>(Val: SrcModuleMD->getOperand(I: 0))->getString();
5874 for (auto &GVS : TheFnVI.getSummaryList()) {
5875 if (GVS->modulePath() == SrcModule) {
5876 GVSummary = GVS.get();
5877 break;
5878 }
5879 }
5880 assert(GVSummary && GVSummary->modulePath() == SrcModule);
5881 }
5882
5883 // If this was an imported alias skip it as we won't have the function
5884 // summary, and it should be cloned in the original module.
5885 if (isa<AliasSummary>(Val: GVSummary))
5886 continue;
5887
5888 auto *FS = cast<FunctionSummary>(Val: GVSummary->getBaseObject());
5889
5890 if (FS->allocs().empty() && FS->callsites().empty())
5891 continue;
5892
5893 auto SI = FS->callsites().begin();
5894 auto AI = FS->allocs().begin();
5895
5896 // To handle callsite infos synthesized for tail calls which have missing
5897 // frames in the profiled context, map callee VI to the synthesized callsite
5898 // info.
5899 DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite;
5900 // Iterate the callsites for this function in reverse, since we place all
5901 // those synthesized for tail calls at the end.
5902 for (auto CallsiteIt = FS->callsites().rbegin();
5903 CallsiteIt != FS->callsites().rend(); CallsiteIt++) {
5904 auto &Callsite = *CallsiteIt;
5905 // Stop as soon as we see a non-synthesized callsite info (see comment
5906 // above loop). All the entries added for discovered tail calls have empty
5907 // stack ids.
5908 if (!Callsite.StackIdIndices.empty())
5909 break;
5910 MapTailCallCalleeVIToCallsite.insert(KV: {Callsite.Callee, Callsite});
5911 }
5912
5913 // Keeps track of needed ICP for the function.
5914 SmallVector<ICallAnalysisData> ICallAnalysisInfo;
5915
5916 // Assume for now that the instructions are in the exact same order
5917 // as when the summary was created, but confirm this is correct by
5918 // matching the stack ids.
5919 for (auto &BB : F) {
5920 for (auto &I : BB) {
5921 auto *CB = dyn_cast<CallBase>(Val: &I);
5922 // Same handling as when creating module summary.
5923 if (!mayHaveMemprofSummary(CB))
5924 continue;
5925
5926 auto *CalledValue = CB->getCalledOperand();
5927 auto *CalledFunction = CB->getCalledFunction();
5928 if (CalledValue && !CalledFunction) {
5929 CalledValue = CalledValue->stripPointerCasts();
5930 // Stripping pointer casts can reveal a called function.
5931 CalledFunction = dyn_cast<Function>(Val: CalledValue);
5932 }
5933 // Check if this is an alias to a function. If so, get the
5934 // called aliasee for the checks below.
5935 if (auto *GA = dyn_cast<GlobalAlias>(Val: CalledValue)) {
5936 assert(!CalledFunction &&
5937 "Expected null called function in callsite for alias");
5938 CalledFunction = dyn_cast<Function>(Val: GA->getAliaseeObject());
5939 }
5940
5941 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
5942 I.getMetadata(KindID: LLVMContext::MD_callsite));
5943 auto *MemProfMD = I.getMetadata(KindID: LLVMContext::MD_memprof);
5944
5945 // Include allocs that were already assigned a memprof function
5946 // attribute in the statistics. Only do this for those that do not have
5947 // memprof metadata, since we add an "ambiguous" memprof attribute by
5948 // default.
5949 if (CB->getAttributes().hasFnAttr(Kind: "memprof") && !MemProfMD) {
5950 CB->getAttributes().getFnAttr(Kind: "memprof").getValueAsString() == "cold"
5951 ? AllocTypeColdThinBackend++
5952 : AllocTypeNotColdThinBackend++;
5953 OrigAllocsThinBackend++;
5954 AllocVersionsThinBackend++;
5955 if (!MaxAllocVersionsThinBackend)
5956 MaxAllocVersionsThinBackend = 1;
5957 continue;
5958 }
5959
5960 if (MemProfMD) {
5961 // Consult the next alloc node.
5962 assert(AI != FS->allocs().end());
5963 auto &AllocNode = *(AI++);
5964
5965#ifndef NDEBUG
5966 checkAllocContextIds(AllocNode, MemProfMD, CallsiteContext,
5967 ImportSummary);
5968#endif
5969
5970 // Perform cloning if not yet done.
5971 CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size(), FS);
5972
5973 OrigAllocsThinBackend++;
5974 AllocVersionsThinBackend += AllocNode.Versions.size();
5975 if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
5976 MaxAllocVersionsThinBackend = AllocNode.Versions.size();
5977
5978 // If there is only one version that means we didn't end up
5979 // considering this function for cloning, and in that case the alloc
5980 // will still be none type or should have gotten the default NotCold.
5981 // Skip that after calling clone helper since that does some sanity
5982 // checks that confirm we haven't decided yet that we need cloning.
5983 // We might have a single version that is cold due to the
5984 // MinClonedColdBytePercent heuristic, make sure we don't skip in that
5985 // case.
5986 if (AllocNode.Versions.size() == 1 &&
5987 (AllocationType)AllocNode.Versions[0] != AllocationType::Cold) {
5988 assert((AllocationType)AllocNode.Versions[0] ==
5989 AllocationType::NotCold ||
5990 (AllocationType)AllocNode.Versions[0] ==
5991 AllocationType::None);
5992 UnclonableAllocsThinBackend++;
5993 continue;
5994 }
5995
5996 // All versions should have a singular allocation type.
5997 assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
5998 return Type == ((uint8_t)AllocationType::NotCold |
5999 (uint8_t)AllocationType::Cold);
6000 }));
6001
6002 // Update the allocation types per the summary info.
6003 for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
6004 // If the VMap is empty, this clone was a duplicate of another and
6005 // was created as an alias or a declaration.
6006 if (J > 0 && VMaps[J - 1]->empty())
6007 continue;
6008 // Ignore any that didn't get an assigned allocation type.
6009 if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
6010 continue;
6011 AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
6012 AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
6013 : AllocTypeNotColdThinBackend++;
6014 std::string AllocTypeString = getAllocTypeAttributeString(Type: AllocTy);
6015 auto A = llvm::Attribute::get(Context&: F.getContext(), Kind: "memprof",
6016 Val: AllocTypeString);
6017 CallBase *CBClone;
6018 // Copy 0 is the original function.
6019 if (!J)
6020 CBClone = CB;
6021 else
6022 // Since VMaps are only created for new clones, we index with
6023 // clone J-1 (J==0 is the original clone and does not have a VMaps
6024 // entry).
6025 CBClone = cast<CallBase>(Val&: (*VMaps[J - 1])[CB]);
6026 removeAnyExistingAmbiguousAttribute(CB: CBClone);
6027 CBClone->addFnAttr(Attr: A);
6028 ORE.emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
6029 << ore::NV("AllocationCall", CBClone) << " in clone "
6030 << ore::NV("Caller", CBClone->getFunction())
6031 << " marked with memprof allocation attribute "
6032 << ore::NV("Attribute", AllocTypeString));
6033 }
6034 } else if (!CallsiteContext.empty()) {
6035 if (!CalledFunction) {
6036#ifndef NDEBUG
6037 // We should have skipped inline assembly calls.
6038 auto *CI = dyn_cast<CallInst>(CB);
6039 assert(!CI || !CI->isInlineAsm());
6040#endif
6041 // We should have skipped direct calls via a Constant.
6042 assert(CalledValue && !isa<Constant>(CalledValue));
6043
6044 // This is an indirect call, see if we have profile information and
6045 // whether any clones were recorded for the profiled targets (that
6046 // we synthesized CallsiteInfo summary records for when building the
6047 // index).
6048 auto NumClones =
6049 recordICPInfo(CB, AllCallsites: FS->callsites(), SI, ICallAnalysisInfo);
6050
6051 // Perform cloning if not yet done. This is done here in case
6052 // we don't need to do ICP, but might need to clone this
6053 // function as it is the target of other cloned calls.
6054 if (NumClones)
6055 CloneFuncIfNeeded(NumClones, FS);
6056 }
6057
6058 else {
6059 // Consult the next callsite node.
6060 assert(SI != FS->callsites().end());
6061 auto &StackNode = *(SI++);
6062
6063#ifndef NDEBUG
6064 // Sanity check that the stack ids match between the summary and
6065 // instruction metadata.
6066 auto StackIdIndexIter = StackNode.StackIdIndices.begin();
6067 for (auto StackId : CallsiteContext) {
6068 assert(StackIdIndexIter != StackNode.StackIdIndices.end());
6069 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
6070 StackId);
6071 StackIdIndexIter++;
6072 }
6073#endif
6074
6075 CloneCallsite(StackNode, CB, CalledFunction, FS);
6076 }
6077 } else if (CB->isTailCall() && CalledFunction) {
6078 // Locate the synthesized callsite info for the callee VI, if any was
6079 // created, and use that for cloning.
6080 ValueInfo CalleeVI =
6081 findValueInfoForFunc(F: *CalledFunction, M, ImportSummary, CallingFunc: &F);
6082 if (CalleeVI && MapTailCallCalleeVIToCallsite.count(Val: CalleeVI)) {
6083 auto Callsite = MapTailCallCalleeVIToCallsite.find(Val: CalleeVI);
6084 assert(Callsite != MapTailCallCalleeVIToCallsite.end());
6085 CloneCallsite(Callsite->second, CB, CalledFunction, FS);
6086 }
6087 }
6088 }
6089 }
6090
6091 // Now do any promotion required for cloning.
6092 performICP(M, AllCallsites: FS->callsites(), VMaps, ICallAnalysisInfo, ORE);
6093 }
6094
6095 // We skip some of the functions and instructions above, so remove all the
6096 // metadata in a single sweep here.
6097 for (auto &F : M) {
6098 // We can skip memprof clones because createFunctionClones already strips
6099 // the metadata from the newly created clones.
6100 if (F.isDeclaration() || isMemProfClone(F))
6101 continue;
6102 for (auto &BB : F) {
6103 for (auto &I : BB) {
6104 if (!isa<CallBase>(Val: I))
6105 continue;
6106 I.setMetadata(KindID: LLVMContext::MD_memprof, Node: nullptr);
6107 I.setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
6108 }
6109 }
6110 }
6111
6112 return Changed;
6113}
6114
6115unsigned MemProfContextDisambiguation::recordICPInfo(
6116 CallBase *CB, ArrayRef<CallsiteInfo> AllCallsites,
6117 ArrayRef<CallsiteInfo>::iterator &SI,
6118 SmallVector<ICallAnalysisData> &ICallAnalysisInfo) {
6119 // First see if we have profile information for this indirect call.
6120 uint32_t NumCandidates;
6121 uint64_t TotalCount;
6122 auto CandidateProfileData =
6123 ICallAnalysis->getPromotionCandidatesForInstruction(
6124 I: CB, TotalCount, NumCandidates, MaxNumValueData: MaxSummaryIndirectEdges);
6125 if (CandidateProfileData.empty())
6126 return 0;
6127
6128 // Iterate through all of the candidate profiled targets along with the
6129 // CallsiteInfo summary records synthesized for them when building the index,
6130 // and see if any are cloned and/or refer to clones.
6131 bool ICPNeeded = false;
6132 unsigned NumClones = 0;
6133 size_t CallsiteInfoStartIndex = std::distance(first: AllCallsites.begin(), last: SI);
6134 for (const auto &Candidate : CandidateProfileData) {
6135#ifndef NDEBUG
6136 auto CalleeValueInfo =
6137#endif
6138 ImportSummary->getValueInfo(GUID: Candidate.Value);
6139 // We might not have a ValueInfo if this is a distributed
6140 // ThinLTO backend and decided not to import that function.
6141 assert(!CalleeValueInfo || SI->Callee == CalleeValueInfo);
6142 assert(SI != AllCallsites.end());
6143 auto &StackNode = *(SI++);
6144 // See if any of the clones of the indirect callsite for this
6145 // profiled target should call a cloned version of the profiled
6146 // target. We only need to do the ICP here if so.
6147 ICPNeeded |= llvm::any_of(Range: StackNode.Clones,
6148 P: [](unsigned CloneNo) { return CloneNo != 0; });
6149 // Every callsite in the same function should have been cloned the same
6150 // number of times.
6151 assert(!NumClones || NumClones == StackNode.Clones.size());
6152 NumClones = StackNode.Clones.size();
6153 }
6154 if (!ICPNeeded)
6155 return NumClones;
6156 // Save information for ICP, which is performed later to avoid messing up the
6157 // current function traversal.
6158 ICallAnalysisInfo.push_back(Elt: {.CB: CB, .CandidateProfileData: CandidateProfileData.vec(), .NumCandidates: NumCandidates,
6159 .TotalCount: TotalCount, .CallsiteInfoStartIndex: CallsiteInfoStartIndex});
6160 return NumClones;
6161}
6162
6163void MemProfContextDisambiguation::performICP(
6164 Module &M, ArrayRef<CallsiteInfo> AllCallsites,
6165 ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps,
6166 ArrayRef<ICallAnalysisData> ICallAnalysisInfo,
6167 OptimizationRemarkEmitter &ORE) {
6168 // Now do any promotion required for cloning. Specifically, for each
6169 // recorded ICP candidate (which was only recorded because one clone of that
6170 // candidate should call a cloned target), we perform ICP (speculative
6171 // devirtualization) for each clone of the callsite, and update its callee
6172 // to the appropriate clone. Note that the ICP compares against the original
6173 // version of the target, which is what is in the vtable.
6174 for (auto &Info : ICallAnalysisInfo) {
6175 auto *CB = Info.CB;
6176 auto CallsiteIndex = Info.CallsiteInfoStartIndex;
6177 auto TotalCount = Info.TotalCount;
6178 unsigned NumPromoted = 0;
6179 unsigned NumClones = 0;
6180
6181 for (auto &Candidate : Info.CandidateProfileData) {
6182 auto &StackNode = AllCallsites[CallsiteIndex++];
6183
6184 // All calls in the same function must have the same number of clones.
6185 assert(!NumClones || NumClones == StackNode.Clones.size());
6186 NumClones = StackNode.Clones.size();
6187
6188 // See if the target is in the module. If it wasn't imported, it is
6189 // possible that this profile could have been collected on a different
6190 // target (or version of the code), and we need to be conservative
6191 // (similar to what is done in the ICP pass).
6192 Function *TargetFunction = Symtab->getFunction(FuncMD5Hash: Candidate.Value);
6193 if (TargetFunction == nullptr ||
6194 // Any ThinLTO global dead symbol removal should have already
6195 // occurred, so it should be safe to promote when the target is a
6196 // declaration.
6197 // TODO: Remove internal option once more fully tested.
6198 (MemProfRequireDefinitionForPromotion &&
6199 TargetFunction->isDeclaration())) {
6200 ORE.emit(RemarkBuilder: [&]() {
6201 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", CB)
6202 << "Memprof cannot promote indirect call: target with md5sum "
6203 << ore::NV("target md5sum", Candidate.Value) << " not found";
6204 });
6205 // FIXME: See if we can use the new declaration importing support to
6206 // at least get the declarations imported for this case. Hot indirect
6207 // targets should have been imported normally, however.
6208 continue;
6209 }
6210
6211 // Check if legal to promote
6212 const char *Reason = nullptr;
6213 if (!isLegalToPromote(CB: *CB, Callee: TargetFunction, FailureReason: &Reason)) {
6214 ORE.emit(RemarkBuilder: [&]() {
6215 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", CB)
6216 << "Memprof cannot promote indirect call to "
6217 << ore::NV("TargetFunction", TargetFunction)
6218 << " with count of " << ore::NV("TotalCount", TotalCount)
6219 << ": " << Reason;
6220 });
6221 continue;
6222 }
6223
6224 assert(!isMemProfClone(*TargetFunction));
6225
6226 // Handle each call clone, applying ICP so that each clone directly
6227 // calls the specified callee clone, guarded by the appropriate ICP
6228 // check.
6229 CallBase *CBClone = CB;
6230 for (unsigned J = 0; J < NumClones; J++) {
6231 // If the VMap is empty, this clone was a duplicate of another and was
6232 // created as an alias or a declaration.
6233 if (J > 0 && VMaps[J - 1]->empty())
6234 continue;
6235 // Copy 0 is the original function.
6236 if (J > 0)
6237 CBClone = cast<CallBase>(Val&: (*VMaps[J - 1])[CB]);
6238 // We do the promotion using the original name, so that the comparison
6239 // is against the name in the vtable. Then just below, change the new
6240 // direct call to call the cloned function.
6241 auto &DirectCall =
6242 pgo::promoteIndirectCall(CB&: *CBClone, F: TargetFunction, Count: Candidate.Count,
6243 TotalCount, AttachProfToDirectCall: isSamplePGO, ORE: &ORE);
6244 auto *TargetToUse = TargetFunction;
6245 // Call original if this version calls the original version of its
6246 // callee.
6247 if (StackNode.Clones[J]) {
6248 TargetToUse =
6249 cast<Function>(Val: M.getOrInsertFunction(
6250 Name: getMemProfFuncName(Base: TargetFunction->getName(),
6251 CloneNo: StackNode.Clones[J]),
6252 T: TargetFunction->getFunctionType())
6253 .getCallee());
6254 }
6255 DirectCall.setCalledFunction(TargetToUse);
6256 // During matching we generate synthetic VP metadata for indirect calls
6257 // not already having any, from the memprof profile's callee GUIDs. If
6258 // we subsequently promote and inline those callees, we currently lose
6259 // the ability to generate this synthetic VP metadata. Optionally apply
6260 // a noinline attribute to promoted direct calls, where the threshold is
6261 // set to capture synthetic VP metadata targets which get a count of 1.
6262 if (MemProfICPNoInlineThreshold &&
6263 Candidate.Count < MemProfICPNoInlineThreshold)
6264 DirectCall.setIsNoInline();
6265 ORE.emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
6266 << ore::NV("Call", CBClone) << " in clone "
6267 << ore::NV("Caller", CBClone->getFunction())
6268 << " promoted and assigned to call function clone "
6269 << ore::NV("Callee", TargetToUse));
6270 }
6271
6272 // Update TotalCount (all clones should get same count above)
6273 TotalCount -= Candidate.Count;
6274 NumPromoted++;
6275 }
6276 // Adjust the MD.prof metadata for all clones, now that we have the new
6277 // TotalCount and the number promoted.
6278 CallBase *CBClone = CB;
6279 for (unsigned J = 0; J < NumClones; J++) {
6280 // If the VMap is empty, this clone was a duplicate of another and was
6281 // created as an alias or a declaration.
6282 if (J > 0 && VMaps[J - 1]->empty())
6283 continue;
6284 // Copy 0 is the original function.
6285 if (J > 0)
6286 CBClone = cast<CallBase>(Val&: (*VMaps[J - 1])[CB]);
6287 // First delete the old one.
6288 CBClone->setMetadata(KindID: LLVMContext::MD_prof, Node: nullptr);
6289 // If all promoted, we don't need the MD.prof metadata.
6290 // Otherwise we need update with the un-promoted records back.
6291 if (TotalCount != 0)
6292 annotateValueSite(
6293 M, Inst&: *CBClone, VDs: ArrayRef(Info.CandidateProfileData).slice(N: NumPromoted),
6294 Sum: TotalCount, ValueKind: IPVK_IndirectCallTarget, MaxMDCount: Info.NumCandidates);
6295 }
6296 }
6297}
6298
6299template <typename DerivedCCG, typename FuncTy, typename CallTy>
6300bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
6301 if (DumpCCG) {
6302 dbgs() << "CCG before cloning:\n";
6303 dbgs() << *this;
6304 }
6305 if (ExportToDot)
6306 exportToDot(Label: "postbuild");
6307
6308 if (VerifyCCG) {
6309 check();
6310 }
6311
6312 identifyClones();
6313
6314 if (VerifyCCG) {
6315 check();
6316 }
6317
6318 if (DumpCCG) {
6319 dbgs() << "CCG after cloning:\n";
6320 dbgs() << *this;
6321 }
6322 if (ExportToDot)
6323 exportToDot(Label: "cloned");
6324
6325 bool Changed = assignFunctions();
6326
6327 if (DumpCCG) {
6328 dbgs() << "CCG after assigning function clones:\n";
6329 dbgs() << *this;
6330 }
6331 if (ExportToDot)
6332 exportToDot(Label: "clonefuncassign");
6333
6334 if (MemProfReportHintedSizes)
6335 printTotalSizes(OS&: errs());
6336
6337 return Changed;
6338}
6339
6340bool MemProfContextDisambiguation::processModule(
6341 Module &M,
6342 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
6343
6344 // If we have an import summary, then the cloning decisions were made during
6345 // the thin link on the index. Apply them and return.
6346 if (ImportSummary)
6347 return applyImport(M);
6348
6349 // TODO: If/when other types of memprof cloning are enabled beyond just for
6350 // hot and cold, we will need to change this to individually control the
6351 // AllocationType passed to addStackNodesForMIB during CCG construction.
6352 // Note that we specifically check this after applying imports above, so that
6353 // the option isn't needed to be passed to distributed ThinLTO backend
6354 // clang processes, which won't necessarily have visibility into the linker
6355 // dependences. Instead the information is communicated from the LTO link to
6356 // the backends via the combined summary index.
6357 if (!SupportsHotColdNew)
6358 return false;
6359
6360 ModuleCallsiteContextGraph CCG(M, OREGetter);
6361 return CCG.process();
6362}
6363
6364MemProfContextDisambiguation::MemProfContextDisambiguation(
6365 const ModuleSummaryIndex *Summary, bool isSamplePGO)
6366 : ImportSummary(Summary), isSamplePGO(isSamplePGO) {
6367 // Check the dot graph printing options once here, to make sure we have valid
6368 // and expected combinations.
6369 if (DotGraphScope == DotScope::Alloc && !AllocIdForDot.getNumOccurrences())
6370 llvm::report_fatal_error(
6371 reason: "-memprof-dot-scope=alloc requires -memprof-dot-alloc-id");
6372 if (DotGraphScope == DotScope::Context &&
6373 !ContextIdForDot.getNumOccurrences())
6374 llvm::report_fatal_error(
6375 reason: "-memprof-dot-scope=context requires -memprof-dot-context-id");
6376 if (DotGraphScope == DotScope::All && AllocIdForDot.getNumOccurrences() &&
6377 ContextIdForDot.getNumOccurrences())
6378 llvm::report_fatal_error(
6379 reason: "-memprof-dot-scope=all can't have both -memprof-dot-alloc-id and "
6380 "-memprof-dot-context-id");
6381 if (ImportSummary) {
6382 // The MemProfImportSummary should only be used for testing ThinLTO
6383 // distributed backend handling via opt, in which case we don't have a
6384 // summary from the pass pipeline.
6385 assert(MemProfImportSummary.empty());
6386 return;
6387 }
6388 if (MemProfImportSummary.empty())
6389 return;
6390
6391 auto ReadSummaryFile =
6392 errorOrToExpected(EO: MemoryBuffer::getFile(Filename: MemProfImportSummary));
6393 if (!ReadSummaryFile) {
6394 logAllUnhandledErrors(E: ReadSummaryFile.takeError(), OS&: errs(),
6395 ErrorBanner: "Error loading file '" + MemProfImportSummary +
6396 "': ");
6397 return;
6398 }
6399 auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(Buffer: **ReadSummaryFile);
6400 if (!ImportSummaryForTestingOrErr) {
6401 logAllUnhandledErrors(E: ImportSummaryForTestingOrErr.takeError(), OS&: errs(),
6402 ErrorBanner: "Error parsing file '" + MemProfImportSummary +
6403 "': ");
6404 return;
6405 }
6406 ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
6407 ImportSummary = ImportSummaryForTesting.get();
6408}
6409
6410PreservedAnalyses MemProfContextDisambiguation::run(Module &M,
6411 ModuleAnalysisManager &AM) {
6412 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(IR&: M).getManager();
6413 auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
6414 return FAM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: *F);
6415 };
6416 if (!processModule(M, OREGetter))
6417 return PreservedAnalyses::all();
6418 return PreservedAnalyses::none();
6419}
6420
6421void MemProfContextDisambiguation::run(
6422 ModuleSummaryIndex &Index,
6423 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
6424 isPrevailing) {
6425 // TODO: If/when other types of memprof cloning are enabled beyond just for
6426 // hot and cold, we will need to change this to individually control the
6427 // AllocationType passed to addStackNodesForMIB during CCG construction.
6428 // The index was set from the option, so these should be in sync.
6429 assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
6430 if (!SupportsHotColdNew)
6431 return;
6432
6433 IndexCallsiteContextGraph CCG(Index, isPrevailing);
6434 CCG.process();
6435}
6436
6437// Strips MemProf attributes and metadata. Can be invoked by the pass pipeline
6438// when we don't have an index that has recorded that we are linking with
6439// allocation libraries containing the necessary APIs for downstream
6440// transformations.
6441PreservedAnalyses MemProfRemoveInfo::run(Module &M, ModuleAnalysisManager &AM) {
6442 // The profile matcher applies hotness attributes directly for allocations,
6443 // and those will cause us to generate calls to the hot/cold interfaces
6444 // unconditionally. If supports-hot-cold-new was not enabled in the LTO
6445 // link then assume we don't want these calls (e.g. not linking with
6446 // the appropriate library, or otherwise trying to disable this behavior).
6447 bool Changed = false;
6448 for (auto &F : M) {
6449 for (auto &BB : F) {
6450 for (auto &I : BB) {
6451 auto *CI = dyn_cast<CallBase>(Val: &I);
6452 if (!CI)
6453 continue;
6454 if (CI->hasFnAttr(Kind: "memprof")) {
6455 CI->removeFnAttr(Kind: "memprof");
6456 Changed = true;
6457 }
6458 if (!CI->hasMetadata(KindID: LLVMContext::MD_callsite)) {
6459 assert(!CI->hasMetadata(LLVMContext::MD_memprof));
6460 continue;
6461 }
6462 // Strip off all memprof metadata as it is no longer needed.
6463 // Importantly, this avoids the addition of new memprof attributes
6464 // after inlining propagation.
6465 CI->setMetadata(KindID: LLVMContext::MD_memprof, Node: nullptr);
6466 CI->setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
6467 Changed = true;
6468 }
6469 }
6470 }
6471 if (!Changed)
6472 return PreservedAnalyses::all();
6473 return PreservedAnalyses::none();
6474}
6475