MemProfContextDisambiguation.cpp source code [llvm_projects/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp]

1	//==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file implements support for context disambiguation of allocation
10	// calls for profile guided heap optimization. Specifically, it uses Memprof
11	// profiles which indicate context specific allocation behavior (currently
12	// distinguishing cold vs hot memory allocations). Cloning is performed to
13	// expose the cold allocation call contexts, and the allocation calls are
14	// subsequently annotated with an attribute for later transformation.
15	//
16	// The transformations can be performed either directly on IR (regular LTO), or
17	// on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
18	// Both types of LTO operate on a the same base graph representation, which
19	// uses CRTP to support either IR or Index formats.
20	//
21	//===----------------------------------------------------------------------===//
22
23	#include "llvm/Transforms/IPO/MemProfContextDisambiguation.h"
24	#include "llvm/ADT/DenseMap.h"
25	#include "llvm/ADT/DenseSet.h"
26	#include "llvm/ADT/MapVector.h"
27	#include "llvm/ADT/SetOperations.h"
28	#include "llvm/ADT/SmallPtrSet.h"
29	#include "llvm/ADT/SmallSet.h"
30	#include "llvm/ADT/SmallVector.h"
31	#include "llvm/ADT/Statistic.h"
32	#include "llvm/Analysis/MemoryProfileInfo.h"
33	#include "llvm/Analysis/ModuleSummaryAnalysis.h"
34	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
35	#include "llvm/Bitcode/BitcodeReader.h"
36	#include "llvm/IR/Instructions.h"
37	#include "llvm/IR/Module.h"
38	#include "llvm/IR/ModuleSummaryIndex.h"
39	#include "llvm/Pass.h"
40	#include "llvm/Support/CommandLine.h"
41	#include "llvm/Support/GraphWriter.h"
42	#include "llvm/Support/InterleavedRange.h"
43	#include "llvm/Support/raw_ostream.h"
44	#include "llvm/Transforms/IPO.h"
45	#include "llvm/Transforms/Utils/CallPromotionUtils.h"
46	#include "llvm/Transforms/Utils/Cloning.h"
47	#include "llvm/Transforms/Utils/Instrumentation.h"
48	#include <deque>
49	#include <sstream>
50	#include <unordered_map>
51	#include <vector>
52	using namespace llvm;
53	using namespace llvm::memprof;
54
55	#define DEBUG_TYPE "memprof-context-disambiguation"
56
57	STATISTIC(FunctionClonesAnalysis,
58	"Number of function clones created during whole program analysis");
59	STATISTIC(FunctionClonesThinBackend,
60	"Number of function clones created during ThinLTO backend");
61	STATISTIC(FunctionsClonedThinBackend,
62	"Number of functions that had clones created during ThinLTO backend");
63	STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
64	"cloned) during whole program analysis");
65	STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
66	"during whole program analysis");
67	STATISTIC(AllocTypeNotColdThinBackend,
68	"Number of not cold static allocations (possibly cloned) during "
69	"ThinLTO backend");
70	STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
71	"(possibly cloned) during ThinLTO backend");
72	STATISTIC(OrigAllocsThinBackend,
73	"Number of original (not cloned) allocations with memprof profiles "
74	"during ThinLTO backend");
75	STATISTIC(
76	AllocVersionsThinBackend,
77	"Number of allocation versions (including clones) during ThinLTO backend");
78	STATISTIC(MaxAllocVersionsThinBackend,
79	"Maximum number of allocation versions created for an original "
80	"allocation during ThinLTO backend");
81	STATISTIC(UnclonableAllocsThinBackend,
82	"Number of unclonable ambigous allocations during ThinLTO backend");
83	STATISTIC(RemovedEdgesWithMismatchedCallees,
84	"Number of edges removed due to mismatched callees (profiled vs IR)");
85	STATISTIC(FoundProfiledCalleeCount,
86	"Number of profiled callees found via tail calls");
87	STATISTIC(FoundProfiledCalleeDepth,
88	"Aggregate depth of profiled callees found via tail calls");
89	STATISTIC(FoundProfiledCalleeMaxDepth,
90	"Maximum depth of profiled callees found via tail calls");
91	STATISTIC(FoundProfiledCalleeNonUniquelyCount,
92	"Number of profiled callees found via multiple tail call chains");
93	STATISTIC(DeferredBackedges, "Number of backedges with deferred cloning");
94	STATISTIC(NewMergedNodes, "Number of new nodes created during merging");
95	STATISTIC(NonNewMergedNodes, "Number of non new nodes used during merging");
96	STATISTIC(MissingAllocForContextId,
97	"Number of missing alloc nodes for context ids");
98
99	static cl::opt<std::string> DotFilePathPrefix(
100	"memprof-dot-file-path-prefix", cl::init(Val: ""), cl::Hidden,
101	cl::value_desc ("filename"),
102	cl::desc ("Specify the path prefix of the MemProf dot files."));
103
104	static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(Val: false),
105	cl::Hidden,
106	cl::desc ("Export graph to dot files."));
107
108	// How much of the graph to export to dot.
109	enum DotScope {
110	All, // The full CCG graph.
111	Alloc, // Only contexts for the specified allocation.
112	Context, // Only the specified context.
113	};
114
115	static cl::opt<DotScope> DotGraphScope(
116	"memprof-dot-scope", cl::desc ("Scope of graph to export to dot"),
117	cl::Hidden, cl::init(Val: DotScope::All),
118	cl::values(
119	clEnumValN(DotScope::All, "all", "Export full callsite graph"),
120	clEnumValN(DotScope::Alloc, "alloc",
121	"Export only nodes with contexts feeding given "
122	"-memprof-dot-alloc-id"),
123	clEnumValN(DotScope::Context, "context",
124	"Export only nodes with given -memprof-dot-context-id")));
125
126	static cl::opt<unsigned>
127	AllocIdForDot("memprof-dot-alloc-id", cl::init(Val: `0`), cl::Hidden,
128	cl::desc ("Id of alloc to export if -memprof-dot-scope=alloc "
129	"or to highlight if -memprof-dot-scope=all"));
130
131	static cl::opt<unsigned> ContextIdForDot(
132	"memprof-dot-context-id", cl::init(Val: `0`), cl::Hidden,
133	cl::desc ("Id of context to export if -memprof-dot-scope=context or to "
134	"highlight otherwise"));
135
136	static cl::opt<bool>
137	DumpCCG("memprof-dump-ccg", cl::init(Val: false), cl::Hidden,
138	cl::desc ("Dump CallingContextGraph to stdout after each stage."));
139
140	static cl::opt<bool>
141	VerifyCCG("memprof-verify-ccg", cl::init(Val: false), cl::Hidden,
142	cl::desc ("Perform verification checks on CallingContextGraph."));
143
144	static cl::opt<bool>
145	VerifyNodes("memprof-verify-nodes", cl::init(Val: false), cl::Hidden,
146	cl::desc ("Perform frequent verification checks on nodes."));
147
148	static cl::opt<std::string> MemProfImportSummary(
149	"memprof-import-summary",
150	cl::desc ("Import summary to use for testing the ThinLTO backend via opt"),
151	cl::Hidden);
152
153	static cl::opt<unsigned>
154	TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(Val: `5`),
155	cl::Hidden,
156	cl::desc ("Max depth to recursively search for missing "
157	"frames through tail calls."));
158
159	// Optionally enable cloning of callsites involved with recursive cycles
160	static cl::opt<bool> AllowRecursiveCallsites(
161	"memprof-allow-recursive-callsites", cl::init(Val: true), cl::Hidden,
162	cl::desc ("Allow cloning of callsites involved in recursive cycles"));
163
164	static cl::opt<bool> CloneRecursiveContexts(
165	"memprof-clone-recursive-contexts", cl::init(Val: true), cl::Hidden,
166	cl::desc ("Allow cloning of contexts through recursive cycles"));
167
168	// Generally this is needed for correct assignment of allocation clones to
169	// function clones, however, allow it to be disabled for debugging while the
170	// functionality is new and being tested more widely.
171	static cl::opt<bool>
172	MergeClones("memprof-merge-clones", cl::init(Val: true), cl::Hidden,
173	cl::desc ("Merge clones before assigning functions"));
174
175	// When disabled, try to detect and prevent cloning of recursive contexts.
176	// This is only necessary until we support cloning through recursive cycles.
177	// Leave on by default for now, as disabling requires a little bit of compile
178	// time overhead and doesn't affect correctness, it will just inflate the cold
179	// hinted bytes reporting a bit when -memprof-report-hinted-sizes is enabled.
180	static cl::opt<bool> AllowRecursiveContexts(
181	"memprof-allow-recursive-contexts", cl::init(Val: true), cl::Hidden,
182	cl::desc ("Allow cloning of contexts having recursive cycles"));
183
184	namespace llvm {
185	cl::opt<bool> EnableMemProfContextDisambiguation(
186	"enable-memprof-context-disambiguation", cl::init(Val: false), cl::Hidden,
187	cl::ZeroOrMore, cl::desc ("Enable MemProf context disambiguation"));
188
189	// Indicate we are linking with an allocator that supports hot/cold operator
190	// new interfaces.
191	cl::opt<bool> SupportsHotColdNew(
192	"supports-hot-cold-new", cl::init(Val: false), cl::Hidden,
193	cl::desc ("Linking with hot/cold operator new interfaces"));
194
195	static cl::opt<bool> MemProfRequireDefinitionForPromotion(
196	"memprof-require-definition-for-promotion", cl::init(Val: false), cl::Hidden,
197	cl::desc (
198	"Require target function definition when promoting indirect calls"));
199	} // namespace llvm
200
201	extern cl::opt<bool> MemProfReportHintedSizes;
202	extern cl::opt<unsigned> MinClonedColdBytePercent;
203
204	namespace {
205	/// CRTP base for graphs built from either IR or ThinLTO summary index.
206	///
207	/// The graph represents the call contexts in all memprof metadata on allocation
208	/// calls, with nodes for the allocations themselves, as well as for the calls
209	/// in each context. The graph is initially built from the allocation memprof
210	/// metadata (or summary) MIBs. It is then updated to match calls with callsite
211	/// metadata onto the nodes, updating it to reflect any inlining performed on
212	/// those calls.
213	///
214	/// Each MIB (representing an allocation's call context with allocation
215	/// behavior) is assigned a unique context id during the graph build. The edges
216	/// and nodes in the graph are decorated with the context ids they carry. This
217	/// is used to correctly update the graph when cloning is performed so that we
218	/// can uniquify the context for a single (possibly cloned) allocation.
219	template <typename DerivedCCG, typename FuncTy, typename CallTy>
220	class CallsiteContextGraph {
221	public:
222	CallsiteContextGraph() = default;
223	CallsiteContextGraph(const CallsiteContextGraph &) = default;
224	CallsiteContextGraph(CallsiteContextGraph &&) = default;
225
226	/// Main entry point to perform analysis and transformations on graph.
227	bool process();
228
229	/// Perform cloning on the graph necessary to uniquely identify the allocation
230	/// behavior of an allocation based on its context.
231	void identifyClones();
232
233	/// Assign callsite clones to functions, cloning functions as needed to
234	/// accommodate the combinations of their callsite clones reached by callers.
235	/// For regular LTO this clones functions and callsites in the IR, but for
236	/// ThinLTO the cloning decisions are noted in the summaries and later applied
237	/// in applyImport.
238	bool assignFunctions();
239
240	void dump() const;
241	void print(raw_ostream &OS) const;
242	void printTotalSizes(raw_ostream &OS) const;
243
244	friend raw_ostream &operator<<(raw_ostream &OS,
245	const CallsiteContextGraph &CCG) {
246	CCG.print(OS);
247	return OS;
248	}
249
250	friend struct GraphTraits<
251	const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
252	friend struct DOTGraphTraits<
253	const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
254
255	void exportToDot(std::string Label) const;
256
257	/// Represents a function clone via FuncTy pointer and clone number pair.
258	struct FuncInfo final
259	: public std::pair<FuncTy , unsigned* /Clone number/> {
260	using Base = std::pair<FuncTy , unsigned*>;
261	FuncInfo(const Base &B) : Base(B) {}
262	FuncInfo(FuncTy F = nullptr, unsigned* CloneNo = `0`) : Base(F, CloneNo) {}
263	explicit operator bool() const { return this->first != nullptr; }
264	FuncTy func() const* { return this->first; }
265	unsigned cloneNo() const { return this->second; }
266	};
267
268	/// Represents a callsite clone via CallTy and clone number pair.
269	struct CallInfo final : public std::pair<CallTy, unsigned /Clone number/> {
270	using Base = std::pair<CallTy, unsigned>;
271	CallInfo(const Base &B) : Base(B) {}
272	CallInfo(CallTy Call = nullptr, unsigned CloneNo = `0`)
273	: Base(Call, CloneNo) {}
274	explicit operator bool() const { return (bool)this->first; }
275	CallTy call() const { return this->first; }
276	unsigned cloneNo() const { return this->second; }
277	void setCloneNo(unsigned N) { this->second = N; }
278	void print(raw_ostream &OS) const {
279	if (!operator bool()) {
280	assert(!cloneNo());
281	OS << "null Call";
282	return;
283	}
284	call()->print(OS);
285	OS << "\t(clone " << cloneNo() << ")";
286	}
287	void dump() const {
288	print(OS&: dbgs());
289	dbgs() << "\n";
290	}
291	friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
292	Call.print(OS);
293	return OS;
294	}
295	};
296
297	struct ContextEdge;
298
299	/// Node in the Callsite Context Graph
300	struct ContextNode {
301	// Keep this for now since in the IR case where we have an Instruction it*
302	// is not as immediately discoverable. Used for printing richer information
303	// when dumping graph.
304	bool IsAllocation;
305
306	// Keeps track of when the Call was reset to null because there was
307	// recursion.
308	bool Recursive = false;
309
310	// This will be formed by ORing together the AllocationType enum values
311	// for contexts including this node.
312	uint8_t AllocTypes = `0`;
313
314	// The corresponding allocation or interior call. This is the primary call
315	// for which we have created this node.
316	CallInfo Call;
317
318	// List of other calls that can be treated the same as the primary call
319	// through cloning. I.e. located in the same function and have the same
320	// (possibly pruned) stack ids. They will be updated the same way as the
321	// primary call when assigning to function clones.
322	SmallVector<CallInfo, `0`> MatchingCalls;
323
324	// For alloc nodes this is a unique id assigned when constructed, and for
325	// callsite stack nodes it is the original stack id when the node is
326	// constructed from the memprof MIB metadata on the alloc nodes. Note that
327	// this is only used when matching callsite metadata onto the stack nodes
328	// created when processing the allocation memprof MIBs, and for labeling
329	// nodes in the dot graph. Therefore we don't bother to assign a value for
330	// clones.
331	uint64_t OrigStackOrAllocId = `0`;
332
333	// Edges to all callees in the profiled call stacks.
334	// TODO: Should this be a map (from Callee node) for more efficient lookup?
335	std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
336
337	// Edges to all callers in the profiled call stacks.
338	// TODO: Should this be a map (from Caller node) for more efficient lookup?
339	std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
340
341	// Returns true if we need to look at the callee edges for determining the
342	// node context ids and allocation type.
343	bool useCallerEdgesForContextInfo() const {
344	// Typically if the callee edges are empty either the caller edges are
345	// also empty, or this is an allocation (leaf node). However, if we are
346	// allowing recursive callsites and contexts this will be violated for
347	// incompletely cloned recursive cycles.
348	assert(!CalleeEdges.empty() \|\| CallerEdges.empty() \|\| IsAllocation \|\|
349	(AllowRecursiveCallsites && AllowRecursiveContexts));
350	// When cloning for a recursive context, during cloning we might be in the
351	// midst of cloning for a recurrence and have moved context ids off of a
352	// caller edge onto the clone but not yet off of the incoming caller
353	// (back) edge. If we don't look at those we miss the fact that this node
354	// still has context ids of interest.
355	return IsAllocation \|\| CloneRecursiveContexts;
356	}
357
358	// Compute the context ids for this node from the union of its edge context
359	// ids.
360	DenseSet<uint32_t> getContextIds() const {
361	unsigned Count = `0`;
362	// Compute the number of ids for reserve below. In general we only need to
363	// look at one set of edges, typically the callee edges, since other than
364	// allocations and in some cases during recursion cloning, all the context
365	// ids on the callers should also flow out via callee edges.
366	for (auto &Edge : CalleeEdges.empty() ? CallerEdges : CalleeEdges)
367	Count += Edge->getContextIds().size();
368	DenseSet<uint32_t> ContextIds;
369	ContextIds.reserve(Size: Count);
370	auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
371	CalleeEdges, useCallerEdgesForContextInfo()
372	? CallerEdges
373	: std::vector<std::shared_ptr<ContextEdge>>());
374	for (const auto &Edge : Edges)
375	ContextIds.insert_range(Edge->getContextIds());
376	return ContextIds;
377	}
378
379	// Compute the allocation type for this node from the OR of its edge
380	// allocation types.
381	uint8_t computeAllocType() const {
382	uint8_t BothTypes =
383	(uint8_t)AllocationType::Cold \| (uint8_t)AllocationType::NotCold;
384	uint8_t AllocType = (uint8_t)AllocationType::None;
385	auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
386	CalleeEdges, useCallerEdgesForContextInfo()
387	? CallerEdges
388	: std::vector<std::shared_ptr<ContextEdge>>());
389	for (const auto &Edge : Edges) {
390	AllocType \|= Edge->AllocTypes;
391	// Bail early if alloc type reached both, no further refinement.
392	if (AllocType == BothTypes)
393	return AllocType;
394	}
395	return AllocType;
396	}
397
398	// The context ids set for this node is empty if its edge context ids are
399	// also all empty.
400	bool emptyContextIds() const {
401	auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
402	CalleeEdges, useCallerEdgesForContextInfo()
403	? CallerEdges
404	: std::vector<std::shared_ptr<ContextEdge>>());
405	for (const auto &Edge : Edges) {
406	if (!Edge->getContextIds().empty())
407	return false;
408	}
409	return true;
410	}
411
412	// List of clones of this ContextNode, initially empty.
413	std::vector<ContextNode *> Clones;
414
415	// If a clone, points to the original uncloned node.
416	ContextNode CloneOf = nullptr*;
417
418	ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
419
420	ContextNode(bool IsAllocation, CallInfo C)
421	: IsAllocation(IsAllocation), Call(C) {}
422
423	void addClone(ContextNode *Clone) {
424	if (CloneOf) {
425	CloneOf->Clones.push_back(Clone);
426	Clone->CloneOf = CloneOf;
427	} else {
428	Clones.push_back(Clone);
429	assert(!Clone->CloneOf);
430	Clone->CloneOf = this;
431	}
432	}
433
434	ContextNode *getOrigNode() {
435	if (!CloneOf)
436	return this;
437	return CloneOf;
438	}
439
440	void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
441	unsigned int ContextId);
442
443	ContextEdge findEdgeFromCallee(const* ContextNode *Callee);
444	ContextEdge findEdgeFromCaller(const* ContextNode *Caller);
445	void eraseCalleeEdge(const ContextEdge *Edge);
446	void eraseCallerEdge(const ContextEdge *Edge);
447
448	void setCall(CallInfo C) { Call = C; }
449
450	bool hasCall() const { return (bool)Call.call(); }
451
452	void printCall(raw_ostream &OS) const { Call.print(OS); }
453
454	// True if this node was effectively removed from the graph, in which case
455	// it should have an allocation type of None and empty context ids.
456	bool isRemoved() const {
457	// Typically if the callee edges are empty either the caller edges are
458	// also empty, or this is an allocation (leaf node). However, if we are
459	// allowing recursive callsites and contexts this will be violated for
460	// incompletely cloned recursive cycles.
461	assert((AllowRecursiveCallsites && AllowRecursiveContexts) \|\|
462	(AllocTypes == (uint8_t)AllocationType::None) ==
463	emptyContextIds());
464	return AllocTypes == (uint8_t)AllocationType::None;
465	}
466
467	void dump() const;
468	void print(raw_ostream &OS) const;
469
470	friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
471	Node.print(OS);
472	return OS;
473	}
474	};
475
476	/// Edge in the Callsite Context Graph from a ContextNode N to a caller or
477	/// callee.
478	struct ContextEdge {
479	ContextNode *Callee;
480	ContextNode *Caller;
481
482	// This will be formed by ORing together the AllocationType enum values
483	// for contexts including this edge.
484	uint8_t AllocTypes = `0`;
485
486	// Set just before initiating cloning when cloning of recursive contexts is
487	// enabled. Used to defer cloning of backedges until we have done cloning of
488	// the callee node for non-backedge caller edges. This exposes cloning
489	// opportunities through the backedge of the cycle.
490	// TODO: Note that this is not updated during cloning, and it is unclear
491	// whether that would be needed.
492	bool IsBackedge = false;
493
494	// The set of IDs for contexts including this edge.
495	DenseSet<uint32_t> ContextIds;
496
497	ContextEdge(ContextNode Callee, ContextNode Caller, uint8_t AllocType,
498	DenseSet<uint32_t> ContextIds)
499	: Callee(Callee), Caller(Caller), AllocTypes(AllocType),
500	ContextIds (std::move(ContextIds)) {}
501
502	DenseSet<uint32_t> &getContextIds() { return ContextIds; }
503
504	// Helper to clear the fields of this edge when we are removing it from the
505	// graph.
506	inline void clear() {
507	ContextIds.clear();
508	AllocTypes = (uint8_t)AllocationType::None;
509	Caller = nullptr;
510	Callee = nullptr;
511	}
512
513	// Check if edge was removed from the graph. This is useful while iterating
514	// over a copy of edge lists when performing operations that mutate the
515	// graph in ways that might remove one of the edges.
516	inline bool isRemoved() const {
517	if (Callee \|\| Caller)
518	return false;
519	// Any edges that have been removed from the graph but are still in a
520	// shared_ptr somewhere should have all fields null'ed out by clear()
521	// above.
522	assert(AllocTypes == (uint8_t)AllocationType::None);
523	assert(ContextIds.empty());
524	return true;
525	}
526
527	void dump() const;
528	void print(raw_ostream &OS) const;
529
530	friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
531	Edge.print(OS);
532	return OS;
533	}
534	};
535
536	/// Helpers to remove edges that have allocation type None (due to not
537	/// carrying any context ids) after transformations.
538	void removeNoneTypeCalleeEdges(ContextNode *Node);
539	void removeNoneTypeCallerEdges(ContextNode *Node);
540	void
541	recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node,
542	DenseSet<const ContextNode *> &Visited);
543
544	protected:
545	/// Get a list of nodes corresponding to the stack ids in the given callsite
546	/// context.
547	template <class NodeT, class IteratorT>
548	std::vector<uint64_t>
549	getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
550
551	/// Adds nodes for the given allocation and any stack ids on its memprof MIB
552	/// metadata (or summary).
553	ContextNode addAllocNode(CallInfo Call, const* FuncTy *F);
554
555	/// Adds nodes for the given MIB stack ids.
556	template <class NodeT, class IteratorT>
557	void addStackNodesForMIB(ContextNode *AllocNode,
558	CallStack<NodeT, IteratorT> &StackContext,
559	CallStack<NodeT, IteratorT> &CallsiteContext,
560	AllocationType AllocType,
561	ArrayRef<ContextTotalSize> ContextSizeInfo);
562
563	/// Matches all callsite metadata (or summary) to the nodes created for
564	/// allocation memprof MIB metadata, synthesizing new nodes to reflect any
565	/// inlining performed on those callsite instructions.
566	void updateStackNodes();
567
568	/// Update graph to conservatively handle any callsite stack nodes that target
569	/// multiple different callee target functions.
570	void handleCallsitesWithMultipleTargets();
571
572	/// Mark backedges via the standard DFS based backedge algorithm.
573	void markBackedges();
574
575	/// Merge clones generated during cloning for different allocations but that
576	/// are called by the same caller node, to ensure proper function assignment.
577	void mergeClones();
578
579	// Try to partition calls on the given node (already placed into the AllCalls
580	// array) by callee function, creating new copies of Node as needed to hold
581	// calls with different callees, and moving the callee edges appropriately.
582	// Returns true if partitioning was successful.
583	bool partitionCallsByCallee(
584	ContextNode *Node, ArrayRef<CallInfo> AllCalls,
585	std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode);
586
587	/// Save lists of calls with MemProf metadata in each function, for faster
588	/// iteration.
589	MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
590
591	/// Map from callsite node to the enclosing caller function.
592	std::map<const ContextNode , const* FuncTy *> NodeToCallingFunc;
593
594	// When exporting to dot, and an allocation id is specified, contains the
595	// context ids on that allocation.
596	DenseSet<uint32_t> DotAllocContextIds;
597
598	private:
599	using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
600
601	// Structure to keep track of information for each call as we are matching
602	// non-allocation callsites onto context nodes created from the allocation
603	// call metadata / summary contexts.
604	struct CallContextInfo {
605	// The callsite we're trying to match.
606	CallTy Call;
607	// The callsites stack ids that have a context node in the graph.
608	std::vector<uint64_t> StackIds;
609	// The function containing this callsite.
610	const FuncTy *Func;
611	// Initially empty, if needed this will be updated to contain the context
612	// ids for use in a new context node created for this callsite.
613	DenseSet<uint32_t> ContextIds;
614	};
615
616	/// Helper to remove edge from graph, updating edge iterator if it is provided
617	/// (in which case CalleeIter indicates which edge list is being iterated).
618	/// This will also perform the necessary clearing of the ContextEdge members
619	/// to enable later checking if the edge has been removed (since we may have
620	/// other copies of the shared_ptr in existence, and in fact rely on this to
621	/// enable removal while iterating over a copy of a node's edge list).
622	void removeEdgeFromGraph(ContextEdge Edge, EdgeIter EI = nullptr,
623	bool CalleeIter = true);
624
625	/// Assigns the given Node to calls at or inlined into the location with
626	/// the Node's stack id, after post order traversing and processing its
627	/// caller nodes. Uses the call information recorded in the given
628	/// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
629	/// as needed. Called by updateStackNodes which sets up the given
630	/// StackIdToMatchingCalls map.
631	void assignStackNodesPostOrder(
632	ContextNode Node, DenseSet<const* ContextNode *> &Visited,
633	DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls,
634	DenseMap<CallInfo, CallInfo> &CallToMatchingCall);
635
636	/// Duplicates the given set of context ids, updating the provided
637	/// map from each original id with the newly generated context ids,
638	/// and returning the new duplicated id set.
639	DenseSet<uint32_t> duplicateContextIds(
640	const DenseSet<uint32_t> &StackSequenceContextIds,
641	DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
642
643	/// Propagates all duplicated context ids across the graph.
644	void propagateDuplicateContextIds(
645	const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
646
647	/// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
648	/// else to its callers. Also updates OrigNode's edges to remove any context
649	/// ids moved to the newly created edge.
650	void connectNewNode(ContextNode NewNode, ContextNode OrigNode,
651	bool TowardsCallee,
652	DenseSet<uint32_t> RemainingContextIds);
653
654	/// Get the stack id corresponding to the given Id or Index (for IR this will
655	/// return itself, for a summary index this will return the id recorded in the
656	/// index for that stack id index value).
657	uint64_t getStackId(uint64_t IdOrIndex) const {
658	return static_cast<const DerivedCCG >(this*)->getStackId(IdOrIndex);
659	}
660
661	/// Returns true if the given call targets the callee of the given edge, or if
662	/// we were able to identify the call chain through intermediate tail calls.
663	/// In the latter case new context nodes are added to the graph for the
664	/// identified tail calls, and their synthesized nodes are added to
665	/// TailCallToContextNodeMap. The EdgeIter is updated in the latter case for
666	/// the updated edges and to prepare it for an increment in the caller.
667	bool
668	calleesMatch(CallTy Call, EdgeIter &EI,
669	MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);
670
671	// Return the callee function of the given call, or nullptr if it can't be
672	// determined
673	const FuncTy *getCalleeFunc(CallTy Call) {
674	return static_cast<DerivedCCG >(this*)->getCalleeFunc(Call);
675	}
676
677	/// Returns true if the given call targets the given function, or if we were
678	/// able to identify the call chain through intermediate tail calls (in which
679	/// case FoundCalleeChain will be populated).
680	bool calleeMatchesFunc(
681	CallTy Call, const FuncTy Func, const* FuncTy *CallerFunc,
682	std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) {
683	return static_cast<DerivedCCG >(this*)->calleeMatchesFunc(
684	Call, Func, CallerFunc, FoundCalleeChain);
685	}
686
687	/// Returns true if both call instructions have the same callee.
688	bool sameCallee(CallTy Call1, CallTy Call2) {
689	return static_cast<DerivedCCG >(this*)->sameCallee(Call1, Call2);
690	}
691
692	/// Get a list of nodes corresponding to the stack ids in the given
693	/// callsite's context.
694	std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
695	return static_cast<DerivedCCG >(this*)->getStackIdsWithContextNodesForCall(
696	Call);
697	}
698
699	/// Get the last stack id in the context for callsite.
700	uint64_t getLastStackId(CallTy Call) {
701	return static_cast<DerivedCCG >(this*)->getLastStackId(Call);
702	}
703
704	/// Update the allocation call to record type of allocated memory.
705	void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
706	AllocType == AllocationType::Cold ? AllocTypeCold ++ : AllocTypeNotCold ++;
707	static_cast<DerivedCCG >(this*)->updateAllocationCall(Call, AllocType);
708	}
709
710	/// Get the AllocationType assigned to the given allocation instruction clone.
711	AllocationType getAllocationCallType(const CallInfo &Call) const {
712	return static_cast<const DerivedCCG >(this*)->getAllocationCallType(Call);
713	}
714
715	/// Update non-allocation call to invoke (possibly cloned) function
716	/// CalleeFunc.
717	void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
718	static_cast<DerivedCCG >(this*)->updateCall(CallerCall, CalleeFunc);
719	}
720
721	/// Clone the given function for the given callsite, recording mapping of all
722	/// of the functions tracked calls to their new versions in the CallMap.
723	/// Assigns new clones to clone number CloneNo.
724	FuncInfo cloneFunctionForCallsite(
725	FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
726	std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
727	return static_cast<DerivedCCG >(this*)->cloneFunctionForCallsite(
728	Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
729	}
730
731	/// Gets a label to use in the dot graph for the given call clone in the given
732	/// function.
733	std::string getLabel(const FuncTy Func, const* CallTy Call,
734	unsigned CloneNo) const {
735	return static_cast<const DerivedCCG >(this*)->getLabel(Func, Call, CloneNo);
736	}
737
738	// Create and return a new ContextNode.
739	ContextNode createNewNode(bool* IsAllocation, const FuncTy F = nullptr*,
740	CallInfo C = CallInfo()) {
741	NodeOwner.push_back(std::make_unique<ContextNode>(IsAllocation, C));
742	auto *NewNode = NodeOwner.back().get();
743	if (F)
744	NodeToCallingFunc[NewNode] = F;
745	return NewNode;
746	}
747
748	/// Helpers to find the node corresponding to the given call or stackid.
749	ContextNode getNodeForInst(const* CallInfo &C);
750	ContextNode getNodeForAlloc(const* CallInfo &C);
751	ContextNode *getNodeForStackId(uint64_t StackId);
752
753	/// Computes the alloc type corresponding to the given context ids, by
754	/// unioning their recorded alloc types.
755	uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds) const;
756
757	/// Returns the allocation type of the intersection of the contexts of two
758	/// nodes (based on their provided context id sets), optimized for the case
759	/// when Node1Ids is smaller than Node2Ids.
760	uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
761	const DenseSet<uint32_t> &Node2Ids) const;
762
763	/// Returns the allocation type of the intersection of the contexts of two
764	/// nodes (based on their provided context id sets).
765	uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
766	const DenseSet<uint32_t> &Node2Ids) const;
767
768	/// Create a clone of Edge's callee and move Edge to that new callee node,
769	/// performing the necessary context id and allocation type updates.
770	/// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
771	/// moved to an edge to the new callee.
772	ContextNode *
773	moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
774	DenseSet<uint32_t> ContextIdsToMove = {});
775
776	/// Change the callee of Edge to existing callee clone NewCallee, performing
777	/// the necessary context id and allocation type updates.
778	/// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
779	/// moved to an edge to the new callee.
780	void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
781	ContextNode *NewCallee,
782	bool NewClone = false,
783	DenseSet<uint32_t> ContextIdsToMove = {});
784
785	/// Change the caller of the edge at the given callee edge iterator to be
786	/// NewCaller, performing the necessary context id and allocation type
787	/// updates. This is similar to the above moveEdgeToExistingCalleeClone, but
788	/// a simplified version of it as we always move the given edge and all of its
789	/// context ids.
790	void moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
791	ContextNode *NewCaller);
792
793	/// Recursive helper for marking backedges via DFS.
794	void markBackedges(ContextNode Node, DenseSet<const* ContextNode *> &Visited,
795	DenseSet<const ContextNode *> &CurrentStack);
796
797	/// Recursive helper for merging clones.
798	void
799	mergeClones(ContextNode Node, DenseSet<const* ContextNode *> &Visited,
800	DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode);
801	/// Main worker for merging callee clones for a given node.
802	void mergeNodeCalleeClones(
803	ContextNode Node, DenseSet<const* ContextNode *> &Visited,
804	DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode);
805	/// Helper to find other callers of the given set of callee edges that can
806	/// share the same callee merge node.
807	void findOtherCallersToShareMerge(
808	ContextNode *Node, std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges,
809	DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode,
810	DenseSet<ContextNode *> &OtherCallersToShareMerge);
811
812	/// Recursively perform cloning on the graph for the given Node and its
813	/// callers, in order to uniquely identify the allocation behavior of an
814	/// allocation given its context. The context ids of the allocation being
815	/// processed are given in AllocContextIds.
816	void identifyClones(ContextNode Node, DenseSet<const* ContextNode *> &Visited,
817	const DenseSet<uint32_t> &AllocContextIds);
818
819	/// Map from each context ID to the AllocationType assigned to that context.
820	DenseMap<uint32_t, AllocationType> ContextIdToAllocationType;
821
822	/// Map from each contextID to the profiled full contexts and their total
823	/// sizes (there may be more than one due to context trimming),
824	/// optionally populated when requested (via MemProfReportHintedSizes or
825	/// MinClonedColdBytePercent).
826	DenseMap<uint32_t, std::vector<ContextTotalSize>> ContextIdToContextSizeInfos;
827
828	/// Identifies the context node created for a stack id when adding the MIB
829	/// contexts to the graph. This is used to locate the context nodes when
830	/// trying to assign the corresponding callsites with those stack ids to these
831	/// nodes.
832	DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
833
834	/// Maps to track the calls to their corresponding nodes in the graph.
835	MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
836	MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
837
838	/// Owner of all ContextNode unique_ptrs.
839	std::vector<std::unique_ptr<ContextNode>> NodeOwner;
840
841	/// Perform sanity checks on graph when requested.
842	void check() const;
843
844	/// Keeps track of the last unique context id assigned.
845	unsigned int LastContextId = `0`;
846	};
847
848	template <typename DerivedCCG, typename FuncTy, typename CallTy>
849	using ContextNode =
850	typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
851	template <typename DerivedCCG, typename FuncTy, typename CallTy>
852	using ContextEdge =
853	typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
854	template <typename DerivedCCG, typename FuncTy, typename CallTy>
855	using FuncInfo =
856	typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
857	template <typename DerivedCCG, typename FuncTy, typename CallTy>
858	using CallInfo =
859	typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
860
861	/// CRTP derived class for graphs built from IR (regular LTO).
862	class ModuleCallsiteContextGraph
863	: public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
864	Instruction *> {
865	public:
866	ModuleCallsiteContextGraph(
867	Module &M,
868	llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
869
870	private:
871	friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
872	Instruction *>;
873
874	uint64_t getStackId(uint64_t IdOrIndex) const;
875	const Function getCalleeFunc(Instruction Call);
876	bool calleeMatchesFunc(
877	Instruction Call, const* Function Func, const* Function *CallerFunc,
878	std::vector<std::pair<Instruction , Function >> &FoundCalleeChain);
879	bool sameCallee(Instruction Call1, Instruction Call2);
880	bool findProfiledCalleeThroughTailCalls(
881	const Function ProfiledCallee, Value CurCallee, unsigned Depth,
882	std::vector<std::pair<Instruction , Function >> &FoundCalleeChain,
883	bool &FoundMultipleCalleeChains);
884	uint64_t getLastStackId(Instruction *Call);
885	std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
886	void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
887	AllocationType getAllocationCallType(const CallInfo &Call) const;
888	void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
889	CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
890	Instruction *>::FuncInfo
891	cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
892	std::map<CallInfo, CallInfo> &CallMap,
893	std::vector<CallInfo> &CallsWithMetadataInFunc,
894	unsigned CloneNo);
895	std::string getLabel(const Function Func, const* Instruction *Call,
896	unsigned CloneNo) const;
897
898	const Module &Mod;
899	llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
900	};
901
902	/// Represents a call in the summary index graph, which can either be an
903	/// allocation or an interior callsite node in an allocation's context.
904	/// Holds a pointer to the corresponding data structure in the index.
905	struct IndexCall : public PointerUnion<CallsiteInfo , AllocInfo > {
906	IndexCall() : PointerUnion () {}
907	IndexCall(std::nullptr_t) : IndexCall () {}
908	IndexCall(CallsiteInfo *StackNode) : PointerUnion (StackNode) {}
909	IndexCall(AllocInfo *AllocNode) : PointerUnion (AllocNode) {}
910	IndexCall(PointerUnion PT) : PointerUnion (PT) {}
911
912	IndexCall *operator->() { return this; }
913
914	void print(raw_ostream &OS) const {
915	PointerUnion<CallsiteInfo , AllocInfo > Base = *this;
916	if (auto AI = llvm::dyn_cast_if_present<AllocInfo >(Val&: Base)) {
917	OS << *AI;
918	} else {
919	auto CI = llvm::dyn_cast_if_present<CallsiteInfo >(Val&: Base);
920	assert(CI);
921	OS << *CI;
922	}
923	}
924	};
925	} // namespace
926
927	namespace llvm {
928	template <> struct simplify_type<IndexCall> {
929	using SimpleType = PointerUnion<CallsiteInfo , AllocInfo >;
930	static SimpleType getSimplifiedValue(IndexCall &Val) { return Val; }
931	};
932	template <> struct simplify_type<const IndexCall> {
933	using SimpleType = const PointerUnion<CallsiteInfo , AllocInfo >;
934	static SimpleType getSimplifiedValue(const IndexCall &Val) { return Val; }
935	};
936	} // namespace llvm
937
938	namespace {
939	/// CRTP derived class for graphs built from summary index (ThinLTO).
940	class IndexCallsiteContextGraph
941	: public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
942	IndexCall> {
943	public:
944	IndexCallsiteContextGraph(
945	ModuleSummaryIndex &Index,
946	llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
947	isPrevailing);
948
949	~IndexCallsiteContextGraph() {
950	// Now that we are done with the graph it is safe to add the new
951	// CallsiteInfo structs to the function summary vectors. The graph nodes
952	// point into locations within these vectors, so we don't want to add them
953	// any earlier.
954	for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) {
955	auto *FS = I.first;
956	for (auto &Callsite : I.second)
957	FS->addCallsite(Callsite&: *Callsite.second);
958	}
959	}
960
961	private:
962	friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
963	IndexCall>;
964
965	uint64_t getStackId(uint64_t IdOrIndex) const;
966	const FunctionSummary *getCalleeFunc(IndexCall &Call);
967	bool calleeMatchesFunc(
968	IndexCall &Call, const FunctionSummary *Func,
969	const FunctionSummary *CallerFunc,
970	std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);
971	bool sameCallee(IndexCall &Call1, IndexCall &Call2);
972	bool findProfiledCalleeThroughTailCalls(
973	ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
974	std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
975	bool &FoundMultipleCalleeChains);
976	uint64_t getLastStackId(IndexCall &Call);
977	std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
978	void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
979	AllocationType getAllocationCallType(const CallInfo &Call) const;
980	void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
981	CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
982	IndexCall>::FuncInfo
983	cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
984	std::map<CallInfo, CallInfo> &CallMap,
985	std::vector<CallInfo> &CallsWithMetadataInFunc,
986	unsigned CloneNo);
987	std::string getLabel(const FunctionSummary Func, const* IndexCall &Call,
988	unsigned CloneNo) const;
989
990	// Saves mapping from function summaries containing memprof records back to
991	// its VI, for use in checking and debugging.
992	std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
993
994	const ModuleSummaryIndex &Index;
995	llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
996	isPrevailing;
997
998	// Saves/owns the callsite info structures synthesized for missing tail call
999	// frames that we discover while building the graph.
1000	// It maps from the summary of the function making the tail call, to a map
1001	// of callee ValueInfo to corresponding synthesized callsite info.
1002	std::unordered_map<FunctionSummary *,
1003	std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>>
1004	FunctionCalleesToSynthesizedCallsiteInfos;
1005	};
1006	} // namespace
1007
1008	namespace llvm {
1009	template <>
1010	struct DenseMapInfo<typename CallsiteContextGraph<
1011	ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
1012	: public DenseMapInfo<std::pair<Instruction , unsigned*>> {};
1013	template <>
1014	struct DenseMapInfo<typename CallsiteContextGraph<
1015	IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
1016	: public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
1017	template <>
1018	struct DenseMapInfo<IndexCall>
1019	: public DenseMapInfo<PointerUnion<CallsiteInfo , AllocInfo >> {};
1020	} // end namespace llvm
1021
1022	namespace {
1023
1024	// Map the uint8_t alloc types (which may contain NotCold\|Cold) to the alloc
1025	// type we should actually use on the corresponding allocation.
1026	// If we can't clone a node that has NotCold+Cold alloc type, we will fall
1027	// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
1028	// from NotCold.
1029	AllocationType allocTypeToUse(uint8_t AllocTypes) {
1030	assert(AllocTypes != (uint8_t)AllocationType::None);
1031	if (AllocTypes ==
1032	((uint8_t)AllocationType::NotCold \| (uint8_t)AllocationType::Cold))
1033	return AllocationType::NotCold;
1034	else
1035	return (AllocationType)AllocTypes;
1036	}
1037
1038	// Helper to check if the alloc types for all edges recorded in the
1039	// InAllocTypes vector match the alloc types for all edges in the Edges
1040	// vector.
1041	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1042	bool allocTypesMatch(
1043	const std::vector<uint8_t> &InAllocTypes,
1044	const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
1045	&Edges) {
1046	// This should be called only when the InAllocTypes vector was computed for
1047	// this set of Edges. Make sure the sizes are the same.
1048	assert(InAllocTypes.size() == Edges.size());
1049	return std::equal(
1050	InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(), Edges.end(),
1051	[](const uint8_t &l,
1052	const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
1053	// Can share if one of the edges is None type - don't
1054	// care about the type along that edge as it doesn't
1055	// exist for those context ids.
1056	if (l == (uint8_t)AllocationType::None \|\|
1057	r->AllocTypes == (uint8_t)AllocationType::None)
1058	return true;
1059	return allocTypeToUse(AllocTypes: l) == allocTypeToUse(r->AllocTypes);
1060	});
1061	}
1062
1063	// Helper to check if the alloc types for all edges recorded in the
1064	// InAllocTypes vector match the alloc types for callee edges in the given
1065	// clone. Because the InAllocTypes were computed from the original node's callee
1066	// edges, and other cloning could have happened after this clone was created, we
1067	// need to find the matching clone callee edge, which may or may not exist.
1068	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1069	bool allocTypesMatchClone(
1070	const std::vector<uint8_t> &InAllocTypes,
1071	const ContextNode<DerivedCCG, FuncTy, CallTy> *Clone) {
1072	const ContextNode<DerivedCCG, FuncTy, CallTy> *Node = Clone->CloneOf;
1073	assert(Node);
1074	// InAllocTypes should have been computed for the original node's callee
1075	// edges.
1076	assert(InAllocTypes.size() == Node->CalleeEdges.size());
1077	// First create a map of the clone callee edge callees to the edge alloc type.
1078	DenseMap<const ContextNode<DerivedCCG, FuncTy, CallTy> *, uint8_t>
1079	EdgeCalleeMap;
1080	for (const auto &E : Clone->CalleeEdges) {
1081	assert(!EdgeCalleeMap.contains(E->Callee));
1082	EdgeCalleeMap[E->Callee] = E->AllocTypes;
1083	}
1084	// Next, walk the original node's callees, and look for the corresponding
1085	// clone edge to that callee.
1086	for (unsigned I = `0`; I < Node->CalleeEdges.size(); I++) {
1087	auto Iter = EdgeCalleeMap.find(Node->CalleeEdges[I]->Callee);
1088	// Not found is ok, we will simply add an edge if we use this clone.
1089	if (Iter == EdgeCalleeMap.end())
1090	continue;
1091	// Can share if one of the edges is None type - don't
1092	// care about the type along that edge as it doesn't
1093	// exist for those context ids.
1094	if (InAllocTypes [I] == (uint8_t)AllocationType::None \|\|
1095	Iter->second == (uint8_t)AllocationType::None)
1096	continue;
1097	if (allocTypeToUse(Iter->second) != allocTypeToUse(AllocTypes: InAllocTypes [I]))
1098	return false;
1099	}
1100	return true;
1101	}
1102
1103	} // end anonymous namespace
1104
1105	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1106	typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1107	CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
1108	const CallInfo &C) {
1109	ContextNode *Node = getNodeForAlloc(C);
1110	if (Node)
1111	return Node;
1112
1113	return NonAllocationCallToContextNodeMap.lookup(C);
1114	}
1115
1116	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1117	typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1118	CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
1119	const CallInfo &C) {
1120	return AllocationCallToContextNodeMap.lookup(C);
1121	}
1122
1123	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1124	typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1125	CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
1126	uint64_t StackId) {
1127	auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
1128	if (StackEntryNode != StackEntryIdToContextNodeMap.end())
1129	return StackEntryNode->second;
1130	return nullptr;
1131	}
1132
1133	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1134	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1135	addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
1136	unsigned int ContextId) {
1137	for (auto &Edge : CallerEdges) {
1138	if (Edge->Caller == Caller) {
1139	Edge->AllocTypes \|= (uint8_t)AllocType;
1140	Edge->getContextIds().insert(ContextId);
1141	return;
1142	}
1143	}
1144	std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
1145	this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
1146	CallerEdges.push_back(Edge);
1147	Caller->CalleeEdges.push_back(Edge);
1148	}
1149
1150	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1151	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::removeEdgeFromGraph(
1152	ContextEdge Edge, EdgeIter EI, bool CalleeIter) {
1153	assert(!EI \|\| (*EI)->get() == Edge);
1154	assert(!Edge->isRemoved());
1155	// Save the Caller and Callee pointers so we can erase Edge from their edge
1156	// lists after clearing Edge below. We do the clearing first in case it is
1157	// destructed after removing from the edge lists (if those were the last
1158	// shared_ptr references to Edge).
1159	auto *Callee = Edge->Callee;
1160	auto *Caller = Edge->Caller;
1161
1162	// Make sure the edge fields are cleared out so we can properly detect
1163	// removed edges if Edge is not destructed because there is still a shared_ptr
1164	// reference.
1165	Edge->clear();
1166
1167	#ifndef NDEBUG
1168	auto CalleeCallerCount = Callee->CallerEdges.size();
1169	auto CallerCalleeCount = Caller->CalleeEdges.size();
1170	#endif
1171	if (!EI) {
1172	Callee->eraseCallerEdge(Edge);
1173	Caller->eraseCalleeEdge(Edge);
1174	} else if (CalleeIter) {
1175	Callee->eraseCallerEdge(Edge);
1176	EI = Caller->CalleeEdges.erase(EI);
1177	} else {
1178	Caller->eraseCalleeEdge(Edge);
1179	EI = Callee->CallerEdges.erase(EI);
1180	}
1181	assert(Callee->CallerEdges.size() < CalleeCallerCount);
1182	assert(Caller->CalleeEdges.size() < CallerCalleeCount);
1183	}
1184
1185	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1186	void CallsiteContextGraph<
1187	DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
1188	for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
1189	auto Edge = *EI;
1190	if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1191	assert(Edge->ContextIds.empty());
1192	removeEdgeFromGraph(Edge: Edge.get(), EI: &EI, /CalleeIter=/true);
1193	} else
1194	++EI;
1195	}
1196	}
1197
1198	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1199	void CallsiteContextGraph<
1200	DerivedCCG, FuncTy, CallTy>::removeNoneTypeCallerEdges(ContextNode *Node) {
1201	for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
1202	auto Edge = *EI;
1203	if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1204	assert(Edge->ContextIds.empty());
1205	Edge->Caller->eraseCalleeEdge(Edge.get());
1206	EI = Node->CallerEdges.erase(EI);
1207	} else
1208	++EI;
1209	}
1210	}
1211
1212	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1213	typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1214	CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1215	findEdgeFromCallee(const ContextNode *Callee) {
1216	for (const auto &Edge : CalleeEdges)
1217	if (Edge->Callee == Callee)
1218	return Edge.get();
1219	return nullptr;
1220	}
1221
1222	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1223	typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1224	CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1225	findEdgeFromCaller(const ContextNode *Caller) {
1226	for (const auto &Edge : CallerEdges)
1227	if (Edge->Caller == Caller)
1228	return Edge.get();
1229	return nullptr;
1230	}
1231
1232	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1233	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1234	eraseCalleeEdge(const ContextEdge *Edge) {
1235	auto EI = llvm::find_if(
1236	CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
1237	return CalleeEdge.get() == Edge;
1238	});
1239	assert(EI != CalleeEdges.end());
1240	CalleeEdges.erase(EI);
1241	}
1242
1243	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1244	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1245	eraseCallerEdge(const ContextEdge *Edge) {
1246	auto EI = llvm::find_if(
1247	CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
1248	return CallerEdge.get() == Edge;
1249	});
1250	assert(EI != CallerEdges.end());
1251	CallerEdges.erase(EI);
1252	}
1253
1254	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1255	uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
1256	DenseSet<uint32_t> &ContextIds) const {
1257	uint8_t BothTypes =
1258	(uint8_t)AllocationType::Cold \| (uint8_t)AllocationType::NotCold;
1259	uint8_t AllocType = (uint8_t)AllocationType::None;
1260	for (auto Id : ContextIds) {
1261	AllocType \|= (uint8_t)ContextIdToAllocationType.at(Val: Id);
1262	// Bail early if alloc type reached both, no further refinement.
1263	if (AllocType == BothTypes)
1264	return AllocType;
1265	}
1266	return AllocType;
1267	}
1268
1269	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1270	uint8_t
1271	CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
1272	const DenseSet<uint32_t> &Node1Ids,
1273	const DenseSet<uint32_t> &Node2Ids) const {
1274	uint8_t BothTypes =
1275	(uint8_t)AllocationType::Cold \| (uint8_t)AllocationType::NotCold;
1276	uint8_t AllocType = (uint8_t)AllocationType::None;
1277	for (auto Id : Node1Ids) {
1278	if (!Node2Ids.count(V: Id))
1279	continue;
1280	AllocType \|= (uint8_t)ContextIdToAllocationType.at(Val: Id);
1281	// Bail early if alloc type reached both, no further refinement.
1282	if (AllocType == BothTypes)
1283	return AllocType;
1284	}
1285	return AllocType;
1286	}
1287
1288	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1289	uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
1290	const DenseSet<uint32_t> &Node1Ids,
1291	const DenseSet<uint32_t> &Node2Ids) const {
1292	if (Node1Ids.size() < Node2Ids.size())
1293	return intersectAllocTypesImpl(Node1Ids, Node2Ids);
1294	else
1295	return intersectAllocTypesImpl(Node1Ids: Node2Ids, Node2Ids: Node1Ids);
1296	}
1297
1298	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1299	typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1300	CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
1301	CallInfo Call, const FuncTy *F) {
1302	assert(!getNodeForAlloc(Call));
1303	ContextNode AllocNode = createNewNode(/IsAllocation=/*true, F, C: Call);
1304	AllocationCallToContextNodeMap[Call] = AllocNode;
1305	// Use LastContextId as a uniq id for MIB allocation nodes.
1306	AllocNode->OrigStackOrAllocId = LastContextId;
1307	// Alloc type should be updated as we add in the MIBs. We should assert
1308	// afterwards that it is not still None.
1309	AllocNode->AllocTypes = (uint8_t)AllocationType::None;
1310
1311	return AllocNode;
1312	}
1313
1314	static std::string getAllocTypeString(uint8_t AllocTypes) {
1315	if (!AllocTypes)
1316	return "None";
1317	std::string Str;
1318	if (AllocTypes & (uint8_t)AllocationType::NotCold)
1319	Str += "NotCold";
1320	if (AllocTypes & (uint8_t)AllocationType::Cold)
1321	Str += "Cold";
1322	return Str;
1323	}
1324
1325	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1326	template <class NodeT, class IteratorT>
1327	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
1328	ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
1329	CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
1330	ArrayRef<ContextTotalSize> ContextSizeInfo) {
1331	// Treating the hot alloc type as NotCold before the disambiguation for "hot"
1332	// is done.
1333	if (AllocType == AllocationType::Hot)
1334	AllocType = AllocationType::NotCold;
1335
1336	ContextIdToAllocationType [++LastContextId] = AllocType;
1337
1338	if (!ContextSizeInfo.empty()) {
1339	auto &Entry = ContextIdToContextSizeInfos [LastContextId];
1340	Entry.insert(position: Entry.begin(), first: ContextSizeInfo.begin(), last: ContextSizeInfo.end());
1341	}
1342
1343	// Update alloc type and context ids for this MIB.
1344	AllocNode->AllocTypes \|= (uint8_t)AllocType;
1345
1346	// Now add or update nodes for each stack id in alloc's context.
1347	// Later when processing the stack ids on non-alloc callsites we will adjust
1348	// for any inlining in the context.
1349	ContextNode *PrevNode = AllocNode;
1350	// Look for recursion (direct recursion should have been collapsed by
1351	// module summary analysis, here we should just be detecting mutual
1352	// recursion). Mark these nodes so we don't try to clone.
1353	SmallSet<uint64_t, `8`> StackIdSet;
1354	// Skip any on the allocation call (inlining).
1355	for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
1356	ContextIter != StackContext.end(); ++ContextIter) {
1357	auto StackId = getStackId(IdOrIndex: *ContextIter);
1358	ContextNode *StackNode = getNodeForStackId(StackId);
1359	if (!StackNode) {
1360	StackNode = createNewNode(/IsAllocation=/false);
1361	StackEntryIdToContextNodeMap[StackId] = StackNode;
1362	StackNode->OrigStackOrAllocId = StackId;
1363	}
1364	// Marking a node recursive will prevent its cloning completely, even for
1365	// non-recursive contexts flowing through it.
1366	if (!AllowRecursiveCallsites) {
1367	auto Ins = StackIdSet.insert(StackId);
1368	if (!Ins.second)
1369	StackNode->Recursive = true;
1370	}
1371	StackNode->AllocTypes \|= (uint8_t)AllocType;
1372	PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
1373	PrevNode = StackNode;
1374	}
1375	}
1376
1377	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1378	DenseSet<uint32_t>
1379	CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
1380	const DenseSet<uint32_t> &StackSequenceContextIds,
1381	DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1382	DenseSet<uint32_t> NewContextIds;
1383	for (auto OldId : StackSequenceContextIds) {
1384	NewContextIds.insert(V: ++LastContextId);
1385	OldToNewContextIds [OldId].insert(V: LastContextId);
1386	assert(ContextIdToAllocationType.count(OldId));
1387	// The new context has the same allocation type as original.
1388	ContextIdToAllocationType [LastContextId] = ContextIdToAllocationType [OldId];
1389	if (DotAllocContextIds.contains(V: OldId))
1390	DotAllocContextIds.insert(V: LastContextId);
1391	}
1392	return NewContextIds;
1393	}
1394
1395	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1396	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1397	propagateDuplicateContextIds(
1398	const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1399	// Build a set of duplicated context ids corresponding to the input id set.
1400	auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
1401	DenseSet<uint32_t> NewIds;
1402	for (auto Id : ContextIds)
1403	if (auto NewId = OldToNewContextIds.find(Val: Id);
1404	NewId != OldToNewContextIds.end())
1405	NewIds.insert_range(R: NewId ->second);
1406	return NewIds;
1407	};
1408
1409	// Recursively update context ids sets along caller edges.
1410	auto UpdateCallers = [&](ContextNode *Node,
1411	DenseSet<const ContextEdge *> &Visited,
1412	auto &&UpdateCallers) -> void {
1413	for (const auto &Edge : Node->CallerEdges) {
1414	auto Inserted = Visited.insert(Edge.get());
1415	if (!Inserted.second)
1416	continue;
1417	ContextNode *NextNode = Edge->Caller;
1418	DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
1419	// Only need to recursively iterate to NextNode via this caller edge if
1420	// it resulted in any added ids to NextNode.
1421	if (!NewIdsToAdd.empty()) {
1422	Edge->getContextIds().insert_range(NewIdsToAdd);
1423	UpdateCallers(NextNode, Visited, UpdateCallers);
1424	}
1425	}
1426	};
1427
1428	DenseSet<const ContextEdge *> Visited;
1429	for (auto &Entry : AllocationCallToContextNodeMap) {
1430	auto *Node = Entry.second;
1431	UpdateCallers(Node, Visited, UpdateCallers);
1432	}
1433	}
1434
1435	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1436	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
1437	ContextNode NewNode, ContextNode OrigNode, bool TowardsCallee,
1438	// This must be passed by value to make a copy since it will be adjusted
1439	// as ids are moved.
1440	DenseSet<uint32_t> RemainingContextIds) {
1441	auto &OrigEdges =
1442	TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
1443	DenseSet<uint32_t> RecursiveContextIds;
1444	DenseSet<uint32_t> AllCallerContextIds;
1445	if (AllowRecursiveCallsites) {
1446	// Identify which context ids are recursive which is needed to properly
1447	// update the RemainingContextIds set. The relevant recursive context ids
1448	// are those that are in multiple edges.
1449	for (auto &CE : OrigEdges) {
1450	AllCallerContextIds.reserve(Size: CE->getContextIds().size());
1451	for (auto Id : CE->getContextIds())
1452	if (!AllCallerContextIds.insert(Id).second)
1453	RecursiveContextIds.insert(Id);
1454	}
1455	}
1456	// Increment iterator in loop so that we can remove edges as needed.
1457	for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
1458	auto Edge = *EI;
1459	DenseSet<uint32_t> NewEdgeContextIds;
1460	DenseSet<uint32_t> NotFoundContextIds;
1461	// Remove any matching context ids from Edge, return set that were found and
1462	// removed, these are the new edge's context ids. Also update the remaining
1463	// (not found ids).
1464	set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
1465	NotFoundContextIds);
1466	// Update the remaining context ids set for the later edges. This is a
1467	// compile time optimization.
1468	if (RecursiveContextIds.empty()) {
1469	// No recursive ids, so all of the previously remaining context ids that
1470	// were not seen on this edge are the new remaining set.
1471	RemainingContextIds.swap(RHS&: NotFoundContextIds);
1472	} else {
1473	// Keep the recursive ids in the remaining set as we expect to see those
1474	// on another edge. We can remove the non-recursive remaining ids that
1475	// were seen on this edge, however. We already have the set of remaining
1476	// ids that were on this edge (in NewEdgeContextIds). Figure out which are
1477	// non-recursive and only remove those. Note that despite the higher
1478	// overhead of updating the remaining context ids set when recursion
1479	// handling is enabled, it was found to be at worst performance neutral
1480	// and in one case a clear win.
1481	DenseSet<uint32_t> NonRecursiveRemainingCurEdgeIds =
1482	set_difference(S1: NewEdgeContextIds, S2: RecursiveContextIds);
1483	set_subtract(S1&: RemainingContextIds, S2: NonRecursiveRemainingCurEdgeIds);
1484	}
1485	// If no matching context ids for this edge, skip it.
1486	if (NewEdgeContextIds.empty()) {
1487	++EI;
1488	continue;
1489	}
1490	if (TowardsCallee) {
1491	uint8_t NewAllocType = computeAllocType(ContextIds&: NewEdgeContextIds);
1492	auto NewEdge = std::make_shared<ContextEdge>(
1493	Edge->Callee, NewNode, NewAllocType, std::move(NewEdgeContextIds));
1494	NewNode->CalleeEdges.push_back(NewEdge);
1495	NewEdge->Callee->CallerEdges.push_back(NewEdge);
1496	} else {
1497	uint8_t NewAllocType = computeAllocType(ContextIds&: NewEdgeContextIds);
1498	auto NewEdge = std::make_shared<ContextEdge>(
1499	NewNode, Edge->Caller, NewAllocType, std::move(NewEdgeContextIds));
1500	NewNode->CallerEdges.push_back(NewEdge);
1501	NewEdge->Caller->CalleeEdges.push_back(NewEdge);
1502	}
1503	// Remove old edge if context ids empty.
1504	if (Edge->getContextIds().empty()) {
1505	removeEdgeFromGraph(Edge: Edge.get(), EI: &EI, CalleeIter: TowardsCallee);
1506	continue;
1507	}
1508	++EI;
1509	}
1510	}
1511
1512	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1513	static void checkEdge(
1514	const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
1515	// Confirm that alloc type is not None and that we have at least one context
1516	// id.
1517	assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
1518	assert(!Edge->ContextIds.empty());
1519	}
1520
1521	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1522	static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
1523	bool CheckEdges = true) {
1524	if (Node->isRemoved())
1525	return;
1526	#ifndef NDEBUG
1527	// Compute node's context ids once for use in asserts.
1528	auto NodeContextIds = Node->getContextIds();
1529	#endif
1530	// Node's context ids should be the union of both its callee and caller edge
1531	// context ids.
1532	if (Node->CallerEdges.size()) {
1533	DenseSet<uint32_t> CallerEdgeContextIds(
1534	Node->CallerEdges.front()->ContextIds);
1535	for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) {
1536	if (CheckEdges)
1537	checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1538	set_union(CallerEdgeContextIds, Edge->ContextIds);
1539	}
1540	// Node can have more context ids than callers if some contexts terminate at
1541	// node and some are longer. If we are allowing recursive callsites and
1542	// contexts this will be violated for incompletely cloned recursive cycles,
1543	// so skip the checking in that case.
1544	assert((AllowRecursiveCallsites && AllowRecursiveContexts) \|\|
1545	NodeContextIds == CallerEdgeContextIds \|\|
1546	set_is_subset(CallerEdgeContextIds, NodeContextIds));
1547	}
1548	if (Node->CalleeEdges.size()) {
1549	DenseSet<uint32_t> CalleeEdgeContextIds(
1550	Node->CalleeEdges.front()->ContextIds);
1551	for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) {
1552	if (CheckEdges)
1553	checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1554	set_union(CalleeEdgeContextIds, Edge->getContextIds());
1555	}
1556	// If we are allowing recursive callsites and contexts this will be violated
1557	// for incompletely cloned recursive cycles, so skip the checking in that
1558	// case.
1559	assert((AllowRecursiveCallsites && AllowRecursiveContexts) \|\|
1560	NodeContextIds == CalleeEdgeContextIds);
1561	}
1562	// FIXME: Since this checking is only invoked under an option, we should
1563	// change the error checking from using assert to something that will trigger
1564	// an error on a release build.
1565	#ifndef NDEBUG
1566	// Make sure we don't end up with duplicate edges between the same caller and
1567	// callee.
1568	DenseSet<ContextNode<DerivedCCG, FuncTy, CallTy> *> NodeSet;
1569	for (const auto &E : Node->CalleeEdges)
1570	NodeSet.insert(E->Callee);
1571	assert(NodeSet.size() == Node->CalleeEdges.size());
1572	#endif
1573	}
1574
1575	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1576	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1577	assignStackNodesPostOrder(
1578	ContextNode Node, DenseSet<const* ContextNode *> &Visited,
1579	DenseMap<uint64_t, std::vector<CallContextInfo>>
1580	&StackIdToMatchingCalls,
1581	DenseMap<CallInfo, CallInfo> &CallToMatchingCall) {
1582	auto Inserted = Visited.insert(Node);
1583	if (!Inserted.second)
1584	return;
1585	// Post order traversal. Iterate over a copy since we may add nodes and
1586	// therefore new callers during the recursive call, invalidating any
1587	// iterator over the original edge vector. We don't need to process these
1588	// new nodes as they were already processed on creation.
1589	auto CallerEdges = Node->CallerEdges;
1590	for (auto &Edge : CallerEdges) {
1591	// Skip any that have been removed during the recursion.
1592	if (Edge->isRemoved()) {
1593	assert(!is_contained(Node->CallerEdges, Edge));
1594	continue;
1595	}
1596	assignStackNodesPostOrder(Node: Edge->Caller, Visited, StackIdToMatchingCalls,
1597	CallToMatchingCall);
1598	}
1599
1600	// If this node's stack id is in the map, update the graph to contain new
1601	// nodes representing any inlining at interior callsites. Note we move the
1602	// associated context ids over to the new nodes.
1603
1604	// Ignore this node if it is for an allocation or we didn't record any
1605	// stack id lists ending at it.
1606	if (Node->IsAllocation \|\|
1607	!StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
1608	return;
1609
1610	auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
1611	// Handle the simple case first. A single call with a single stack id.
1612	// In this case there is no need to create any new context nodes, simply
1613	// assign the context node for stack id to this Call.
1614	if (Calls.size() == `1`) {
1615	auto &[Call, Ids, Func, SavedContextIds] = Calls[`0`];
1616	if (Ids.size() == `1`) {
1617	assert(SavedContextIds.empty());
1618	// It should be this Node
1619	assert(Node == getNodeForStackId(Ids[`0`]));
1620	if (Node->Recursive)
1621	return;
1622	Node->setCall(Call);
1623	NonAllocationCallToContextNodeMap[Call] = Node;
1624	NodeToCallingFunc[Node] = Func;
1625	return;
1626	}
1627	}
1628
1629	#ifndef NDEBUG
1630	// Find the node for the last stack id, which should be the same
1631	// across all calls recorded for this id, and is this node's id.
1632	uint64_t LastId = Node->OrigStackOrAllocId;
1633	ContextNode *LastNode = getNodeForStackId(LastId);
1634	// We should only have kept stack ids that had nodes.
1635	assert(LastNode);
1636	assert(LastNode == Node);
1637	#else
1638	ContextNode *LastNode = Node;
1639	#endif
1640
1641	// Compute the last node's context ids once, as it is shared by all calls in
1642	// this entry.
1643	DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1644
1645	[[maybe_unused]] bool PrevIterCreatedNode = false;
1646	bool CreatedNode = false;
1647	for (unsigned I = `0`; I < Calls.size();
1648	I++, PrevIterCreatedNode = CreatedNode) {
1649	CreatedNode = false;
1650	auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1651	// Skip any for which we didn't assign any ids, these don't get a node in
1652	// the graph.
1653	if (SavedContextIds.empty()) {
1654	// If this call has a matching call (located in the same function and
1655	// having the same stack ids), simply add it to the context node created
1656	// for its matching call earlier. These can be treated the same through
1657	// cloning and get updated at the same time.
1658	if (!CallToMatchingCall.contains(Call))
1659	continue;
1660	auto MatchingCall = CallToMatchingCall[Call];
1661	if (!NonAllocationCallToContextNodeMap.contains(MatchingCall)) {
1662	// This should only happen if we had a prior iteration, and it didn't
1663	// create a node because of the below recomputation of context ids
1664	// finding none remaining and continuing early.
1665	assert(I > `0` && !PrevIterCreatedNode);
1666	continue;
1667	}
1668	NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back(
1669	Call);
1670	continue;
1671	}
1672
1673	assert(LastId == Ids.back());
1674
1675	// Recompute the context ids for this stack id sequence (the
1676	// intersection of the context ids of the corresponding nodes).
1677	// Start with the ids we saved in the map for this call, which could be
1678	// duplicated context ids. We have to recompute as we might have overlap
1679	// overlap between the saved context ids for different last nodes, and
1680	// removed them already during the post order traversal.
1681	set_intersect(SavedContextIds, LastNodeContextIds);
1682	ContextNode *PrevNode = LastNode;
1683	bool Skip = false;
1684	// Iterate backwards through the stack Ids, starting after the last Id
1685	// in the list, which was handled once outside for all Calls.
1686	for (auto IdIter = Ids.rbegin() + `1`; IdIter != Ids.rend(); IdIter++) {
1687	auto Id = *IdIter;
1688	ContextNode *CurNode = getNodeForStackId(StackId: Id);
1689	// We should only have kept stack ids that had nodes and weren't
1690	// recursive.
1691	assert(CurNode);
1692	assert(!CurNode->Recursive);
1693
1694	auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1695	if (!Edge) {
1696	Skip = true;
1697	break;
1698	}
1699	PrevNode = CurNode;
1700
1701	// Update the context ids, which is the intersection of the ids along
1702	// all edges in the sequence.
1703	set_intersect(SavedContextIds, Edge->getContextIds());
1704
1705	// If we now have no context ids for clone, skip this call.
1706	if (SavedContextIds.empty()) {
1707	Skip = true;
1708	break;
1709	}
1710	}
1711	if (Skip)
1712	continue;
1713
1714	// Create new context node.
1715	ContextNode NewNode = createNewNode(/IsAllocation=/*false, F: Func, C: Call);
1716	NonAllocationCallToContextNodeMap[Call] = NewNode;
1717	CreatedNode = true;
1718	NewNode->AllocTypes = computeAllocType(ContextIds&: SavedContextIds);
1719
1720	ContextNode *FirstNode = getNodeForStackId(StackId: Ids[`0`]);
1721	assert(FirstNode);
1722
1723	// Connect to callees of innermost stack frame in inlined call chain.
1724	// This updates context ids for FirstNode's callee's to reflect those
1725	// moved to NewNode.
1726	connectNewNode(NewNode, OrigNode: FirstNode, /TowardsCallee=/true, RemainingContextIds: SavedContextIds);
1727
1728	// Connect to callers of outermost stack frame in inlined call chain.
1729	// This updates context ids for FirstNode's caller's to reflect those
1730	// moved to NewNode.
1731	connectNewNode(NewNode, OrigNode: LastNode, /TowardsCallee=/false, RemainingContextIds: SavedContextIds);
1732
1733	// Now we need to remove context ids from edges/nodes between First and
1734	// Last Node.
1735	PrevNode = nullptr;
1736	for (auto Id : Ids) {
1737	ContextNode *CurNode = getNodeForStackId(StackId: Id);
1738	// We should only have kept stack ids that had nodes.
1739	assert(CurNode);
1740
1741	// Remove the context ids moved to NewNode from CurNode, and the
1742	// edge from the prior node.
1743	if (PrevNode) {
1744	auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
1745	// If the sequence contained recursion, we might have already removed
1746	// some edges during the connectNewNode calls above.
1747	if (!PrevEdge) {
1748	PrevNode = CurNode;
1749	continue;
1750	}
1751	set_subtract(PrevEdge->getContextIds(), SavedContextIds);
1752	if (PrevEdge->getContextIds().empty())
1753	removeEdgeFromGraph(Edge: PrevEdge);
1754	}
1755	// Since we update the edges from leaf to tail, only look at the callee
1756	// edges. This isn't an alloc node, so if there are no callee edges, the
1757	// alloc type is None.
1758	CurNode->AllocTypes = CurNode->CalleeEdges.empty()
1759	? (uint8_t)AllocationType::None
1760	: CurNode->computeAllocType();
1761	PrevNode = CurNode;
1762	}
1763	if (VerifyNodes) {
1764	checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /CheckEdges=/true);
1765	for (auto Id : Ids) {
1766	ContextNode *CurNode = getNodeForStackId(StackId: Id);
1767	// We should only have kept stack ids that had nodes.
1768	assert(CurNode);
1769	checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /CheckEdges=/true);
1770	}
1771	}
1772	}
1773	}
1774
1775	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1776	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
1777	// Map of stack id to all calls with that as the last (outermost caller)
1778	// callsite id that has a context node (some might not due to pruning
1779	// performed during matching of the allocation profile contexts).
1780	// The CallContextInfo contains the Call and a list of its stack ids with
1781	// ContextNodes, the function containing Call, and the set of context ids
1782	// the analysis will eventually identify for use in any new node created
1783	// for that callsite.
1784	DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls;
1785	for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
1786	for (auto &Call : CallsWithMetadata) {
1787	// Ignore allocations, already handled.
1788	if (AllocationCallToContextNodeMap.count(Call))
1789	continue;
1790	auto StackIdsWithContextNodes =
1791	getStackIdsWithContextNodesForCall(Call: Call.call());
1792	// If there were no nodes created for MIBs on allocs (maybe this was in
1793	// the unambiguous part of the MIB stack that was pruned), ignore.
1794	if (StackIdsWithContextNodes.empty())
1795	continue;
1796	// Otherwise, record this Call along with the list of ids for the last
1797	// (outermost caller) stack id with a node.
1798	StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
1799	{Call.call(), StackIdsWithContextNodes, Func, {}});
1800	}
1801	}
1802
1803	// First make a pass through all stack ids that correspond to a call,
1804	// as identified in the above loop. Compute the context ids corresponding to
1805	// each of these calls when they correspond to multiple stack ids due to
1806	// due to inlining. Perform any duplication of context ids required when
1807	// there is more than one call with the same stack ids. Their (possibly newly
1808	// duplicated) context ids are saved in the StackIdToMatchingCalls map.
1809	DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
1810	// Save a map from each call to any that are found to match it. I.e. located
1811	// in the same function and have the same (possibly pruned) stack ids. We use
1812	// this to avoid creating extra graph nodes as they can be treated the same.
1813	DenseMap<CallInfo, CallInfo> CallToMatchingCall;
1814	for (auto &It : StackIdToMatchingCalls) {
1815	auto &Calls = It.getSecond();
1816	// Skip single calls with a single stack id. These don't need a new node.
1817	if (Calls.size() == `1`) {
1818	auto &Ids = Calls[`0`].StackIds;
1819	if (Ids.size() == `1`)
1820	continue;
1821	}
1822	// In order to do the best and maximal matching of inlined calls to context
1823	// node sequences we will sort the vectors of stack ids in descending order
1824	// of length, and within each length, lexicographically by stack id. The
1825	// latter is so that we can specially handle calls that have identical stack
1826	// id sequences (either due to cloning or artificially because of the MIB
1827	// context pruning). Those with the same Ids are then sorted by function to
1828	// facilitate efficiently mapping them to the same context node.
1829	// Because the functions are pointers, to ensure a stable sort first assign
1830	// each function pointer to its first index in the Calls array, and then use
1831	// that to sort by.
1832	DenseMap<const FuncTy , unsigned*> FuncToIndex;
1833	for (const auto &[Idx, CallCtxInfo] : enumerate(Calls))
1834	FuncToIndex.insert({CallCtxInfo.Func, Idx});
1835	llvm::stable_sort(
1836	Calls,
1837	[&FuncToIndex](const CallContextInfo &A, const CallContextInfo &B) {
1838	return A.StackIds.size() > B.StackIds.size() \|\|
1839	(A.StackIds.size() == B.StackIds.size() &&
1840	(A.StackIds < B.StackIds \|\|
1841	(A.StackIds == B.StackIds &&
1842	FuncToIndex[A.Func] < FuncToIndex[B.Func])));
1843	});
1844
1845	// Find the node for the last stack id, which should be the same
1846	// across all calls recorded for this id, and is the id for this
1847	// entry in the StackIdToMatchingCalls map.
1848	uint64_t LastId = It.getFirst();
1849	ContextNode *LastNode = getNodeForStackId(StackId: LastId);
1850	// We should only have kept stack ids that had nodes.
1851	assert(LastNode);
1852
1853	if (LastNode->Recursive)
1854	continue;
1855
1856	// Initialize the context ids with the last node's. We will subsequently
1857	// refine the context ids by computing the intersection along all edges.
1858	DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1859	assert(!LastNodeContextIds.empty());
1860
1861	#ifndef NDEBUG
1862	// Save the set of functions seen for a particular set of the same stack
1863	// ids. This is used to ensure that they have been correctly sorted to be
1864	// adjacent in the Calls list, since we rely on that to efficiently place
1865	// all such matching calls onto the same context node.
1866	DenseSet<const FuncTy *> MatchingIdsFuncSet;
1867	#endif
1868
1869	for (unsigned I = `0`; I < Calls.size(); I++) {
1870	auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1871	assert(SavedContextIds.empty());
1872	assert(LastId == Ids.back());
1873
1874	#ifndef NDEBUG
1875	// If this call has a different set of ids than the last one, clear the
1876	// set used to ensure they are sorted properly.
1877	if (I > `0` && Ids != Calls[I - `1`].StackIds)
1878	MatchingIdsFuncSet.clear();
1879	#endif
1880
1881	// First compute the context ids for this stack id sequence (the
1882	// intersection of the context ids of the corresponding nodes).
1883	// Start with the remaining saved ids for the last node.
1884	assert(!LastNodeContextIds.empty());
1885	DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
1886
1887	ContextNode *PrevNode = LastNode;
1888	ContextNode *CurNode = LastNode;
1889	bool Skip = false;
1890
1891	// Iterate backwards through the stack Ids, starting after the last Id
1892	// in the list, which was handled once outside for all Calls.
1893	for (auto IdIter = Ids.rbegin() + `1`; IdIter != Ids.rend(); IdIter++) {
1894	auto Id = *IdIter;
1895	CurNode = getNodeForStackId(StackId: Id);
1896	// We should only have kept stack ids that had nodes.
1897	assert(CurNode);
1898
1899	if (CurNode->Recursive) {
1900	Skip = true;
1901	break;
1902	}
1903
1904	auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1905	// If there is no edge then the nodes belong to different MIB contexts,
1906	// and we should skip this inlined context sequence. For example, this
1907	// particular inlined context may include stack ids A->B, and we may
1908	// indeed have nodes for both A and B, but it is possible that they were
1909	// never profiled in sequence in a single MIB for any allocation (i.e.
1910	// we might have profiled an allocation that involves the callsite A,
1911	// but through a different one of its callee callsites, and we might
1912	// have profiled an allocation that involves callsite B, but reached
1913	// from a different caller callsite).
1914	if (!Edge) {
1915	Skip = true;
1916	break;
1917	}
1918	PrevNode = CurNode;
1919
1920	// Update the context ids, which is the intersection of the ids along
1921	// all edges in the sequence.
1922	set_intersect(StackSequenceContextIds, Edge->getContextIds());
1923
1924	// If we now have no context ids for clone, skip this call.
1925	if (StackSequenceContextIds.empty()) {
1926	Skip = true;
1927	break;
1928	}
1929	}
1930	if (Skip)
1931	continue;
1932
1933	// If some of this call's stack ids did not have corresponding nodes (due
1934	// to pruning), don't include any context ids for contexts that extend
1935	// beyond these nodes. Otherwise we would be matching part of unrelated /
1936	// not fully matching stack contexts. To do this, subtract any context ids
1937	// found in caller nodes of the last node found above.
1938	if (Ids.back() != getLastStackId(Call)) {
1939	for (const auto &PE : LastNode->CallerEdges) {
1940	set_subtract(StackSequenceContextIds, PE->getContextIds());
1941	if (StackSequenceContextIds.empty())
1942	break;
1943	}
1944	// If we now have no context ids for clone, skip this call.
1945	if (StackSequenceContextIds.empty())
1946	continue;
1947	}
1948
1949	#ifndef NDEBUG
1950	// If the prior call had the same stack ids this set would not be empty.
1951	// Check if we already have a call that "matches" because it is located
1952	// in the same function. If the Calls list was sorted properly we should
1953	// not encounter this situation as all such entries should be adjacent
1954	// and processed in bulk further below.
1955	assert(!MatchingIdsFuncSet.contains(Func));
1956
1957	MatchingIdsFuncSet.insert(Func);
1958	#endif
1959
1960	// Check if the next set of stack ids is the same (since the Calls vector
1961	// of tuples is sorted by the stack ids we can just look at the next one).
1962	// If so, save them in the CallToMatchingCall map so that they get
1963	// assigned to the same context node, and skip them.
1964	bool DuplicateContextIds = false;
1965	for (unsigned J = I + `1`; J < Calls.size(); J++) {
1966	auto &CallCtxInfo = Calls[J];
1967	auto &NextIds = CallCtxInfo.StackIds;
1968	if (NextIds != Ids)
1969	break;
1970	auto *NextFunc = CallCtxInfo.Func;
1971	if (NextFunc != Func) {
1972	// We have another Call with the same ids but that cannot share this
1973	// node, must duplicate ids for it.
1974	DuplicateContextIds = true;
1975	break;
1976	}
1977	auto &NextCall = CallCtxInfo.Call;
1978	CallToMatchingCall[NextCall] = Call;
1979	// Update I so that it gets incremented correctly to skip this call.
1980	I = J;
1981	}
1982
1983	// If we don't have duplicate context ids, then we can assign all the
1984	// context ids computed for the original node sequence to this call.
1985	// If there are duplicate calls with the same stack ids then we synthesize
1986	// new context ids that are duplicates of the originals. These are
1987	// assigned to SavedContextIds, which is a reference into the map entry
1988	// for this call, allowing us to access these ids later on.
1989	OldToNewContextIds.reserve(NumEntries: OldToNewContextIds.size() +
1990	StackSequenceContextIds.size());
1991	SavedContextIds =
1992	DuplicateContextIds
1993	? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
1994	: StackSequenceContextIds;
1995	assert(!SavedContextIds.empty());
1996
1997	if (!DuplicateContextIds) {
1998	// Update saved last node's context ids to remove those that are
1999	// assigned to other calls, so that it is ready for the next call at
2000	// this stack id.
2001	set_subtract(S1&: LastNodeContextIds, S2: StackSequenceContextIds);
2002	if (LastNodeContextIds.empty())
2003	break;
2004	}
2005	}
2006	}
2007
2008	// Propagate the duplicate context ids over the graph.
2009	propagateDuplicateContextIds(OldToNewContextIds);
2010
2011	if (VerifyCCG)
2012	check();
2013
2014	// Now perform a post-order traversal over the graph, starting with the
2015	// allocation nodes, essentially processing nodes from callers to callees.
2016	// For any that contains an id in the map, update the graph to contain new
2017	// nodes representing any inlining at interior callsites. Note we move the
2018	// associated context ids over to the new nodes.
2019	DenseSet<const ContextNode *> Visited;
2020	for (auto &Entry : AllocationCallToContextNodeMap)
2021	assignStackNodesPostOrder(Node: Entry.second, Visited, StackIdToMatchingCalls,
2022	CallToMatchingCall);
2023	if (VerifyCCG)
2024	check();
2025	}
2026
2027	uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
2028	CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
2029	Call->getMetadata(KindID: LLVMContext::MD_callsite));
2030	return CallsiteContext.back();
2031	}
2032
2033	uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
2034	assert(isa<CallsiteInfo *>(Call));
2035	CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
2036	CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Val&: Call));
2037	// Need to convert index into stack id.
2038	return Index.getStackIdAtIndex(Index: CallsiteContext.back());
2039	}
2040
2041	static const std::string MemProfCloneSuffix = ".memprof.";
2042
2043	static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
2044	// We use CloneNo == 0 to refer to the original version, which doesn't get
2045	// renamed with a suffix.
2046	if (!CloneNo)
2047	return Base.str();
2048	return (Base + MemProfCloneSuffix + Twine (CloneNo)).str();
2049	}
2050
2051	static bool isMemProfClone(const Function &F) {
2052	return F.getName().contains(Other: MemProfCloneSuffix);
2053	}
2054
2055	std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
2056	const Instruction *Call,
2057	unsigned CloneNo) const {
2058	return (Twine (Call->getFunction()->getName()) + " -> " +
2059	cast<CallBase>(Val: Call)->getCalledFunction()->getName())
2060	.str();
2061	}
2062
2063	std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
2064	const IndexCall &Call,
2065	unsigned CloneNo) const {
2066	auto VI = FSToVIMap.find(x: Func);
2067	assert(VI != FSToVIMap.end());
2068	if (isa<AllocInfo *>(Val: Call))
2069	return (VI ->second.name() + " -> alloc").str();
2070	else {
2071	auto Callsite = dyn_cast_if_present<CallsiteInfo >(Val: Call);
2072	return (VI ->second.name() + " -> " +
2073	getMemProfFuncName(Base: Callsite->Callee.name(),
2074	CloneNo: Callsite->Clones [CloneNo]))
2075	.str();
2076	}
2077	}
2078
2079	std::vector<uint64_t>
2080	ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
2081	Instruction *Call) {
2082	CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
2083	Call->getMetadata(KindID: LLVMContext::MD_callsite));
2084	return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
2085	CallsiteContext);
2086	}
2087
2088	std::vector<uint64_t>
2089	IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
2090	assert(isa<CallsiteInfo *>(Call));
2091	CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
2092	CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Val&: Call));
2093	return getStackIdsWithContextNodes<CallsiteInfo,
2094	SmallVector<unsigned>::const_iterator>(
2095	CallsiteContext);
2096	}
2097
2098	template <typename DerivedCCG, typename FuncTy, typename CallTy>
2099	template <class NodeT, class IteratorT>
2100	std::vector<uint64_t>
2101	CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
2102	CallStack<NodeT, IteratorT> &CallsiteContext) {
2103	std::vector<uint64_t> StackIds;
2104	for (auto IdOrIndex : CallsiteContext) {
2105	auto StackId = getStackId(IdOrIndex);
2106	ContextNode *Node = getNodeForStackId(StackId);
2107	if (!Node)
2108	break;
2109	StackIds.push_back(StackId);
2110	}
2111	return StackIds;
2112	}
2113
2114	ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
2115	Module &M,
2116	llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)
2117	: Mod(M), OREGetter (OREGetter) {
2118	for (auto &F : M) {
2119	std::vector<CallInfo> CallsWithMetadata;
2120	for (auto &BB : F) {
2121	for (auto &I : BB) {
2122	if (!isa<CallBase>(Val: I))
2123	continue;
2124	if (auto *MemProfMD = I.getMetadata(KindID: LLVMContext::MD_memprof)) {
2125	CallsWithMetadata.push_back(x: &I);
2126	auto *AllocNode = addAllocNode(Call: &I, F: &F);
2127	auto *CallsiteMD = I.getMetadata(KindID: LLVMContext::MD_callsite);
2128	assert(CallsiteMD);
2129	CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
2130	// Add all of the MIBs and their stack nodes.
2131	for (auto &MDOp : MemProfMD->operands()) {
2132	auto MIBMD = cast<const* MDNode>(Val: MDOp);
2133	std::vector<ContextTotalSize> ContextSizeInfo;
2134	// Collect the context size information if it exists.
2135	if (MIBMD->getNumOperands() > `2`) {
2136	for (unsigned I = `2`; I < MIBMD->getNumOperands(); I++) {
2137	MDNode *ContextSizePair =
2138	dyn_cast<MDNode>(Val: MIBMD->getOperand(I));
2139	assert(ContextSizePair->getNumOperands() == `2`);
2140	uint64_t FullStackId = mdconst::dyn_extract<ConstantInt>(
2141	MD: ContextSizePair->getOperand(I: `0`))
2142	->getZExtValue();
2143	uint64_t TotalSize = mdconst::dyn_extract<ConstantInt>(
2144	MD: ContextSizePair->getOperand(I: `1`))
2145	->getZExtValue();
2146	ContextSizeInfo.push_back(x: {.FullStackId: FullStackId, .TotalSize: TotalSize});
2147	}
2148	}
2149	MDNode *StackNode = getMIBStackNode(MIB: MIBMD);
2150	assert(StackNode);
2151	CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode);
2152	addStackNodesForMIB<MDNode, MDNode::op_iterator>(
2153	AllocNode, StackContext, CallsiteContext,
2154	AllocType: getMIBAllocType(MIB: MIBMD), ContextSizeInfo);
2155	}
2156	// If exporting the graph to dot and an allocation id of interest was
2157	// specified, record all the context ids for this allocation node.
2158	if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
2159	DotAllocContextIds = AllocNode->getContextIds();
2160	assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2161	// Memprof and callsite metadata on memory allocations no longer
2162	// needed.
2163	I.setMetadata(KindID: LLVMContext::MD_memprof, Node: nullptr);
2164	I.setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
2165	}
2166	// For callsite metadata, add to list for this function for later use.
2167	else if (I.getMetadata(KindID: LLVMContext::MD_callsite)) {
2168	CallsWithMetadata.push_back(x: &I);
2169	}
2170	}
2171	}
2172	if (!CallsWithMetadata.empty())
2173	FuncToCallsWithMetadata [&F] = CallsWithMetadata;
2174	}
2175
2176	if (DumpCCG) {
2177	dbgs() << "CCG before updating call stack chains:\n";
2178	dbgs() << *this;
2179	}
2180
2181	if (ExportToDot)
2182	exportToDot(Label: "prestackupdate");
2183
2184	updateStackNodes();
2185
2186	if (ExportToDot)
2187	exportToDot(Label: "poststackupdate");
2188
2189	handleCallsitesWithMultipleTargets();
2190
2191	markBackedges();
2192
2193	// Strip off remaining callsite metadata, no longer needed.
2194	for (auto &FuncEntry : FuncToCallsWithMetadata)
2195	for (auto &Call : FuncEntry.second)
2196	Call.call()->setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
2197	}
2198
2199	IndexCallsiteContextGraph::IndexCallsiteContextGraph(
2200	ModuleSummaryIndex &Index,
2201	llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
2202	isPrevailing)
2203	: Index(Index), isPrevailing (isPrevailing) {
2204	for (auto &I : Index) {
2205	auto VI = Index.getValueInfo(R: I);
2206	for (auto &S : VI.getSummaryList()) {
2207	// We should only add the prevailing nodes. Otherwise we may try to clone
2208	// in a weak copy that won't be linked (and may be different than the
2209	// prevailing version).
2210	// We only keep the memprof summary on the prevailing copy now when
2211	// building the combined index, as a space optimization, however don't
2212	// rely on this optimization. The linker doesn't resolve local linkage
2213	// values so don't check whether those are prevailing.
2214	if (!GlobalValue::isLocalLinkage(Linkage: S ->linkage()) &&
2215	!isPrevailing (VI.getGUID(), S.get()))
2216	continue;
2217	auto *FS = dyn_cast<FunctionSummary>(Val: S.get());
2218	if (!FS)
2219	continue;
2220	std::vector<CallInfo> CallsWithMetadata;
2221	if (!FS->allocs().empty()) {
2222	for (auto &AN : FS->mutableAllocs()) {
2223	// This can happen because of recursion elimination handling that
2224	// currently exists in ModuleSummaryAnalysis. Skip these for now.
2225	// We still added them to the summary because we need to be able to
2226	// correlate properly in applyImport in the backends.
2227	if (AN.MIBs.empty())
2228	continue;
2229	IndexCall AllocCall(&AN);
2230	CallsWithMetadata.push_back(x: AllocCall);
2231	auto *AllocNode = addAllocNode(Call: AllocCall, F: FS);
2232	// Pass an empty CallStack to the CallsiteContext (second)
2233	// parameter, since for ThinLTO we already collapsed out the inlined
2234	// stack ids on the allocation call during ModuleSummaryAnalysis.
2235	CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
2236	EmptyContext;
2237	unsigned I = `0`;
2238	assert(!metadataMayIncludeContextSizeInfo() \|\|
2239	AN.ContextSizeInfos.size() == AN.MIBs.size());
2240	// Now add all of the MIBs and their stack nodes.
2241	for (auto &MIB : AN.MIBs) {
2242	CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
2243	StackContext(&MIB);
2244	std::vector<ContextTotalSize> ContextSizeInfo;
2245	if (!AN.ContextSizeInfos.empty()) {
2246	for (auto [FullStackId, TotalSize] : AN.ContextSizeInfos [I])
2247	ContextSizeInfo.push_back(x: {.FullStackId: FullStackId, .TotalSize: TotalSize});
2248	}
2249	addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
2250	AllocNode, StackContext, CallsiteContext&: EmptyContext, AllocType: MIB.AllocType,
2251	ContextSizeInfo);
2252	I++;
2253	}
2254	// If exporting the graph to dot and an allocation id of interest was
2255	// specified, record all the context ids for this allocation node.
2256	if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
2257	DotAllocContextIds = AllocNode->getContextIds();
2258	assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2259	// Initialize version 0 on the summary alloc node to the current alloc
2260	// type, unless it has both types in which case make it default, so
2261	// that in the case where we aren't able to clone the original version
2262	// always ends up with the default allocation behavior.
2263	AN.Versions [`0`] = (uint8_t)allocTypeToUse(AllocTypes: AllocNode->AllocTypes);
2264	}
2265	}
2266	// For callsite metadata, add to list for this function for later use.
2267	if (!FS->callsites().empty())
2268	for (auto &SN : FS->mutableCallsites()) {
2269	IndexCall StackNodeCall(&SN);
2270	CallsWithMetadata.push_back(x: StackNodeCall);
2271	}
2272
2273	if (!CallsWithMetadata.empty())
2274	FuncToCallsWithMetadata [FS] = CallsWithMetadata;
2275
2276	if (!FS->allocs().empty() \|\| !FS->callsites().empty())
2277	FSToVIMap [FS] = VI;
2278	}
2279	}
2280
2281	if (DumpCCG) {
2282	dbgs() << "CCG before updating call stack chains:\n";
2283	dbgs() << *this;
2284	}
2285
2286	if (ExportToDot)
2287	exportToDot(Label: "prestackupdate");
2288
2289	updateStackNodes();
2290
2291	if (ExportToDot)
2292	exportToDot(Label: "poststackupdate");
2293
2294	handleCallsitesWithMultipleTargets();
2295
2296	markBackedges();
2297	}
2298
2299	template <typename DerivedCCG, typename FuncTy, typename CallTy>
2300	void CallsiteContextGraph<DerivedCCG, FuncTy,
2301	CallTy>::handleCallsitesWithMultipleTargets() {
2302	// Look for and workaround callsites that call multiple functions.
2303	// This can happen for indirect calls, which needs better handling, and in
2304	// more rare cases (e.g. macro expansion).
2305	// TODO: To fix this for indirect calls we will want to perform speculative
2306	// devirtualization using either the normal PGO info with ICP, or using the
2307	// information in the profiled MemProf contexts. We can do this prior to
2308	// this transformation for regular LTO, and for ThinLTO we can simulate that
2309	// effect in the summary and perform the actual speculative devirtualization
2310	// while cloning in the ThinLTO backend.
2311
2312	// Keep track of the new nodes synthesized for discovered tail calls missing
2313	// from the profiled contexts.
2314	MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;
2315
2316	std::vector<std::pair<CallInfo, ContextNode *>> NewCallToNode;
2317	for (auto &Entry : NonAllocationCallToContextNodeMap) {
2318	auto *Node = Entry.second;
2319	assert(Node->Clones.empty());
2320	// Check all node callees and see if in the same function.
2321	// We need to check all of the calls recorded in this Node, because in some
2322	// cases we may have had multiple calls with the same debug info calling
2323	// different callees. This can happen, for example, when an object is
2324	// constructed in the paramter list - the destructor call of the object has
2325	// the same debug info (line/col) as the call the object was passed to.
2326	// Here we will prune any that don't match all callee nodes.
2327	std::vector<CallInfo> AllCalls;
2328	AllCalls.reserve(Node->MatchingCalls.size() + `1`);
2329	AllCalls.push_back(Node->Call);
2330	llvm::append_range(AllCalls, Node->MatchingCalls);
2331
2332	// First see if we can partition the calls by callee function, creating new
2333	// nodes to host each set of calls calling the same callees. This is
2334	// necessary for support indirect calls with ThinLTO, for which we
2335	// synthesized CallsiteInfo records for each target. They will all have the
2336	// same callsite stack ids and would be sharing a context node at this
2337	// point. We need to perform separate cloning for each, which will be
2338	// applied along with speculative devirtualization in the ThinLTO backends
2339	// as needed. Note this does not currently support looking through tail
2340	// calls, it is unclear if we need that for indirect call targets.
2341	// First partition calls by callee func. Map indexed by func, value is
2342	// struct with list of matching calls, assigned node.
2343	if (partitionCallsByCallee(Node, AllCalls, NewCallToNode))
2344	continue;
2345
2346	auto It = AllCalls.begin();
2347	// Iterate through the calls until we find the first that matches.
2348	for (; It != AllCalls.end(); ++It) {
2349	auto ThisCall = *It;
2350	bool Match = true;
2351	for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();
2352	++EI) {
2353	auto Edge = *EI;
2354	if (!Edge->Callee->hasCall())
2355	continue;
2356	assert(NodeToCallingFunc.count(Edge->Callee));
2357	// Check if the called function matches that of the callee node.
2358	if (!calleesMatch(Call: ThisCall.call(), EI, TailCallToContextNodeMap)) {
2359	Match = false;
2360	break;
2361	}
2362	}
2363	// Found a call that matches the callee nodes, we can quit now.
2364	if (Match) {
2365	// If the first match is not the primary call on the Node, update it
2366	// now. We will update the list of matching calls further below.
2367	if (Node->Call != ThisCall) {
2368	Node->setCall(ThisCall);
2369	// We need to update the NonAllocationCallToContextNodeMap, but don't
2370	// want to do this during iteration over that map, so save the calls
2371	// that need updated entries.
2372	NewCallToNode.push_back({ThisCall, Node});
2373	}
2374	break;
2375	}
2376	}
2377	// We will update this list below (or leave it cleared if there was no
2378	// match found above).
2379	Node->MatchingCalls.clear();
2380	// If we hit the end of the AllCalls vector, no call matching the callee
2381	// nodes was found, clear the call information in the node.
2382	if (It == AllCalls.end()) {
2383	RemovedEdgesWithMismatchedCallees ++;
2384	// Work around by setting Node to have a null call, so it gets
2385	// skipped during cloning. Otherwise assignFunctions will assert
2386	// because its data structures are not designed to handle this case.
2387	Node->setCall(CallInfo());
2388	continue;
2389	}
2390	// Now add back any matching calls that call the same function as the
2391	// matching primary call on Node.
2392	for (++It; It != AllCalls.end(); ++It) {
2393	auto ThisCall = *It;
2394	if (!sameCallee(Call1: Node->Call.call(), Call2: ThisCall.call()))
2395	continue;
2396	Node->MatchingCalls.push_back(ThisCall);
2397	}
2398	}
2399
2400	// Remove all mismatched nodes identified in the above loop from the node map
2401	// (checking whether they have a null call which is set above). For a
2402	// MapVector like NonAllocationCallToContextNodeMap it is much more efficient
2403	// to do the removal via remove_if than by individually erasing entries above.
2404	// Also remove any entries if we updated the node's primary call above.
2405	NonAllocationCallToContextNodeMap.remove_if([](const auto &it) {
2406	return !it.second->hasCall() \|\| it.second->Call != it.first;
2407	});
2408
2409	// Add entries for any new primary calls recorded above.
2410	for (auto &[Call, Node] : NewCallToNode)
2411	NonAllocationCallToContextNodeMap[Call] = Node;
2412
2413	// Add the new nodes after the above loop so that the iteration is not
2414	// invalidated.
2415	for (auto &[Call, Node] : TailCallToContextNodeMap)
2416	NonAllocationCallToContextNodeMap[Call] = Node;
2417	}
2418
2419	template <typename DerivedCCG, typename FuncTy, typename CallTy>
2420	bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::partitionCallsByCallee(
2421	ContextNode *Node, ArrayRef<CallInfo> AllCalls,
2422	std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode) {
2423	// Struct to keep track of all the calls having the same callee function,
2424	// and the node we eventually assign to them. Eventually we will record the
2425	// context node assigned to this group of calls.
2426	struct CallsWithSameCallee {
2427	std::vector<CallInfo> Calls;
2428	ContextNode Node = nullptr*;
2429	};
2430
2431	// First partition calls by callee function. Build map from each function
2432	// to the list of matching calls.
2433	DenseMap<const FuncTy *, CallsWithSameCallee> CalleeFuncToCallInfo;
2434	for (auto ThisCall : AllCalls) {
2435	auto *F = getCalleeFunc(Call: ThisCall.call());
2436	if (F)
2437	CalleeFuncToCallInfo[F].Calls.push_back(ThisCall);
2438	}
2439
2440	// Next, walk through all callee edges. For each callee node, get its
2441	// containing function and see if it was recorded in the above map (meaning we
2442	// have at least one matching call). Build another map from each callee node
2443	// with a matching call to the structure instance created above containing all
2444	// the calls.
2445	DenseMap<ContextNode , CallsWithSameCallee > CalleeNodeToCallInfo;
2446	for (const auto &Edge : Node->CalleeEdges) {
2447	if (!Edge->Callee->hasCall())
2448	continue;
2449	const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2450	if (CalleeFuncToCallInfo.contains(ProfiledCalleeFunc))
2451	CalleeNodeToCallInfo[Edge->Callee] =
2452	&CalleeFuncToCallInfo[ProfiledCalleeFunc];
2453	}
2454
2455	// If there are entries in the second map, then there were no matching
2456	// calls/callees, nothing to do here. Return so we can go to the handling that
2457	// looks through tail calls.
2458	if (CalleeNodeToCallInfo.empty())
2459	return false;
2460
2461	// Walk through all callee edges again. Any and all callee edges that didn't
2462	// match any calls (callee not in the CalleeNodeToCallInfo map) are moved to a
2463	// new caller node (UnmatchedCalleesNode) which gets a null call so that it is
2464	// ignored during cloning. If it is in the map, then we use the node recorded
2465	// in that entry (creating it if needed), and move the callee edge to it.
2466	// The first callee will use the original node instead of creating a new one.
2467	// Note that any of the original calls on this node (in AllCalls) that didn't
2468	// have a callee function automatically get dropped from the node as part of
2469	// this process.
2470	ContextNode UnmatchedCalleesNode = nullptr*;
2471	// Track whether we already assigned original node to a callee.
2472	bool UsedOrigNode = false;
2473	assert(NodeToCallingFunc[Node]);
2474	// Iterate over a copy of Node's callee edges, since we may need to remove
2475	// edges in moveCalleeEdgeToNewCaller, and this simplifies the handling and
2476	// makes it less error-prone.
2477	auto CalleeEdges = Node->CalleeEdges;
2478	for (auto &Edge : CalleeEdges) {
2479	if (!Edge->Callee->hasCall())
2480	continue;
2481
2482	// Will be updated below to point to whatever (caller) node this callee edge
2483	// should be moved to.
2484	ContextNode CallerNodeToUse = nullptr*;
2485
2486	// Handle the case where there were no matching calls first. Move this
2487	// callee edge to the UnmatchedCalleesNode, creating it if needed.
2488	if (!CalleeNodeToCallInfo.contains(Edge->Callee)) {
2489	if (!UnmatchedCalleesNode)
2490	UnmatchedCalleesNode =
2491	createNewNode(/IsAllocation=/false, F: NodeToCallingFunc[Node]);
2492	CallerNodeToUse = UnmatchedCalleesNode;
2493	} else {
2494	// Look up the information recorded for this callee node, and use the
2495	// recorded caller node (creating it if needed).
2496	auto *Info = CalleeNodeToCallInfo[Edge->Callee];
2497	if (!Info->Node) {
2498	// If we haven't assigned any callees to the original node use it.
2499	if (!UsedOrigNode) {
2500	Info->Node = Node;
2501	// Clear the set of matching calls which will be updated below.
2502	Node->MatchingCalls.clear();
2503	UsedOrigNode = true;
2504	} else
2505	Info->Node =
2506	createNewNode(/IsAllocation=/false, F: NodeToCallingFunc[Node]);
2507	assert(!Info->Calls.empty());
2508	// The first call becomes the primary call for this caller node, and the
2509	// rest go in the matching calls list.
2510	Info->Node->setCall(Info->Calls.front());
2511	llvm::append_range(Info->Node->MatchingCalls,
2512	llvm::drop_begin(Info->Calls));
2513	// Save the primary call to node correspondence so that we can update
2514	// the NonAllocationCallToContextNodeMap, which is being iterated in the
2515	// caller of this function.
2516	NewCallToNode.push_back({Info->Node->Call, Info->Node});
2517	}
2518	CallerNodeToUse = Info->Node;
2519	}
2520
2521	// Don't need to move edge if we are using the original node;
2522	if (CallerNodeToUse == Node)
2523	continue;
2524
2525	moveCalleeEdgeToNewCaller(Edge, NewCaller: CallerNodeToUse);
2526	}
2527	// Now that we are done moving edges, clean up any caller edges that ended
2528	// up with no type or context ids. During moveCalleeEdgeToNewCaller all
2529	// caller edges from Node are replicated onto the new callers, and it
2530	// simplifies the handling to leave them until we have moved all
2531	// edges/context ids.
2532	for (auto &I : CalleeNodeToCallInfo)
2533	removeNoneTypeCallerEdges(Node: I.second->Node);
2534	if (UnmatchedCalleesNode)
2535	removeNoneTypeCallerEdges(Node: UnmatchedCalleesNode);
2536	removeNoneTypeCallerEdges(Node);
2537
2538	return true;
2539	}
2540
2541	uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2542	// In the Module (IR) case this is already the Id.
2543	return IdOrIndex;
2544	}
2545
2546	uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2547	// In the Index case this is an index into the stack id list in the summary
2548	// index, convert it to an Id.
2549	return Index.getStackIdAtIndex(Index: IdOrIndex);
2550	}
2551
2552	template <typename DerivedCCG, typename FuncTy, typename CallTy>
2553	bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
2554	CallTy Call, EdgeIter &EI,
2555	MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) {
2556	auto Edge = *EI;
2557	const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2558	const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller];
2559	// Will be populated in order of callee to caller if we find a chain of tail
2560	// calls between the profiled caller and callee.
2561	std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain;
2562	if (!calleeMatchesFunc(Call, Func: ProfiledCalleeFunc, CallerFunc,
2563	FoundCalleeChain))
2564	return false;
2565
2566	// The usual case where the profiled callee matches that of the IR/summary.
2567	if (FoundCalleeChain.empty())
2568	return true;
2569
2570	auto AddEdge = [Edge, &EI](ContextNode Caller, ContextNode Callee) {
2571	auto *CurEdge = Callee->findEdgeFromCaller(Caller);
2572	// If there is already an edge between these nodes, simply update it and
2573	// return.
2574	if (CurEdge) {
2575	CurEdge->ContextIds.insert_range(Edge->ContextIds);
2576	CurEdge->AllocTypes \|= Edge->AllocTypes;
2577	return;
2578	}
2579	// Otherwise, create a new edge and insert it into the caller and callee
2580	// lists.
2581	auto NewEdge = std::make_shared<ContextEdge>(
2582	Callee, Caller, Edge->AllocTypes, Edge->ContextIds);
2583	Callee->CallerEdges.push_back(NewEdge);
2584	if (Caller == Edge->Caller) {
2585	// If we are inserting the new edge into the current edge's caller, insert
2586	// the new edge before the current iterator position, and then increment
2587	// back to the current edge.
2588	EI = Caller->CalleeEdges.insert(EI, NewEdge);
2589	++EI;
2590	assert(*EI == Edge &&
2591	"Iterator position not restored after insert and increment");
2592	} else
2593	Caller->CalleeEdges.push_back(NewEdge);
2594	};
2595
2596	// Create new nodes for each found callee and connect in between the profiled
2597	// caller and callee.
2598	auto *CurCalleeNode = Edge->Callee;
2599	for (auto &[NewCall, Func] : FoundCalleeChain) {
2600	ContextNode NewNode = nullptr*;
2601	// First check if we have already synthesized a node for this tail call.
2602	if (TailCallToContextNodeMap.count(NewCall)) {
2603	NewNode = TailCallToContextNodeMap[NewCall];
2604	NewNode->AllocTypes \|= Edge->AllocTypes;
2605	} else {
2606	FuncToCallsWithMetadata[Func].push_back({NewCall});
2607	// Create Node and record node info.
2608	NewNode = createNewNode(/IsAllocation=/false, F: Func, C: NewCall);
2609	TailCallToContextNodeMap[NewCall] = NewNode;
2610	NewNode->AllocTypes = Edge->AllocTypes;
2611	}
2612
2613	// Hook up node to its callee node
2614	AddEdge(NewNode, CurCalleeNode);
2615
2616	CurCalleeNode = NewNode;
2617	}
2618
2619	// Hook up edge's original caller to new callee node.
2620	AddEdge(Edge->Caller, CurCalleeNode);
2621
2622	#ifndef NDEBUG
2623	// Save this because Edge's fields get cleared below when removed.
2624	auto *Caller = Edge->Caller;
2625	#endif
2626
2627	// Remove old edge
2628	removeEdgeFromGraph(Edge: Edge.get(), EI: &EI, /CalleeIter=/true);
2629
2630	// To simplify the increment of EI in the caller, subtract one from EI.
2631	// In the final AddEdge call we would have either added a new callee edge,
2632	// to Edge->Caller, or found an existing one. Either way we are guaranteed
2633	// that there is at least one callee edge.
2634	assert(!Caller->CalleeEdges.empty());
2635	--EI;
2636
2637	return true;
2638	}
2639
2640	bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2641	const Function ProfiledCallee, Value CurCallee, unsigned Depth,
2642	std::vector<std::pair<Instruction , Function >> &FoundCalleeChain,
2643	bool &FoundMultipleCalleeChains) {
2644	// Stop recursive search if we have already explored the maximum specified
2645	// depth.
2646	if (Depth > TailCallSearchDepth)
2647	return false;
2648
2649	auto SaveCallsiteInfo = [&](Instruction Callsite, Function F) {
2650	FoundCalleeChain.push_back(x: {Callsite, F});
2651	};
2652
2653	auto *CalleeFunc = dyn_cast<Function>(Val: CurCallee);
2654	if (!CalleeFunc) {
2655	auto *Alias = dyn_cast<GlobalAlias>(Val: CurCallee);
2656	assert(Alias);
2657	CalleeFunc = dyn_cast<Function>(Val: Alias->getAliasee());
2658	assert(CalleeFunc);
2659	}
2660
2661	// Look for tail calls in this function, and check if they either call the
2662	// profiled callee directly, or indirectly (via a recursive search).
2663	// Only succeed if there is a single unique tail call chain found between the
2664	// profiled caller and callee, otherwise we could perform incorrect cloning.
2665	bool FoundSingleCalleeChain = false;
2666	for (auto &BB : *CalleeFunc) {
2667	for (auto &I : BB) {
2668	auto *CB = dyn_cast<CallBase>(Val: &I);
2669	if (!CB \|\| !CB->isTailCall())
2670	continue;
2671	auto *CalledValue = CB->getCalledOperand();
2672	auto *CalledFunction = CB->getCalledFunction();
2673	if (CalledValue && !CalledFunction) {
2674	CalledValue = CalledValue->stripPointerCasts();
2675	// Stripping pointer casts can reveal a called function.
2676	CalledFunction = dyn_cast<Function>(Val: CalledValue);
2677	}
2678	// Check if this is an alias to a function. If so, get the
2679	// called aliasee for the checks below.
2680	if (auto *GA = dyn_cast<GlobalAlias>(Val: CalledValue)) {
2681	assert(!CalledFunction &&
2682	"Expected null called function in callsite for alias");
2683	CalledFunction = dyn_cast<Function>(Val: GA->getAliaseeObject());
2684	}
2685	if (!CalledFunction)
2686	continue;
2687	if (CalledFunction == ProfiledCallee) {
2688	if (FoundSingleCalleeChain) {
2689	FoundMultipleCalleeChains = true;
2690	return false;
2691	}
2692	FoundSingleCalleeChain = true;
2693	FoundProfiledCalleeCount ++;
2694	FoundProfiledCalleeDepth += Depth;
2695	if (Depth > FoundProfiledCalleeMaxDepth)
2696	FoundProfiledCalleeMaxDepth = Depth;
2697	SaveCallsiteInfo (&I, CalleeFunc);
2698	} else if (findProfiledCalleeThroughTailCalls(
2699	ProfiledCallee, CurCallee: CalledFunction, Depth: Depth + `1`,
2700	FoundCalleeChain, FoundMultipleCalleeChains)) {
2701	// findProfiledCalleeThroughTailCalls should not have returned
2702	// true if FoundMultipleCalleeChains.
2703	assert(!FoundMultipleCalleeChains);
2704	if (FoundSingleCalleeChain) {
2705	FoundMultipleCalleeChains = true;
2706	return false;
2707	}
2708	FoundSingleCalleeChain = true;
2709	SaveCallsiteInfo (&I, CalleeFunc);
2710	} else if (FoundMultipleCalleeChains)
2711	return false;
2712	}
2713	}
2714
2715	return FoundSingleCalleeChain;
2716	}
2717
2718	const Function ModuleCallsiteContextGraph::getCalleeFunc(Instruction Call) {
2719	auto *CB = dyn_cast<CallBase>(Val: Call);
2720	if (!CB->getCalledOperand() \|\| CB->isIndirectCall())
2721	return nullptr;
2722	auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
2723	auto *Alias = dyn_cast<GlobalAlias>(Val: CalleeVal);
2724	if (Alias)
2725	return dyn_cast<Function>(Val: Alias->getAliasee());
2726	return dyn_cast<Function>(Val: CalleeVal);
2727	}
2728
2729	bool ModuleCallsiteContextGraph::calleeMatchesFunc(
2730	Instruction Call, const* Function Func, const* Function *CallerFunc,
2731	std::vector<std::pair<Instruction , Function >> &FoundCalleeChain) {
2732	auto *CB = dyn_cast<CallBase>(Val: Call);
2733	if (!CB->getCalledOperand() \|\| CB->isIndirectCall())
2734	return false;
2735	auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
2736	auto *CalleeFunc = dyn_cast<Function>(Val: CalleeVal);
2737	if (CalleeFunc == Func)
2738	return true;
2739	auto *Alias = dyn_cast<GlobalAlias>(Val: CalleeVal);
2740	if (Alias && Alias->getAliasee() == Func)
2741	return true;
2742
2743	// Recursively search for the profiled callee through tail calls starting with
2744	// the actual Callee. The discovered tail call chain is saved in
2745	// FoundCalleeChain, and we will fixup the graph to include these callsites
2746	// after returning.
2747	// FIXME: We will currently redo the same recursive walk if we find the same
2748	// mismatched callee from another callsite. We can improve this with more
2749	// bookkeeping of the created chain of new nodes for each mismatch.
2750	unsigned Depth = `1`;
2751	bool FoundMultipleCalleeChains = false;
2752	if (!findProfiledCalleeThroughTailCalls(ProfiledCallee: Func, CurCallee: CalleeVal, Depth,
2753	FoundCalleeChain,
2754	FoundMultipleCalleeChains)) {
2755	LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: "
2756	<< Func->getName() << " from " << CallerFunc->getName()
2757	<< " that actually called " << CalleeVal->getName()
2758	<< (FoundMultipleCalleeChains
2759	? " (found multiple possible chains)"
2760	: "")
2761	<< "\n");
2762	if (FoundMultipleCalleeChains)
2763	FoundProfiledCalleeNonUniquelyCount ++;
2764	return false;
2765	}
2766
2767	return true;
2768	}
2769
2770	bool ModuleCallsiteContextGraph::sameCallee(Instruction *Call1,
2771	Instruction *Call2) {
2772	auto *CB1 = cast<CallBase>(Val: Call1);
2773	if (!CB1->getCalledOperand() \|\| CB1->isIndirectCall())
2774	return false;
2775	auto *CalleeVal1 = CB1->getCalledOperand()->stripPointerCasts();
2776	auto *CalleeFunc1 = dyn_cast<Function>(Val: CalleeVal1);
2777	auto *CB2 = cast<CallBase>(Val: Call2);
2778	if (!CB2->getCalledOperand() \|\| CB2->isIndirectCall())
2779	return false;
2780	auto *CalleeVal2 = CB2->getCalledOperand()->stripPointerCasts();
2781	auto *CalleeFunc2 = dyn_cast<Function>(Val: CalleeVal2);
2782	return CalleeFunc1 == CalleeFunc2;
2783	}
2784
2785	bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2786	ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
2787	std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
2788	bool &FoundMultipleCalleeChains) {
2789	// Stop recursive search if we have already explored the maximum specified
2790	// depth.
2791	if (Depth > TailCallSearchDepth)
2792	return false;
2793
2794	auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) {
2795	// Make a CallsiteInfo for each discovered callee, if one hasn't already
2796	// been synthesized.
2797	if (!FunctionCalleesToSynthesizedCallsiteInfos.count(x: FS) \|\|
2798	!FunctionCalleesToSynthesizedCallsiteInfos [FS].count(x: Callee))
2799	// StackIds is empty (we don't have debug info available in the index for
2800	// these callsites)
2801	FunctionCalleesToSynthesizedCallsiteInfos [FS][Callee] =
2802	std::make_unique<CallsiteInfo>(args&: Callee, args: SmallVector<unsigned>());
2803	CallsiteInfo *NewCallsiteInfo =
2804	FunctionCalleesToSynthesizedCallsiteInfos [FS][Callee].get();
2805	FoundCalleeChain.push_back(x: {NewCallsiteInfo, FS});
2806	};
2807
2808	// Look for tail calls in this function, and check if they either call the
2809	// profiled callee directly, or indirectly (via a recursive search).
2810	// Only succeed if there is a single unique tail call chain found between the
2811	// profiled caller and callee, otherwise we could perform incorrect cloning.
2812	bool FoundSingleCalleeChain = false;
2813	for (auto &S : CurCallee.getSummaryList()) {
2814	if (!GlobalValue::isLocalLinkage(Linkage: S ->linkage()) &&
2815	!isPrevailing (CurCallee.getGUID(), S.get()))
2816	continue;
2817	auto *FS = dyn_cast<FunctionSummary>(Val: S ->getBaseObject());
2818	if (!FS)
2819	continue;
2820	auto FSVI = CurCallee;
2821	auto *AS = dyn_cast<AliasSummary>(Val: S.get());
2822	if (AS)
2823	FSVI = AS->getAliaseeVI();
2824	for (auto &CallEdge : FS->calls()) {
2825	if (!CallEdge.second.hasTailCall())
2826	continue;
2827	if (CallEdge.first == ProfiledCallee) {
2828	if (FoundSingleCalleeChain) {
2829	FoundMultipleCalleeChains = true;
2830	return false;
2831	}
2832	FoundSingleCalleeChain = true;
2833	FoundProfiledCalleeCount ++;
2834	FoundProfiledCalleeDepth += Depth;
2835	if (Depth > FoundProfiledCalleeMaxDepth)
2836	FoundProfiledCalleeMaxDepth = Depth;
2837	CreateAndSaveCallsiteInfo (CallEdge.first, FS);
2838	// Add FS to FSToVIMap in case it isn't already there.
2839	assert(!FSToVIMap.count(FS) \|\| FSToVIMap[FS] == FSVI);
2840	FSToVIMap [FS] = FSVI;
2841	} else if (findProfiledCalleeThroughTailCalls(
2842	ProfiledCallee, CurCallee: CallEdge.first, Depth: Depth + `1`,
2843	FoundCalleeChain, FoundMultipleCalleeChains)) {
2844	// findProfiledCalleeThroughTailCalls should not have returned
2845	// true if FoundMultipleCalleeChains.
2846	assert(!FoundMultipleCalleeChains);
2847	if (FoundSingleCalleeChain) {
2848	FoundMultipleCalleeChains = true;
2849	return false;
2850	}
2851	FoundSingleCalleeChain = true;
2852	CreateAndSaveCallsiteInfo (CallEdge.first, FS);
2853	// Add FS to FSToVIMap in case it isn't already there.
2854	assert(!FSToVIMap.count(FS) \|\| FSToVIMap[FS] == FSVI);
2855	FSToVIMap [FS] = FSVI;
2856	} else if (FoundMultipleCalleeChains)
2857	return false;
2858	}
2859	}
2860
2861	return FoundSingleCalleeChain;
2862	}
2863
2864	const FunctionSummary *
2865	IndexCallsiteContextGraph::getCalleeFunc(IndexCall &Call) {
2866	ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Val&: Call)->Callee;
2867	if (Callee.getSummaryList().empty())
2868	return nullptr;
2869	return dyn_cast<FunctionSummary>(Val: Callee.getSummaryList()[`0`]->getBaseObject());
2870	}
2871
2872	bool IndexCallsiteContextGraph::calleeMatchesFunc(
2873	IndexCall &Call, const FunctionSummary *Func,
2874	const FunctionSummary *CallerFunc,
2875	std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {
2876	ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Val&: Call)->Callee;
2877	// If there is no summary list then this is a call to an externally defined
2878	// symbol.
2879	AliasSummary *Alias =
2880	Callee.getSummaryList().empty()
2881	? nullptr
2882	: dyn_cast<AliasSummary>(Val: Callee.getSummaryList()[`0`].get());
2883	assert(FSToVIMap.count(Func));
2884	auto FuncVI = FSToVIMap [Func];
2885	if (Callee == FuncVI \|\|
2886	// If callee is an alias, check the aliasee, since only function
2887	// summary base objects will contain the stack node summaries and thus
2888	// get a context node.
2889	(Alias && Alias->getAliaseeVI() == FuncVI))
2890	return true;
2891
2892	// Recursively search for the profiled callee through tail calls starting with
2893	// the actual Callee. The discovered tail call chain is saved in
2894	// FoundCalleeChain, and we will fixup the graph to include these callsites
2895	// after returning.
2896	// FIXME: We will currently redo the same recursive walk if we find the same
2897	// mismatched callee from another callsite. We can improve this with more
2898	// bookkeeping of the created chain of new nodes for each mismatch.
2899	unsigned Depth = `1`;
2900	bool FoundMultipleCalleeChains = false;
2901	if (!findProfiledCalleeThroughTailCalls(
2902	ProfiledCallee: FuncVI, CurCallee: Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) {
2903	LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI
2904	<< " from " << FSToVIMap[CallerFunc]
2905	<< " that actually called " << Callee
2906	<< (FoundMultipleCalleeChains
2907	? " (found multiple possible chains)"
2908	: "")
2909	<< "\n");
2910	if (FoundMultipleCalleeChains)
2911	FoundProfiledCalleeNonUniquelyCount ++;
2912	return false;
2913	}
2914
2915	return true;
2916	}
2917
2918	bool IndexCallsiteContextGraph::sameCallee(IndexCall &Call1, IndexCall &Call2) {
2919	ValueInfo Callee1 = dyn_cast_if_present<CallsiteInfo *>(Val&: Call1)->Callee;
2920	ValueInfo Callee2 = dyn_cast_if_present<CallsiteInfo *>(Val&: Call2)->Callee;
2921	return Callee1 == Callee2;
2922	}
2923
2924	template <typename DerivedCCG, typename FuncTy, typename CallTy>
2925	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
2926	const {
2927	print(OS&: dbgs());
2928	dbgs() << "\n";
2929	}
2930
2931	template <typename DerivedCCG, typename FuncTy, typename CallTy>
2932	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
2933	raw_ostream &OS) const {
2934	OS << "Node " << this << "\n";
2935	OS << "\t";
2936	printCall(OS);
2937	if (Recursive)
2938	OS << " (recursive)";
2939	OS << "\n";
2940	if (!MatchingCalls.empty()) {
2941	OS << "\tMatchingCalls:\n";
2942	for (auto &MatchingCall : MatchingCalls) {
2943	OS << "\t";
2944	MatchingCall.print(OS);
2945	OS << "\n";
2946	}
2947	}
2948	OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
2949	OS << "\tContextIds:";
2950	// Make a copy of the computed context ids that we can sort for stability.
2951	auto ContextIds = getContextIds();
2952	std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2953	std::sort(first: SortedIds.begin(), last: SortedIds.end());
2954	for (auto Id : SortedIds)
2955	OS << " " << Id;
2956	OS << "\n";
2957	OS << "\tCalleeEdges:\n";
2958	for (auto &Edge : CalleeEdges)
2959	OS << "\t\t" << *Edge << "\n";
2960	OS << "\tCallerEdges:\n";
2961	for (auto &Edge : CallerEdges)
2962	OS << "\t\t" << *Edge << "\n";
2963	if (!Clones.empty()) {
2964	OS << "\tClones: " << llvm::interleaved(Clones) << "\n";
2965	} else if (CloneOf) {
2966	OS << "\tClone of " << CloneOf << "\n";
2967	}
2968	}
2969
2970	template <typename DerivedCCG, typename FuncTy, typename CallTy>
2971	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
2972	const {
2973	print(OS&: dbgs());
2974	dbgs() << "\n";
2975	}
2976
2977	template <typename DerivedCCG, typename FuncTy, typename CallTy>
2978	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
2979	raw_ostream &OS) const {
2980	OS << "Edge from Callee " << Callee << " to Caller: " << Caller
2981	<< (IsBackedge ? " (BE)" : "")
2982	<< " AllocTypes: " << getAllocTypeString(AllocTypes);
2983	OS << " ContextIds:";
2984	std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2985	std::sort(first: SortedIds.begin(), last: SortedIds.end());
2986	for (auto Id : SortedIds)
2987	OS << " " << Id;
2988	}
2989
2990	template <typename DerivedCCG, typename FuncTy, typename CallTy>
2991	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
2992	print(OS&: dbgs());
2993	}
2994
2995	template <typename DerivedCCG, typename FuncTy, typename CallTy>
2996	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
2997	raw_ostream &OS) const {
2998	OS << "Callsite Context Graph:\n";
2999	using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3000	for (const auto Node : nodes<GraphType>(this)) {
3001	if (Node->isRemoved())
3002	continue;
3003	Node->print(OS);
3004	OS << "\n";
3005	}
3006	}
3007
3008	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3009	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
3010	raw_ostream &OS) const {
3011	using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3012	for (const auto Node : nodes<GraphType>(this)) {
3013	if (Node->isRemoved())
3014	continue;
3015	if (!Node->IsAllocation)
3016	continue;
3017	DenseSet<uint32_t> ContextIds = Node->getContextIds();
3018	auto AllocTypeFromCall = getAllocationCallType(Call: Node->Call);
3019	std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3020	std::sort(first: SortedIds.begin(), last: SortedIds.end());
3021	for (auto Id : SortedIds) {
3022	auto TypeI = ContextIdToAllocationType.find(Val: Id);
3023	assert(TypeI != ContextIdToAllocationType.end());
3024	auto CSI = ContextIdToContextSizeInfos.find(Val: Id);
3025	if (CSI != ContextIdToContextSizeInfos.end()) {
3026	for (auto &Info : CSI ->second) {
3027	OS << "MemProf hinting: "
3028	<< getAllocTypeString(AllocTypes: (uint8_t)TypeI ->second)
3029	<< " full allocation context " << Info.FullStackId
3030	<< " with total size " << Info.TotalSize << " is "
3031	<< getAllocTypeString(Node->AllocTypes) << " after cloning";
3032	if (allocTypeToUse(Node->AllocTypes) != AllocTypeFromCall)
3033	OS << " marked " << getAllocTypeString(AllocTypes: (uint8_t)AllocTypeFromCall)
3034	<< " due to cold byte percent";
3035	// Print the internal context id to aid debugging and visualization.
3036	OS << " (context id " << Id << ")";
3037	OS << "\n";
3038	}
3039	}
3040	}
3041	}
3042	}
3043
3044	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3045	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
3046	using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3047	for (const auto Node : nodes<GraphType>(this)) {
3048	checkNode<DerivedCCG, FuncTy, CallTy>(Node, /CheckEdges=/false);
3049	for (auto &Edge : Node->CallerEdges)
3050	checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
3051	}
3052	}
3053
3054	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3055	struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
3056	using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3057	using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
3058
3059	using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
3060	static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
3061
3062	using nodes_iterator =
3063	mapped_iterator<typename std::vector<NodePtrTy>::const_iterator,
3064	decltype(&getNode)>;
3065
3066	static nodes_iterator nodes_begin(GraphType G) {
3067	return nodes_iterator(G->NodeOwner.begin(), &getNode);
3068	}
3069
3070	static nodes_iterator nodes_end(GraphType G) {
3071	return nodes_iterator(G->NodeOwner.end(), &getNode);
3072	}
3073
3074	static NodeRef getEntryNode(GraphType G) {
3075	return G->NodeOwner.begin()->get();
3076	}
3077
3078	using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
3079	static const ContextNode<DerivedCCG, FuncTy, CallTy> *
3080	GetCallee(const EdgePtrTy &P) {
3081	return P->Callee;
3082	}
3083
3084	using ChildIteratorType =
3085	mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
3086	DerivedCCG, FuncTy, CallTy>>>::const_iterator,
3087	decltype(&GetCallee)>;
3088
3089	static ChildIteratorType child_begin(NodeRef N) {
3090	return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
3091	}
3092
3093	static ChildIteratorType child_end(NodeRef N) {
3094	return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
3095	}
3096	};
3097
3098	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3099	struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
3100	: public DefaultDOTGraphTraits {
3101	DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {
3102	// If the user requested the full graph to be exported, but provided an
3103	// allocation id, or if the user gave a context id and requested more than
3104	// just a specific context to be exported, note that highlighting is
3105	// enabled.
3106	DoHighlight =
3107	(AllocIdForDot.getNumOccurrences() && DotGraphScope == DotScope::All) \|\|
3108	(ContextIdForDot.getNumOccurrences() &&
3109	DotGraphScope != DotScope::Context);
3110	}
3111
3112	using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3113	using GTraits = GraphTraits<GraphType>;
3114	using NodeRef = typename GTraits::NodeRef;
3115	using ChildIteratorType = typename GTraits::ChildIteratorType;
3116
3117	static std::string getNodeLabel(NodeRef Node, GraphType G) {
3118	std::string LabelString =
3119	(Twine ("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
3120	Twine(Node->OrigStackOrAllocId))
3121	.str();
3122	LabelString += "\n";
3123	if (Node->hasCall()) {
3124	auto Func = G->NodeToCallingFunc.find(Node);
3125	assert(Func != G->NodeToCallingFunc.end());
3126	LabelString +=
3127	G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
3128	} else {
3129	LabelString += "null call";
3130	if (Node->Recursive)
3131	LabelString += " (recursive)";
3132	else
3133	LabelString += " (external)";
3134	}
3135	return LabelString;
3136	}
3137
3138	static std::string getNodeAttributes(NodeRef Node, GraphType G) {
3139	auto ContextIds = Node->getContextIds();
3140	// If highlighting enabled, see if this node contains any of the context ids
3141	// of interest. If so, it will use a different color and a larger fontsize
3142	// (which makes the node larger as well).
3143	bool Highlight = false;
3144	if (DoHighlight) {
3145	assert(ContextIdForDot.getNumOccurrences() \|\|
3146	AllocIdForDot.getNumOccurrences());
3147	if (ContextIdForDot.getNumOccurrences())
3148	Highlight = ContextIds.contains(ContextIdForDot);
3149	else
3150	Highlight = set_intersects(ContextIds, G->DotAllocContextIds);
3151	}
3152	std::string AttributeString = (Twine ("tooltip=\"") + getNodeId(Node) + " " +
3153	getContextIds(ContextIds) + "\"")
3154	.str();
3155	// Default fontsize is 14
3156	if (Highlight)
3157	AttributeString += ",fontsize=\"30\"";
3158	AttributeString +=
3159	(Twine (",fillcolor=\"") + getColor(AllocTypes: Node->AllocTypes, Highlight) + "\"")
3160	.str();
3161	if (Node->CloneOf) {
3162	AttributeString += ",color=\"blue\"";
3163	AttributeString += ",style=\"filled,bold,dashed\"";
3164	} else
3165	AttributeString += ",style=\"filled\"";
3166	return AttributeString;
3167	}
3168
3169	static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
3170	GraphType G) {
3171	auto &Edge = *(ChildIter.getCurrent());
3172	// If highlighting enabled, see if this edge contains any of the context ids
3173	// of interest. If so, it will use a different color and a heavier arrow
3174	// size and weight (the larger weight makes the highlighted path
3175	// straighter).
3176	bool Highlight = false;
3177	if (DoHighlight) {
3178	assert(ContextIdForDot.getNumOccurrences() \|\|
3179	AllocIdForDot.getNumOccurrences());
3180	if (ContextIdForDot.getNumOccurrences())
3181	Highlight = Edge->ContextIds.contains(ContextIdForDot);
3182	else
3183	Highlight = set_intersects(Edge->ContextIds, G->DotAllocContextIds);
3184	}
3185	auto Color = getColor(AllocTypes: Edge->AllocTypes, Highlight);
3186	std::string AttributeString =
3187	(Twine ("tooltip=\"") + getContextIds(ContextIds: Edge->ContextIds) + "\"" +
3188	// fillcolor is the arrow head and color is the line
3189	Twine (",fillcolor=\"") + Color + "\"" + Twine (",color=\"") + Color +
3190	"\"")
3191	.str();
3192	if (Edge->IsBackedge)
3193	AttributeString += ",style=\"dotted\"";
3194	// Default penwidth and weight are both 1.
3195	if (Highlight)
3196	AttributeString += ",penwidth=\"2.0\",weight=\"2\"";
3197	return AttributeString;
3198	}
3199
3200	// Since the NodeOwners list includes nodes that are no longer connected to
3201	// the graph, skip them here.
3202	static bool isNodeHidden(NodeRef Node, GraphType G) {
3203	if (Node->isRemoved())
3204	return true;
3205	// If a scope smaller than the full graph was requested, see if this node
3206	// contains any of the context ids of interest.
3207	if (DotGraphScope == DotScope::Alloc)
3208	return !set_intersects(Node->getContextIds(), G->DotAllocContextIds);
3209	if (DotGraphScope == DotScope::Context)
3210	return !Node->getContextIds().contains(ContextIdForDot);
3211	return false;
3212	}
3213
3214	private:
3215	static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
3216	std::string IdString = "ContextIds:";
3217	if (ContextIds.size() < `100`) {
3218	std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3219	std::sort(first: SortedIds.begin(), last: SortedIds.end());
3220	for (auto Id : SortedIds)
3221	IdString += (" " + Twine (Id)).str();
3222	} else {
3223	IdString += (" (" + Twine (ContextIds.size()) + " ids)").str();
3224	}
3225	return IdString;
3226	}
3227
3228	static std::string getColor(uint8_t AllocTypes, bool Highlight) {
3229	// If DoHighlight is not enabled, we want to use the highlight colors for
3230	// NotCold and Cold, and the non-highlight color for NotCold+Cold. This is
3231	// both compatible with the color scheme before highlighting was supported,
3232	// and for the NotCold+Cold color the non-highlight color is a bit more
3233	// readable.
3234	if (AllocTypes == (uint8_t)AllocationType::NotCold)
3235	// Color "brown1" actually looks like a lighter red.
3236	return !DoHighlight \|\| Highlight ? "brown1" : "lightpink";
3237	if (AllocTypes == (uint8_t)AllocationType::Cold)
3238	return !DoHighlight \|\| Highlight ? "cyan" : "lightskyblue";
3239	if (AllocTypes ==
3240	((uint8_t)AllocationType::NotCold \| (uint8_t)AllocationType::Cold))
3241	return Highlight ? "magenta" : "mediumorchid1";
3242	return "gray";
3243	}
3244
3245	static std::string getNodeId(NodeRef Node) {
3246	std::stringstream SStream;
3247	SStream << std::hex << "N0x" << (unsigned long long)Node;
3248	std::string Result = SStream.str();
3249	return Result;
3250	}
3251
3252	// True if we should highlight a specific context or allocation's contexts in
3253	// the emitted graph.
3254	static bool DoHighlight;
3255	};
3256
3257	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3258	bool DOTGraphTraits<
3259	const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>::DoHighlight =
3260	false;
3261
3262	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3263	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
3264	std::string Label) const {
3265	WriteGraph(this, "", false, Label,
3266	DotFilePathPrefix + "ccg." + Label + ".dot");
3267	}
3268
3269	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3270	typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
3271	CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
3272	const std::shared_ptr<ContextEdge> &Edge,
3273	DenseSet<uint32_t> ContextIdsToMove) {
3274	ContextNode *Node = Edge->Callee;
3275	assert(NodeToCallingFunc.count(Node));
3276	ContextNode *Clone =
3277	createNewNode(IsAllocation: Node->IsAllocation, F: NodeToCallingFunc[Node], C: Node->Call);
3278	Node->addClone(Clone);
3279	Clone->MatchingCalls = Node->MatchingCalls;
3280	moveEdgeToExistingCalleeClone(Edge, NewCallee: Clone, /NewClone=/true,
3281	ContextIdsToMove);
3282	return Clone;
3283	}
3284
3285	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3286	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3287	moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
3288	ContextNode NewCallee, bool* NewClone,
3289	DenseSet<uint32_t> ContextIdsToMove) {
3290	// NewCallee and Edge's current callee must be clones of the same original
3291	// node (Edge's current callee may be the original node too).
3292	assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
3293
3294	bool EdgeIsRecursive = Edge->Callee == Edge->Caller;
3295
3296	ContextNode *OldCallee = Edge->Callee;
3297
3298	// We might already have an edge to the new callee from earlier cloning for a
3299	// different allocation. If one exists we will reuse it.
3300	auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller);
3301
3302	// Callers will pass an empty ContextIdsToMove set when they want to move the
3303	// edge. Copy in Edge's ids for simplicity.
3304	if (ContextIdsToMove.empty())
3305	ContextIdsToMove = Edge->getContextIds();
3306
3307	// If we are moving all of Edge's ids, then just move the whole Edge.
3308	// Otherwise only move the specified subset, to a new edge if needed.
3309	if (Edge->getContextIds().size() == ContextIdsToMove.size()) {
3310	// First, update the alloc types on New Callee from Edge.
3311	// Do this before we potentially clear Edge's fields below!
3312	NewCallee->AllocTypes \|= Edge->AllocTypes;
3313	// Moving the whole Edge.
3314	if (ExistingEdgeToNewCallee) {
3315	// Since we already have an edge to NewCallee, simply move the ids
3316	// onto it, and remove the existing Edge.
3317	ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
3318	ExistingEdgeToNewCallee->AllocTypes \|= Edge->AllocTypes;
3319	assert(Edge->ContextIds == ContextIdsToMove);
3320	removeEdgeFromGraph(Edge: Edge.get());
3321	} else {
3322	// Otherwise just reconnect Edge to NewCallee.
3323	Edge->Callee = NewCallee;
3324	NewCallee->CallerEdges.push_back(Edge);
3325	// Remove it from callee where it was previously connected.
3326	OldCallee->eraseCallerEdge(Edge.get());
3327	// Don't need to update Edge's context ids since we are simply
3328	// reconnecting it.
3329	}
3330	} else {
3331	// Only moving a subset of Edge's ids.
3332	// Compute the alloc type of the subset of ids being moved.
3333	auto CallerEdgeAllocType = computeAllocType(ContextIds&: ContextIdsToMove);
3334	if (ExistingEdgeToNewCallee) {
3335	// Since we already have an edge to NewCallee, simply move the ids
3336	// onto it.
3337	ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
3338	ExistingEdgeToNewCallee->AllocTypes \|= CallerEdgeAllocType;
3339	} else {
3340	// Otherwise, create a new edge to NewCallee for the ids being moved.
3341	auto NewEdge = std::make_shared<ContextEdge>(
3342	NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove);
3343	Edge->Caller->CalleeEdges.push_back(NewEdge);
3344	NewCallee->CallerEdges.push_back(NewEdge);
3345	}
3346	// In either case, need to update the alloc types on NewCallee, and remove
3347	// those ids and update the alloc type on the original Edge.
3348	NewCallee->AllocTypes \|= CallerEdgeAllocType;
3349	set_subtract(Edge->ContextIds, ContextIdsToMove);
3350	Edge->AllocTypes = computeAllocType(ContextIds&: Edge->ContextIds);
3351	}
3352	// Now walk the old callee node's callee edges and move Edge's context ids
3353	// over to the corresponding edge into the clone (which is created here if
3354	// this is a newly created clone).
3355	for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
3356	ContextNode *CalleeToUse = OldCalleeEdge->Callee;
3357	// If this is a direct recursion edge, use NewCallee (the clone) as the
3358	// callee as well, so that any edge updated/created here is also direct
3359	// recursive.
3360	if (CalleeToUse == OldCallee) {
3361	// If this is a recursive edge, see if we already moved a recursive edge
3362	// (which would have to have been this one) - if we were only moving a
3363	// subset of context ids it would still be on OldCallee.
3364	if (EdgeIsRecursive) {
3365	assert(OldCalleeEdge == Edge);
3366	continue;
3367	}
3368	CalleeToUse = NewCallee;
3369	}
3370	// The context ids moving to the new callee are the subset of this edge's
3371	// context ids and the context ids on the caller edge being moved.
3372	DenseSet<uint32_t> EdgeContextIdsToMove =
3373	set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove);
3374	set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
3375	OldCalleeEdge->AllocTypes =
3376	computeAllocType(ContextIds&: OldCalleeEdge->getContextIds());
3377	if (!NewClone) {
3378	// Update context ids / alloc type on corresponding edge to NewCallee.
3379	// There is a chance this may not exist if we are reusing an existing
3380	// clone, specifically during function assignment, where we would have
3381	// removed none type edges after creating the clone. If we can't find
3382	// a corresponding edge there, fall through to the cloning below.
3383	if (auto *NewCalleeEdge = NewCallee->findEdgeFromCallee(CalleeToUse)) {
3384	NewCalleeEdge->getContextIds().insert_range(EdgeContextIdsToMove);
3385	NewCalleeEdge->AllocTypes \|= computeAllocType(ContextIds&: EdgeContextIdsToMove);
3386	continue;
3387	}
3388	}
3389	auto NewEdge = std::make_shared<ContextEdge>(
3390	CalleeToUse, NewCallee, computeAllocType(ContextIds&: EdgeContextIdsToMove),
3391	EdgeContextIdsToMove);
3392	NewCallee->CalleeEdges.push_back(NewEdge);
3393	NewEdge->Callee->CallerEdges.push_back(NewEdge);
3394	}
3395	// Recompute the node alloc type now that its callee edges have been
3396	// updated (since we will compute from those edges).
3397	OldCallee->AllocTypes = OldCallee->computeAllocType();
3398	// OldCallee alloc type should be None iff its context id set is now empty.
3399	assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
3400	OldCallee->emptyContextIds());
3401	if (VerifyCCG) {
3402	checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /CheckEdges=/false);
3403	checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /CheckEdges=/false);
3404	for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
3405	checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
3406	/CheckEdges=/false);
3407	for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
3408	checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
3409	/CheckEdges=/false);
3410	}
3411	}
3412
3413	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3414	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3415	moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
3416	ContextNode *NewCaller) {
3417	auto *OldCallee = Edge->Callee;
3418	auto *NewCallee = OldCallee;
3419	// If this edge was direct recursive, make any new/updated edge also direct
3420	// recursive to NewCaller.
3421	bool Recursive = Edge->Caller == Edge->Callee;
3422	if (Recursive)
3423	NewCallee = NewCaller;
3424
3425	ContextNode *OldCaller = Edge->Caller;
3426	OldCaller->eraseCalleeEdge(Edge.get());
3427
3428	// We might already have an edge to the new caller. If one exists we will
3429	// reuse it.
3430	auto ExistingEdgeToNewCaller = NewCaller->findEdgeFromCallee(NewCallee);
3431
3432	if (ExistingEdgeToNewCaller) {
3433	// Since we already have an edge to NewCaller, simply move the ids
3434	// onto it, and remove the existing Edge.
3435	ExistingEdgeToNewCaller->getContextIds().insert_range(
3436	Edge->getContextIds());
3437	ExistingEdgeToNewCaller->AllocTypes \|= Edge->AllocTypes;
3438	Edge->ContextIds.clear();
3439	Edge->AllocTypes = (uint8_t)AllocationType::None;
3440	OldCallee->eraseCallerEdge(Edge.get());
3441	} else {
3442	// Otherwise just reconnect Edge to NewCaller.
3443	Edge->Caller = NewCaller;
3444	NewCaller->CalleeEdges.push_back(Edge);
3445	if (Recursive) {
3446	assert(NewCallee == NewCaller);
3447	// In the case of (direct) recursive edges, we update the callee as well
3448	// so that it becomes recursive on the new caller.
3449	Edge->Callee = NewCallee;
3450	NewCallee->CallerEdges.push_back(Edge);
3451	OldCallee->eraseCallerEdge(Edge.get());
3452	}
3453	// Don't need to update Edge's context ids since we are simply
3454	// reconnecting it.
3455	}
3456	// In either case, need to update the alloc types on New Caller.
3457	NewCaller->AllocTypes \|= Edge->AllocTypes;
3458
3459	// Now walk the old caller node's caller edges and move Edge's context ids
3460	// over to the corresponding edge into the node (which is created here if
3461	// this is a newly created node). We can tell whether this is a newly created
3462	// node by seeing if it has any caller edges yet.
3463	#ifndef NDEBUG
3464	bool IsNewNode = NewCaller->CallerEdges.empty();
3465	#endif
3466	// If we just moved a direct recursive edge, presumably its context ids should
3467	// also flow out of OldCaller via some other non-recursive callee edge. We
3468	// don't want to remove the recursive context ids from other caller edges yet,
3469	// otherwise the context ids get into an inconsistent state on OldCaller.
3470	// We will update these context ids on the non-recursive caller edge when and
3471	// if they are updated on the non-recursive callee.
3472	if (!Recursive) {
3473	for (auto &OldCallerEdge : OldCaller->CallerEdges) {
3474	auto OldCallerCaller = OldCallerEdge->Caller;
3475	// The context ids moving to the new caller are the subset of this edge's
3476	// context ids and the context ids on the callee edge being moved.
3477	DenseSet<uint32_t> EdgeContextIdsToMove = set_intersection(
3478	OldCallerEdge->getContextIds(), Edge->getContextIds());
3479	if (OldCaller == OldCallerCaller) {
3480	OldCallerCaller = NewCaller;
3481	// Don't actually move this one. The caller will move it directly via a
3482	// call to this function with this as the Edge if it is appropriate to
3483	// move to a diff node that has a matching callee (itself).
3484	continue;
3485	}
3486	set_subtract(OldCallerEdge->getContextIds(), EdgeContextIdsToMove);
3487	OldCallerEdge->AllocTypes =
3488	computeAllocType(ContextIds&: OldCallerEdge->getContextIds());
3489	// In this function we expect that any pre-existing node already has edges
3490	// from the same callers as the old node. That should be true in the
3491	// current use case, where we will remove None-type edges after copying
3492	// over all caller edges from the callee.
3493	auto *ExistingCallerEdge = NewCaller->findEdgeFromCaller(OldCallerCaller);
3494	// Since we would have skipped caller edges when moving a direct recursive
3495	// edge, this may not hold true when recursive handling enabled.
3496	assert(IsNewNode \|\| ExistingCallerEdge \|\| AllowRecursiveCallsites);
3497	if (ExistingCallerEdge) {
3498	ExistingCallerEdge->getContextIds().insert_range(EdgeContextIdsToMove);
3499	ExistingCallerEdge->AllocTypes \|=
3500	computeAllocType(ContextIds&: EdgeContextIdsToMove);
3501	continue;
3502	}
3503	auto NewEdge = std::make_shared<ContextEdge>(
3504	NewCaller, OldCallerCaller, computeAllocType(ContextIds&: EdgeContextIdsToMove),
3505	EdgeContextIdsToMove);
3506	NewCaller->CallerEdges.push_back(NewEdge);
3507	NewEdge->Caller->CalleeEdges.push_back(NewEdge);
3508	}
3509	}
3510	// Recompute the node alloc type now that its caller edges have been
3511	// updated (since we will compute from those edges).
3512	OldCaller->AllocTypes = OldCaller->computeAllocType();
3513	// OldCaller alloc type should be None iff its context id set is now empty.
3514	assert((OldCaller->AllocTypes == (uint8_t)AllocationType::None) ==
3515	OldCaller->emptyContextIds());
3516	if (VerifyCCG) {
3517	checkNode<DerivedCCG, FuncTy, CallTy>(OldCaller, /CheckEdges=/false);
3518	checkNode<DerivedCCG, FuncTy, CallTy>(NewCaller, /CheckEdges=/false);
3519	for (const auto &OldCallerEdge : OldCaller->CallerEdges)
3520	checkNode<DerivedCCG, FuncTy, CallTy>(OldCallerEdge->Caller,
3521	/CheckEdges=/false);
3522	for (const auto &NewCallerEdge : NewCaller->CallerEdges)
3523	checkNode<DerivedCCG, FuncTy, CallTy>(NewCallerEdge->Caller,
3524	/CheckEdges=/false);
3525	}
3526	}
3527
3528	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3529	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3530	recursivelyRemoveNoneTypeCalleeEdges(
3531	ContextNode Node, DenseSet<const* ContextNode *> &Visited) {
3532	auto Inserted = Visited.insert(Node);
3533	if (!Inserted.second)
3534	return;
3535
3536	removeNoneTypeCalleeEdges(Node);
3537
3538	for (auto *Clone : Node->Clones)
3539	recursivelyRemoveNoneTypeCalleeEdges(Node: Clone, Visited);
3540
3541	// The recursive call may remove some of this Node's caller edges.
3542	// Iterate over a copy and skip any that were removed.
3543	auto CallerEdges = Node->CallerEdges;
3544	for (auto &Edge : CallerEdges) {
3545	// Skip any that have been removed by an earlier recursive call.
3546	if (Edge->isRemoved()) {
3547	assert(!is_contained(Node->CallerEdges, Edge));
3548	continue;
3549	}
3550	recursivelyRemoveNoneTypeCalleeEdges(Node: Edge->Caller, Visited);
3551	}
3552	}
3553
3554	// This is the standard DFS based backedge discovery algorithm.
3555	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3556	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges() {
3557	// If we are cloning recursive contexts, find and mark backedges from all root
3558	// callers, using the typical DFS based backedge analysis.
3559	if (!CloneRecursiveContexts)
3560	return;
3561	DenseSet<const ContextNode *> Visited;
3562	DenseSet<const ContextNode *> CurrentStack;
3563	for (auto &Entry : NonAllocationCallToContextNodeMap) {
3564	auto *Node = Entry.second;
3565	if (Node->isRemoved())
3566	continue;
3567	// It is a root if it doesn't have callers.
3568	if (!Node->CallerEdges.empty())
3569	continue;
3570	markBackedges(Node, Visited, CurrentStack);
3571	assert(CurrentStack.empty());
3572	}
3573	}
3574
3575	// Recursive helper for above markBackedges method.
3576	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3577	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges(
3578	ContextNode Node, DenseSet<const* ContextNode *> &Visited,
3579	DenseSet<const ContextNode *> &CurrentStack) {
3580	auto I = Visited.insert(Node);
3581	// We should only call this for unvisited nodes.
3582	assert(I.second);
3583	(void)I;
3584	for (auto &CalleeEdge : Node->CalleeEdges) {
3585	auto *Callee = CalleeEdge->Callee;
3586	if (Visited.count(Callee)) {
3587	// Since this was already visited we need to check if it is currently on
3588	// the recursive stack in which case it is a backedge.
3589	if (CurrentStack.count(Callee))
3590	CalleeEdge->IsBackedge = true;
3591	continue;
3592	}
3593	CurrentStack.insert(Callee);
3594	markBackedges(Callee, Visited, CurrentStack);
3595	CurrentStack.erase(Callee);
3596	}
3597	}
3598
3599	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3600	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
3601	DenseSet<const ContextNode *> Visited;
3602	for (auto &Entry : AllocationCallToContextNodeMap) {
3603	Visited.clear();
3604	identifyClones(Entry.second, Visited, Entry.second->getContextIds());
3605	}
3606	Visited.clear();
3607	for (auto &Entry : AllocationCallToContextNodeMap)
3608	recursivelyRemoveNoneTypeCalleeEdges(Node: Entry.second, Visited);
3609	if (VerifyCCG)
3610	check();
3611	}
3612
3613	// helper function to check an AllocType is cold or notcold or both.
3614	bool checkColdOrNotCold(uint8_t AllocType) {
3615	return (AllocType == (uint8_t)AllocationType::Cold) \|\|
3616	(AllocType == (uint8_t)AllocationType::NotCold) \|\|
3617	(AllocType ==
3618	((uint8_t)AllocationType::Cold \| (uint8_t)AllocationType::NotCold));
3619	}
3620
3621	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3622	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
3623	ContextNode Node, DenseSet<const* ContextNode *> &Visited,
3624	const DenseSet<uint32_t> &AllocContextIds) {
3625	if (VerifyNodes)
3626	checkNode<DerivedCCG, FuncTy, CallTy>(Node, /CheckEdges=/false);
3627	assert(!Node->CloneOf);
3628
3629	// If Node as a null call, then either it wasn't found in the module (regular
3630	// LTO) or summary index (ThinLTO), or there were other conditions blocking
3631	// cloning (e.g. recursion, calls multiple targets, etc).
3632	// Do this here so that we don't try to recursively clone callers below, which
3633	// isn't useful at least for this node.
3634	if (!Node->hasCall())
3635	return;
3636
3637	// No need to look at any callers if allocation type already unambiguous.
3638	if (hasSingleAllocType(Node->AllocTypes))
3639	return;
3640
3641	#ifndef NDEBUG
3642	auto Insert =
3643	#endif
3644	Visited.insert(Node);
3645	// We should not have visited this node yet.
3646	assert(Insert.second);
3647	// The recursive call to identifyClones may delete the current edge from the
3648	// CallerEdges vector. Make a copy and iterate on that, simpler than passing
3649	// in an iterator and having recursive call erase from it. Other edges may
3650	// also get removed during the recursion, which will have null Callee and
3651	// Caller pointers (and are deleted later), so we skip those below.
3652	{
3653	auto CallerEdges = Node->CallerEdges;
3654	for (auto &Edge : CallerEdges) {
3655	// Skip any that have been removed by an earlier recursive call.
3656	if (Edge->isRemoved()) {
3657	assert(!is_contained(Node->CallerEdges, Edge));
3658	continue;
3659	}
3660	// Defer backedges. See comments further below where these edges are
3661	// handled during the cloning of this Node.
3662	if (Edge->IsBackedge) {
3663	// We should only mark these if cloning recursive contexts, where we
3664	// need to do this deferral.
3665	assert(CloneRecursiveContexts);
3666	continue;
3667	}
3668	// Ignore any caller we previously visited via another edge.
3669	if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
3670	identifyClones(Edge->Caller, Visited, AllocContextIds);
3671	}
3672	}
3673	}
3674
3675	// Check if we reached an unambiguous call or have have only a single caller.
3676	if (hasSingleAllocType(Node->AllocTypes) \|\| Node->CallerEdges.size() <= `1`)
3677	return;
3678
3679	// We need to clone.
3680
3681	// Try to keep the original version as alloc type NotCold. This will make
3682	// cases with indirect calls or any other situation with an unknown call to
3683	// the original function get the default behavior. We do this by sorting the
3684	// CallerEdges of the Node we will clone by alloc type.
3685	//
3686	// Give NotCold edge the lowest sort priority so those edges are at the end of
3687	// the caller edges vector, and stay on the original version (since the below
3688	// code clones greedily until it finds all remaining edges have the same type
3689	// and leaves the remaining ones on the original Node).
3690	//
3691	// We shouldn't actually have any None type edges, so the sorting priority for
3692	// that is arbitrary, and we assert in that case below.
3693	const unsigned AllocTypeCloningPriority[] = {/None/ `3`, /NotCold/ `4`,
3694	/Cold/ `1`,
3695	/NotColdCold/ `2`};
3696	llvm::stable_sort(Node->CallerEdges,
3697	[&](const std::shared_ptr<ContextEdge> &A,
3698	const std::shared_ptr<ContextEdge> &B) {
3699	// Nodes with non-empty context ids should be sorted
3700	// before those with empty context ids.
3701	if (A->ContextIds.empty())
3702	// Either B ContextIds are non-empty (in which case we
3703	// should return false because B < A), or B ContextIds
3704	// are empty, in which case they are equal, and we
3705	// should maintain the original relative ordering.
3706	return false;
3707	if (B->ContextIds.empty())
3708	return true;
3709
3710	if (A->AllocTypes == B->AllocTypes)
3711	// Use the first context id for each edge as a
3712	// tie-breaker.
3713	return A->ContextIds.begin() < B->ContextIds.begin();
3714	return AllocTypeCloningPriority[A->AllocTypes] <
3715	AllocTypeCloningPriority[B->AllocTypes];
3716	});
3717
3718	assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3719
3720	DenseSet<uint32_t> RecursiveContextIds;
3721	assert(AllowRecursiveContexts \|\| !CloneRecursiveContexts);
3722	// If we are allowing recursive callsites, but have also disabled recursive
3723	// contexts, look for context ids that show up in multiple caller edges.
3724	if (AllowRecursiveCallsites && !AllowRecursiveContexts) {
3725	DenseSet<uint32_t> AllCallerContextIds;
3726	for (auto &CE : Node->CallerEdges) {
3727	// Resize to the largest set of caller context ids, since we know the
3728	// final set will be at least that large.
3729	AllCallerContextIds.reserve(Size: CE->getContextIds().size());
3730	for (auto Id : CE->getContextIds())
3731	if (!AllCallerContextIds.insert(Id).second)
3732	RecursiveContextIds.insert(Id);
3733	}
3734	}
3735
3736	// Iterate until we find no more opportunities for disambiguating the alloc
3737	// types via cloning. In most cases this loop will terminate once the Node
3738	// has a single allocation type, in which case no more cloning is needed.
3739	// Iterate over a copy of Node's caller edges, since we may need to remove
3740	// edges in the moveEdgeTo methods, and this simplifies the handling and*
3741	// makes it less error-prone.
3742	auto CallerEdges = Node->CallerEdges;
3743	for (auto &CallerEdge : CallerEdges) {
3744	// Skip any that have been removed by an earlier recursive call.
3745	if (CallerEdge->isRemoved()) {
3746	assert(!is_contained(Node->CallerEdges, CallerEdge));
3747	continue;
3748	}
3749	assert(CallerEdge->Callee == Node);
3750
3751	// See if cloning the prior caller edge left this node with a single alloc
3752	// type or a single caller. In that case no more cloning of Node is needed.
3753	if (hasSingleAllocType(Node->AllocTypes) \|\| Node->CallerEdges.size() <= `1`)
3754	break;
3755
3756	// If the caller was not successfully matched to a call in the IR/summary,
3757	// there is no point in trying to clone for it as we can't update that call.
3758	if (!CallerEdge->Caller->hasCall())
3759	continue;
3760
3761	// Only need to process the ids along this edge pertaining to the given
3762	// allocation.
3763	auto CallerEdgeContextsForAlloc =
3764	set_intersection(CallerEdge->getContextIds(), AllocContextIds);
3765	if (!RecursiveContextIds.empty())
3766	CallerEdgeContextsForAlloc =
3767	set_difference(CallerEdgeContextsForAlloc, RecursiveContextIds);
3768	if (CallerEdgeContextsForAlloc.empty())
3769	continue;
3770
3771	auto CallerAllocTypeForAlloc = computeAllocType(ContextIds&: CallerEdgeContextsForAlloc);
3772
3773	// Compute the node callee edge alloc types corresponding to the context ids
3774	// for this caller edge.
3775	std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
3776	CalleeEdgeAllocTypesForCallerEdge.reserve(n: Node->CalleeEdges.size());
3777	for (auto &CalleeEdge : Node->CalleeEdges)
3778	CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
3779	Node1Ids: CalleeEdge->getContextIds(), Node2Ids: CallerEdgeContextsForAlloc));
3780
3781	// Don't clone if doing so will not disambiguate any alloc types amongst
3782	// caller edges (including the callee edges that would be cloned).
3783	// Otherwise we will simply move all edges to the clone.
3784	//
3785	// First check if by cloning we will disambiguate the caller allocation
3786	// type from node's allocation type. Query allocTypeToUse so that we don't
3787	// bother cloning to distinguish NotCold+Cold from NotCold. Note that
3788	// neither of these should be None type.
3789	//
3790	// Then check if by cloning node at least one of the callee edges will be
3791	// disambiguated by splitting out different context ids.
3792	//
3793	// However, always do the cloning if this is a backedge, in which case we
3794	// have not yet cloned along this caller edge.
3795	assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
3796	assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3797	if (!CallerEdge->IsBackedge &&
3798	allocTypeToUse(CallerAllocTypeForAlloc) ==
3799	allocTypeToUse(Node->AllocTypes) &&
3800	allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
3801	CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {
3802	continue;
3803	}
3804
3805	if (CallerEdge->IsBackedge) {
3806	// We should only mark these if cloning recursive contexts, where we
3807	// need to do this deferral.
3808	assert(CloneRecursiveContexts);
3809	DeferredBackedges ++;
3810	}
3811
3812	// If this is a backedge, we now do recursive cloning starting from its
3813	// caller since we may have moved unambiguous caller contexts to a clone
3814	// of this Node in a previous iteration of the current loop, giving more
3815	// opportunity for cloning through the backedge. Because we sorted the
3816	// caller edges earlier so that cold caller edges are first, we would have
3817	// visited and cloned this node for any unamibiguously cold non-recursive
3818	// callers before any ambiguous backedge callers. Note that we don't do this
3819	// if the caller is already cloned or visited during cloning (e.g. via a
3820	// different context path from the allocation).
3821	// TODO: Can we do better in the case where the caller was already visited?
3822	if (CallerEdge->IsBackedge && !CallerEdge->Caller->CloneOf &&
3823	!Visited.count(CallerEdge->Caller)) {
3824	const auto OrigIdCount = CallerEdge->getContextIds().size();
3825	// Now do the recursive cloning of this backedge's caller, which was
3826	// deferred earlier.
3827	identifyClones(CallerEdge->Caller, Visited, CallerEdgeContextsForAlloc);
3828	removeNoneTypeCalleeEdges(Node: CallerEdge->Caller);
3829	// See if the recursive call to identifyClones moved the context ids to a
3830	// new edge from this node to a clone of caller, and switch to looking at
3831	// that new edge so that we clone Node for the new caller clone.
3832	bool UpdatedEdge = false;
3833	if (OrigIdCount > CallerEdge->getContextIds().size()) {
3834	for (auto E : Node->CallerEdges) {
3835	// Only interested in clones of the current edges caller.
3836	if (E->Caller->CloneOf != CallerEdge->Caller)
3837	continue;
3838	// See if this edge contains any of the context ids originally on the
3839	// current caller edge.
3840	auto CallerEdgeContextsForAllocNew =
3841	set_intersection(CallerEdgeContextsForAlloc, E->getContextIds());
3842	if (CallerEdgeContextsForAllocNew.empty())
3843	continue;
3844	// Make sure we don't pick a previously existing caller edge of this
3845	// Node, which would be processed on a different iteration of the
3846	// outer loop over the saved CallerEdges.
3847	if (llvm::is_contained(CallerEdges, E))
3848	continue;
3849	// The CallerAllocTypeForAlloc and CalleeEdgeAllocTypesForCallerEdge
3850	// are updated further below for all cases where we just invoked
3851	// identifyClones recursively.
3852	CallerEdgeContextsForAlloc.swap(CallerEdgeContextsForAllocNew);
3853	CallerEdge = E;
3854	UpdatedEdge = true;
3855	break;
3856	}
3857	}
3858	// If cloning removed this edge (and we didn't update it to a new edge
3859	// above), we're done with this edge. It's possible we moved all of the
3860	// context ids to an existing clone, in which case there's no need to do
3861	// further processing for them.
3862	if (CallerEdge->isRemoved())
3863	continue;
3864
3865	// Now we need to update the information used for the cloning decisions
3866	// further below, as we may have modified edges and their context ids.
3867
3868	// Note if we changed the CallerEdge above we would have already updated
3869	// the context ids.
3870	if (!UpdatedEdge) {
3871	CallerEdgeContextsForAlloc = set_intersection(
3872	CallerEdgeContextsForAlloc, CallerEdge->getContextIds());
3873	if (CallerEdgeContextsForAlloc.empty())
3874	continue;
3875	}
3876	// Update the other information that depends on the edges and on the now
3877	// updated CallerEdgeContextsForAlloc.
3878	CallerAllocTypeForAlloc = computeAllocType(ContextIds&: CallerEdgeContextsForAlloc);
3879	CalleeEdgeAllocTypesForCallerEdge.clear();
3880	for (auto &CalleeEdge : Node->CalleeEdges) {
3881	CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
3882	Node1Ids: CalleeEdge->getContextIds(), Node2Ids: CallerEdgeContextsForAlloc));
3883	}
3884	}
3885
3886	// First see if we can use an existing clone. Check each clone and its
3887	// callee edges for matching alloc types.
3888	ContextNode Clone = nullptr*;
3889	for (auto *CurClone : Node->Clones) {
3890	if (allocTypeToUse(CurClone->AllocTypes) !=
3891	allocTypeToUse(CallerAllocTypeForAlloc))
3892	continue;
3893
3894	bool BothSingleAlloc = hasSingleAllocType(CurClone->AllocTypes) &&
3895	hasSingleAllocType(CallerAllocTypeForAlloc);
3896	// The above check should mean that if both have single alloc types that
3897	// they should be equal.
3898	assert(!BothSingleAlloc \|\|
3899	CurClone->AllocTypes == CallerAllocTypeForAlloc);
3900
3901	// If either both have a single alloc type (which are the same), or if the
3902	// clone's callee edges have the same alloc types as those for the current
3903	// allocation on Node's callee edges (CalleeEdgeAllocTypesForCallerEdge),
3904	// then we can reuse this clone.
3905	if (BothSingleAlloc \|\| allocTypesMatchClone<DerivedCCG, FuncTy, CallTy>(
3906	CalleeEdgeAllocTypesForCallerEdge, CurClone)) {
3907	Clone = CurClone;
3908	break;
3909	}
3910	}
3911
3912	// The edge iterator is adjusted when we move the CallerEdge to the clone.
3913	if (Clone)
3914	moveEdgeToExistingCalleeClone(Edge: CallerEdge, NewCallee: Clone, /NewClone=/false,
3915	ContextIdsToMove: CallerEdgeContextsForAlloc);
3916	else
3917	Clone = moveEdgeToNewCalleeClone(Edge: CallerEdge, ContextIdsToMove: CallerEdgeContextsForAlloc);
3918
3919	// Sanity check that no alloc types on clone or its edges are None.
3920	assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
3921	}
3922
3923	// We should still have some context ids on the original Node.
3924	assert(!Node->emptyContextIds());
3925
3926	// Sanity check that no alloc types on node or edges are None.
3927	assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3928
3929	if (VerifyNodes)
3930	checkNode<DerivedCCG, FuncTy, CallTy>(Node, /CheckEdges=/false);
3931	}
3932
3933	void ModuleCallsiteContextGraph::updateAllocationCall(
3934	CallInfo &Call, AllocationType AllocType) {
3935	std::string AllocTypeString = getAllocTypeAttributeString(Type: AllocType);
3936	auto A = llvm::Attribute::get(Context&: Call.call()->getFunction()->getContext(),
3937	Kind: "memprof", Val: AllocTypeString);
3938	cast<CallBase>(Val: Call.call())->addFnAttr(Attr: A);
3939	OREGetter (Call.call()->getFunction())
3940	.emit(OptDiag: OptimizationRemark (DEBUG_TYPE, "MemprofAttribute", Call.call())
3941	<< ore::NV ("AllocationCall", Call.call()) << " in clone "
3942	<< ore::NV ("Caller", Call.call()->getFunction())
3943	<< " marked with memprof allocation attribute "
3944	<< ore::NV ("Attribute", AllocTypeString));
3945	}
3946
3947	void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
3948	AllocationType AllocType) {
3949	auto AI = cast<AllocInfo >(Val: Call.call());
3950	assert(AI);
3951	assert(AI->Versions.size() > Call.cloneNo());
3952	AI->Versions [Call.cloneNo()] = (uint8_t)AllocType;
3953	}
3954
3955	AllocationType
3956	ModuleCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
3957	const auto *CB = cast<CallBase>(Val: Call.call());
3958	if (!CB->getAttributes().hasFnAttr(Kind: "memprof"))
3959	return AllocationType::None;
3960	return CB->getAttributes().getFnAttr(Kind: "memprof").getValueAsString() == "cold"
3961	? AllocationType::Cold
3962	: AllocationType::NotCold;
3963	}
3964
3965	AllocationType
3966	IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
3967	const auto AI = cast<AllocInfo >(Val: Call.call());
3968	assert(AI->Versions.size() > Call.cloneNo());
3969	return (AllocationType)AI->Versions [Call.cloneNo()];
3970	}
3971
3972	void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
3973	FuncInfo CalleeFunc) {
3974	if (CalleeFunc.cloneNo() > `0`)
3975	cast<CallBase>(Val: CallerCall.call())->setCalledFunction(CalleeFunc.func());
3976	OREGetter (CallerCall.call()->getFunction())
3977	.emit(OptDiag: OptimizationRemark (DEBUG_TYPE, "MemprofCall", CallerCall.call())
3978	<< ore::NV ("Call", CallerCall.call()) << " in clone "
3979	<< ore::NV ("Caller", CallerCall.call()->getFunction())
3980	<< " assigned to call function clone "
3981	<< ore::NV ("Callee", CalleeFunc.func()));
3982	}
3983
3984	void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
3985	FuncInfo CalleeFunc) {
3986	auto CI = cast<CallsiteInfo >(Val: CallerCall.call());
3987	assert(CI &&
3988	"Caller cannot be an allocation which should not have profiled calls");
3989	assert(CI->Clones.size() > CallerCall.cloneNo());
3990	CI->Clones [CallerCall.cloneNo()] = CalleeFunc.cloneNo();
3991	}
3992
3993	CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
3994	Instruction *>::FuncInfo
3995	ModuleCallsiteContextGraph::cloneFunctionForCallsite(
3996	FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
3997	std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
3998	// Use existing LLVM facilities for cloning and obtaining Call in clone
3999	ValueToValueMapTy VMap;
4000	auto *NewFunc = CloneFunction(F: Func.func(), VMap);
4001	std::string Name = getMemProfFuncName(Base: Func.func()->getName(), CloneNo);
4002	assert(!Func.func()->getParent()->getFunction(Name));
4003	NewFunc->setName(Name);
4004	if (auto *SP = NewFunc->getSubprogram())
4005	SP->replaceLinkageName(
4006	LN: MDString::get(Context&: NewFunc->getParent()->getContext(), Str: Name));
4007	for (auto &Inst : CallsWithMetadataInFunc) {
4008	// This map always has the initial version in it.
4009	assert(Inst.cloneNo() == `0`);
4010	CallMap [Inst] = {cast<Instruction>(Val&: VMap [Inst.call()]), CloneNo};
4011	}
4012	OREGetter (Func.func())
4013	.emit(OptDiag: OptimizationRemark (DEBUG_TYPE, "MemprofClone", Func.func())
4014	<< "created clone " << ore::NV ("NewFunction", NewFunc));
4015	return {NewFunc, CloneNo};
4016	}
4017
4018	CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
4019	IndexCall>::FuncInfo
4020	IndexCallsiteContextGraph::cloneFunctionForCallsite(
4021	FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
4022	std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
4023	// Check how many clones we have of Call (and therefore function).
4024	// The next clone number is the current size of versions array.
4025	// Confirm this matches the CloneNo provided by the caller, which is based on
4026	// the number of function clones we have.
4027	assert(CloneNo == (isa<AllocInfo *>(Call.call())
4028	? cast<AllocInfo *>(Call.call())->Versions.size()
4029	: cast<CallsiteInfo *>(Call.call())->Clones.size()));
4030	// Walk all the instructions in this function. Create a new version for
4031	// each (by adding an entry to the Versions/Clones summary array), and copy
4032	// over the version being called for the function clone being cloned here.
4033	// Additionally, add an entry to the CallMap for the new function clone,
4034	// mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
4035	// to the new call clone.
4036	for (auto &Inst : CallsWithMetadataInFunc) {
4037	// This map always has the initial version in it.
4038	assert(Inst.cloneNo() == `0`);
4039	if (auto AI = dyn_cast<AllocInfo >(Val: Inst.call())) {
4040	assert(AI->Versions.size() == CloneNo);
4041	// We assign the allocation type later (in updateAllocationCall), just add
4042	// an entry for it here.
4043	AI->Versions.push_back(Elt: `0`);
4044	} else {
4045	auto CI = cast<CallsiteInfo >(Val: Inst.call());
4046	assert(CI && CI->Clones.size() == CloneNo);
4047	// We assign the clone number later (in updateCall), just add an entry for
4048	// it here.
4049	CI->Clones.push_back(Elt: `0`);
4050	}
4051	CallMap [Inst] = {Inst.call(), CloneNo};
4052	}
4053	return {Func.func(), CloneNo};
4054	}
4055
4056	// We perform cloning for each allocation node separately. However, this
4057	// sometimes results in a situation where the same node calls multiple
4058	// clones of the same callee, created for different allocations. This
4059	// causes issues when assigning functions to these clones, as each node can
4060	// in reality only call a single callee clone.
4061	//
4062	// To address this, before assigning functions, merge callee clone nodes as
4063	// needed using a post order traversal from the allocations. We attempt to
4064	// use existing clones as the merge node when legal, and to share them
4065	// among callers with the same properties (callers calling the same set of
4066	// callee clone nodes for the same allocations).
4067	//
4068	// Without this fix, in some cases incorrect function assignment will lead
4069	// to calling the wrong allocation clone.
4070	template <typename DerivedCCG, typename FuncTy, typename CallTy>
4071	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones() {
4072	if (!MergeClones)
4073	return;
4074
4075	// Generate a map from context id to the associated allocation node for use
4076	// when merging clones.
4077	DenseMap<uint32_t, ContextNode *> ContextIdToAllocationNode;
4078	for (auto &Entry : AllocationCallToContextNodeMap) {
4079	auto *Node = Entry.second;
4080	for (auto Id : Node->getContextIds())
4081	ContextIdToAllocationNode[Id] = Node->getOrigNode();
4082	for (auto *Clone : Node->Clones) {
4083	for (auto Id : Clone->getContextIds())
4084	ContextIdToAllocationNode[Id] = Clone->getOrigNode();
4085	}
4086	}
4087
4088	// Post order traversal starting from allocations to ensure each callsite
4089	// calls a single clone of its callee. Callee nodes that are clones of each
4090	// other are merged (via new merge nodes if needed) to achieve this.
4091	DenseSet<const ContextNode *> Visited;
4092	for (auto &Entry : AllocationCallToContextNodeMap) {
4093	auto *Node = Entry.second;
4094
4095	mergeClones(Node, Visited, ContextIdToAllocationNode);
4096
4097	// Make a copy so the recursive post order traversal that may create new
4098	// clones doesn't mess up iteration. Note that the recursive traversal
4099	// itself does not call mergeClones on any of these nodes, which are all
4100	// (clones of) allocations.
4101	auto Clones = Node->Clones;
4102	for (auto *Clone : Clones)
4103	mergeClones(Clone, Visited, ContextIdToAllocationNode);
4104	}
4105
4106	if (DumpCCG) {
4107	dbgs() << "CCG after merging:\n";
4108	dbgs() << *this;
4109	}
4110	if (ExportToDot)
4111	exportToDot(Label: "aftermerge");
4112
4113	if (VerifyCCG) {
4114	check();
4115	}
4116	}
4117
4118	// Recursive helper for above mergeClones method.
4119	template <typename DerivedCCG, typename FuncTy, typename CallTy>
4120	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones(
4121	ContextNode Node, DenseSet<const* ContextNode *> &Visited,
4122	DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
4123	auto Inserted = Visited.insert(Node);
4124	if (!Inserted.second)
4125	return;
4126
4127	// Make a copy since the recursive call may move a caller edge to a new
4128	// callee, messing up the iterator.
4129	auto CallerEdges = Node->CallerEdges;
4130	for (auto CallerEdge : CallerEdges) {
4131	// Skip any caller edge moved onto a different callee during recursion.
4132	if (CallerEdge->Callee != Node)
4133	continue;
4134	mergeClones(CallerEdge->Caller, Visited, ContextIdToAllocationNode);
4135	}
4136
4137	// Merge for this node after we handle its callers.
4138	mergeNodeCalleeClones(Node, Visited, ContextIdToAllocationNode);
4139	}
4140
4141	template <typename DerivedCCG, typename FuncTy, typename CallTy>
4142	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeNodeCalleeClones(
4143	ContextNode Node, DenseSet<const* ContextNode *> &Visited,
4144	DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
4145	// Ignore Node if we moved all of its contexts to clones.
4146	if (Node->emptyContextIds())
4147	return;
4148
4149	// First identify groups of clones among Node's callee edges, by building
4150	// a map from each callee base node to the associated callee edges from Node.
4151	MapVector<ContextNode *, std::vector<std::shared_ptr<ContextEdge>>>
4152	OrigNodeToCloneEdges;
4153	for (const auto &E : Node->CalleeEdges) {
4154	auto *Callee = E->Callee;
4155	if (!Callee->CloneOf && Callee->Clones.empty())
4156	continue;
4157	ContextNode *Base = Callee->getOrigNode();
4158	OrigNodeToCloneEdges[Base].push_back(E);
4159	}
4160
4161	// Helper for callee edge sorting below. Return true if A's callee has fewer
4162	// caller edges than B, or if A is a clone and B is not, or if A's first
4163	// context id is smaller than B's.
4164	auto CalleeCallerEdgeLessThan = [](const std::shared_ptr<ContextEdge> &A,
4165	const std::shared_ptr<ContextEdge> &B) {
4166	if (A->Callee->CallerEdges.size() != B->Callee->CallerEdges.size())
4167	return A->Callee->CallerEdges.size() < B->Callee->CallerEdges.size();
4168	if (A->Callee->CloneOf && !B->Callee->CloneOf)
4169	return true;
4170	else if (!A->Callee->CloneOf && B->Callee->CloneOf)
4171	return false;
4172	// Use the first context id for each edge as a
4173	// tie-breaker.
4174	return A->ContextIds.begin() < B->ContextIds.begin();
4175	};
4176
4177	// Process each set of callee clones called by Node, performing the needed
4178	// merging.
4179	for (auto Entry : OrigNodeToCloneEdges) {
4180	// CalleeEdges is the set of edges from Node reaching callees that are
4181	// mutual clones of each other.
4182	auto &CalleeEdges = Entry.second;
4183	auto NumCalleeClones = CalleeEdges.size();
4184	// A single edge means there is no merging needed.
4185	if (NumCalleeClones == `1`)
4186	continue;
4187	// Sort the CalleeEdges calling this group of clones in ascending order of
4188	// their caller edge counts, putting the original non-clone node first in
4189	// cases of a tie. This simplifies finding an existing node to use as the
4190	// merge node.
4191	llvm::stable_sort(CalleeEdges, CalleeCallerEdgeLessThan);
4192
4193	/// Find other callers of the given set of callee edges that can
4194	/// share the same callee merge node. See the comments at this method
4195	/// definition for details.
4196	DenseSet<ContextNode *> OtherCallersToShareMerge;
4197	findOtherCallersToShareMerge(Node, CalleeEdges, ContextIdToAllocationNode,
4198	OtherCallersToShareMerge);
4199
4200	// Now do the actual merging. Identify existing or create a new MergeNode
4201	// during the first iteration. Move each callee over, along with edges from
4202	// other callers we've determined above can share the same merge node.
4203	ContextNode MergeNode = nullptr*;
4204	DenseMap<ContextNode , unsigned*> CallerToMoveCount;
4205	for (auto CalleeEdge : CalleeEdges) {
4206	auto *OrigCallee = CalleeEdge->Callee;
4207	// If we don't have a MergeNode yet (only happens on the first iteration,
4208	// as a new one will be created when we go to move the first callee edge
4209	// over as needed), see if we can use this callee.
4210	if (!MergeNode) {
4211	// If there are no other callers, simply use this callee.
4212	if (CalleeEdge->Callee->CallerEdges.size() == `1`) {
4213	MergeNode = OrigCallee;
4214	NonNewMergedNodes ++;
4215	continue;
4216	}
4217	// Otherwise, if we have identified other caller nodes that can share
4218	// the merge node with Node, see if all of OrigCallee's callers are
4219	// going to share the same merge node. In that case we can use callee
4220	// (since all of its callers would move to the new merge node).
4221	if (!OtherCallersToShareMerge.empty()) {
4222	bool MoveAllCallerEdges = true;
4223	for (auto CalleeCallerE : OrigCallee->CallerEdges) {
4224	if (CalleeCallerE == CalleeEdge)
4225	continue;
4226	if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller)) {
4227	MoveAllCallerEdges = false;
4228	break;
4229	}
4230	}
4231	// If we are going to move all callers over, we can use this callee as
4232	// the MergeNode.
4233	if (MoveAllCallerEdges) {
4234	MergeNode = OrigCallee;
4235	NonNewMergedNodes ++;
4236	continue;
4237	}
4238	}
4239	}
4240	// Move this callee edge, creating a new merge node if necessary.
4241	if (MergeNode) {
4242	assert(MergeNode != OrigCallee);
4243	moveEdgeToExistingCalleeClone(Edge: CalleeEdge, NewCallee: MergeNode,
4244	/NewClone/ false);
4245	} else {
4246	MergeNode = moveEdgeToNewCalleeClone(Edge: CalleeEdge);
4247	NewMergedNodes ++;
4248	}
4249	// Now move all identified edges from other callers over to the merge node
4250	// as well.
4251	if (!OtherCallersToShareMerge.empty()) {
4252	// Make and iterate over a copy of OrigCallee's caller edges because
4253	// some of these will be moved off of the OrigCallee and that would mess
4254	// up the iteration from OrigCallee.
4255	auto OrigCalleeCallerEdges = OrigCallee->CallerEdges;
4256	for (auto &CalleeCallerE : OrigCalleeCallerEdges) {
4257	if (CalleeCallerE == CalleeEdge)
4258	continue;
4259	if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller))
4260	continue;
4261	CallerToMoveCount[CalleeCallerE->Caller]++;
4262	moveEdgeToExistingCalleeClone(Edge: CalleeCallerE, NewCallee: MergeNode,
4263	/NewClone/ false);
4264	}
4265	}
4266	removeNoneTypeCalleeEdges(Node: OrigCallee);
4267	removeNoneTypeCalleeEdges(Node: MergeNode);
4268	}
4269	}
4270	}
4271
4272	// Look for other nodes that have edges to the same set of callee
4273	// clones as the current Node. Those can share the eventual merge node
4274	// (reducing cloning and binary size overhead) iff:
4275	// - they have edges to the same set of callee clones
4276	// - each callee edge reaches a subset of the same allocations as Node's
4277	// corresponding edge to the same callee clone.
4278	// The second requirement is to ensure that we don't undo any of the
4279	// necessary cloning to distinguish contexts with different allocation
4280	// behavior.
4281	// FIXME: This is somewhat conservative, as we really just need to ensure
4282	// that they don't reach the same allocations as contexts on edges from Node
4283	// going to any of the other* callee clones being merged. However, that*
4284	// requires more tracking and checking to get right.
4285	template <typename DerivedCCG, typename FuncTy, typename CallTy>
4286	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
4287	findOtherCallersToShareMerge(
4288	ContextNode *Node,
4289	std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges,
4290	DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode,
4291	DenseSet<ContextNode *> &OtherCallersToShareMerge) {
4292	auto NumCalleeClones = CalleeEdges.size();
4293	// This map counts how many edges to the same callee clone exist for other
4294	// caller nodes of each callee clone.
4295	DenseMap<ContextNode , unsigned*> OtherCallersToSharedCalleeEdgeCount;
4296	// Counts the number of other caller nodes that have edges to all callee
4297	// clones that don't violate the allocation context checking.
4298	unsigned PossibleOtherCallerNodes = `0`;
4299
4300	// We only need to look at other Caller nodes if the first callee edge has
4301	// multiple callers (recall they are sorted in ascending order above).
4302	if (CalleeEdges[`0`]->Callee->CallerEdges.size() < `2`)
4303	return;
4304
4305	// For each callee edge:
4306	// - Collect the count of other caller nodes calling the same callees.
4307	// - Collect the alloc nodes reached by contexts on each callee edge.
4308	DenseMap<ContextEdge , DenseSet<ContextNode >> CalleeEdgeToAllocNodes;
4309	for (auto CalleeEdge : CalleeEdges) {
4310	assert(CalleeEdge->Callee->CallerEdges.size() > `1`);
4311	// For each other caller of the same callee, increment the count of
4312	// edges reaching the same callee clone.
4313	for (auto CalleeCallerEdges : CalleeEdge->Callee->CallerEdges) {
4314	if (CalleeCallerEdges->Caller == Node) {
4315	assert(CalleeCallerEdges == CalleeEdge);
4316	continue;
4317	}
4318	OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller]++;
4319	// If this caller edge now reaches all of the same callee clones,
4320	// increment the count of candidate other caller nodes.
4321	if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller] ==
4322	NumCalleeClones)
4323	PossibleOtherCallerNodes++;
4324	}
4325	// Collect the alloc nodes reached by contexts on each callee edge, for
4326	// later analysis.
4327	for (auto Id : CalleeEdge->getContextIds()) {
4328	auto *Alloc = ContextIdToAllocationNode.lookup(Id);
4329	if (!Alloc) {
4330	// FIXME: unclear why this happens occasionally, presumably
4331	// imperfect graph updates possibly with recursion.
4332	MissingAllocForContextId ++;
4333	continue;
4334	}
4335	CalleeEdgeToAllocNodes[CalleeEdge.get()].insert(Alloc);
4336	}
4337	}
4338
4339	// Now walk the callee edges again, and make sure that for each candidate
4340	// caller node all of its edges to the callees reach the same allocs (or
4341	// a subset) as those along the corresponding callee edge from Node.
4342	for (auto CalleeEdge : CalleeEdges) {
4343	assert(CalleeEdge->Callee->CallerEdges.size() > `1`);
4344	// Stop if we do not have any (more) candidate other caller nodes.
4345	if (!PossibleOtherCallerNodes)
4346	break;
4347	auto &CurCalleeAllocNodes = CalleeEdgeToAllocNodes[CalleeEdge.get()];
4348	// Check each other caller of this callee clone.
4349	for (auto &CalleeCallerE : CalleeEdge->Callee->CallerEdges) {
4350	// Not interested in the callee edge from Node itself.
4351	if (CalleeCallerE == CalleeEdge)
4352	continue;
4353	// Skip any callers that didn't have callee edges to all the same
4354	// callee clones.
4355	if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] !=
4356	NumCalleeClones)
4357	continue;
4358	// Make sure that each context along edge from candidate caller node
4359	// reaches an allocation also reached by this callee edge from Node.
4360	for (auto Id : CalleeCallerE->getContextIds()) {
4361	auto *Alloc = ContextIdToAllocationNode.lookup(Id);
4362	if (!Alloc)
4363	continue;
4364	// If not, simply reset the map entry to 0 so caller is ignored, and
4365	// reduce the count of candidate other caller nodes.
4366	if (!CurCalleeAllocNodes.contains(Alloc)) {
4367	OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] = `0`;
4368	PossibleOtherCallerNodes--;
4369	break;
4370	}
4371	}
4372	}
4373	}
4374
4375	if (!PossibleOtherCallerNodes)
4376	return;
4377
4378	// Build the set of other caller nodes that can use the same callee merge
4379	// node.
4380	for (auto &[OtherCaller, Count] : OtherCallersToSharedCalleeEdgeCount) {
4381	if (Count != NumCalleeClones)
4382	continue;
4383	OtherCallersToShareMerge.insert(OtherCaller);
4384	}
4385	}
4386
4387	// This method assigns cloned callsites to functions, cloning the functions as
4388	// needed. The assignment is greedy and proceeds roughly as follows:
4389	//
4390	// For each function Func:
4391	// For each call with graph Node having clones:
4392	// Initialize ClonesWorklist to Node and its clones
4393	// Initialize NodeCloneCount to 0
4394	// While ClonesWorklist is not empty:
4395	// Clone = pop front ClonesWorklist
4396	// NodeCloneCount++
4397	// If Func has been cloned less than NodeCloneCount times:
4398	// If NodeCloneCount is 1:
4399	// Assign Clone to original Func
4400	// Continue
4401	// Create a new function clone
4402	// If other callers not assigned to call a function clone yet:
4403	// Assign them to call new function clone
4404	// Continue
4405	// Assign any other caller calling the cloned version to new clone
4406	//
4407	// For each caller of Clone:
4408	// If caller is assigned to call a specific function clone:
4409	// If we cannot assign Clone to that function clone:
4410	// Create new callsite Clone NewClone
4411	// Add NewClone to ClonesWorklist
4412	// Continue
4413	// Assign Clone to existing caller's called function clone
4414	// Else:
4415	// If Clone not already assigned to a function clone:
4416	// Assign to first function clone without assignment
4417	// Assign caller to selected function clone
4418	template <typename DerivedCCG, typename FuncTy, typename CallTy>
4419	bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
4420	bool Changed = false;
4421
4422	mergeClones();
4423
4424	// Keep track of the assignment of nodes (callsites) to function clones they
4425	// call.
4426	DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
4427
4428	// Update caller node to call function version CalleeFunc, by recording the
4429	// assignment in CallsiteToCalleeFuncCloneMap.
4430	auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
4431	const FuncInfo &CalleeFunc) {
4432	assert(Caller->hasCall());
4433	CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
4434	};
4435
4436	// Walk all functions for which we saw calls with memprof metadata, and handle
4437	// cloning for each of its calls.
4438	for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
4439	FuncInfo OrigFunc(Func);
4440	// Map from each clone of OrigFunc to a map of remappings of each call of
4441	// interest (from original uncloned call to the corresponding cloned call in
4442	// that function clone).
4443	std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap;
4444	for (auto &Call : CallsWithMetadata) {
4445	ContextNode *Node = getNodeForInst(C: Call);
4446	// Skip call if we do not have a node for it (all uses of its stack ids
4447	// were either on inlined chains or pruned from the MIBs), or if we did
4448	// not create any clones for it.
4449	if (!Node \|\| Node->Clones.empty())
4450	continue;
4451	assert(Node->hasCall() &&
4452	"Not having a call should have prevented cloning");
4453
4454	// Track the assignment of function clones to clones of the current
4455	// callsite Node being handled.
4456	std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
4457
4458	// Assign callsite version CallsiteClone to function version FuncClone,
4459	// and also assign (possibly cloned) Call to CallsiteClone.
4460	auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
4461	CallInfo &Call,
4462	ContextNode *CallsiteClone,
4463	bool IsAlloc) {
4464	// Record the clone of callsite node assigned to this function clone.
4465	FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
4466
4467	assert(FuncClonesToCallMap.count(FuncClone));
4468	std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone];
4469	CallInfo CallClone(Call);
4470	if (auto It = CallMap.find(Call); It != CallMap.end())
4471	CallClone = It->second;
4472	CallsiteClone->setCall(CallClone);
4473	// Need to do the same for all matching calls.
4474	for (auto &MatchingCall : Node->MatchingCalls) {
4475	CallInfo CallClone(MatchingCall);
4476	if (auto It = CallMap.find(MatchingCall); It != CallMap.end())
4477	CallClone = It->second;
4478	// Updates the call in the list.
4479	MatchingCall = CallClone;
4480	}
4481	};
4482
4483	// Keep track of the clones of callsite Node that need to be assigned to
4484	// function clones. This list may be expanded in the loop body below if we
4485	// find additional cloning is required.
4486	std::deque<ContextNode *> ClonesWorklist;
4487	// Ignore original Node if we moved all of its contexts to clones.
4488	if (!Node->emptyContextIds())
4489	ClonesWorklist.push_back(Node);
4490	llvm::append_range(ClonesWorklist, Node->Clones);
4491
4492	// Now walk through all of the clones of this callsite Node that we need,
4493	// and determine the assignment to a corresponding clone of the current
4494	// function (creating new function clones as needed).
4495	unsigned NodeCloneCount = `0`;
4496	while (!ClonesWorklist.empty()) {
4497	ContextNode *Clone = ClonesWorklist.front();
4498	ClonesWorklist.pop_front();
4499	NodeCloneCount++;
4500	if (VerifyNodes)
4501	checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
4502
4503	// Need to create a new function clone if we have more callsite clones
4504	// than existing function clones, which would have been assigned to an
4505	// earlier clone in the list (we assign callsite clones to function
4506	// clones greedily).
4507	if (FuncClonesToCallMap.size() < NodeCloneCount) {
4508	// If this is the first callsite copy, assign to original function.
4509	if (NodeCloneCount == `1`) {
4510	// Since FuncClonesToCallMap is empty in this case, no clones have
4511	// been created for this function yet, and no callers should have
4512	// been assigned a function clone for this callee node yet.
4513	assert(llvm::none_of(
4514	Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
4515	return CallsiteToCalleeFuncCloneMap.count(E->Caller);
4516	}));
4517	// Initialize with empty call map, assign Clone to original function
4518	// and its callers, and skip to the next clone.
4519	FuncClonesToCallMap[OrigFunc] = {};
4520	AssignCallsiteCloneToFuncClone(
4521	OrigFunc, Call, Clone,
4522	AllocationCallToContextNodeMap.count(Call));
4523	for (auto &CE : Clone->CallerEdges) {
4524	// Ignore any caller that does not have a recorded callsite Call.
4525	if (!CE->Caller->hasCall())
4526	continue;
4527	RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
4528	}
4529	continue;
4530	}
4531
4532	// First locate which copy of OrigFunc to clone again. If a caller
4533	// of this callsite clone was already assigned to call a particular
4534	// function clone, we need to redirect all of those callers to the
4535	// new function clone, and update their other callees within this
4536	// function.
4537	FuncInfo PreviousAssignedFuncClone;
4538	auto EI = llvm::find_if(
4539	Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
4540	return CallsiteToCalleeFuncCloneMap.count(E->Caller);
4541	});
4542	bool CallerAssignedToCloneOfFunc = false;
4543	if (EI != Clone->CallerEdges.end()) {
4544	const std::shared_ptr<ContextEdge> &Edge = *EI;
4545	PreviousAssignedFuncClone =
4546	CallsiteToCalleeFuncCloneMap[Edge->Caller];
4547	CallerAssignedToCloneOfFunc = true;
4548	}
4549
4550	// Clone function and save it along with the CallInfo map created
4551	// during cloning in the FuncClonesToCallMap.
4552	std::map<CallInfo, CallInfo> NewCallMap;
4553	unsigned CloneNo = FuncClonesToCallMap.size();
4554	assert(CloneNo > `0` && "Clone 0 is the original function, which "
4555	"should already exist in the map");
4556	FuncInfo NewFuncClone = cloneFunctionForCallsite(
4557	Func&: OrigFunc, Call, CallMap&: NewCallMap, CallsWithMetadataInFunc&: CallsWithMetadata, CloneNo);
4558	FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap));
4559	FunctionClonesAnalysis ++;
4560	Changed = true;
4561
4562	// If no caller callsites were already assigned to a clone of this
4563	// function, we can simply assign this clone to the new func clone
4564	// and update all callers to it, then skip to the next clone.
4565	if (!CallerAssignedToCloneOfFunc) {
4566	AssignCallsiteCloneToFuncClone(
4567	NewFuncClone, Call, Clone,
4568	AllocationCallToContextNodeMap.count(Call));
4569	for (auto &CE : Clone->CallerEdges) {
4570	// Ignore any caller that does not have a recorded callsite Call.
4571	if (!CE->Caller->hasCall())
4572	continue;
4573	RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
4574	}
4575	continue;
4576	}
4577
4578	// We may need to do additional node cloning in this case.
4579	// Reset the CallsiteToCalleeFuncCloneMap entry for any callers
4580	// that were previously assigned to call PreviousAssignedFuncClone,
4581	// to record that they now call NewFuncClone.
4582	// The none type edge removal may remove some of this Clone's caller
4583	// edges, if it is reached via another of its caller's callees.
4584	// Iterate over a copy and skip any that were removed.
4585	auto CallerEdges = Clone->CallerEdges;
4586	for (auto CE : CallerEdges) {
4587	// Skip any that have been removed on an earlier iteration.
4588	if (CE->isRemoved()) {
4589	assert(!is_contained(Clone->CallerEdges, CE));
4590	continue;
4591	}
4592	assert(CE);
4593	// Ignore any caller that does not have a recorded callsite Call.
4594	if (!CE->Caller->hasCall())
4595	continue;
4596
4597	if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) \|\|
4598	// We subsequently fall through to later handling that
4599	// will perform any additional cloning required for
4600	// callers that were calling other function clones.
4601	CallsiteToCalleeFuncCloneMap[CE->Caller] !=
4602	PreviousAssignedFuncClone)
4603	continue;
4604
4605	RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
4606
4607	// If we are cloning a function that was already assigned to some
4608	// callers, then essentially we are creating new callsite clones
4609	// of the other callsites in that function that are reached by those
4610	// callers. Clone the other callees of the current callsite's caller
4611	// that were already assigned to PreviousAssignedFuncClone
4612	// accordingly. This is important since we subsequently update the
4613	// calls from the nodes in the graph and their assignments to callee
4614	// functions recorded in CallsiteToCalleeFuncCloneMap.
4615	// The none type edge removal may remove some of this caller's
4616	// callee edges, if it is reached via another of its callees.
4617	// Iterate over a copy and skip any that were removed.
4618	auto CalleeEdges = CE->Caller->CalleeEdges;
4619	for (auto CalleeEdge : CalleeEdges) {
4620	// Skip any that have been removed on an earlier iteration when
4621	// cleaning up newly None type callee edges.
4622	if (CalleeEdge->isRemoved()) {
4623	assert(!is_contained(CE->Caller->CalleeEdges, CalleeEdge));
4624	continue;
4625	}
4626	assert(CalleeEdge);
4627	ContextNode *Callee = CalleeEdge->Callee;
4628	// Skip the current callsite, we are looking for other
4629	// callsites Caller calls, as well as any that does not have a
4630	// recorded callsite Call.
4631	if (Callee == Clone \|\| !Callee->hasCall())
4632	continue;
4633	// Skip direct recursive calls. We don't need/want to clone the
4634	// caller node again, and this loop will not behave as expected if
4635	// we tried.
4636	if (Callee == CalleeEdge->Caller)
4637	continue;
4638	ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge: CalleeEdge);
4639	removeNoneTypeCalleeEdges(Node: NewClone);
4640	// Moving the edge may have resulted in some none type
4641	// callee edges on the original Callee.
4642	removeNoneTypeCalleeEdges(Node: Callee);
4643	assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
4644	// If the Callee node was already assigned to call a specific
4645	// function version, make sure its new clone is assigned to call
4646	// that same function clone.
4647	if (CallsiteToCalleeFuncCloneMap.count(Callee))
4648	RecordCalleeFuncOfCallsite(
4649	NewClone, CallsiteToCalleeFuncCloneMap[Callee]);
4650	// Update NewClone with the new Call clone of this callsite's Call
4651	// created for the new function clone created earlier.
4652	// Recall that we have already ensured when building the graph
4653	// that each caller can only call callsites within the same
4654	// function, so we are guaranteed that Callee Call is in the
4655	// current OrigFunc.
4656	// CallMap is set up as indexed by original Call at clone 0.
4657	CallInfo OrigCall(Callee->getOrigNode()->Call);
4658	OrigCall.setCloneNo(`0`);
4659	std::map<CallInfo, CallInfo> &CallMap =
4660	FuncClonesToCallMap[NewFuncClone];
4661	assert(CallMap.count(OrigCall));
4662	CallInfo NewCall(CallMap[OrigCall]);
4663	assert(NewCall);
4664	NewClone->setCall(NewCall);
4665	// Need to do the same for all matching calls.
4666	for (auto &MatchingCall : NewClone->MatchingCalls) {
4667	CallInfo OrigMatchingCall(MatchingCall);
4668	OrigMatchingCall.setCloneNo(`0`);
4669	assert(CallMap.count(OrigMatchingCall));
4670	CallInfo NewCall(CallMap[OrigMatchingCall]);
4671	assert(NewCall);
4672	// Updates the call in the list.
4673	MatchingCall = NewCall;
4674	}
4675	}
4676	}
4677	// Fall through to handling below to perform the recording of the
4678	// function for this callsite clone. This enables handling of cases
4679	// where the callers were assigned to different clones of a function.
4680	}
4681
4682	// See if we can use existing function clone. Walk through
4683	// all caller edges to see if any have already been assigned to
4684	// a clone of this callsite's function. If we can use it, do so. If not,
4685	// because that function clone is already assigned to a different clone
4686	// of this callsite, then we need to clone again.
4687	// Basically, this checking is needed to handle the case where different
4688	// caller functions/callsites may need versions of this function
4689	// containing different mixes of callsite clones across the different
4690	// callsites within the function. If that happens, we need to create
4691	// additional function clones to handle the various combinations.
4692	//
4693	// Keep track of any new clones of this callsite created by the
4694	// following loop, as well as any existing clone that we decided to
4695	// assign this clone to.
4696	std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
4697	FuncInfo FuncCloneAssignedToCurCallsiteClone;
4698	// Iterate over a copy of Clone's caller edges, since we may need to
4699	// remove edges in the moveEdgeTo methods, and this simplifies the*
4700	// handling and makes it less error-prone.
4701	auto CloneCallerEdges = Clone->CallerEdges;
4702	for (auto &Edge : CloneCallerEdges) {
4703	// Skip removed edges (due to direct recursive edges updated when
4704	// updating callee edges when moving an edge and subsequently
4705	// removed by call to removeNoneTypeCalleeEdges on the Clone).
4706	if (Edge->isRemoved())
4707	continue;
4708	// Ignore any caller that does not have a recorded callsite Call.
4709	if (!Edge->Caller->hasCall())
4710	continue;
4711	// If this caller already assigned to call a version of OrigFunc, need
4712	// to ensure we can assign this callsite clone to that function clone.
4713	if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
4714	FuncInfo FuncCloneCalledByCaller =
4715	CallsiteToCalleeFuncCloneMap[Edge->Caller];
4716	// First we need to confirm that this function clone is available
4717	// for use by this callsite node clone.
4718	//
4719	// While FuncCloneToCurNodeCloneMap is built only for this Node and
4720	// its callsite clones, one of those callsite clones X could have
4721	// been assigned to the same function clone called by Edge's caller
4722	// - if Edge's caller calls another callsite within Node's original
4723	// function, and that callsite has another caller reaching clone X.
4724	// We need to clone Node again in this case.
4725	if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
4726	FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
4727	Clone) \|\|
4728	// Detect when we have multiple callers of this callsite that
4729	// have already been assigned to specific, and different, clones
4730	// of OrigFunc (due to other unrelated callsites in Func they
4731	// reach via call contexts). Is this Clone of callsite Node
4732	// assigned to a different clone of OrigFunc? If so, clone Node
4733	// again.
4734	(FuncCloneAssignedToCurCallsiteClone &&
4735	FuncCloneAssignedToCurCallsiteClone !=
4736	FuncCloneCalledByCaller)) {
4737	// We need to use a different newly created callsite clone, in
4738	// order to assign it to another new function clone on a
4739	// subsequent iteration over the Clones array (adjusted below).
4740	// Note we specifically do not reset the
4741	// CallsiteToCalleeFuncCloneMap entry for this caller, so that
4742	// when this new clone is processed later we know which version of
4743	// the function to copy (so that other callsite clones we have
4744	// assigned to that function clone are properly cloned over). See
4745	// comments in the function cloning handling earlier.
4746
4747	// Check if we already have cloned this callsite again while
4748	// walking through caller edges, for a caller calling the same
4749	// function clone. If so, we can move this edge to that new clone
4750	// rather than creating yet another new clone.
4751	if (FuncCloneToNewCallsiteCloneMap.count(
4752	FuncCloneCalledByCaller)) {
4753	ContextNode *NewClone =
4754	FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
4755	moveEdgeToExistingCalleeClone(Edge, NewCallee: NewClone);
4756	// Cleanup any none type edges cloned over.
4757	removeNoneTypeCalleeEdges(Node: NewClone);
4758	} else {
4759	// Create a new callsite clone.
4760	ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge);
4761	removeNoneTypeCalleeEdges(Node: NewClone);
4762	FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
4763	NewClone;
4764	// Add to list of clones and process later.
4765	ClonesWorklist.push_back(NewClone);
4766	assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
4767	}
4768	// Moving the caller edge may have resulted in some none type
4769	// callee edges.
4770	removeNoneTypeCalleeEdges(Node: Clone);
4771	// We will handle the newly created callsite clone in a subsequent
4772	// iteration over this Node's Clones.
4773	continue;
4774	}
4775
4776	// Otherwise, we can use the function clone already assigned to this
4777	// caller.
4778	if (!FuncCloneAssignedToCurCallsiteClone) {
4779	FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
4780	// Assign Clone to FuncCloneCalledByCaller
4781	AssignCallsiteCloneToFuncClone(
4782	FuncCloneCalledByCaller, Call, Clone,
4783	AllocationCallToContextNodeMap.count(Call));
4784	} else
4785	// Don't need to do anything - callsite is already calling this
4786	// function clone.
4787	assert(FuncCloneAssignedToCurCallsiteClone ==
4788	FuncCloneCalledByCaller);
4789
4790	} else {
4791	// We have not already assigned this caller to a version of
4792	// OrigFunc. Do the assignment now.
4793
4794	// First check if we have already assigned this callsite clone to a
4795	// clone of OrigFunc for another caller during this iteration over
4796	// its caller edges.
4797	if (!FuncCloneAssignedToCurCallsiteClone) {
4798	// Find first function in FuncClonesToCallMap without an assigned
4799	// clone of this callsite Node. We should always have one
4800	// available at this point due to the earlier cloning when the
4801	// FuncClonesToCallMap size was smaller than the clone number.
4802	for (auto &CF : FuncClonesToCallMap) {
4803	if (!FuncCloneToCurNodeCloneMap.count(CF.first)) {
4804	FuncCloneAssignedToCurCallsiteClone = CF.first;
4805	break;
4806	}
4807	}
4808	assert(FuncCloneAssignedToCurCallsiteClone);
4809	// Assign Clone to FuncCloneAssignedToCurCallsiteClone
4810	AssignCallsiteCloneToFuncClone(
4811	FuncCloneAssignedToCurCallsiteClone, Call, Clone,
4812	AllocationCallToContextNodeMap.count(Call));
4813	} else
4814	assert(FuncCloneToCurNodeCloneMap
4815	[FuncCloneAssignedToCurCallsiteClone] == Clone);
4816	// Update callers to record function version called.
4817	RecordCalleeFuncOfCallsite(Edge->Caller,
4818	FuncCloneAssignedToCurCallsiteClone);
4819	}
4820	}
4821	}
4822	if (VerifyCCG) {
4823	checkNode<DerivedCCG, FuncTy, CallTy>(Node);
4824	for (const auto &PE : Node->CalleeEdges)
4825	checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
4826	for (const auto &CE : Node->CallerEdges)
4827	checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
4828	for (auto *Clone : Node->Clones) {
4829	checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
4830	for (const auto &PE : Clone->CalleeEdges)
4831	checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
4832	for (const auto &CE : Clone->CallerEdges)
4833	checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
4834	}
4835	}
4836	}
4837	}
4838
4839	uint8_t BothTypes =
4840	(uint8_t)AllocationType::Cold \| (uint8_t)AllocationType::NotCold;
4841
4842	auto UpdateCalls = [&](ContextNode *Node,
4843	DenseSet<const ContextNode *> &Visited,
4844	auto &&UpdateCalls) {
4845	auto Inserted = Visited.insert(Node);
4846	if (!Inserted.second)
4847	return;
4848
4849	for (auto *Clone : Node->Clones)
4850	UpdateCalls(Clone, Visited, UpdateCalls);
4851
4852	for (auto &Edge : Node->CallerEdges)
4853	UpdateCalls(Edge->Caller, Visited, UpdateCalls);
4854
4855	// Skip if either no call to update, or if we ended up with no context ids
4856	// (we moved all edges onto other clones).
4857	if (!Node->hasCall() \|\| Node->emptyContextIds())
4858	return;
4859
4860	if (Node->IsAllocation) {
4861	auto AT = allocTypeToUse(Node->AllocTypes);
4862	// If the allocation type is ambiguous, and more aggressive hinting
4863	// has been enabled via the MinClonedColdBytePercent flag, see if this
4864	// allocation should be hinted cold anyway because its fraction cold bytes
4865	// allocated is at least the given threshold.
4866	if (Node->AllocTypes == BothTypes && MinClonedColdBytePercent < `100` &&
4867	!ContextIdToContextSizeInfos.empty()) {
4868	uint64_t TotalCold = `0`;
4869	uint64_t Total = `0`;
4870	for (auto Id : Node->getContextIds()) {
4871	auto TypeI = ContextIdToAllocationType.find(Id);
4872	assert(TypeI != ContextIdToAllocationType.end());
4873	auto CSI = ContextIdToContextSizeInfos.find(Id);
4874	if (CSI != ContextIdToContextSizeInfos.end()) {
4875	for (auto &Info : CSI->second) {
4876	Total += Info.TotalSize;
4877	if (TypeI->second == AllocationType::Cold)
4878	TotalCold += Info.TotalSize;
4879	}
4880	}
4881	}
4882	if (TotalCold * `100` >= Total * MinClonedColdBytePercent)
4883	AT = AllocationType::Cold;
4884	}
4885	updateAllocationCall(Call&: Node->Call, AllocType: AT);
4886	assert(Node->MatchingCalls.empty());
4887	return;
4888	}
4889
4890	if (!CallsiteToCalleeFuncCloneMap.count(Node))
4891	return;
4892
4893	auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
4894	updateCall(CallerCall&: Node->Call, CalleeFunc);
4895	// Update all the matching calls as well.
4896	for (auto &Call : Node->MatchingCalls)
4897	updateCall(CallerCall&: Call, CalleeFunc);
4898	};
4899
4900	// Performs DFS traversal starting from allocation nodes to update calls to
4901	// reflect cloning decisions recorded earlier. For regular LTO this will
4902	// update the actual calls in the IR to call the appropriate function clone
4903	// (and add attributes to allocation calls), whereas for ThinLTO the decisions
4904	// are recorded in the summary entries.
4905	DenseSet<const ContextNode *> Visited;
4906	for (auto &Entry : AllocationCallToContextNodeMap)
4907	UpdateCalls(Entry.second, Visited, UpdateCalls);
4908
4909	return Changed;
4910	}
4911
4912	static SmallVector<std::unique_ptr<ValueToValueMapTy>, `4`> createFunctionClones(
4913	Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
4914	std::map<const Function , SmallPtrSet<const* GlobalAlias *, `1`>>
4915	&FuncToAliasMap) {
4916	// The first "clone" is the original copy, we should only call this if we
4917	// needed to create new clones.
4918	assert(NumClones > `1`);
4919	SmallVector<std::unique_ptr<ValueToValueMapTy>, `4`> VMaps;
4920	VMaps.reserve(N: NumClones - `1`);
4921	FunctionsClonedThinBackend ++;
4922	for (unsigned I = `1`; I < NumClones; I++) {
4923	VMaps.emplace_back(Args: std::make_unique<ValueToValueMapTy>());
4924	auto NewF = CloneFunction(F: &F, VMap&: VMaps.back());
4925	FunctionClonesThinBackend ++;
4926	// Strip memprof and callsite metadata from clone as they are no longer
4927	// needed.
4928	for (auto &BB : *NewF) {
4929	for (auto &Inst : BB) {
4930	Inst.setMetadata(KindID: LLVMContext::MD_memprof, Node: nullptr);
4931	Inst.setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
4932	}
4933	}
4934	std::string Name = getMemProfFuncName(Base: F.getName(), CloneNo: I);
4935	auto *PrevF = M.getFunction(Name);
4936	if (PrevF) {
4937	// We might have created this when adjusting callsite in another
4938	// function. It should be a declaration.
4939	assert(PrevF->isDeclaration());
4940	NewF->takeName(V: PrevF);
4941	PrevF->replaceAllUsesWith(V: NewF);
4942	PrevF->eraseFromParent();
4943	} else
4944	NewF->setName(Name);
4945	if (auto *SP = NewF->getSubprogram())
4946	SP->replaceLinkageName(
4947	LN: MDString::get(Context&: NewF->getParent()->getContext(), Str: Name));
4948	ORE.emit(OptDiag: OptimizationRemark (DEBUG_TYPE, "MemprofClone", &F)
4949	<< "created clone " << ore::NV ("NewFunction", NewF));
4950
4951	// Now handle aliases to this function, and clone those as well.
4952	if (!FuncToAliasMap.count(x: &F))
4953	continue;
4954	for (auto *A : FuncToAliasMap [&F]) {
4955	std::string Name = getMemProfFuncName(Base: A->getName(), CloneNo: I);
4956	auto *PrevA = M.getNamedAlias(Name);
4957	auto *NewA = GlobalAlias::create(Ty: A->getValueType(),
4958	AddressSpace: A->getType()->getPointerAddressSpace(),
4959	Linkage: A->getLinkage(), Name, Aliasee: NewF);
4960	NewA->copyAttributesFrom(Src: A);
4961	if (PrevA) {
4962	// We might have created this when adjusting callsite in another
4963	// function. It should be a declaration.
4964	assert(PrevA->isDeclaration());
4965	NewA->takeName(V: PrevA);
4966	PrevA->replaceAllUsesWith(V: NewA);
4967	PrevA->eraseFromParent();
4968	}
4969	}
4970	}
4971	return VMaps;
4972	}
4973
4974	// Locate the summary for F. This is complicated by the fact that it might
4975	// have been internalized or promoted.
4976	static ValueInfo findValueInfoForFunc(const Function &F, const Module &M,
4977	const ModuleSummaryIndex *ImportSummary,
4978	const Function CallingFunc = nullptr*) {
4979	// FIXME: Ideally we would retain the original GUID in some fashion on the
4980	// function (e.g. as metadata), but for now do our best to locate the
4981	// summary without that information.
4982	ValueInfo TheFnVI = ImportSummary->getValueInfo(GUID: F.getGUID());
4983	if (!TheFnVI)
4984	// See if theFn was internalized, by checking index directly with
4985	// original name (this avoids the name adjustment done by getGUID() for
4986	// internal symbols).
4987	TheFnVI = ImportSummary->getValueInfo(
4988	GUID: GlobalValue::getGUIDAssumingExternalLinkage(GlobalName: F.getName()));
4989	if (TheFnVI)
4990	return TheFnVI;
4991	// Now query with the original name before any promotion was performed.
4992	StringRef OrigName =
4993	ModuleSummaryIndex::getOriginalNameBeforePromote(Name: F.getName());
4994	// When this pass is enabled, we always add thinlto_src_file provenance
4995	// metadata to imported function definitions, which allows us to recreate the
4996	// original internal symbol's GUID.
4997	auto SrcFileMD = F.getMetadata(Kind: "thinlto_src_file");
4998	// If this is a call to an imported/promoted local for which we didn't import
4999	// the definition, the metadata will not exist on the declaration. However,
5000	// since we are doing this early, before any inlining in the LTO backend, we
5001	// can simply look at the metadata on the calling function which must have
5002	// been from the same module if F was an internal symbol originally.
5003	if (!SrcFileMD && F.isDeclaration()) {
5004	// We would only call this for a declaration for a direct callsite, in which
5005	// case the caller would have provided the calling function pointer.
5006	assert(CallingFunc);
5007	SrcFileMD = CallingFunc->getMetadata(Kind: "thinlto_src_file");
5008	// If this is a promoted local (OrigName != F.getName()), since this is a
5009	// declaration, it must be imported from a different module and therefore we
5010	// should always find the metadata on its calling function. Any call to a
5011	// promoted local that came from this module should still be a definition.
5012	assert(SrcFileMD \|\| OrigName == F.getName());
5013	}
5014	StringRef SrcFile = M.getSourceFileName();
5015	if (SrcFileMD)
5016	SrcFile = dyn_cast<MDString>(Val: SrcFileMD->getOperand(I: `0`))->getString();
5017	std::string OrigId = GlobalValue::getGlobalIdentifier(
5018	Name: OrigName, Linkage: GlobalValue::InternalLinkage, FileName: SrcFile);
5019	TheFnVI = ImportSummary->getValueInfo(
5020	GUID: GlobalValue::getGUIDAssumingExternalLinkage(GlobalName: OrigId));
5021	// Internal func in original module may have gotten a numbered suffix if we
5022	// imported an external function with the same name. This happens
5023	// automatically during IR linking for naming conflicts. It would have to
5024	// still be internal in that case (otherwise it would have been renamed on
5025	// promotion in which case we wouldn't have a naming conflict).
5026	if (!TheFnVI && OrigName == F.getName() && F.hasLocalLinkage() &&
5027	F.getName().contains(C: `'.'`)) {
5028	OrigName = F.getName().rsplit(Separator: `'.'`).first;
5029	OrigId = GlobalValue::getGlobalIdentifier(
5030	Name: OrigName, Linkage: GlobalValue::InternalLinkage, FileName: SrcFile);
5031	TheFnVI = ImportSummary->getValueInfo(
5032	GUID: GlobalValue::getGUIDAssumingExternalLinkage(GlobalName: OrigId));
5033	}
5034	// The only way we may not have a VI is if this is a declaration created for
5035	// an imported reference. For distributed ThinLTO we may not have a VI for
5036	// such declarations in the distributed summary.
5037	assert(TheFnVI \|\| F.isDeclaration());
5038	return TheFnVI;
5039	}
5040
5041	bool MemProfContextDisambiguation::initializeIndirectCallPromotionInfo(
5042	Module &M) {
5043	ICallAnalysis = std::make_unique<ICallPromotionAnalysis>();
5044	Symtab = std::make_unique<InstrProfSymtab>();
5045	// Don't add canonical names, to avoid multiple functions to the symtab
5046	// when they both have the same root name with "." suffixes stripped.
5047	// If we pick the wrong one then this could lead to incorrect ICP and calling
5048	// a memprof clone that we don't actually create (resulting in linker unsats).
5049	// What this means is that the GUID of the function (or its PGOFuncName
5050	// metadata) must* match that in the VP metadata to allow promotion.*
5051	// In practice this should not be a limitation, since local functions should
5052	// have PGOFuncName metadata and global function names shouldn't need any
5053	// special handling (they should not get the ".llvm." suffix that the*
5054	// canonicalization handling is attempting to strip).
5055	if (Error E = Symtab ->create(M, /InLTO=/true, /AddCanonical=/false)) {
5056	std::string SymtabFailure = toString(E: std::move(E));
5057	M.getContext().emitError(ErrorStr: "Failed to create symtab: " + SymtabFailure);
5058	return false;
5059	}
5060	return true;
5061	}
5062
5063	#ifndef NDEBUG
5064	// Sanity check that the MIB stack ids match between the summary and
5065	// instruction metadata.
5066	static void checkAllocContextIds(
5067	const AllocInfo &AllocNode, const MDNode *MemProfMD,
5068	const CallStack<MDNode, MDNode::op_iterator> &CallsiteContext,
5069	const ModuleSummaryIndex *ImportSummary) {
5070	auto MIBIter = AllocNode.MIBs.begin();
5071	for (auto &MDOp : MemProfMD->operands()) {
5072	assert(MIBIter != AllocNode.MIBs.end());
5073	auto StackIdIndexIter = MIBIter->StackIdIndices.begin();
5074	auto MIBMD = cast<const* MDNode>(MDOp);
5075	MDNode *StackMDNode = getMIBStackNode(MIBMD);
5076	assert(StackMDNode);
5077	CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
5078	auto ContextIterBegin =
5079	StackContext.beginAfterSharedPrefix(CallsiteContext);
5080	// Skip the checking on the first iteration.
5081	uint64_t LastStackContextId =
5082	(ContextIterBegin != StackContext.end() && *ContextIterBegin == `0`) ? `1`
5083	: `0`;
5084	for (auto ContextIter = ContextIterBegin; ContextIter != StackContext.end();
5085	++ContextIter) {
5086	// If this is a direct recursion, simply skip the duplicate
5087	// entries, to be consistent with how the summary ids were
5088	// generated during ModuleSummaryAnalysis.
5089	if (LastStackContextId == *ContextIter)
5090	continue;
5091	LastStackContextId = *ContextIter;
5092	assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
5093	assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
5094	*ContextIter);
5095	StackIdIndexIter++;
5096	}
5097	MIBIter++;
5098	}
5099	}
5100	#endif
5101
5102	bool MemProfContextDisambiguation::applyImport(Module &M) {
5103	assert(ImportSummary);
5104	bool Changed = false;
5105
5106	// We also need to clone any aliases that reference cloned functions, because
5107	// the modified callsites may invoke via the alias. Keep track of the aliases
5108	// for each function.
5109	std::map<const Function , SmallPtrSet<const* GlobalAlias *, `1`>>
5110	FuncToAliasMap;
5111	for (auto &A : M.aliases()) {
5112	auto *Aliasee = A.getAliaseeObject();
5113	if (auto *F = dyn_cast<Function>(Val: Aliasee))
5114	FuncToAliasMap [F].insert(Ptr: &A);
5115	}
5116
5117	if (!initializeIndirectCallPromotionInfo(M))
5118	return false;
5119
5120	for (auto &F : M) {
5121	if (F.isDeclaration() \|\| isMemProfClone(F))
5122	continue;
5123
5124	OptimizationRemarkEmitter ORE(&F);
5125
5126	SmallVector<std::unique_ptr<ValueToValueMapTy>, `4`> VMaps;
5127	bool ClonesCreated = false;
5128	unsigned NumClonesCreated = `0`;
5129	auto CloneFuncIfNeeded = [&](unsigned NumClones) {
5130	// We should at least have version 0 which is the original copy.
5131	assert(NumClones > `0`);
5132	// If only one copy needed use original.
5133	if (NumClones == `1`)
5134	return;
5135	// If we already performed cloning of this function, confirm that the
5136	// requested number of clones matches (the thin link should ensure the
5137	// number of clones for each constituent callsite is consistent within
5138	// each function), before returning.
5139	if (ClonesCreated) {
5140	assert(NumClonesCreated == NumClones);
5141	return;
5142	}
5143	VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap);
5144	// The first "clone" is the original copy, which doesn't have a VMap.
5145	assert(VMaps.size() == NumClones - `1`);
5146	Changed = true;
5147	ClonesCreated = true;
5148	NumClonesCreated = NumClones;
5149	};
5150
5151	auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
5152	Function *CalledFunction) {
5153	// Perform cloning if not yet done.
5154	CloneFuncIfNeeded (/NumClones=/StackNode.Clones.size());
5155
5156	assert(!isMemProfClone(*CalledFunction));
5157
5158	// Update the calls per the summary info.
5159	// Save orig name since it gets updated in the first iteration
5160	// below.
5161	auto CalleeOrigName = CalledFunction->getName();
5162	for (unsigned J = `0`; J < StackNode.Clones.size(); J++) {
5163	// Do nothing if this version calls the original version of its
5164	// callee.
5165	if (!StackNode.Clones [J])
5166	continue;
5167	auto NewF = M.getOrInsertFunction(
5168	Name: getMemProfFuncName(Base: CalleeOrigName, CloneNo: StackNode.Clones [J]),
5169	T: CalledFunction->getFunctionType());
5170	CallBase *CBClone;
5171	// Copy 0 is the original function.
5172	if (!J)
5173	CBClone = CB;
5174	else
5175	CBClone = cast<CallBase>(Val&: (*VMaps [J - `1`])[CB]);
5176	CBClone->setCalledFunction(NewF);
5177	ORE.emit(OptDiag: OptimizationRemark (DEBUG_TYPE, "MemprofCall", CBClone)
5178	<< ore::NV ("Call", CBClone) << " in clone "
5179	<< ore::NV ("Caller", CBClone->getFunction())
5180	<< " assigned to call function clone "
5181	<< ore::NV ("Callee", NewF.getCallee()));
5182	}
5183	};
5184
5185	// Locate the summary for F.
5186	ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
5187	// If not found, this could be an imported local (see comment in
5188	// findValueInfoForFunc). Skip for now as it will be cloned in its original
5189	// module (where it would have been promoted to global scope so should
5190	// satisfy any reference in this module).
5191	if (!TheFnVI)
5192	continue;
5193
5194	auto *GVSummary =
5195	ImportSummary->findSummaryInModule(VI: TheFnVI, ModuleId: M.getModuleIdentifier());
5196	if (!GVSummary) {
5197	// Must have been imported, use the summary which matches the definition。
5198	// (might be multiple if this was a linkonce_odr).
5199	auto SrcModuleMD = F.getMetadata(Kind: "thinlto_src_module");
5200	assert(SrcModuleMD &&
5201	"enable-import-metadata is needed to emit thinlto_src_module");
5202	StringRef SrcModule =
5203	dyn_cast<MDString>(Val: SrcModuleMD->getOperand(I: `0`))->getString();
5204	for (auto &GVS : TheFnVI.getSummaryList()) {
5205	if (GVS ->modulePath() == SrcModule) {
5206	GVSummary = GVS.get();
5207	break;
5208	}
5209	}
5210	assert(GVSummary && GVSummary->modulePath() == SrcModule);
5211	}
5212
5213	// If this was an imported alias skip it as we won't have the function
5214	// summary, and it should be cloned in the original module.
5215	if (isa<AliasSummary>(Val: GVSummary))
5216	continue;
5217
5218	auto *FS = cast<FunctionSummary>(Val: GVSummary->getBaseObject());
5219
5220	if (FS->allocs().empty() && FS->callsites().empty())
5221	continue;
5222
5223	auto SI = FS->callsites().begin();
5224	auto AI = FS->allocs().begin();
5225
5226	// To handle callsite infos synthesized for tail calls which have missing
5227	// frames in the profiled context, map callee VI to the synthesized callsite
5228	// info.
5229	DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite;
5230	// Iterate the callsites for this function in reverse, since we place all
5231	// those synthesized for tail calls at the end.
5232	for (auto CallsiteIt = FS->callsites().rbegin();
5233	CallsiteIt != FS->callsites().rend(); CallsiteIt ++) {
5234	auto &Callsite = *CallsiteIt;
5235	// Stop as soon as we see a non-synthesized callsite info (see comment
5236	// above loop). All the entries added for discovered tail calls have empty
5237	// stack ids.
5238	if (!Callsite.StackIdIndices.empty())
5239	break;
5240	MapTailCallCalleeVIToCallsite.insert(KV: {Callsite.Callee, Callsite});
5241	}
5242
5243	// Keeps track of needed ICP for the function.
5244	SmallVector<ICallAnalysisData> ICallAnalysisInfo;
5245
5246	// Assume for now that the instructions are in the exact same order
5247	// as when the summary was created, but confirm this is correct by
5248	// matching the stack ids.
5249	for (auto &BB : F) {
5250	for (auto &I : BB) {
5251	auto *CB = dyn_cast<CallBase>(Val: &I);
5252	// Same handling as when creating module summary.
5253	if (!mayHaveMemprofSummary(CB))
5254	continue;
5255
5256	auto *CalledValue = CB->getCalledOperand();
5257	auto *CalledFunction = CB->getCalledFunction();
5258	if (CalledValue && !CalledFunction) {
5259	CalledValue = CalledValue->stripPointerCasts();
5260	// Stripping pointer casts can reveal a called function.
5261	CalledFunction = dyn_cast<Function>(Val: CalledValue);
5262	}
5263	// Check if this is an alias to a function. If so, get the
5264	// called aliasee for the checks below.
5265	if (auto *GA = dyn_cast<GlobalAlias>(Val: CalledValue)) {
5266	assert(!CalledFunction &&
5267	"Expected null called function in callsite for alias");
5268	CalledFunction = dyn_cast<Function>(Val: GA->getAliaseeObject());
5269	}
5270
5271	CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
5272	I.getMetadata(KindID: LLVMContext::MD_callsite));
5273	auto *MemProfMD = I.getMetadata(KindID: LLVMContext::MD_memprof);
5274
5275	// Include allocs that were already assigned a memprof function
5276	// attribute in the statistics.
5277	if (CB->getAttributes().hasFnAttr(Kind: "memprof")) {
5278	assert(!MemProfMD);
5279	CB->getAttributes().getFnAttr(Kind: "memprof").getValueAsString() == "cold"
5280	? AllocTypeColdThinBackend ++
5281	: AllocTypeNotColdThinBackend ++;
5282	OrigAllocsThinBackend ++;
5283	AllocVersionsThinBackend ++;
5284	if (!MaxAllocVersionsThinBackend)
5285	MaxAllocVersionsThinBackend = `1`;
5286	continue;
5287	}
5288
5289	if (MemProfMD) {
5290	// Consult the next alloc node.
5291	assert(AI != FS->allocs().end());
5292	auto &AllocNode = *(AI++);
5293
5294	#ifndef NDEBUG
5295	checkAllocContextIds(AllocNode, MemProfMD, CallsiteContext,
5296	ImportSummary);
5297	#endif
5298
5299	// Perform cloning if not yet done.
5300	CloneFuncIfNeeded (/NumClones=/AllocNode.Versions.size());
5301
5302	OrigAllocsThinBackend ++;
5303	AllocVersionsThinBackend += AllocNode.Versions.size();
5304	if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
5305	MaxAllocVersionsThinBackend = AllocNode.Versions.size();
5306
5307	// If there is only one version that means we didn't end up
5308	// considering this function for cloning, and in that case the alloc
5309	// will still be none type or should have gotten the default NotCold.
5310	// Skip that after calling clone helper since that does some sanity
5311	// checks that confirm we haven't decided yet that we need cloning.
5312	// We might have a single version that is cold due to the
5313	// MinClonedColdBytePercent heuristic, make sure we don't skip in that
5314	// case.
5315	if (AllocNode.Versions.size() == `1` &&
5316	(AllocationType)AllocNode.Versions [`0`] != AllocationType::Cold) {
5317	assert((AllocationType)AllocNode.Versions[`0`] ==
5318	AllocationType::NotCold \|\|
5319	(AllocationType)AllocNode.Versions[`0`] ==
5320	AllocationType::None);
5321	UnclonableAllocsThinBackend ++;
5322	continue;
5323	}
5324
5325	// All versions should have a singular allocation type.
5326	assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
5327	return Type == ((uint8_t)AllocationType::NotCold \|
5328	(uint8_t)AllocationType::Cold);
5329	}));
5330
5331	// Update the allocation types per the summary info.
5332	for (unsigned J = `0`; J < AllocNode.Versions.size(); J++) {
5333	// Ignore any that didn't get an assigned allocation type.
5334	if (AllocNode.Versions [J] == (uint8_t)AllocationType::None)
5335	continue;
5336	AllocationType AllocTy = (AllocationType)AllocNode.Versions [J];
5337	AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend ++
5338	: AllocTypeNotColdThinBackend ++;
5339	std::string AllocTypeString = getAllocTypeAttributeString(Type: AllocTy);
5340	auto A = llvm::Attribute::get(Context&: F.getContext(), Kind: "memprof",
5341	Val: AllocTypeString);
5342	CallBase *CBClone;
5343	// Copy 0 is the original function.
5344	if (!J)
5345	CBClone = CB;
5346	else
5347	// Since VMaps are only created for new clones, we index with
5348	// clone J-1 (J==0 is the original clone and does not have a VMaps
5349	// entry).
5350	CBClone = cast<CallBase>(Val&: (*VMaps [J - `1`])[CB]);
5351	CBClone->addFnAttr(Attr: A);
5352	ORE.emit(OptDiag: OptimizationRemark (DEBUG_TYPE, "MemprofAttribute", CBClone)
5353	<< ore::NV ("AllocationCall", CBClone) << " in clone "
5354	<< ore::NV ("Caller", CBClone->getFunction())
5355	<< " marked with memprof allocation attribute "
5356	<< ore::NV ("Attribute", AllocTypeString));
5357	}
5358	} else if (!CallsiteContext.empty()) {
5359	if (!CalledFunction) {
5360	#ifndef NDEBUG
5361	// We should have skipped inline assembly calls.
5362	auto *CI = dyn_cast<CallInst>(CB);
5363	assert(!CI \|\| !CI->isInlineAsm());
5364	#endif
5365	// We should have skipped direct calls via a Constant.
5366	assert(CalledValue && !isa<Constant>(CalledValue));
5367
5368	// This is an indirect call, see if we have profile information and
5369	// whether any clones were recorded for the profiled targets (that
5370	// we synthesized CallsiteInfo summary records for when building the
5371	// index).
5372	auto NumClones =
5373	recordICPInfo(CB, AllCallsites: FS->callsites(), SI, ICallAnalysisInfo);
5374
5375	// Perform cloning if not yet done. This is done here in case
5376	// we don't need to do ICP, but might need to clone this
5377	// function as it is the target of other cloned calls.
5378	if (NumClones)
5379	CloneFuncIfNeeded (NumClones);
5380	}
5381
5382	else {
5383	// Consult the next callsite node.
5384	assert(SI != FS->callsites().end());
5385	auto &StackNode = *(SI++);
5386
5387	#ifndef NDEBUG
5388	// Sanity check that the stack ids match between the summary and
5389	// instruction metadata.
5390	auto StackIdIndexIter = StackNode.StackIdIndices.begin();
5391	for (auto StackId : CallsiteContext) {
5392	assert(StackIdIndexIter != StackNode.StackIdIndices.end());
5393	assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
5394	StackId);
5395	StackIdIndexIter++;
5396	}
5397	#endif
5398
5399	CloneCallsite (StackNode, CB, CalledFunction);
5400	}
5401	} else if (CB->isTailCall() && CalledFunction) {
5402	// Locate the synthesized callsite info for the callee VI, if any was
5403	// created, and use that for cloning.
5404	ValueInfo CalleeVI =
5405	findValueInfoForFunc(F: *CalledFunction, M, ImportSummary, CallingFunc: &F);
5406	if (CalleeVI && MapTailCallCalleeVIToCallsite.count(Val: CalleeVI)) {
5407	auto Callsite = MapTailCallCalleeVIToCallsite.find(Val: CalleeVI);
5408	assert(Callsite != MapTailCallCalleeVIToCallsite.end());
5409	CloneCallsite (Callsite ->second, CB, CalledFunction);
5410	}
5411	}
5412	}
5413	}
5414
5415	// Now do any promotion required for cloning.
5416	performICP(M, AllCallsites: FS->callsites(), VMaps, ICallAnalysisInfo, ORE);
5417	}
5418
5419	// We skip some of the functions and instructions above, so remove all the
5420	// metadata in a single sweep here.
5421	for (auto &F : M) {
5422	// We can skip memprof clones because createFunctionClones already strips
5423	// the metadata from the newly created clones.
5424	if (F.isDeclaration() \|\| isMemProfClone(F))
5425	continue;
5426	for (auto &BB : F) {
5427	for (auto &I : BB) {
5428	if (!isa<CallBase>(Val: I))
5429	continue;
5430	I.setMetadata(KindID: LLVMContext::MD_memprof, Node: nullptr);
5431	I.setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
5432	}
5433	}
5434	}
5435
5436	return Changed;
5437	}
5438
5439	unsigned MemProfContextDisambiguation::recordICPInfo(
5440	CallBase *CB, ArrayRef<CallsiteInfo> AllCallsites,
5441	ArrayRef<CallsiteInfo>::iterator &SI,
5442	SmallVector<ICallAnalysisData> &ICallAnalysisInfo) {
5443	// First see if we have profile information for this indirect call.
5444	uint32_t NumCandidates;
5445	uint64_t TotalCount;
5446	auto CandidateProfileData =
5447	ICallAnalysis ->getPromotionCandidatesForInstruction(I: CB, TotalCount,
5448	NumCandidates);
5449	if (CandidateProfileData.empty())
5450	return `0`;
5451
5452	// Iterate through all of the candidate profiled targets along with the
5453	// CallsiteInfo summary records synthesized for them when building the index,
5454	// and see if any are cloned and/or refer to clones.
5455	bool ICPNeeded = false;
5456	unsigned NumClones = `0`;
5457	size_t CallsiteInfoStartIndex = std::distance(first: AllCallsites.begin(), last: SI);
5458	for (const auto &Candidate : CandidateProfileData) {
5459	#ifndef NDEBUG
5460	auto CalleeValueInfo =
5461	#endif
5462	ImportSummary->getValueInfo(GUID: Candidate.Value);
5463	// We might not have a ValueInfo if this is a distributed
5464	// ThinLTO backend and decided not to import that function.
5465	assert(!CalleeValueInfo \|\| SI->Callee == CalleeValueInfo);
5466	assert(SI != AllCallsites.end());
5467	auto &StackNode = *(SI++);
5468	// See if any of the clones of the indirect callsite for this
5469	// profiled target should call a cloned version of the profiled
5470	// target. We only need to do the ICP here if so.
5471	ICPNeeded \|= llvm::any_of(Range: StackNode.Clones,
5472	P: [](unsigned CloneNo) { return CloneNo != `0`; });
5473	// Every callsite in the same function should have been cloned the same
5474	// number of times.
5475	assert(!NumClones \|\| NumClones == StackNode.Clones.size());
5476	NumClones = StackNode.Clones.size();
5477	}
5478	if (!ICPNeeded)
5479	return NumClones;
5480	// Save information for ICP, which is performed later to avoid messing up the
5481	// current function traversal.
5482	ICallAnalysisInfo.push_back(Elt: {.CB: CB, .CandidateProfileData: CandidateProfileData.vec(), .NumCandidates: NumCandidates,
5483	.TotalCount: TotalCount, .CallsiteInfoStartIndex: CallsiteInfoStartIndex});
5484	return NumClones;
5485	}
5486
5487	void MemProfContextDisambiguation::performICP(
5488	Module &M, ArrayRef<CallsiteInfo> AllCallsites,
5489	ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps,
5490	ArrayRef<ICallAnalysisData> ICallAnalysisInfo,
5491	OptimizationRemarkEmitter &ORE) {
5492	// Now do any promotion required for cloning. Specifically, for each
5493	// recorded ICP candidate (which was only recorded because one clone of that
5494	// candidate should call a cloned target), we perform ICP (speculative
5495	// devirtualization) for each clone of the callsite, and update its callee
5496	// to the appropriate clone. Note that the ICP compares against the original
5497	// version of the target, which is what is in the vtable.
5498	for (auto &Info : ICallAnalysisInfo) {
5499	auto *CB = Info.CB;
5500	auto CallsiteIndex = Info.CallsiteInfoStartIndex;
5501	auto TotalCount = Info.TotalCount;
5502	unsigned NumPromoted = `0`;
5503	unsigned NumClones = `0`;
5504
5505	for (auto &Candidate : Info.CandidateProfileData) {
5506	auto &StackNode = AllCallsites [CallsiteIndex++];
5507
5508	// All calls in the same function must have the same number of clones.
5509	assert(!NumClones \|\| NumClones == StackNode.Clones.size());
5510	NumClones = StackNode.Clones.size();
5511
5512	// See if the target is in the module. If it wasn't imported, it is
5513	// possible that this profile could have been collected on a different
5514	// target (or version of the code), and we need to be conservative
5515	// (similar to what is done in the ICP pass).
5516	Function *TargetFunction = Symtab ->getFunction(FuncMD5Hash: Candidate.Value);
5517	if (TargetFunction == nullptr \|\|
5518	// Any ThinLTO global dead symbol removal should have already
5519	// occurred, so it should be safe to promote when the target is a
5520	// declaration.
5521	// TODO: Remove internal option once more fully tested.
5522	(MemProfRequireDefinitionForPromotion &&
5523	TargetFunction->isDeclaration())) {
5524	ORE.emit(RemarkBuilder: [&]() {
5525	return OptimizationRemarkMissed (DEBUG_TYPE, "UnableToFindTarget", CB)
5526	<< "Memprof cannot promote indirect call: target with md5sum "
5527	<< ore::NV ("target md5sum", Candidate.Value) << " not found";
5528	});
5529	// FIXME: See if we can use the new declaration importing support to
5530	// at least get the declarations imported for this case. Hot indirect
5531	// targets should have been imported normally, however.
5532	continue;
5533	}
5534
5535	// Check if legal to promote
5536	const char Reason = nullptr*;
5537	if (!isLegalToPromote(CB: *CB, Callee: TargetFunction, FailureReason: &Reason)) {
5538	ORE.emit(RemarkBuilder: [&]() {
5539	return OptimizationRemarkMissed (DEBUG_TYPE, "UnableToPromote", CB)
5540	<< "Memprof cannot promote indirect call to "
5541	<< ore::NV ("TargetFunction", TargetFunction)
5542	<< " with count of " << ore::NV ("TotalCount", TotalCount)
5543	<< ": " << Reason;
5544	});
5545	continue;
5546	}
5547
5548	assert(!isMemProfClone(*TargetFunction));
5549
5550	// Handle each call clone, applying ICP so that each clone directly
5551	// calls the specified callee clone, guarded by the appropriate ICP
5552	// check.
5553	CallBase *CBClone = CB;
5554	for (unsigned J = `0`; J < NumClones; J++) {
5555	// Copy 0 is the original function.
5556	if (J > `0`)
5557	CBClone = cast<CallBase>(Val&: (*VMaps [J - `1`])[CB]);
5558	// We do the promotion using the original name, so that the comparison
5559	// is against the name in the vtable. Then just below, change the new
5560	// direct call to call the cloned function.
5561	auto &DirectCall =
5562	pgo::promoteIndirectCall(CB&: *CBClone, F: TargetFunction, Count: Candidate.Count,
5563	TotalCount, AttachProfToDirectCall: isSamplePGO, ORE: &ORE);
5564	auto *TargetToUse = TargetFunction;
5565	// Call original if this version calls the original version of its
5566	// callee.
5567	if (StackNode.Clones [J]) {
5568	TargetToUse =
5569	cast<Function>(Val: M.getOrInsertFunction(
5570	Name: getMemProfFuncName(Base: TargetFunction->getName(),
5571	CloneNo: StackNode.Clones [J]),
5572	T: TargetFunction->getFunctionType())
5573	.getCallee());
5574	}
5575	DirectCall.setCalledFunction(TargetToUse);
5576	ORE.emit(OptDiag: OptimizationRemark (DEBUG_TYPE, "MemprofCall", CBClone)
5577	<< ore::NV ("Call", CBClone) << " in clone "
5578	<< ore::NV ("Caller", CBClone->getFunction())
5579	<< " promoted and assigned to call function clone "
5580	<< ore::NV ("Callee", TargetToUse));
5581	}
5582
5583	// Update TotalCount (all clones should get same count above)
5584	TotalCount -= Candidate.Count;
5585	NumPromoted++;
5586	}
5587	// Adjust the MD.prof metadata for all clones, now that we have the new
5588	// TotalCount and the number promoted.
5589	CallBase *CBClone = CB;
5590	for (unsigned J = `0`; J < NumClones; J++) {
5591	// Copy 0 is the original function.
5592	if (J > `0`)
5593	CBClone = cast<CallBase>(Val&: (*VMaps [J - `1`])[CB]);
5594	// First delete the old one.
5595	CBClone->setMetadata(KindID: LLVMContext::MD_prof, Node: nullptr);
5596	// If all promoted, we don't need the MD.prof metadata.
5597	// Otherwise we need update with the un-promoted records back.
5598	if (TotalCount != `0`)
5599	annotateValueSite(
5600	M, Inst&: *CBClone, VDs: ArrayRef(Info.CandidateProfileData).slice(N: NumPromoted),
5601	Sum: TotalCount, ValueKind: IPVK_IndirectCallTarget, MaxMDCount: Info.NumCandidates);
5602	}
5603	}
5604	}
5605
5606	template <typename DerivedCCG, typename FuncTy, typename CallTy>
5607	bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
5608	if (DumpCCG) {
5609	dbgs() << "CCG before cloning:\n";
5610	dbgs() << *this;
5611	}
5612	if (ExportToDot)
5613	exportToDot(Label: "postbuild");
5614
5615	if (VerifyCCG) {
5616	check();
5617	}
5618
5619	identifyClones();
5620
5621	if (VerifyCCG) {
5622	check();
5623	}
5624
5625	if (DumpCCG) {
5626	dbgs() << "CCG after cloning:\n";
5627	dbgs() << *this;
5628	}
5629	if (ExportToDot)
5630	exportToDot(Label: "cloned");
5631
5632	bool Changed = assignFunctions();
5633
5634	if (DumpCCG) {
5635	dbgs() << "CCG after assigning function clones:\n";
5636	dbgs() << *this;
5637	}
5638	if (ExportToDot)
5639	exportToDot(Label: "clonefuncassign");
5640
5641	if (MemProfReportHintedSizes)
5642	printTotalSizes(OS&: errs());
5643
5644	return Changed;
5645	}
5646
5647	bool MemProfContextDisambiguation::processModule(
5648	Module &M,
5649	llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
5650
5651	// If we have an import summary, then the cloning decisions were made during
5652	// the thin link on the index. Apply them and return.
5653	if (ImportSummary)
5654	return applyImport(M);
5655
5656	// TODO: If/when other types of memprof cloning are enabled beyond just for
5657	// hot and cold, we will need to change this to individually control the
5658	// AllocationType passed to addStackNodesForMIB during CCG construction.
5659	// Note that we specifically check this after applying imports above, so that
5660	// the option isn't needed to be passed to distributed ThinLTO backend
5661	// clang processes, which won't necessarily have visibility into the linker
5662	// dependences. Instead the information is communicated from the LTO link to
5663	// the backends via the combined summary index.
5664	if (!SupportsHotColdNew)
5665	return false;
5666
5667	ModuleCallsiteContextGraph CCG(M, OREGetter);
5668	return CCG.process();
5669	}
5670
5671	MemProfContextDisambiguation::MemProfContextDisambiguation(
5672	const ModuleSummaryIndex Summary, bool* isSamplePGO)
5673	: ImportSummary(Summary), isSamplePGO(isSamplePGO) {
5674	// Check the dot graph printing options once here, to make sure we have valid
5675	// and expected combinations.
5676	if (DotGraphScope == DotScope::Alloc && !AllocIdForDot.getNumOccurrences())
5677	llvm::report_fatal_error(
5678	reason: "-memprof-dot-scope=alloc requires -memprof-dot-alloc-id");
5679	if (DotGraphScope == DotScope::Context &&
5680	!ContextIdForDot.getNumOccurrences())
5681	llvm::report_fatal_error(
5682	reason: "-memprof-dot-scope=context requires -memprof-dot-context-id");
5683	if (DotGraphScope == DotScope::All && AllocIdForDot.getNumOccurrences() &&
5684	ContextIdForDot.getNumOccurrences())
5685	llvm::report_fatal_error(
5686	reason: "-memprof-dot-scope=all can't have both -memprof-dot-alloc-id and "
5687	"-memprof-dot-context-id");
5688	if (ImportSummary) {
5689	// The MemProfImportSummary should only be used for testing ThinLTO
5690	// distributed backend handling via opt, in which case we don't have a
5691	// summary from the pass pipeline.
5692	assert(MemProfImportSummary.empty());
5693	return;
5694	}
5695	if (MemProfImportSummary.empty())
5696	return;
5697
5698	auto ReadSummaryFile =
5699	errorOrToExpected(EO: MemoryBuffer::getFile(Filename: MemProfImportSummary));
5700	if (!ReadSummaryFile) {
5701	logAllUnhandledErrors(E: ReadSummaryFile.takeError(), OS&: errs(),
5702	ErrorBanner: "Error loading file '" + MemProfImportSummary +
5703	"': ");
5704	return;
5705	}
5706	auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(Buffer: **ReadSummaryFile);
5707	if (!ImportSummaryForTestingOrErr) {
5708	logAllUnhandledErrors(E: ImportSummaryForTestingOrErr.takeError(), OS&: errs(),
5709	ErrorBanner: "Error parsing file '" + MemProfImportSummary +
5710	"': ");
5711	return;
5712	}
5713	ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
5714	ImportSummary = ImportSummaryForTesting.get();
5715	}
5716
5717	PreservedAnalyses MemProfContextDisambiguation::run(Module &M,
5718	ModuleAnalysisManager &AM) {
5719	auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(IR&: M).getManager();
5720	auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
5721	return FAM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: *F);
5722	};
5723	if (!processModule(M, OREGetter))
5724	return PreservedAnalyses::all();
5725	return PreservedAnalyses::none();
5726	}
5727
5728	void MemProfContextDisambiguation::run(
5729	ModuleSummaryIndex &Index,
5730	llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
5731	isPrevailing) {
5732	// TODO: If/when other types of memprof cloning are enabled beyond just for
5733	// hot and cold, we will need to change this to individually control the
5734	// AllocationType passed to addStackNodesForMIB during CCG construction.
5735	// The index was set from the option, so these should be in sync.
5736	assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
5737	if (!SupportsHotColdNew)
5738	return;
5739
5740	IndexCallsiteContextGraph CCG(Index, isPrevailing);
5741	CCG.process();
5742	}
5743

Browse the source code of llvm_projects/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp