MemProfContextDisambiguation.cpp source code [llvm_projects/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp]

1	//==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file implements support for context disambiguation of allocation
10	// calls for profile guided heap optimization. Specifically, it uses Memprof
11	// profiles which indicate context specific allocation behavior (currently
12	// distinguishing cold vs hot memory allocations). Cloning is performed to
13	// expose the cold allocation call contexts, and the allocation calls are
14	// subsequently annotated with an attribute for later transformation.
15	//
16	// The transformations can be performed either directly on IR (regular LTO), or
17	// on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
18	// Both types of LTO operate on a the same base graph representation, which
19	// uses CRTP to support either IR or Index formats.
20	//
21	//===----------------------------------------------------------------------===//
22
23	#include "llvm/Transforms/IPO/MemProfContextDisambiguation.h"
24	#include "llvm/ADT/DenseMap.h"
25	#include "llvm/ADT/DenseSet.h"
26	#include "llvm/ADT/MapVector.h"
27	#include "llvm/ADT/SetOperations.h"
28	#include "llvm/ADT/SmallPtrSet.h"
29	#include "llvm/ADT/SmallSet.h"
30	#include "llvm/ADT/SmallVector.h"
31	#include "llvm/ADT/Statistic.h"
32	#include "llvm/ADT/StringExtras.h"
33	#include "llvm/Analysis/MemoryProfileInfo.h"
34	#include "llvm/Analysis/ModuleSummaryAnalysis.h"
35	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
36	#include "llvm/Bitcode/BitcodeReader.h"
37	#include "llvm/IR/Instructions.h"
38	#include "llvm/IR/Module.h"
39	#include "llvm/IR/ModuleSummaryIndex.h"
40	#include "llvm/Pass.h"
41	#include "llvm/Support/CommandLine.h"
42	#include "llvm/Support/GraphWriter.h"
43	#include "llvm/Support/InterleavedRange.h"
44	#include "llvm/Support/SHA1.h"
45	#include "llvm/Support/raw_ostream.h"
46	#include "llvm/Transforms/IPO.h"
47	#include "llvm/Transforms/Utils/CallPromotionUtils.h"
48	#include "llvm/Transforms/Utils/Cloning.h"
49	#include "llvm/Transforms/Utils/Instrumentation.h"
50	#include <deque>
51	#include <sstream>
52	#include <vector>
53	using namespace llvm;
54	using namespace llvm::memprof;
55
56	#define DEBUG_TYPE "memprof-context-disambiguation"
57
58	STATISTIC(FunctionClonesAnalysis,
59	"Number of function clones created during whole program analysis");
60	STATISTIC(FunctionClonesThinBackend,
61	"Number of function clones created during ThinLTO backend");
62	STATISTIC(FunctionsClonedThinBackend,
63	"Number of functions that had clones created during ThinLTO backend");
64	STATISTIC(
65	FunctionCloneDuplicatesThinBackend,
66	"Number of function clone duplicates detected during ThinLTO backend");
67	STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
68	"cloned) during whole program analysis");
69	STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
70	"during whole program analysis");
71	STATISTIC(AllocTypeNotColdThinBackend,
72	"Number of not cold static allocations (possibly cloned) during "
73	"ThinLTO backend");
74	STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
75	"(possibly cloned) during ThinLTO backend");
76	STATISTIC(OrigAllocsThinBackend,
77	"Number of original (not cloned) allocations with memprof profiles "
78	"during ThinLTO backend");
79	STATISTIC(
80	AllocVersionsThinBackend,
81	"Number of allocation versions (including clones) during ThinLTO backend");
82	STATISTIC(MaxAllocVersionsThinBackend,
83	"Maximum number of allocation versions created for an original "
84	"allocation during ThinLTO backend");
85	STATISTIC(UnclonableAllocsThinBackend,
86	"Number of unclonable ambigous allocations during ThinLTO backend");
87	STATISTIC(RemovedEdgesWithMismatchedCallees,
88	"Number of edges removed due to mismatched callees (profiled vs IR)");
89	STATISTIC(FoundProfiledCalleeCount,
90	"Number of profiled callees found via tail calls");
91	STATISTIC(FoundProfiledCalleeDepth,
92	"Aggregate depth of profiled callees found via tail calls");
93	STATISTIC(FoundProfiledCalleeMaxDepth,
94	"Maximum depth of profiled callees found via tail calls");
95	STATISTIC(FoundProfiledCalleeNonUniquelyCount,
96	"Number of profiled callees found via multiple tail call chains");
97	STATISTIC(DeferredBackedges, "Number of backedges with deferred cloning");
98	STATISTIC(NewMergedNodes, "Number of new nodes created during merging");
99	STATISTIC(NonNewMergedNodes, "Number of non new nodes used during merging");
100	STATISTIC(MissingAllocForContextId,
101	"Number of missing alloc nodes for context ids");
102	STATISTIC(SkippedCallsCloning,
103	"Number of calls skipped during cloning due to unexpected operand");
104	STATISTIC(MismatchedCloneAssignments,
105	"Number of callsites assigned to call multiple non-matching clones");
106	STATISTIC(TotalMergeInvokes, "Number of merge invocations for nodes");
107	STATISTIC(TotalMergeIters, "Number of merge iterations for nodes");
108	STATISTIC(MaxMergeIters, "Max merge iterations for nodes");
109	STATISTIC(NumImportantContextIds, "Number of important context ids");
110	STATISTIC(NumFixupEdgeIdsInserted, "Number of fixup edge ids inserted");
111	STATISTIC(NumFixupEdgesAdded, "Number of fixup edges added");
112	STATISTIC(NumFixedContexts, "Number of contexts with fixed edges");
113	STATISTIC(AliaseesPrevailingInDiffModuleFromAlias,
114	"Number of aliasees prevailing in a different module than its alias");
115
116	static cl::opt<std::string> DotFilePathPrefix(
117	"memprof-dot-file-path-prefix", cl::init(Val: ""), cl::Hidden,
118	cl::value_desc("filename"),
119	cl::desc("Specify the path prefix of the MemProf dot files."));
120
121	static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(Val: false),
122	cl::Hidden,
123	cl::desc("Export graph to dot files."));
124
125	// TODO: Remove this option once new handling is validated more widely.
126	static cl::opt<bool> DoMergeIteration(
127	"memprof-merge-iteration", cl::init(Val: true), cl::Hidden,
128	cl::desc("Iteratively apply merging on a node to catch new callers"));
129
130	// How much of the graph to export to dot.
131	enum DotScope {
132	All, // The full CCG graph.
133	Alloc, // Only contexts for the specified allocation.
134	Context, // Only the specified context.
135	};
136
137	static cl::opt<DotScope> DotGraphScope(
138	"memprof-dot-scope", cl::desc("Scope of graph to export to dot"),
139	cl::Hidden, cl::init(Val: DotScope::All),
140	cl::values(
141	clEnumValN(DotScope::All, "all", "Export full callsite graph"),
142	clEnumValN(DotScope::Alloc, "alloc",
143	"Export only nodes with contexts feeding given "
144	"-memprof-dot-alloc-id"),
145	clEnumValN(DotScope::Context, "context",
146	"Export only nodes with given -memprof-dot-context-id")));
147
148	static cl::opt<unsigned>
149	AllocIdForDot("memprof-dot-alloc-id", cl::init(Val: `0`), cl::Hidden,
150	cl::desc("Id of alloc to export if -memprof-dot-scope=alloc "
151	"or to highlight if -memprof-dot-scope=all"));
152
153	static cl::opt<unsigned> ContextIdForDot(
154	"memprof-dot-context-id", cl::init(Val: `0`), cl::Hidden,
155	cl::desc("Id of context to export if -memprof-dot-scope=context or to "
156	"highlight otherwise"));
157
158	static cl::opt<bool>
159	DumpCCG("memprof-dump-ccg", cl::init(Val: false), cl::Hidden,
160	cl::desc("Dump CallingContextGraph to stdout after each stage."));
161
162	static cl::opt<bool>
163	VerifyCCG("memprof-verify-ccg", cl::init(Val: false), cl::Hidden,
164	cl::desc("Perform verification checks on CallingContextGraph."));
165
166	static cl::opt<bool>
167	VerifyNodes("memprof-verify-nodes", cl::init(Val: false), cl::Hidden,
168	cl::desc("Perform frequent verification checks on nodes."));
169
170	static cl::opt<std::string> MemProfImportSummary(
171	"memprof-import-summary",
172	cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
173	cl::Hidden);
174
175	static cl::opt<unsigned>
176	TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(Val: `5`),
177	cl::Hidden,
178	cl::desc("Max depth to recursively search for missing "
179	"frames through tail calls."));
180
181	// Optionally enable cloning of callsites involved with recursive cycles
182	static cl::opt<bool> AllowRecursiveCallsites(
183	"memprof-allow-recursive-callsites", cl::init(Val: true), cl::Hidden,
184	cl::desc("Allow cloning of callsites involved in recursive cycles"));
185
186	static cl::opt<bool> CloneRecursiveContexts(
187	"memprof-clone-recursive-contexts", cl::init(Val: true), cl::Hidden,
188	cl::desc("Allow cloning of contexts through recursive cycles"));
189
190	// Generally this is needed for correct assignment of allocation clones to
191	// function clones, however, allow it to be disabled for debugging while the
192	// functionality is new and being tested more widely.
193	static cl::opt<bool>
194	MergeClones("memprof-merge-clones", cl::init(Val: true), cl::Hidden,
195	cl::desc("Merge clones before assigning functions"));
196
197	// When disabled, try to detect and prevent cloning of recursive contexts.
198	// This is only necessary until we support cloning through recursive cycles.
199	// Leave on by default for now, as disabling requires a little bit of compile
200	// time overhead and doesn't affect correctness, it will just inflate the cold
201	// hinted bytes reporting a bit when -memprof-report-hinted-sizes is enabled.
202	static cl::opt<bool> AllowRecursiveContexts(
203	"memprof-allow-recursive-contexts", cl::init(Val: true), cl::Hidden,
204	cl::desc("Allow cloning of contexts having recursive cycles"));
205
206	// Set the minimum absolute count threshold for allowing inlining of indirect
207	// calls promoted during cloning.
208	static cl::opt<unsigned> MemProfICPNoInlineThreshold(
209	"memprof-icp-noinline-threshold", cl::init(Val: `0`), cl::Hidden,
210	cl::desc("Minimum absolute count for promoted target to be inlinable"));
211
212	namespace llvm {
213	cl::opt<bool> EnableMemProfContextDisambiguation(
214	"enable-memprof-context-disambiguation", cl::Hidden,
215	cl::desc("Enable MemProf context disambiguation"));
216
217	// Indicate we are linking with an allocator that supports hot/cold operator
218	// new interfaces.
219	cl::opt<bool> SupportsHotColdNew(
220	"supports-hot-cold-new", cl::init(Val: false), cl::Hidden,
221	cl::desc("Linking with hot/cold operator new interfaces"));
222
223	static cl::opt<bool> MemProfRequireDefinitionForPromotion(
224	"memprof-require-definition-for-promotion", cl::init(Val: false), cl::Hidden,
225	cl::desc(
226	"Require target function definition when promoting indirect calls"));
227
228	extern cl::opt<bool> MemProfReportHintedSizes;
229	extern cl::opt<unsigned> MinClonedColdBytePercent;
230
231	cl::opt<unsigned> MemProfTopNImportant(
232	"memprof-top-n-important", cl::init(Val: `10`), cl::Hidden,
233	cl::desc("Number of largest cold contexts to consider important"));
234
235	cl::opt<bool> MemProfFixupImportant(
236	"memprof-fixup-important", cl::init(Val: true), cl::Hidden,
237	cl::desc("Enables edge fixup for important contexts"));
238
239	extern cl::opt<unsigned> MaxSummaryIndirectEdges;
240
241	} // namespace llvm
242
243	namespace {
244
245	/// CRTP base for graphs built from either IR or ThinLTO summary index.
246	///
247	/// The graph represents the call contexts in all memprof metadata on allocation
248	/// calls, with nodes for the allocations themselves, as well as for the calls
249	/// in each context. The graph is initially built from the allocation memprof
250	/// metadata (or summary) MIBs. It is then updated to match calls with callsite
251	/// metadata onto the nodes, updating it to reflect any inlining performed on
252	/// those calls.
253	///
254	/// Each MIB (representing an allocation's call context with allocation
255	/// behavior) is assigned a unique context id during the graph build. The edges
256	/// and nodes in the graph are decorated with the context ids they carry. This
257	/// is used to correctly update the graph when cloning is performed so that we
258	/// can uniquify the context for a single (possibly cloned) allocation.
259	template <typename DerivedCCG, typename FuncTy, typename CallTy>
260	class CallsiteContextGraph {
261	public:
262	CallsiteContextGraph() = default;
263	CallsiteContextGraph(const CallsiteContextGraph &) = default;
264	CallsiteContextGraph(CallsiteContextGraph &&) = default;
265
266	/// Main entry point to perform analysis and transformations on graph.
267	bool process(function_ref<void(StringRef, StringRef, const Twine &)>
268	EmitRemark = nullptr,
269	bool AllowExtraAnalysis = false);
270
271	/// Perform cloning on the graph necessary to uniquely identify the allocation
272	/// behavior of an allocation based on its context.
273	void identifyClones();
274
275	/// Assign callsite clones to functions, cloning functions as needed to
276	/// accommodate the combinations of their callsite clones reached by callers.
277	/// For regular LTO this clones functions and callsites in the IR, but for
278	/// ThinLTO the cloning decisions are noted in the summaries and later applied
279	/// in applyImport.
280	bool assignFunctions();
281
282	void dump() const;
283	void print(raw_ostream &OS) const;
284	void printTotalSizes(raw_ostream &OS,
285	function_ref<void(StringRef, StringRef, const Twine &)>
286	EmitRemark = nullptr) const;
287
288	friend raw_ostream &operator<<(raw_ostream &OS,
289	const CallsiteContextGraph &CCG) {
290	CCG.print(OS);
291	return OS;
292	}
293
294	friend struct GraphTraits<
295	const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
296	friend struct DOTGraphTraits<
297	const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
298
299	void exportToDot(std::string Label) const;
300
301	/// Represents a function clone via FuncTy pointer and clone number pair.
302	struct FuncInfo final
303	: public std::pair<FuncTy , unsigned* /Clone number/> {
304	using Base = std::pair<FuncTy , unsigned*>;
305	FuncInfo(const Base &B) : Base(B) {}
306	FuncInfo(FuncTy F = nullptr, unsigned* CloneNo = `0`) : Base(F, CloneNo) {}
307	explicit operator bool() const { return this->first != nullptr; }
308	FuncTy func() const* { return this->first; }
309	unsigned cloneNo() const { return this->second; }
310	};
311
312	/// Represents a callsite clone via CallTy and clone number pair.
313	struct CallInfo final : public std::pair<CallTy, unsigned /Clone number/> {
314	using Base = std::pair<CallTy, unsigned>;
315	CallInfo(const Base &B) : Base(B) {}
316	CallInfo(CallTy Call = nullptr, unsigned CloneNo = `0`)
317	: Base(Call, CloneNo) {}
318	explicit operator bool() const { return (bool)this->first; }
319	CallTy call() const { return this->first; }
320	unsigned cloneNo() const { return this->second; }
321	void setCloneNo(unsigned N) { this->second = N; }
322	void print(raw_ostream &OS) const {
323	if (!operator bool()) {
324	assert(!cloneNo());
325	OS << "null Call";
326	return;
327	}
328	call()->print(OS);
329	OS << "\t(clone " << cloneNo() << ")";
330	}
331	void dump() const {
332	print(OS&: dbgs());
333	dbgs() << "\n";
334	}
335	friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
336	Call.print(OS);
337	return OS;
338	}
339	};
340
341	struct ContextEdge;
342
343	/// Node in the Callsite Context Graph
344	struct ContextNode {
345	// Assigned to nodes as they are created, useful for debugging.
346	unsigned NodeId = `0`;
347
348	// Keep this for now since in the IR case where we have an Instruction it*
349	// is not as immediately discoverable. Used for printing richer information
350	// when dumping graph.
351	bool IsAllocation;
352
353	// Keeps track of when the Call was reset to null because there was
354	// recursion.
355	bool Recursive = false;
356
357	// This will be formed by ORing together the AllocationType enum values
358	// for contexts including this node.
359	uint8_t AllocTypes = `0`;
360
361	// The corresponding allocation or interior call. This is the primary call
362	// for which we have created this node.
363	CallInfo Call;
364
365	// List of other calls that can be treated the same as the primary call
366	// through cloning. I.e. located in the same function and have the same
367	// (possibly pruned) stack ids. They will be updated the same way as the
368	// primary call when assigning to function clones.
369	SmallVector<CallInfo, `0`> MatchingCalls;
370
371	// For alloc nodes this is a unique id assigned when constructed, and for
372	// callsite stack nodes it is the original stack id when the node is
373	// constructed from the memprof MIB metadata on the alloc nodes. Note that
374	// this is only used when matching callsite metadata onto the stack nodes
375	// created when processing the allocation memprof MIBs, and for labeling
376	// nodes in the dot graph. Therefore we don't bother to assign a value for
377	// clones.
378	uint64_t OrigStackOrAllocId = `0`;
379
380	// Edges to all callees in the profiled call stacks.
381	// TODO: Should this be a map (from Callee node) for more efficient lookup?
382	std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
383
384	// Edges to all callers in the profiled call stacks.
385	// TODO: Should this be a map (from Caller node) for more efficient lookup?
386	std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
387
388	// Returns true if we need to look at the callee edges for determining the
389	// node context ids and allocation type.
390	bool useCallerEdgesForContextInfo() const {
391	// Typically if the callee edges are empty either the caller edges are
392	// also empty, or this is an allocation (leaf node). However, if we are
393	// allowing recursive callsites and contexts this will be violated for
394	// incompletely cloned recursive cycles.
395	assert(!CalleeEdges.empty() \|\| CallerEdges.empty() \|\| IsAllocation \|\|
396	(AllowRecursiveCallsites && AllowRecursiveContexts));
397	// When cloning for a recursive context, during cloning we might be in the
398	// midst of cloning for a recurrence and have moved context ids off of a
399	// caller edge onto the clone but not yet off of the incoming caller
400	// (back) edge. If we don't look at those we miss the fact that this node
401	// still has context ids of interest.
402	return IsAllocation \|\| CloneRecursiveContexts;
403	}
404
405	// Compute the context ids for this node from the union of its edge context
406	// ids.
407	DenseSet<uint32_t> getContextIds() const {
408	unsigned Count = `0`;
409	// Compute the number of ids for reserve below. In general we only need to
410	// look at one set of edges, typically the callee edges, since other than
411	// allocations and in some cases during recursion cloning, all the context
412	// ids on the callers should also flow out via callee edges.
413	for (auto &Edge : CalleeEdges.empty() ? CallerEdges : CalleeEdges)
414	Count += Edge->getContextIds().size();
415	DenseSet<uint32_t> ContextIds;
416	ContextIds.reserve(Size: Count);
417	auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
418	CalleeEdges, useCallerEdgesForContextInfo()
419	? CallerEdges
420	: std::vector<std::shared_ptr<ContextEdge>>());
421	for (const auto &Edge : Edges)
422	ContextIds.insert_range(Edge->getContextIds());
423	return ContextIds;
424	}
425
426	// Compute the allocation type for this node from the OR of its edge
427	// allocation types.
428	uint8_t computeAllocType() const {
429	uint8_t BothTypes =
430	(uint8_t)AllocationType::Cold \| (uint8_t)AllocationType::NotCold;
431	uint8_t AllocType = (uint8_t)AllocationType::None;
432	auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
433	CalleeEdges, useCallerEdgesForContextInfo()
434	? CallerEdges
435	: std::vector<std::shared_ptr<ContextEdge>>());
436	for (const auto &Edge : Edges) {
437	AllocType \|= Edge->AllocTypes;
438	// Bail early if alloc type reached both, no further refinement.
439	if (AllocType == BothTypes)
440	return AllocType;
441	}
442	return AllocType;
443	}
444
445	// The context ids set for this node is empty if its edge context ids are
446	// also all empty.
447	bool emptyContextIds() const {
448	auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
449	CalleeEdges, useCallerEdgesForContextInfo()
450	? CallerEdges
451	: std::vector<std::shared_ptr<ContextEdge>>());
452	for (const auto &Edge : Edges) {
453	if (!Edge->getContextIds().empty())
454	return false;
455	}
456	return true;
457	}
458
459	// List of clones of this ContextNode, initially empty.
460	std::vector<ContextNode *> Clones;
461
462	// If a clone, points to the original uncloned node.
463	ContextNode CloneOf = nullptr*;
464
465	ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
466
467	ContextNode(bool IsAllocation, CallInfo C)
468	: IsAllocation(IsAllocation), Call(C) {}
469
470	void addClone(ContextNode *Clone) {
471	if (CloneOf) {
472	CloneOf->Clones.push_back(Clone);
473	Clone->CloneOf = CloneOf;
474	} else {
475	Clones.push_back(Clone);
476	assert(!Clone->CloneOf);
477	Clone->CloneOf = this;
478	}
479	}
480
481	ContextNode *getOrigNode() {
482	if (!CloneOf)
483	return this;
484	return CloneOf;
485	}
486
487	void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
488	unsigned int ContextId);
489
490	ContextEdge findEdgeFromCallee(const* ContextNode *Callee);
491	ContextEdge findEdgeFromCaller(const* ContextNode *Caller);
492	void eraseCalleeEdge(const ContextEdge *Edge);
493	void eraseCallerEdge(const ContextEdge *Edge);
494
495	void setCall(CallInfo C) { Call = std::move(C); }
496
497	bool hasCall() const { return (bool)Call.call(); }
498
499	void printCall(raw_ostream &OS) const { Call.print(OS); }
500
501	// True if this node was effectively removed from the graph, in which case
502	// it should have an allocation type of None and empty context ids.
503	bool isRemoved() const {
504	// Typically if the callee edges are empty either the caller edges are
505	// also empty, or this is an allocation (leaf node). However, if we are
506	// allowing recursive callsites and contexts this will be violated for
507	// incompletely cloned recursive cycles.
508	assert((AllowRecursiveCallsites && AllowRecursiveContexts) \|\|
509	(AllocTypes == (uint8_t)AllocationType::None) ==
510	emptyContextIds());
511	return AllocTypes == (uint8_t)AllocationType::None;
512	}
513
514	void dump() const;
515	void print(raw_ostream &OS) const;
516
517	friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
518	Node.print(OS);
519	return OS;
520	}
521	};
522
523	/// Edge in the Callsite Context Graph from a ContextNode N to a caller or
524	/// callee.
525	struct ContextEdge {
526	ContextNode *Callee;
527	ContextNode *Caller;
528
529	// This will be formed by ORing together the AllocationType enum values
530	// for contexts including this edge.
531	uint8_t AllocTypes = `0`;
532
533	// Set just before initiating cloning when cloning of recursive contexts is
534	// enabled. Used to defer cloning of backedges until we have done cloning of
535	// the callee node for non-backedge caller edges. This exposes cloning
536	// opportunities through the backedge of the cycle.
537	// TODO: Note that this is not updated during cloning, and it is unclear
538	// whether that would be needed.
539	bool IsBackedge = false;
540
541	// The set of IDs for contexts including this edge.
542	DenseSet<uint32_t> ContextIds;
543
544	ContextEdge(ContextNode Callee, ContextNode Caller, uint8_t AllocType,
545	DenseSet<uint32_t> ContextIds)
546	: Callee(Callee), Caller(Caller), AllocTypes(AllocType),
547	ContextIds (std::move(ContextIds)) {}
548
549	DenseSet<uint32_t> &getContextIds() { return ContextIds; }
550
551	// Helper to clear the fields of this edge when we are removing it from the
552	// graph.
553	inline void clear() {
554	ContextIds.clear();
555	AllocTypes = (uint8_t)AllocationType::None;
556	Caller = nullptr;
557	Callee = nullptr;
558	}
559
560	// Check if edge was removed from the graph. This is useful while iterating
561	// over a copy of edge lists when performing operations that mutate the
562	// graph in ways that might remove one of the edges.
563	inline bool isRemoved() const {
564	if (Callee \|\| Caller)
565	return false;
566	// Any edges that have been removed from the graph but are still in a
567	// shared_ptr somewhere should have all fields null'ed out by clear()
568	// above.
569	assert(AllocTypes == (uint8_t)AllocationType::None);
570	assert(ContextIds.empty());
571	return true;
572	}
573
574	void dump() const;
575	void print(raw_ostream &OS) const;
576
577	friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
578	Edge.print(OS);
579	return OS;
580	}
581	};
582
583	/// Helpers to remove edges that have allocation type None (due to not
584	/// carrying any context ids) after transformations.
585	void removeNoneTypeCalleeEdges(ContextNode *Node);
586	void removeNoneTypeCallerEdges(ContextNode *Node);
587	void
588	recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node,
589	DenseSet<const ContextNode *> &Visited);
590
591	protected:
592	/// Get a list of nodes corresponding to the stack ids in the given callsite
593	/// context.
594	template <class NodeT, class IteratorT>
595	std::vector<uint64_t>
596	getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
597
598	/// Adds nodes for the given allocation and any stack ids on its memprof MIB
599	/// metadata (or summary).
600	ContextNode addAllocNode(CallInfo Call, const* FuncTy *F);
601
602	/// Adds nodes for the given MIB stack ids.
603	template <class NodeT, class IteratorT>
604	void addStackNodesForMIB(
605	ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
606	CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
607	ArrayRef<ContextTotalSize> ContextSizeInfo,
608	std::map<uint64_t, uint32_t> &TotalSizeToContextIdTopNCold);
609
610	/// Matches all callsite metadata (or summary) to the nodes created for
611	/// allocation memprof MIB metadata, synthesizing new nodes to reflect any
612	/// inlining performed on those callsite instructions.
613	void updateStackNodes();
614
615	/// Optionally fixup edges for the N largest cold contexts to better enable
616	/// cloning. This is particularly helpful if the context includes recursion
617	/// as well as inlining, resulting in a single stack node for multiple stack
618	/// ids in the context. With recursion it is particularly difficult to get the
619	/// edge updates correct as in the general case we have lost the original
620	/// stack id ordering for the context. Do more expensive fixup for the largest
621	/// contexts, controlled by MemProfTopNImportant and MemProfFixupImportant.
622	void fixupImportantContexts();
623
624	/// Update graph to conservatively handle any callsite stack nodes that target
625	/// multiple different callee target functions.
626	void handleCallsitesWithMultipleTargets();
627
628	/// Mark backedges via the standard DFS based backedge algorithm.
629	void markBackedges();
630
631	/// Merge clones generated during cloning for different allocations but that
632	/// are called by the same caller node, to ensure proper function assignment.
633	void mergeClones();
634
635	// Try to partition calls on the given node (already placed into the AllCalls
636	// array) by callee function, creating new copies of Node as needed to hold
637	// calls with different callees, and moving the callee edges appropriately.
638	// Returns true if partitioning was successful.
639	bool partitionCallsByCallee(
640	ContextNode *Node, ArrayRef<CallInfo> AllCalls,
641	std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode);
642
643	/// Save lists of calls with MemProf metadata in each function, for faster
644	/// iteration.
645	MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
646
647	/// Map from callsite node to the enclosing caller function.
648	std::map<const ContextNode , const* FuncTy *> NodeToCallingFunc;
649
650	// When exporting to dot, and an allocation id is specified, contains the
651	// context ids on that allocation.
652	DenseSet<uint32_t> DotAllocContextIds;
653
654	private:
655	using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
656
657	// Structure to keep track of information for each call as we are matching
658	// non-allocation callsites onto context nodes created from the allocation
659	// call metadata / summary contexts.
660	struct CallContextInfo {
661	// The callsite we're trying to match.
662	CallTy Call;
663	// The callsites stack ids that have a context node in the graph.
664	std::vector<uint64_t> StackIds;
665	// The function containing this callsite.
666	const FuncTy *Func;
667	// Initially empty, if needed this will be updated to contain the context
668	// ids for use in a new context node created for this callsite.
669	DenseSet<uint32_t> ContextIds;
670	};
671
672	/// Helper to remove edge from graph, updating edge iterator if it is provided
673	/// (in which case CalleeIter indicates which edge list is being iterated).
674	/// This will also perform the necessary clearing of the ContextEdge members
675	/// to enable later checking if the edge has been removed (since we may have
676	/// other copies of the shared_ptr in existence, and in fact rely on this to
677	/// enable removal while iterating over a copy of a node's edge list).
678	void removeEdgeFromGraph(ContextEdge Edge, EdgeIter EI = nullptr,
679	bool CalleeIter = true);
680
681	/// Assigns the given Node to calls at or inlined into the location with
682	/// the Node's stack id, after post order traversing and processing its
683	/// caller nodes. Uses the call information recorded in the given
684	/// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
685	/// as needed. Called by updateStackNodes which sets up the given
686	/// StackIdToMatchingCalls map.
687	void assignStackNodesPostOrder(
688	ContextNode Node, DenseSet<const* ContextNode *> &Visited,
689	DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls,
690	DenseMap<CallInfo, CallInfo> &CallToMatchingCall,
691	const DenseSet<uint32_t> &ImportantContextIds);
692
693	/// Duplicates the given set of context ids, updating the provided
694	/// map from each original id with the newly generated context ids,
695	/// and returning the new duplicated id set.
696	DenseSet<uint32_t> duplicateContextIds(
697	const DenseSet<uint32_t> &StackSequenceContextIds,
698	DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
699
700	/// Propagates all duplicated context ids across the graph.
701	void propagateDuplicateContextIds(
702	const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
703
704	/// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
705	/// else to its callers. Also updates OrigNode's edges to remove any context
706	/// ids moved to the newly created edge.
707	void connectNewNode(ContextNode NewNode, ContextNode OrigNode,
708	bool TowardsCallee,
709	DenseSet<uint32_t> RemainingContextIds);
710
711	/// Get the stack id corresponding to the given Id or Index (for IR this will
712	/// return itself, for a summary index this will return the id recorded in the
713	/// index for that stack id index value).
714	uint64_t getStackId(uint64_t IdOrIndex) const {
715	return static_cast<const DerivedCCG >(this*)->getStackId(IdOrIndex);
716	}
717
718	/// Returns true if the given call targets the callee of the given edge, or if
719	/// we were able to identify the call chain through intermediate tail calls.
720	/// In the latter case new context nodes are added to the graph for the
721	/// identified tail calls, and their synthesized nodes are added to
722	/// TailCallToContextNodeMap. The EdgeIter is updated in the latter case for
723	/// the updated edges and to prepare it for an increment in the caller.
724	bool
725	calleesMatch(CallTy Call, EdgeIter &EI,
726	MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);
727
728	// Return the callee function of the given call, or nullptr if it can't be
729	// determined
730	const FuncTy *getCalleeFunc(CallTy Call) {
731	return static_cast<DerivedCCG >(this*)->getCalleeFunc(Call);
732	}
733
734	/// Returns true if the given call targets the given function, or if we were
735	/// able to identify the call chain through intermediate tail calls (in which
736	/// case FoundCalleeChain will be populated).
737	bool calleeMatchesFunc(
738	CallTy Call, const FuncTy Func, const* FuncTy *CallerFunc,
739	std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) {
740	return static_cast<DerivedCCG >(this*)->calleeMatchesFunc(
741	Call, Func, CallerFunc, FoundCalleeChain);
742	}
743
744	/// Returns true if both call instructions have the same callee.
745	bool sameCallee(CallTy Call1, CallTy Call2) {
746	return static_cast<DerivedCCG >(this*)->sameCallee(Call1, Call2);
747	}
748
749	/// Get a list of nodes corresponding to the stack ids in the given
750	/// callsite's context.
751	std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
752	return static_cast<DerivedCCG >(this*)->getStackIdsWithContextNodesForCall(
753	Call);
754	}
755
756	/// Get the last stack id in the context for callsite.
757	uint64_t getLastStackId(CallTy Call) {
758	return static_cast<DerivedCCG >(this*)->getLastStackId(Call);
759	}
760
761	/// Update the allocation call to record type of allocated memory.
762	void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
763	AllocType == AllocationType::Cold ? AllocTypeCold ++ : AllocTypeNotCold ++;
764	static_cast<DerivedCCG >(this*)->updateAllocationCall(Call, AllocType);
765	}
766
767	/// Get the AllocationType assigned to the given allocation instruction clone.
768	AllocationType getAllocationCallType(const CallInfo &Call) const {
769	return static_cast<const DerivedCCG >(this*)->getAllocationCallType(Call);
770	}
771
772	/// Update non-allocation call to invoke (possibly cloned) function
773	/// CalleeFunc.
774	void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
775	static_cast<DerivedCCG >(this*)->updateCall(CallerCall, CalleeFunc);
776	}
777
778	/// Clone the given function for the given callsite, recording mapping of all
779	/// of the functions tracked calls to their new versions in the CallMap.
780	/// Assigns new clones to clone number CloneNo.
781	FuncInfo cloneFunctionForCallsite(
782	FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
783	std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
784	return static_cast<DerivedCCG >(this*)->cloneFunctionForCallsite(
785	Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
786	}
787
788	/// Gets a label to use in the dot graph for the given call clone in the given
789	/// function.
790	std::string getLabel(const FuncTy Func, const* CallTy Call,
791	unsigned CloneNo) const {
792	return static_cast<const DerivedCCG >(this*)->getLabel(Func, Call, CloneNo);
793	}
794
795	// Create and return a new ContextNode.
796	ContextNode createNewNode(bool* IsAllocation, const FuncTy F = nullptr*,
797	CallInfo C = CallInfo()) {
798	NodeOwner.push_back(std::make_unique<ContextNode>(IsAllocation, C));
799	auto *NewNode = NodeOwner.back().get();
800	if (F)
801	NodeToCallingFunc[NewNode] = F;
802	NewNode->NodeId = NodeOwner.size();
803	return NewNode;
804	}
805
806	/// Helpers to find the node corresponding to the given call or stackid.
807	ContextNode getNodeForInst(const* CallInfo &C);
808	ContextNode getNodeForAlloc(const* CallInfo &C);
809	ContextNode *getNodeForStackId(uint64_t StackId);
810
811	/// Computes the alloc type corresponding to the given context ids, by
812	/// unioning their recorded alloc types.
813	uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds) const;
814
815	/// Returns the allocation type of the intersection of the contexts of two
816	/// nodes (based on their provided context id sets), optimized for the case
817	/// when Node1Ids is smaller than Node2Ids.
818	uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
819	const DenseSet<uint32_t> &Node2Ids) const;
820
821	/// Returns the allocation type of the intersection of the contexts of two
822	/// nodes (based on their provided context id sets).
823	uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
824	const DenseSet<uint32_t> &Node2Ids) const;
825
826	/// Create a clone of Edge's callee and move Edge to that new callee node,
827	/// performing the necessary context id and allocation type updates.
828	/// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
829	/// moved to an edge to the new callee.
830	ContextNode *
831	moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
832	DenseSet<uint32_t> ContextIdsToMove = {});
833
834	/// Change the callee of Edge to existing callee clone NewCallee, performing
835	/// the necessary context id and allocation type updates.
836	/// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
837	/// moved to an edge to the new callee.
838	void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
839	ContextNode *NewCallee,
840	bool NewClone = false,
841	DenseSet<uint32_t> ContextIdsToMove = {});
842
843	/// Change the caller of the edge at the given callee edge iterator to be
844	/// NewCaller, performing the necessary context id and allocation type
845	/// updates. This is similar to the above moveEdgeToExistingCalleeClone, but
846	/// a simplified version of it as we always move the given edge and all of its
847	/// context ids.
848	void moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
849	ContextNode *NewCaller);
850
851	/// Recursive helper for marking backedges via DFS.
852	void markBackedges(ContextNode Node, DenseSet<const* ContextNode *> &Visited,
853	DenseSet<const ContextNode *> &CurrentStack);
854
855	/// Recursive helper for merging clones.
856	void
857	mergeClones(ContextNode Node, DenseSet<const* ContextNode *> &Visited,
858	DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode);
859	/// Main worker for merging callee clones for a given node.
860	void mergeNodeCalleeClones(
861	ContextNode Node, DenseSet<const* ContextNode *> &Visited,
862	DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode);
863	/// Helper to find other callers of the given set of callee edges that can
864	/// share the same callee merge node.
865	void findOtherCallersToShareMerge(
866	ContextNode *Node, std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges,
867	DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode,
868	DenseSet<ContextNode *> &OtherCallersToShareMerge);
869
870	/// Recursively perform cloning on the graph for the given Node and its
871	/// callers, in order to uniquely identify the allocation behavior of an
872	/// allocation given its context. The context ids of the allocation being
873	/// processed are given in AllocContextIds.
874	void identifyClones(ContextNode Node, DenseSet<const* ContextNode *> &Visited,
875	const DenseSet<uint32_t> &AllocContextIds);
876
877	/// Map from each context ID to the AllocationType assigned to that context.
878	DenseMap<uint32_t, AllocationType> ContextIdToAllocationType;
879
880	/// Map from each contextID to the profiled full contexts and their total
881	/// sizes (there may be more than one due to context trimming),
882	/// optionally populated when requested (via MemProfReportHintedSizes or
883	/// MinClonedColdBytePercent).
884	DenseMap<uint32_t, std::vector<ContextTotalSize>> ContextIdToContextSizeInfos;
885
886	/// Identifies the context node created for a stack id when adding the MIB
887	/// contexts to the graph. This is used to locate the context nodes when
888	/// trying to assign the corresponding callsites with those stack ids to these
889	/// nodes.
890	DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
891
892	/// Saves information for the contexts identified as important (the largest
893	/// cold contexts up to MemProfTopNImportant).
894	struct ImportantContextInfo {
895	// The original list of leaf first stack ids corresponding to this context.
896	std::vector<uint64_t> StackIds;
897	// Max length of stack ids corresponding to a single stack ContextNode for
898	// this context (i.e. the max length of a key in StackIdsToNode below).
899	unsigned MaxLength = `0`;
900	// Mapping of slices of the stack ids to the corresponding ContextNode
901	// (there can be multiple stack ids due to inlining). Populated when
902	// updating stack nodes while matching them to the IR or summary.
903	std::map<std::vector<uint64_t>, ContextNode *> StackIdsToNode;
904	};
905
906	// Map of important full context ids to information about each.
907	DenseMap<uint32_t, ImportantContextInfo> ImportantContextIdInfo;
908
909	// For each important context id found in Node (if any), records the list of
910	// stack ids that corresponded to the given callsite Node. There can be more
911	// than one in the case of inlining.
912	void recordStackNode(std::vector<uint64_t> &StackIds, ContextNode *Node,
913	// We pass in the Node's context ids to avoid the
914	// overhead of computing them as the caller already has
915	// them in some cases.
916	const DenseSet<uint32_t> &NodeContextIds,
917	const DenseSet<uint32_t> &ImportantContextIds) {
918	if (!MemProfTopNImportant) {
919	assert(ImportantContextIds.empty());
920	return;
921	}
922	DenseSet<uint32_t> Ids =
923	set_intersection(S1: NodeContextIds, S2: ImportantContextIds);
924	if (Ids.empty())
925	return;
926	auto Size = StackIds.size();
927	for (auto Id : Ids) {
928	auto &Entry = ImportantContextIdInfo[Id];
929	Entry.StackIdsToNode[StackIds] = Node;
930	// Keep track of the max to simplify later analysis.
931	if (Size > Entry.MaxLength)
932	Entry.MaxLength = Size;
933	}
934	}
935
936	/// Maps to track the calls to their corresponding nodes in the graph.
937	MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
938	MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
939
940	/// Owner of all ContextNode unique_ptrs.
941	std::vector<std::unique_ptr<ContextNode>> NodeOwner;
942
943	/// Perform sanity checks on graph when requested.
944	void check() const;
945
946	/// Keeps track of the last unique context id assigned.
947	unsigned int LastContextId = `0`;
948	};
949
950	template <typename DerivedCCG, typename FuncTy, typename CallTy>
951	using ContextNode =
952	typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
953	template <typename DerivedCCG, typename FuncTy, typename CallTy>
954	using ContextEdge =
955	typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
956	template <typename DerivedCCG, typename FuncTy, typename CallTy>
957	using FuncInfo =
958	typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
959	template <typename DerivedCCG, typename FuncTy, typename CallTy>
960	using CallInfo =
961	typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
962
963	/// CRTP derived class for graphs built from IR (regular LTO).
964	class ModuleCallsiteContextGraph
965	: public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
966	Instruction *> {
967	public:
968	ModuleCallsiteContextGraph(
969	Module &M,
970	llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
971
972	private:
973	friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
974	Instruction *>;
975
976	uint64_t getStackId(uint64_t IdOrIndex) const;
977	const Function getCalleeFunc(Instruction Call);
978	bool calleeMatchesFunc(
979	Instruction Call, const* Function Func, const* Function *CallerFunc,
980	std::vector<std::pair<Instruction , Function >> &FoundCalleeChain);
981	bool sameCallee(Instruction Call1, Instruction Call2);
982	bool findProfiledCalleeThroughTailCalls(
983	const Function ProfiledCallee, Value CurCallee, unsigned Depth,
984	std::vector<std::pair<Instruction , Function >> &FoundCalleeChain,
985	bool &FoundMultipleCalleeChains);
986	uint64_t getLastStackId(Instruction *Call);
987	std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
988	void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
989	AllocationType getAllocationCallType(const CallInfo &Call) const;
990	void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
991	CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
992	Instruction *>::FuncInfo
993	cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
994	DenseMap<CallInfo, CallInfo> &CallMap,
995	std::vector<CallInfo> &CallsWithMetadataInFunc,
996	unsigned CloneNo);
997	std::string getLabel(const Function Func, const* Instruction *Call,
998	unsigned CloneNo) const;
999
1000	const Module &Mod;
1001	llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
1002	};
1003
1004	/// Represents a call in the summary index graph, which can either be an
1005	/// allocation or an interior callsite node in an allocation's context.
1006	/// Holds a pointer to the corresponding data structure in the index.
1007	struct IndexCall : public PointerUnion<CallsiteInfo , AllocInfo > {
1008	IndexCall() : PointerUnion () {}
1009	IndexCall(std::nullptr_t) : IndexCall () {}
1010	IndexCall(CallsiteInfo *StackNode) : PointerUnion (StackNode) {}
1011	IndexCall(AllocInfo *AllocNode) : PointerUnion (AllocNode) {}
1012	IndexCall(PointerUnion PT) : PointerUnion (PT) {}
1013
1014	IndexCall *operator->() { return this; }
1015
1016	void print(raw_ostream &OS) const {
1017	PointerUnion<CallsiteInfo , AllocInfo > Base = *this;
1018	if (auto AI = llvm::dyn_cast_if_present<AllocInfo >(Val&: Base)) {
1019	OS << *AI;
1020	} else {
1021	auto CI = llvm::dyn_cast_if_present<CallsiteInfo >(Val&: Base);
1022	assert(CI);
1023	OS << *CI;
1024	}
1025	}
1026	};
1027	} // namespace
1028
1029	namespace llvm {
1030	template <> struct simplify_type<IndexCall> {
1031	using SimpleType = PointerUnion<CallsiteInfo , AllocInfo >;
1032	static SimpleType getSimplifiedValue(IndexCall &Val) { return Val; }
1033	};
1034	template <> struct simplify_type<const IndexCall> {
1035	using SimpleType = const PointerUnion<CallsiteInfo , AllocInfo >;
1036	static SimpleType getSimplifiedValue(const IndexCall &Val) { return Val; }
1037	};
1038	} // namespace llvm
1039
1040	namespace {
1041	/// CRTP derived class for graphs built from summary index (ThinLTO).
1042	class IndexCallsiteContextGraph
1043	: public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
1044	IndexCall> {
1045	public:
1046	IndexCallsiteContextGraph(
1047	ModuleSummaryIndex &Index,
1048	llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
1049	isPrevailing);
1050
1051	~IndexCallsiteContextGraph() {
1052	// Now that we are done with the graph it is safe to add the new
1053	// CallsiteInfo structs to the function summary vectors. The graph nodes
1054	// point into locations within these vectors, so we don't want to add them
1055	// any earlier.
1056	for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) {
1057	auto *FS = I.first;
1058	for (auto &Callsite : I.second)
1059	FS->addCallsite(Callsite: std::move(*Callsite.second));
1060	}
1061	}
1062
1063	private:
1064	friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
1065	IndexCall>;
1066
1067	uint64_t getStackId(uint64_t IdOrIndex) const;
1068	const FunctionSummary *getCalleeFunc(IndexCall &Call);
1069	bool calleeMatchesFunc(
1070	IndexCall &Call, const FunctionSummary *Func,
1071	const FunctionSummary *CallerFunc,
1072	std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);
1073	bool sameCallee(IndexCall &Call1, IndexCall &Call2);
1074	bool findProfiledCalleeThroughTailCalls(
1075	ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
1076	std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
1077	bool &FoundMultipleCalleeChains);
1078	uint64_t getLastStackId(IndexCall &Call);
1079	std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
1080	void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
1081	AllocationType getAllocationCallType(const CallInfo &Call) const;
1082	void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
1083	CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
1084	IndexCall>::FuncInfo
1085	cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
1086	DenseMap<CallInfo, CallInfo> &CallMap,
1087	std::vector<CallInfo> &CallsWithMetadataInFunc,
1088	unsigned CloneNo);
1089	std::string getLabel(const FunctionSummary Func, const* IndexCall &Call,
1090	unsigned CloneNo) const;
1091	DenseSet<GlobalValue::GUID> findAliaseeGUIDsPrevailingInDifferentModule();
1092
1093	// Saves mapping from function summaries containing memprof records back to
1094	// its VI, for use in checking and debugging.
1095	std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
1096
1097	const ModuleSummaryIndex &Index;
1098	llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
1099	isPrevailing;
1100
1101	// Saves/owns the callsite info structures synthesized for missing tail call
1102	// frames that we discover while building the graph.
1103	// It maps from the summary of the function making the tail call, to a map
1104	// of callee ValueInfo to corresponding synthesized callsite info.
1105	DenseMap<FunctionSummary *,
1106	std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>>
1107	FunctionCalleesToSynthesizedCallsiteInfos;
1108	};
1109	} // namespace
1110
1111	template <>
1112	struct llvm::DenseMapInfo<CallsiteContextGraph<
1113	ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
1114	: public DenseMapInfo<std::pair<Instruction , unsigned*>> {};
1115	template <>
1116	struct llvm::DenseMapInfo<CallsiteContextGraph<
1117	IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
1118	: public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
1119	template <>
1120	struct llvm::DenseMapInfo<IndexCall>
1121	: public DenseMapInfo<PointerUnion<CallsiteInfo , AllocInfo >> {};
1122
1123	namespace {
1124
1125	// Map the uint8_t alloc types (which may contain NotCold\|Cold) to the alloc
1126	// type we should actually use on the corresponding allocation.
1127	// If we can't clone a node that has NotCold+Cold alloc type, we will fall
1128	// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
1129	// from NotCold.
1130	AllocationType allocTypeToUse(uint8_t AllocTypes) {
1131	assert(AllocTypes != (uint8_t)AllocationType::None);
1132	if (AllocTypes ==
1133	((uint8_t)AllocationType::NotCold \| (uint8_t)AllocationType::Cold))
1134	return AllocationType::NotCold;
1135	else
1136	return (AllocationType)AllocTypes;
1137	}
1138
1139	// Helper to check if the alloc types for all edges recorded in the
1140	// InAllocTypes vector match the alloc types for all edges in the Edges
1141	// vector.
1142	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1143	bool allocTypesMatch(
1144	const std::vector<uint8_t> &InAllocTypes,
1145	const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
1146	&Edges) {
1147	// This should be called only when the InAllocTypes vector was computed for
1148	// this set of Edges. Make sure the sizes are the same.
1149	assert(InAllocTypes.size() == Edges.size());
1150	return std::equal(
1151	InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(), Edges.end(),
1152	[](const uint8_t &l,
1153	const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
1154	// Can share if one of the edges is None type - don't
1155	// care about the type along that edge as it doesn't
1156	// exist for those context ids.
1157	if (l == (uint8_t)AllocationType::None \|\|
1158	r->AllocTypes == (uint8_t)AllocationType::None)
1159	return true;
1160	return allocTypeToUse(AllocTypes: l) == allocTypeToUse(r->AllocTypes);
1161	});
1162	}
1163
1164	// Helper to check if the alloc types for all edges recorded in the
1165	// InAllocTypes vector match the alloc types for callee edges in the given
1166	// clone. Because the InAllocTypes were computed from the original node's callee
1167	// edges, and other cloning could have happened after this clone was created, we
1168	// need to find the matching clone callee edge, which may or may not exist.
1169	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1170	bool allocTypesMatchClone(
1171	const std::vector<uint8_t> &InAllocTypes,
1172	const ContextNode<DerivedCCG, FuncTy, CallTy> *Clone) {
1173	const ContextNode<DerivedCCG, FuncTy, CallTy> *Node = Clone->CloneOf;
1174	assert(Node);
1175	// InAllocTypes should have been computed for the original node's callee
1176	// edges.
1177	assert(InAllocTypes.size() == Node->CalleeEdges.size());
1178	// First create a map of the clone callee edge callees to the edge alloc type.
1179	DenseMap<const ContextNode<DerivedCCG, FuncTy, CallTy> *, uint8_t>
1180	EdgeCalleeMap;
1181	for (const auto &E : Clone->CalleeEdges) {
1182	assert(!EdgeCalleeMap.contains(E->Callee));
1183	EdgeCalleeMap[E->Callee] = E->AllocTypes;
1184	}
1185	// Next, walk the original node's callees, and look for the corresponding
1186	// clone edge to that callee.
1187	for (unsigned I = `0`; I < Node->CalleeEdges.size(); I++) {
1188	auto Iter = EdgeCalleeMap.find(Node->CalleeEdges[I]->Callee);
1189	// Not found is ok, we will simply add an edge if we use this clone.
1190	if (Iter == EdgeCalleeMap.end())
1191	continue;
1192	// Can share if one of the edges is None type - don't
1193	// care about the type along that edge as it doesn't
1194	// exist for those context ids.
1195	if (InAllocTypes [I] == (uint8_t)AllocationType::None \|\|
1196	Iter->second == (uint8_t)AllocationType::None)
1197	continue;
1198	if (allocTypeToUse(Iter->second) != allocTypeToUse(AllocTypes: InAllocTypes [I]))
1199	return false;
1200	}
1201	return true;
1202	}
1203
1204	} // end anonymous namespace
1205
1206	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1207	typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1208	CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
1209	const CallInfo &C) {
1210	ContextNode *Node = getNodeForAlloc(C);
1211	if (Node)
1212	return Node;
1213
1214	return NonAllocationCallToContextNodeMap.lookup(C);
1215	}
1216
1217	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1218	typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1219	CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
1220	const CallInfo &C) {
1221	return AllocationCallToContextNodeMap.lookup(C);
1222	}
1223
1224	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1225	typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1226	CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
1227	uint64_t StackId) {
1228	auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
1229	if (StackEntryNode != StackEntryIdToContextNodeMap.end())
1230	return StackEntryNode->second;
1231	return nullptr;
1232	}
1233
1234	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1235	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1236	addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
1237	unsigned int ContextId) {
1238	for (auto &Edge : CallerEdges) {
1239	if (Edge->Caller == Caller) {
1240	Edge->AllocTypes \|= (uint8_t)AllocType;
1241	Edge->getContextIds().insert(ContextId);
1242	return;
1243	}
1244	}
1245	std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
1246	this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
1247	CallerEdges.push_back(Edge);
1248	Caller->CalleeEdges.push_back(Edge);
1249	}
1250
1251	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1252	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::removeEdgeFromGraph(
1253	ContextEdge Edge, EdgeIter EI, bool CalleeIter) {
1254	assert(!EI \|\| (*EI)->get() == Edge);
1255	assert(!Edge->isRemoved());
1256	// Save the Caller and Callee pointers so we can erase Edge from their edge
1257	// lists after clearing Edge below. We do the clearing first in case it is
1258	// destructed after removing from the edge lists (if those were the last
1259	// shared_ptr references to Edge).
1260	auto *Callee = Edge->Callee;
1261	auto *Caller = Edge->Caller;
1262
1263	// Make sure the edge fields are cleared out so we can properly detect
1264	// removed edges if Edge is not destructed because there is still a shared_ptr
1265	// reference.
1266	Edge->clear();
1267
1268	#ifndef NDEBUG
1269	auto CalleeCallerCount = Callee->CallerEdges.size();
1270	auto CallerCalleeCount = Caller->CalleeEdges.size();
1271	#endif
1272	if (!EI) {
1273	Callee->eraseCallerEdge(Edge);
1274	Caller->eraseCalleeEdge(Edge);
1275	} else if (CalleeIter) {
1276	Callee->eraseCallerEdge(Edge);
1277	EI = Caller->CalleeEdges.erase(EI);
1278	} else {
1279	Caller->eraseCalleeEdge(Edge);
1280	EI = Callee->CallerEdges.erase(EI);
1281	}
1282	assert(Callee->CallerEdges.size() < CalleeCallerCount);
1283	assert(Caller->CalleeEdges.size() < CallerCalleeCount);
1284	}
1285
1286	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1287	void CallsiteContextGraph<
1288	DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
1289	for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
1290	auto Edge = *EI;
1291	if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1292	assert(Edge->ContextIds.empty());
1293	removeEdgeFromGraph(Edge: Edge.get(), EI: &EI, /CalleeIter=/true);
1294	} else
1295	++EI;
1296	}
1297	}
1298
1299	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1300	void CallsiteContextGraph<
1301	DerivedCCG, FuncTy, CallTy>::removeNoneTypeCallerEdges(ContextNode *Node) {
1302	for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
1303	auto Edge = *EI;
1304	if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1305	assert(Edge->ContextIds.empty());
1306	Edge->Caller->eraseCalleeEdge(Edge.get());
1307	EI = Node->CallerEdges.erase(EI);
1308	} else
1309	++EI;
1310	}
1311	}
1312
1313	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1314	typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1315	CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1316	findEdgeFromCallee(const ContextNode *Callee) {
1317	for (const auto &Edge : CalleeEdges)
1318	if (Edge->Callee == Callee)
1319	return Edge.get();
1320	return nullptr;
1321	}
1322
1323	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1324	typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1325	CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1326	findEdgeFromCaller(const ContextNode *Caller) {
1327	for (const auto &Edge : CallerEdges)
1328	if (Edge->Caller == Caller)
1329	return Edge.get();
1330	return nullptr;
1331	}
1332
1333	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1334	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1335	eraseCalleeEdge(const ContextEdge *Edge) {
1336	auto EI = llvm::find_if(
1337	CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
1338	return CalleeEdge.get() == Edge;
1339	});
1340	assert(EI != CalleeEdges.end());
1341	CalleeEdges.erase(EI);
1342	}
1343
1344	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1345	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1346	eraseCallerEdge(const ContextEdge *Edge) {
1347	auto EI = llvm::find_if(
1348	CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
1349	return CallerEdge.get() == Edge;
1350	});
1351	assert(EI != CallerEdges.end());
1352	CallerEdges.erase(EI);
1353	}
1354
1355	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1356	uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
1357	DenseSet<uint32_t> &ContextIds) const {
1358	uint8_t BothTypes =
1359	(uint8_t)AllocationType::Cold \| (uint8_t)AllocationType::NotCold;
1360	uint8_t AllocType = (uint8_t)AllocationType::None;
1361	for (auto Id : ContextIds) {
1362	AllocType \|= (uint8_t)ContextIdToAllocationType.at(Val: Id);
1363	// Bail early if alloc type reached both, no further refinement.
1364	if (AllocType == BothTypes)
1365	return AllocType;
1366	}
1367	return AllocType;
1368	}
1369
1370	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1371	uint8_t
1372	CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
1373	const DenseSet<uint32_t> &Node1Ids,
1374	const DenseSet<uint32_t> &Node2Ids) const {
1375	uint8_t BothTypes =
1376	(uint8_t)AllocationType::Cold \| (uint8_t)AllocationType::NotCold;
1377	uint8_t AllocType = (uint8_t)AllocationType::None;
1378	for (auto Id : Node1Ids) {
1379	if (!Node2Ids.count(V: Id))
1380	continue;
1381	AllocType \|= (uint8_t)ContextIdToAllocationType.at(Val: Id);
1382	// Bail early if alloc type reached both, no further refinement.
1383	if (AllocType == BothTypes)
1384	return AllocType;
1385	}
1386	return AllocType;
1387	}
1388
1389	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1390	uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
1391	const DenseSet<uint32_t> &Node1Ids,
1392	const DenseSet<uint32_t> &Node2Ids) const {
1393	if (Node1Ids.size() < Node2Ids.size())
1394	return intersectAllocTypesImpl(Node1Ids, Node2Ids);
1395	else
1396	return intersectAllocTypesImpl(Node1Ids: Node2Ids, Node2Ids: Node1Ids);
1397	}
1398
1399	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1400	typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1401	CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
1402	CallInfo Call, const FuncTy *F) {
1403	assert(!getNodeForAlloc(Call));
1404	ContextNode AllocNode = createNewNode(/IsAllocation=/*true, F, C: Call);
1405	AllocationCallToContextNodeMap[Call] = AllocNode;
1406	// Use LastContextId as a uniq id for MIB allocation nodes.
1407	AllocNode->OrigStackOrAllocId = LastContextId;
1408	// Alloc type should be updated as we add in the MIBs. We should assert
1409	// afterwards that it is not still None.
1410	AllocNode->AllocTypes = (uint8_t)AllocationType::None;
1411
1412	return AllocNode;
1413	}
1414
1415	static std::string getAllocTypeString(uint8_t AllocTypes) {
1416	if (!AllocTypes)
1417	return "None";
1418	std::string Str;
1419	if (AllocTypes & (uint8_t)AllocationType::NotCold)
1420	Str += "NotCold";
1421	if (AllocTypes & (uint8_t)AllocationType::Cold)
1422	Str += "Cold";
1423	return Str;
1424	}
1425
1426	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1427	template <class NodeT, class IteratorT>
1428	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
1429	ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
1430	CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
1431	ArrayRef<ContextTotalSize> ContextSizeInfo,
1432	std::map<uint64_t, uint32_t> &TotalSizeToContextIdTopNCold) {
1433	// Treating the hot alloc type as NotCold before the disambiguation for "hot"
1434	// is done.
1435	if (AllocType == AllocationType::Hot)
1436	AllocType = AllocationType::NotCold;
1437
1438	ContextIdToAllocationType [++LastContextId] = AllocType;
1439
1440	bool IsImportant = false;
1441	if (!ContextSizeInfo.empty()) {
1442	auto &Entry = ContextIdToContextSizeInfos [LastContextId];
1443	// If this is a cold allocation, and we are collecting non-zero largest
1444	// contexts, see if this is a candidate.
1445	if (AllocType == AllocationType::Cold && MemProfTopNImportant > `0`) {
1446	uint64_t TotalCold = `0`;
1447	for (auto &CSI : ContextSizeInfo)
1448	TotalCold += CSI.TotalSize;
1449	// Record this context if either we haven't found the first top-n largest
1450	// yet, or if it is larger than the smallest already recorded.
1451	if (TotalSizeToContextIdTopNCold.size() < MemProfTopNImportant \|\|
1452	// Since TotalSizeToContextIdTopNCold is a std::map, it is implicitly
1453	// sorted in ascending size of its key which is the size.
1454	TotalCold > TotalSizeToContextIdTopNCold.begin()->first) {
1455	if (TotalSizeToContextIdTopNCold.size() == MemProfTopNImportant) {
1456	// Remove old one and its associated entries.
1457	auto IdToRemove = TotalSizeToContextIdTopNCold.begin()->second;
1458	TotalSizeToContextIdTopNCold.erase(
1459	position: TotalSizeToContextIdTopNCold.begin());
1460	assert(ImportantContextIdInfo.count(IdToRemove));
1461	ImportantContextIdInfo.erase(IdToRemove);
1462	}
1463	TotalSizeToContextIdTopNCold [TotalCold] = LastContextId;
1464	IsImportant = true;
1465	}
1466	}
1467	Entry.insert(position: Entry.begin(), first: ContextSizeInfo.begin(), last: ContextSizeInfo.end());
1468	}
1469
1470	// Update alloc type and context ids for this MIB.
1471	AllocNode->AllocTypes \|= (uint8_t)AllocType;
1472
1473	// Now add or update nodes for each stack id in alloc's context.
1474	// Later when processing the stack ids on non-alloc callsites we will adjust
1475	// for any inlining in the context.
1476	ContextNode *PrevNode = AllocNode;
1477	// Look for recursion (direct recursion should have been collapsed by
1478	// module summary analysis, here we should just be detecting mutual
1479	// recursion). Mark these nodes so we don't try to clone.
1480	SmallSet<uint64_t, `8`> StackIdSet;
1481	// Skip any on the allocation call (inlining).
1482	for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
1483	ContextIter != StackContext.end(); ++ContextIter) {
1484	auto StackId = getStackId(IdOrIndex: *ContextIter);
1485	if (IsImportant)
1486	ImportantContextIdInfo[LastContextId].StackIds.push_back(StackId);
1487	ContextNode *StackNode = getNodeForStackId(StackId);
1488	if (!StackNode) {
1489	StackNode = createNewNode(/IsAllocation=/false);
1490	StackEntryIdToContextNodeMap[StackId] = StackNode;
1491	StackNode->OrigStackOrAllocId = StackId;
1492	}
1493	// Marking a node recursive will prevent its cloning completely, even for
1494	// non-recursive contexts flowing through it.
1495	if (!AllowRecursiveCallsites) {
1496	auto Ins = StackIdSet.insert(StackId);
1497	if (!Ins.second)
1498	StackNode->Recursive = true;
1499	}
1500	StackNode->AllocTypes \|= (uint8_t)AllocType;
1501	PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
1502	PrevNode = StackNode;
1503	}
1504	}
1505
1506	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1507	DenseSet<uint32_t>
1508	CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
1509	const DenseSet<uint32_t> &StackSequenceContextIds,
1510	DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1511	DenseSet<uint32_t> NewContextIds;
1512	for (auto OldId : StackSequenceContextIds) {
1513	NewContextIds.insert(V: ++LastContextId);
1514	OldToNewContextIds [OldId].insert(V: LastContextId);
1515	assert(ContextIdToAllocationType.count(OldId));
1516	// The new context has the same allocation type and size info as original.
1517	ContextIdToAllocationType [LastContextId] = ContextIdToAllocationType [OldId];
1518	auto CSI = ContextIdToContextSizeInfos.find(Val: OldId);
1519	if (CSI != ContextIdToContextSizeInfos.end())
1520	ContextIdToContextSizeInfos [LastContextId] = CSI ->second;
1521	if (DotAllocContextIds.contains(V: OldId))
1522	DotAllocContextIds.insert(V: LastContextId);
1523	}
1524	return NewContextIds;
1525	}
1526
1527	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1528	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1529	propagateDuplicateContextIds(
1530	const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1531	// Build a set of duplicated context ids corresponding to the input id set.
1532	auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
1533	DenseSet<uint32_t> NewIds;
1534	for (auto Id : ContextIds)
1535	if (auto NewId = OldToNewContextIds.find(Val: Id);
1536	NewId != OldToNewContextIds.end())
1537	NewIds.insert_range(R: NewId ->second);
1538	return NewIds;
1539	};
1540
1541	// Recursively update context ids sets along caller edges.
1542	auto UpdateCallers = [&](ContextNode *Node,
1543	DenseSet<const ContextEdge *> &Visited,
1544	auto &&UpdateCallers) -> void {
1545	for (const auto &Edge : Node->CallerEdges) {
1546	auto Inserted = Visited.insert(Edge.get());
1547	if (!Inserted.second)
1548	continue;
1549	ContextNode *NextNode = Edge->Caller;
1550	DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
1551	// Only need to recursively iterate to NextNode via this caller edge if
1552	// it resulted in any added ids to NextNode.
1553	if (!NewIdsToAdd.empty()) {
1554	Edge->getContextIds().insert_range(NewIdsToAdd);
1555	UpdateCallers(NextNode, Visited, UpdateCallers);
1556	}
1557	}
1558	};
1559
1560	DenseSet<const ContextEdge *> Visited;
1561	for (auto &Entry : AllocationCallToContextNodeMap) {
1562	auto *Node = Entry.second;
1563	UpdateCallers(Node, Visited, UpdateCallers);
1564	}
1565	}
1566
1567	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1568	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
1569	ContextNode NewNode, ContextNode OrigNode, bool TowardsCallee,
1570	// This must be passed by value to make a copy since it will be adjusted
1571	// as ids are moved.
1572	DenseSet<uint32_t> RemainingContextIds) {
1573	auto &OrigEdges =
1574	TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
1575	DenseSet<uint32_t> RecursiveContextIds;
1576	DenseSet<uint32_t> AllCallerContextIds;
1577	if (AllowRecursiveCallsites) {
1578	// Identify which context ids are recursive which is needed to properly
1579	// update the RemainingContextIds set. The relevant recursive context ids
1580	// are those that are in multiple edges.
1581	for (auto &CE : OrigEdges) {
1582	AllCallerContextIds.reserve(Size: CE->getContextIds().size());
1583	for (auto Id : CE->getContextIds())
1584	if (!AllCallerContextIds.insert(Id).second)
1585	RecursiveContextIds.insert(Id);
1586	}
1587	}
1588	// Increment iterator in loop so that we can remove edges as needed.
1589	for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
1590	auto Edge = *EI;
1591	DenseSet<uint32_t> NewEdgeContextIds;
1592	DenseSet<uint32_t> NotFoundContextIds;
1593	// Remove any matching context ids from Edge, return set that were found and
1594	// removed, these are the new edge's context ids. Also update the remaining
1595	// (not found ids).
1596	set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
1597	NotFoundContextIds);
1598	// Update the remaining context ids set for the later edges. This is a
1599	// compile time optimization.
1600	if (RecursiveContextIds.empty()) {
1601	// No recursive ids, so all of the previously remaining context ids that
1602	// were not seen on this edge are the new remaining set.
1603	RemainingContextIds.swap(RHS&: NotFoundContextIds);
1604	} else {
1605	// Keep the recursive ids in the remaining set as we expect to see those
1606	// on another edge. We can remove the non-recursive remaining ids that
1607	// were seen on this edge, however. We already have the set of remaining
1608	// ids that were on this edge (in NewEdgeContextIds). Figure out which are
1609	// non-recursive and only remove those. Note that despite the higher
1610	// overhead of updating the remaining context ids set when recursion
1611	// handling is enabled, it was found to be at worst performance neutral
1612	// and in one case a clear win.
1613	DenseSet<uint32_t> NonRecursiveRemainingCurEdgeIds =
1614	set_difference(S1: NewEdgeContextIds, S2: RecursiveContextIds);
1615	set_subtract(S1&: RemainingContextIds, S2: NonRecursiveRemainingCurEdgeIds);
1616	}
1617	// If no matching context ids for this edge, skip it.
1618	if (NewEdgeContextIds.empty()) {
1619	++EI;
1620	continue;
1621	}
1622	if (TowardsCallee) {
1623	uint8_t NewAllocType = computeAllocType(ContextIds&: NewEdgeContextIds);
1624	auto NewEdge = std::make_shared<ContextEdge>(
1625	Edge->Callee, NewNode, NewAllocType, std::move(NewEdgeContextIds));
1626	NewNode->CalleeEdges.push_back(NewEdge);
1627	NewEdge->Callee->CallerEdges.push_back(NewEdge);
1628	} else {
1629	uint8_t NewAllocType = computeAllocType(ContextIds&: NewEdgeContextIds);
1630	auto NewEdge = std::make_shared<ContextEdge>(
1631	NewNode, Edge->Caller, NewAllocType, std::move(NewEdgeContextIds));
1632	NewNode->CallerEdges.push_back(NewEdge);
1633	NewEdge->Caller->CalleeEdges.push_back(NewEdge);
1634	}
1635	// Remove old edge if context ids empty.
1636	if (Edge->getContextIds().empty()) {
1637	removeEdgeFromGraph(Edge: Edge.get(), EI: &EI, CalleeIter: TowardsCallee);
1638	continue;
1639	}
1640	++EI;
1641	}
1642	}
1643
1644	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1645	static void checkEdge(
1646	const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
1647	// Confirm that alloc type is not None and that we have at least one context
1648	// id.
1649	assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
1650	assert(!Edge->ContextIds.empty());
1651	}
1652
1653	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1654	static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
1655	bool CheckEdges = true) {
1656	if (Node->isRemoved())
1657	return;
1658	#ifndef NDEBUG
1659	// Compute node's context ids once for use in asserts.
1660	auto NodeContextIds = Node->getContextIds();
1661	#endif
1662	// Node's context ids should be the union of both its callee and caller edge
1663	// context ids.
1664	if (Node->CallerEdges.size()) {
1665	DenseSet<uint32_t> CallerEdgeContextIds(
1666	Node->CallerEdges.front()->ContextIds);
1667	for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) {
1668	if (CheckEdges)
1669	checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1670	set_union(CallerEdgeContextIds, Edge->ContextIds);
1671	}
1672	// Node can have more context ids than callers if some contexts terminate at
1673	// node and some are longer. If we are allowing recursive callsites and
1674	// contexts this will be violated for incompletely cloned recursive cycles,
1675	// so skip the checking in that case.
1676	assert((AllowRecursiveCallsites && AllowRecursiveContexts) \|\|
1677	NodeContextIds == CallerEdgeContextIds \|\|
1678	set_is_subset(CallerEdgeContextIds, NodeContextIds));
1679	}
1680	if (Node->CalleeEdges.size()) {
1681	DenseSet<uint32_t> CalleeEdgeContextIds(
1682	Node->CalleeEdges.front()->ContextIds);
1683	for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) {
1684	if (CheckEdges)
1685	checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1686	set_union(CalleeEdgeContextIds, Edge->getContextIds());
1687	}
1688	// If we are allowing recursive callsites and contexts this will be violated
1689	// for incompletely cloned recursive cycles, so skip the checking in that
1690	// case.
1691	assert((AllowRecursiveCallsites && AllowRecursiveContexts) \|\|
1692	NodeContextIds == CalleeEdgeContextIds);
1693	}
1694	// FIXME: Since this checking is only invoked under an option, we should
1695	// change the error checking from using assert to something that will trigger
1696	// an error on a release build.
1697	#ifndef NDEBUG
1698	// Make sure we don't end up with duplicate edges between the same caller and
1699	// callee.
1700	DenseSet<ContextNode<DerivedCCG, FuncTy, CallTy> *> NodeSet;
1701	for (const auto &E : Node->CalleeEdges)
1702	NodeSet.insert(E->Callee);
1703	assert(NodeSet.size() == Node->CalleeEdges.size());
1704	#endif
1705	}
1706
1707	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1708	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1709	assignStackNodesPostOrder(ContextNode *Node,
1710	DenseSet<const ContextNode *> &Visited,
1711	DenseMap<uint64_t, std::vector<CallContextInfo>>
1712	&StackIdToMatchingCalls,
1713	DenseMap<CallInfo, CallInfo> &CallToMatchingCall,
1714	const DenseSet<uint32_t> &ImportantContextIds) {
1715	auto Inserted = Visited.insert(Node);
1716	if (!Inserted.second)
1717	return;
1718	// Post order traversal. Iterate over a copy since we may add nodes and
1719	// therefore new callers during the recursive call, invalidating any
1720	// iterator over the original edge vector. We don't need to process these
1721	// new nodes as they were already processed on creation.
1722	auto CallerEdges = Node->CallerEdges;
1723	for (auto &Edge : CallerEdges) {
1724	// Skip any that have been removed during the recursion.
1725	if (Edge->isRemoved()) {
1726	assert(!is_contained(Node->CallerEdges, Edge));
1727	continue;
1728	}
1729	assignStackNodesPostOrder(Node: Edge->Caller, Visited, StackIdToMatchingCalls,
1730	CallToMatchingCall, ImportantContextIds);
1731	}
1732
1733	// If this node's stack id is in the map, update the graph to contain new
1734	// nodes representing any inlining at interior callsites. Note we move the
1735	// associated context ids over to the new nodes.
1736
1737	// Ignore this node if it is for an allocation or we didn't record any
1738	// stack id lists ending at it.
1739	if (Node->IsAllocation \|\|
1740	!StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
1741	return;
1742
1743	auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
1744	// Handle the simple case first. A single call with a single stack id.
1745	// In this case there is no need to create any new context nodes, simply
1746	// assign the context node for stack id to this Call.
1747	if (Calls.size() == `1`) {
1748	auto &[Call, Ids, Func, SavedContextIds] = Calls[`0`];
1749	if (Ids.size() == `1`) {
1750	assert(SavedContextIds.empty());
1751	// It should be this Node
1752	assert(Node == getNodeForStackId(Ids[`0`]));
1753	if (Node->Recursive)
1754	return;
1755	Node->setCall(Call);
1756	NonAllocationCallToContextNodeMap[Call] = Node;
1757	NodeToCallingFunc[Node] = Func;
1758	recordStackNode(StackIds&: Ids, Node, NodeContextIds: Node->getContextIds(), ImportantContextIds);
1759	return;
1760	}
1761	}
1762
1763	#ifndef NDEBUG
1764	// Find the node for the last stack id, which should be the same
1765	// across all calls recorded for this id, and is this node's id.
1766	uint64_t LastId = Node->OrigStackOrAllocId;
1767	ContextNode *LastNode = getNodeForStackId(LastId);
1768	// We should only have kept stack ids that had nodes.
1769	assert(LastNode);
1770	assert(LastNode == Node);
1771	#else
1772	ContextNode *LastNode = Node;
1773	#endif
1774
1775	// Compute the last node's context ids once, as it is shared by all calls in
1776	// this entry.
1777	DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1778
1779	[[maybe_unused]] bool PrevIterCreatedNode = false;
1780	bool CreatedNode = false;
1781	for (unsigned I = `0`; I < Calls.size();
1782	I++, PrevIterCreatedNode = CreatedNode) {
1783	CreatedNode = false;
1784	auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1785	// Skip any for which we didn't assign any ids, these don't get a node in
1786	// the graph.
1787	if (SavedContextIds.empty()) {
1788	// If this call has a matching call (located in the same function and
1789	// having the same stack ids), simply add it to the context node created
1790	// for its matching call earlier. These can be treated the same through
1791	// cloning and get updated at the same time.
1792	if (!CallToMatchingCall.contains(Call))
1793	continue;
1794	auto MatchingCall = CallToMatchingCall[Call];
1795	if (!NonAllocationCallToContextNodeMap.contains(MatchingCall)) {
1796	// This should only happen if we had a prior iteration, and it didn't
1797	// create a node because of the below recomputation of context ids
1798	// finding none remaining and continuing early.
1799	assert(I > `0` && !PrevIterCreatedNode);
1800	continue;
1801	}
1802	NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back(
1803	Call);
1804	continue;
1805	}
1806
1807	assert(LastId == Ids.back());
1808
1809	// Recompute the context ids for this stack id sequence (the
1810	// intersection of the context ids of the corresponding nodes).
1811	// Start with the ids we saved in the map for this call, which could be
1812	// duplicated context ids. We have to recompute as we might have overlap
1813	// overlap between the saved context ids for different last nodes, and
1814	// removed them already during the post order traversal.
1815	set_intersect(SavedContextIds, LastNodeContextIds);
1816	ContextNode *PrevNode = LastNode;
1817	bool Skip = false;
1818	// Iterate backwards through the stack Ids, starting after the last Id
1819	// in the list, which was handled once outside for all Calls.
1820	for (auto IdIter = Ids.rbegin() + `1`; IdIter != Ids.rend(); IdIter++) {
1821	auto Id = *IdIter;
1822	ContextNode *CurNode = getNodeForStackId(StackId: Id);
1823	// We should only have kept stack ids that had nodes and weren't
1824	// recursive.
1825	assert(CurNode);
1826	assert(!CurNode->Recursive);
1827
1828	auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1829	if (!Edge) {
1830	Skip = true;
1831	break;
1832	}
1833	PrevNode = CurNode;
1834
1835	// Update the context ids, which is the intersection of the ids along
1836	// all edges in the sequence.
1837	set_intersect(SavedContextIds, Edge->getContextIds());
1838
1839	// If we now have no context ids for clone, skip this call.
1840	if (SavedContextIds.empty()) {
1841	Skip = true;
1842	break;
1843	}
1844	}
1845	if (Skip)
1846	continue;
1847
1848	// Create new context node.
1849	ContextNode NewNode = createNewNode(/IsAllocation=/*false, F: Func, C: Call);
1850	NonAllocationCallToContextNodeMap[Call] = NewNode;
1851	CreatedNode = true;
1852	NewNode->AllocTypes = computeAllocType(ContextIds&: SavedContextIds);
1853
1854	ContextNode *FirstNode = getNodeForStackId(StackId: Ids[`0`]);
1855	assert(FirstNode);
1856
1857	// Connect to callees of innermost stack frame in inlined call chain.
1858	// This updates context ids for FirstNode's callee's to reflect those
1859	// moved to NewNode.
1860	connectNewNode(NewNode, OrigNode: FirstNode, /TowardsCallee=/true, RemainingContextIds: SavedContextIds);
1861
1862	// Connect to callers of outermost stack frame in inlined call chain.
1863	// This updates context ids for FirstNode's caller's to reflect those
1864	// moved to NewNode.
1865	connectNewNode(NewNode, OrigNode: LastNode, /TowardsCallee=/false, RemainingContextIds: SavedContextIds);
1866
1867	// Now we need to remove context ids from edges/nodes between First and
1868	// Last Node.
1869	PrevNode = nullptr;
1870	for (auto Id : Ids) {
1871	ContextNode *CurNode = getNodeForStackId(StackId: Id);
1872	// We should only have kept stack ids that had nodes.
1873	assert(CurNode);
1874
1875	// Remove the context ids moved to NewNode from CurNode, and the
1876	// edge from the prior node.
1877	if (PrevNode) {
1878	auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
1879	// If the sequence contained recursion, we might have already removed
1880	// some edges during the connectNewNode calls above.
1881	if (!PrevEdge) {
1882	PrevNode = CurNode;
1883	continue;
1884	}
1885	set_subtract(PrevEdge->getContextIds(), SavedContextIds);
1886	if (PrevEdge->getContextIds().empty())
1887	removeEdgeFromGraph(Edge: PrevEdge);
1888	}
1889	// Since we update the edges from leaf to tail, only look at the callee
1890	// edges. This isn't an alloc node, so if there are no callee edges, the
1891	// alloc type is None.
1892	CurNode->AllocTypes = CurNode->CalleeEdges.empty()
1893	? (uint8_t)AllocationType::None
1894	: CurNode->computeAllocType();
1895	PrevNode = CurNode;
1896	}
1897
1898	recordStackNode(StackIds&: Ids, Node: NewNode, NodeContextIds: SavedContextIds, ImportantContextIds);
1899
1900	if (VerifyNodes) {
1901	checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /CheckEdges=/true);
1902	for (auto Id : Ids) {
1903	ContextNode *CurNode = getNodeForStackId(StackId: Id);
1904	// We should only have kept stack ids that had nodes.
1905	assert(CurNode);
1906	checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /CheckEdges=/true);
1907	}
1908	}
1909	}
1910	}
1911
1912	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1913	void CallsiteContextGraph<DerivedCCG, FuncTy,
1914	CallTy>::fixupImportantContexts() {
1915	if (ImportantContextIdInfo.empty())
1916	return;
1917
1918	// Update statistics as we are done building this map at this point.
1919	NumImportantContextIds = ImportantContextIdInfo.size();
1920
1921	if (!MemProfFixupImportant)
1922	return;
1923
1924	if (ExportToDot)
1925	exportToDot(Label: "beforestackfixup");
1926
1927	// For each context we identified as important, walk through the saved context
1928	// stack ids in order from leaf upwards, and make sure all edges are correct.
1929	// These can be difficult to get right when updating the graph while mapping
1930	// nodes onto summary or IR, especially when there is recursion. In
1931	// particular, when we have created new nodes to reflect inlining, it is
1932	// sometimes impossible to know exactly how to update the edges in the face of
1933	// recursion, as we have lost the original ordering of the stack ids in the
1934	// contexts.
1935	// TODO: Consider only doing this if we detect the context has recursive
1936	// cycles.
1937	//
1938	// I.e. assume we have a context with stack ids like: {A B A C A D E}
1939	// and let's say A was inlined into B, C, and D. The original graph will have
1940	// multiple recursive cycles through A. When we match the original context
1941	// nodes onto the IR or summary, we will merge {A B} into one context node,
1942	// {A C} onto another, and {A D} onto another. Looking at the stack sequence
1943	// above, we should end up with a non-cyclic set of edges like:
1944	// {AB} <- {AC} <- {AD} <- E. However, because we normally have lost the
1945	// original ordering, we won't get the edges correct initially (it's
1946	// impossible without the original ordering). Here we do the fixup (add and
1947	// removing edges where necessary) for this context. In the
1948	// ImportantContextInfo struct in this case we should have a MaxLength = 2,
1949	// and map entries for {A B}, {A C}, {A D}, and {E}.
1950	for (auto &[CurContextId, Info] : ImportantContextIdInfo) {
1951	if (Info.StackIdsToNode.empty())
1952	continue;
1953	bool Changed = false;
1954	ContextNode PrevNode = nullptr*;
1955	ContextNode CurNode = nullptr*;
1956	DenseSet<const ContextEdge *> VisitedEdges;
1957	ArrayRef<uint64_t> AllStackIds(Info.StackIds);
1958	// Try to identify what callsite ContextNode maps to which slice of the
1959	// context's ordered stack ids.
1960	for (unsigned I = `0`; I < AllStackIds.size(); I++, PrevNode = CurNode) {
1961	// We will do this greedily, trying up to MaxLength stack ids in a row, to
1962	// see if we recorded a context node for that sequence.
1963	auto Len = Info.MaxLength;
1964	auto LenToEnd = AllStackIds.size() - I;
1965	if (Len > LenToEnd)
1966	Len = LenToEnd;
1967	CurNode = nullptr;
1968	// Try to find a recorded context node starting with the longest length
1969	// recorded, and on down until we check for just a single stack node.
1970	for (; Len > `0`; Len--) {
1971	// Get the slice of the original stack id sequence to check.
1972	auto CheckStackIds = AllStackIds.slice(I, Len);
1973	auto EntryIt = Info.StackIdsToNode.find(CheckStackIds);
1974	if (EntryIt == Info.StackIdsToNode.end())
1975	continue;
1976	CurNode = EntryIt->second;
1977	// Skip forward so we don't try to look for the ones we just matched.
1978	// We increment by Len - 1, because the outer for loop will increment I.
1979	I += Len - `1`;
1980	break;
1981	}
1982	// Give up if we couldn't find a node. Since we need to clone from the
1983	// leaf allocation upwards, no sense in doing anymore fixup further up
1984	// the context if we couldn't match part of the original stack context
1985	// onto a callsite node.
1986	if (!CurNode)
1987	break;
1988	// No edges to fix up until we have a pair of nodes that should be
1989	// adjacent in the graph.
1990	if (!PrevNode)
1991	continue;
1992	// See if we already have a call edge from CurNode to PrevNode.
1993	auto *CurEdge = PrevNode->findEdgeFromCaller(CurNode);
1994	if (CurEdge) {
1995	// We already have an edge. Make sure it contains this context id.
1996	if (CurEdge->getContextIds().insert(CurContextId).second) {
1997	NumFixupEdgeIdsInserted ++;
1998	Changed = true;
1999	}
2000	} else {
2001	// No edge exists - add one.
2002	NumFixupEdgesAdded ++;
2003	DenseSet<uint32_t> ContextIds({CurContextId});
2004	auto AllocType = computeAllocType(ContextIds);
2005	auto NewEdge = std::make_shared<ContextEdge>(
2006	PrevNode, CurNode, AllocType, std::move(ContextIds));
2007	PrevNode->CallerEdges.push_back(NewEdge);
2008	CurNode->CalleeEdges.push_back(NewEdge);
2009	// Save the new edge for the below handling.
2010	CurEdge = NewEdge.get();
2011	Changed = true;
2012	}
2013	VisitedEdges.insert(CurEdge);
2014	// Now remove this context id from any other caller edges calling
2015	// PrevNode.
2016	for (auto &Edge : PrevNode->CallerEdges) {
2017	// Skip the edge updating/created above and edges we have already
2018	// visited (due to recursion).
2019	if (Edge.get() != CurEdge && !VisitedEdges.contains(Edge.get()))
2020	Edge->getContextIds().erase(CurContextId);
2021	}
2022	}
2023	if (Changed)
2024	NumFixedContexts ++;
2025	}
2026	}
2027
2028	template <typename DerivedCCG, typename FuncTy, typename CallTy>
2029	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
2030	// Map of stack id to all calls with that as the last (outermost caller)
2031	// callsite id that has a context node (some might not due to pruning
2032	// performed during matching of the allocation profile contexts).
2033	// The CallContextInfo contains the Call and a list of its stack ids with
2034	// ContextNodes, the function containing Call, and the set of context ids
2035	// the analysis will eventually identify for use in any new node created
2036	// for that callsite.
2037	DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls;
2038	for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
2039	for (auto &Call : CallsWithMetadata) {
2040	// Ignore allocations, already handled.
2041	if (AllocationCallToContextNodeMap.count(Call))
2042	continue;
2043	auto StackIdsWithContextNodes =
2044	getStackIdsWithContextNodesForCall(Call: Call.call());
2045	// If there were no nodes created for MIBs on allocs (maybe this was in
2046	// the unambiguous part of the MIB stack that was pruned), ignore.
2047	if (StackIdsWithContextNodes.empty())
2048	continue;
2049	// Otherwise, record this Call along with the list of ids for the last
2050	// (outermost caller) stack id with a node.
2051	StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
2052	{Call.call(), StackIdsWithContextNodes, Func, {}});
2053	}
2054	}
2055
2056	// First make a pass through all stack ids that correspond to a call,
2057	// as identified in the above loop. Compute the context ids corresponding to
2058	// each of these calls when they correspond to multiple stack ids due to
2059	// due to inlining. Perform any duplication of context ids required when
2060	// there is more than one call with the same stack ids. Their (possibly newly
2061	// duplicated) context ids are saved in the StackIdToMatchingCalls map.
2062	DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
2063	// Save a map from each call to any that are found to match it. I.e. located
2064	// in the same function and have the same (possibly pruned) stack ids. We use
2065	// this to avoid creating extra graph nodes as they can be treated the same.
2066	DenseMap<CallInfo, CallInfo> CallToMatchingCall;
2067	for (auto &It : StackIdToMatchingCalls) {
2068	auto &Calls = It.getSecond();
2069	// Skip single calls with a single stack id. These don't need a new node.
2070	if (Calls.size() == `1`) {
2071	auto &Ids = Calls[`0`].StackIds;
2072	if (Ids.size() == `1`)
2073	continue;
2074	}
2075	// In order to do the best and maximal matching of inlined calls to context
2076	// node sequences we will sort the vectors of stack ids in descending order
2077	// of length, and within each length, lexicographically by stack id. The
2078	// latter is so that we can specially handle calls that have identical stack
2079	// id sequences (either due to cloning or artificially because of the MIB
2080	// context pruning). Those with the same Ids are then sorted by function to
2081	// facilitate efficiently mapping them to the same context node.
2082	// Because the functions are pointers, to ensure a stable sort first assign
2083	// each function pointer to its first index in the Calls array, and then use
2084	// that to sort by.
2085	DenseMap<const FuncTy , unsigned*> FuncToIndex;
2086	for (const auto &[Idx, CallCtxInfo] : enumerate(Calls))
2087	FuncToIndex.insert({CallCtxInfo.Func, Idx});
2088	llvm::stable_sort(
2089	Calls,
2090	[&FuncToIndex](const CallContextInfo &A, const CallContextInfo &B) {
2091	return A.StackIds.size() > B.StackIds.size() \|\|
2092	(A.StackIds.size() == B.StackIds.size() &&
2093	(A.StackIds < B.StackIds \|\|
2094	(A.StackIds == B.StackIds &&
2095	FuncToIndex[A.Func] < FuncToIndex[B.Func])));
2096	});
2097
2098	// Find the node for the last stack id, which should be the same
2099	// across all calls recorded for this id, and is the id for this
2100	// entry in the StackIdToMatchingCalls map.
2101	uint64_t LastId = It.getFirst();
2102	ContextNode *LastNode = getNodeForStackId(StackId: LastId);
2103	// We should only have kept stack ids that had nodes.
2104	assert(LastNode);
2105
2106	if (LastNode->Recursive)
2107	continue;
2108
2109	// Initialize the context ids with the last node's. We will subsequently
2110	// refine the context ids by computing the intersection along all edges.
2111	DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
2112	assert(!LastNodeContextIds.empty());
2113
2114	#ifndef NDEBUG
2115	// Save the set of functions seen for a particular set of the same stack
2116	// ids. This is used to ensure that they have been correctly sorted to be
2117	// adjacent in the Calls list, since we rely on that to efficiently place
2118	// all such matching calls onto the same context node.
2119	DenseSet<const FuncTy *> MatchingIdsFuncSet;
2120	#endif
2121
2122	for (unsigned I = `0`; I < Calls.size(); I++) {
2123	auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
2124	assert(SavedContextIds.empty());
2125	assert(LastId == Ids.back());
2126
2127	#ifndef NDEBUG
2128	// If this call has a different set of ids than the last one, clear the
2129	// set used to ensure they are sorted properly.
2130	if (I > `0` && Ids != Calls[I - `1`].StackIds)
2131	MatchingIdsFuncSet.clear();
2132	#endif
2133
2134	// First compute the context ids for this stack id sequence (the
2135	// intersection of the context ids of the corresponding nodes).
2136	// Start with the remaining saved ids for the last node.
2137	assert(!LastNodeContextIds.empty());
2138	DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
2139
2140	ContextNode *PrevNode = LastNode;
2141	ContextNode *CurNode = LastNode;
2142	bool Skip = false;
2143
2144	// Iterate backwards through the stack Ids, starting after the last Id
2145	// in the list, which was handled once outside for all Calls.
2146	for (auto IdIter = Ids.rbegin() + `1`; IdIter != Ids.rend(); IdIter++) {
2147	auto Id = *IdIter;
2148	CurNode = getNodeForStackId(StackId: Id);
2149	// We should only have kept stack ids that had nodes.
2150	assert(CurNode);
2151
2152	if (CurNode->Recursive) {
2153	Skip = true;
2154	break;
2155	}
2156
2157	auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
2158	// If there is no edge then the nodes belong to different MIB contexts,
2159	// and we should skip this inlined context sequence. For example, this
2160	// particular inlined context may include stack ids A->B, and we may
2161	// indeed have nodes for both A and B, but it is possible that they were
2162	// never profiled in sequence in a single MIB for any allocation (i.e.
2163	// we might have profiled an allocation that involves the callsite A,
2164	// but through a different one of its callee callsites, and we might
2165	// have profiled an allocation that involves callsite B, but reached
2166	// from a different caller callsite).
2167	if (!Edge) {
2168	Skip = true;
2169	break;
2170	}
2171	PrevNode = CurNode;
2172
2173	// Update the context ids, which is the intersection of the ids along
2174	// all edges in the sequence.
2175	set_intersect(StackSequenceContextIds, Edge->getContextIds());
2176
2177	// If we now have no context ids for clone, skip this call.
2178	if (StackSequenceContextIds.empty()) {
2179	Skip = true;
2180	break;
2181	}
2182	}
2183	if (Skip)
2184	continue;
2185
2186	// If some of this call's stack ids did not have corresponding nodes (due
2187	// to pruning), don't include any context ids for contexts that extend
2188	// beyond these nodes. Otherwise we would be matching part of unrelated /
2189	// not fully matching stack contexts. To do this, subtract any context ids
2190	// found in caller nodes of the last node found above.
2191	if (Ids.back() != getLastStackId(Call)) {
2192	for (const auto &PE : LastNode->CallerEdges) {
2193	set_subtract(StackSequenceContextIds, PE->getContextIds());
2194	if (StackSequenceContextIds.empty())
2195	break;
2196	}
2197	// If we now have no context ids for clone, skip this call.
2198	if (StackSequenceContextIds.empty())
2199	continue;
2200	}
2201
2202	#ifndef NDEBUG
2203	// If the prior call had the same stack ids this set would not be empty.
2204	// Check if we already have a call that "matches" because it is located
2205	// in the same function. If the Calls list was sorted properly we should
2206	// not encounter this situation as all such entries should be adjacent
2207	// and processed in bulk further below.
2208	assert(!MatchingIdsFuncSet.contains(Func));
2209
2210	MatchingIdsFuncSet.insert(Func);
2211	#endif
2212
2213	// Check if the next set of stack ids is the same (since the Calls vector
2214	// of tuples is sorted by the stack ids we can just look at the next one).
2215	// If so, save them in the CallToMatchingCall map so that they get
2216	// assigned to the same context node, and skip them.
2217	bool DuplicateContextIds = false;
2218	for (unsigned J = I + `1`; J < Calls.size(); J++) {
2219	auto &CallCtxInfo = Calls[J];
2220	auto &NextIds = CallCtxInfo.StackIds;
2221	if (NextIds != Ids)
2222	break;
2223	auto *NextFunc = CallCtxInfo.Func;
2224	if (NextFunc != Func) {
2225	// We have another Call with the same ids but that cannot share this
2226	// node, must duplicate ids for it.
2227	DuplicateContextIds = true;
2228	break;
2229	}
2230	auto &NextCall = CallCtxInfo.Call;
2231	CallToMatchingCall[NextCall] = Call;
2232	// Update I so that it gets incremented correctly to skip this call.
2233	I = J;
2234	}
2235
2236	// If we don't have duplicate context ids, then we can assign all the
2237	// context ids computed for the original node sequence to this call.
2238	// If there are duplicate calls with the same stack ids then we synthesize
2239	// new context ids that are duplicates of the originals. These are
2240	// assigned to SavedContextIds, which is a reference into the map entry
2241	// for this call, allowing us to access these ids later on.
2242	OldToNewContextIds.reserve(NumEntries: OldToNewContextIds.size() +
2243	StackSequenceContextIds.size());
2244	SavedContextIds =
2245	DuplicateContextIds
2246	? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
2247	: StackSequenceContextIds;
2248	assert(!SavedContextIds.empty());
2249
2250	if (!DuplicateContextIds) {
2251	// Update saved last node's context ids to remove those that are
2252	// assigned to other calls, so that it is ready for the next call at
2253	// this stack id.
2254	set_subtract(S1&: LastNodeContextIds, S2: StackSequenceContextIds);
2255	if (LastNodeContextIds.empty())
2256	break;
2257	}
2258	}
2259	}
2260
2261	// Propagate the duplicate context ids over the graph.
2262	propagateDuplicateContextIds(OldToNewContextIds);
2263
2264	if (VerifyCCG)
2265	check();
2266
2267	// Now perform a post-order traversal over the graph, starting with the
2268	// allocation nodes, essentially processing nodes from callers to callees.
2269	// For any that contains an id in the map, update the graph to contain new
2270	// nodes representing any inlining at interior callsites. Note we move the
2271	// associated context ids over to the new nodes.
2272	DenseSet<const ContextNode *> Visited;
2273	DenseSet<uint32_t> ImportantContextIds(llvm::from_range,
2274	ImportantContextIdInfo.keys());
2275	for (auto &Entry : AllocationCallToContextNodeMap)
2276	assignStackNodesPostOrder(Node: Entry.second, Visited, StackIdToMatchingCalls,
2277	CallToMatchingCall, ImportantContextIds);
2278
2279	fixupImportantContexts();
2280
2281	if (VerifyCCG)
2282	check();
2283	}
2284
2285	uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
2286	CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
2287	Call->getMetadata(KindID: LLVMContext::MD_callsite));
2288	return CallsiteContext.back();
2289	}
2290
2291	uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
2292	assert(isa<CallsiteInfo *>(Call));
2293	CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
2294	CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Val&: Call));
2295	// Need to convert index into stack id.
2296	return Index.getStackIdAtIndex(Index: CallsiteContext.back());
2297	}
2298
2299	static const std::string MemProfCloneSuffix = ".memprof.";
2300
2301	static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
2302	// We use CloneNo == 0 to refer to the original version, which doesn't get
2303	// renamed with a suffix.
2304	if (!CloneNo)
2305	return Base.str();
2306	return (Base + MemProfCloneSuffix + Twine (CloneNo)).str();
2307	}
2308
2309	static bool isMemProfClone(const Function &F) {
2310	return F.getName().contains(Other: MemProfCloneSuffix);
2311	}
2312
2313	// Return the clone number of the given function by extracting it from the
2314	// memprof suffix. Assumes the caller has already confirmed it is a memprof
2315	// clone.
2316	static unsigned getMemProfCloneNum(const Function &F) {
2317	assert(isMemProfClone(F));
2318	auto Pos = F.getName().find_last_of(C: `'.'`);
2319	assert(Pos > `0`);
2320	unsigned CloneNo;
2321	bool Err = F.getName().drop_front(N: Pos + `1`).getAsInteger(Radix: `10`, Result&: CloneNo);
2322	assert(!Err);
2323	(void)Err;
2324	return CloneNo;
2325	}
2326
2327	std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
2328	const Instruction *Call,
2329	unsigned CloneNo) const {
2330	return (Twine (Call->getFunction()->getName()) + " -> " +
2331	cast<CallBase>(Val: Call)->getCalledFunction()->getName())
2332	.str();
2333	}
2334
2335	std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
2336	const IndexCall &Call,
2337	unsigned CloneNo) const {
2338	auto VI = FSToVIMap.find(x: Func);
2339	assert(VI != FSToVIMap.end());
2340	std::string CallerName = getMemProfFuncName(Base: VI ->second.name(), CloneNo);
2341	if (isa<AllocInfo *>(Val: Call))
2342	return CallerName + " -> alloc";
2343	else {
2344	auto Callsite = dyn_cast_if_present<CallsiteInfo >(Val: Call);
2345	return CallerName + " -> " +
2346	getMemProfFuncName(Base: Callsite->Callee.name(),
2347	CloneNo: Callsite->Clones [CloneNo]);
2348	}
2349	}
2350
2351	std::vector<uint64_t>
2352	ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
2353	Instruction *Call) {
2354	CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
2355	Call->getMetadata(KindID: LLVMContext::MD_callsite));
2356	return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
2357	CallsiteContext);
2358	}
2359
2360	std::vector<uint64_t>
2361	IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
2362	assert(isa<CallsiteInfo *>(Call));
2363	CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
2364	CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Val&: Call));
2365	return getStackIdsWithContextNodes<CallsiteInfo,
2366	SmallVector<unsigned>::const_iterator>(
2367	CallsiteContext);
2368	}
2369
2370	template <typename DerivedCCG, typename FuncTy, typename CallTy>
2371	template <class NodeT, class IteratorT>
2372	std::vector<uint64_t>
2373	CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
2374	CallStack<NodeT, IteratorT> &CallsiteContext) {
2375	std::vector<uint64_t> StackIds;
2376	for (auto IdOrIndex : CallsiteContext) {
2377	auto StackId = getStackId(IdOrIndex);
2378	ContextNode *Node = getNodeForStackId(StackId);
2379	if (!Node)
2380	break;
2381	StackIds.push_back(StackId);
2382	}
2383	return StackIds;
2384	}
2385
2386	ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
2387	Module &M,
2388	llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)
2389	: Mod(M), OREGetter (OREGetter) {
2390	// Map for keeping track of the largest cold contexts up to the number given
2391	// by MemProfTopNImportant. Must be a std::map (not DenseMap) because keys
2392	// must be sorted.
2393	std::map<uint64_t, uint32_t> TotalSizeToContextIdTopNCold;
2394	for (auto &F : M) {
2395	std::vector<CallInfo> CallsWithMetadata;
2396	for (auto &BB : F) {
2397	for (auto &I : BB) {
2398	if (!isa<CallBase>(Val: I))
2399	continue;
2400	if (auto *MemProfMD = I.getMetadata(KindID: LLVMContext::MD_memprof)) {
2401	CallsWithMetadata.push_back(x: &I);
2402	auto *AllocNode = addAllocNode(Call: &I, F: &F);
2403	auto *CallsiteMD = I.getMetadata(KindID: LLVMContext::MD_callsite);
2404	assert(CallsiteMD);
2405	CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
2406	// Add all of the MIBs and their stack nodes.
2407	for (auto &MDOp : MemProfMD->operands()) {
2408	auto MIBMD = cast<const* MDNode>(Val: MDOp);
2409	std::vector<ContextTotalSize> ContextSizeInfo;
2410	// Collect the context size information if it exists.
2411	if (MIBMD->getNumOperands() > `2`) {
2412	for (unsigned I = `2`; I < MIBMD->getNumOperands(); I++) {
2413	MDNode *ContextSizePair =
2414	dyn_cast<MDNode>(Val: MIBMD->getOperand(I));
2415	assert(ContextSizePair->getNumOperands() == `2`);
2416	uint64_t FullStackId = mdconst::dyn_extract<ConstantInt>(
2417	MD: ContextSizePair->getOperand(I: `0`))
2418	->getZExtValue();
2419	uint64_t TotalSize = mdconst::dyn_extract<ConstantInt>(
2420	MD: ContextSizePair->getOperand(I: `1`))
2421	->getZExtValue();
2422	ContextSizeInfo.push_back(x: {.FullStackId: FullStackId, .TotalSize: TotalSize});
2423	}
2424	}
2425	MDNode *StackNode = getMIBStackNode(MIB: MIBMD);
2426	assert(StackNode);
2427	CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode);
2428	addStackNodesForMIB<MDNode, MDNode::op_iterator>(
2429	AllocNode, StackContext, CallsiteContext,
2430	AllocType: getMIBAllocType(MIB: MIBMD), ContextSizeInfo,
2431	TotalSizeToContextIdTopNCold);
2432	}
2433	// If exporting the graph to dot and an allocation id of interest was
2434	// specified, record all the context ids for this allocation node.
2435	if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
2436	DotAllocContextIds = AllocNode->getContextIds();
2437	assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2438	// Memprof and callsite metadata on memory allocations no longer
2439	// needed.
2440	I.setMetadata(KindID: LLVMContext::MD_memprof, Node: nullptr);
2441	I.setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
2442	}
2443	// For callsite metadata, add to list for this function for later use.
2444	else if (I.getMetadata(KindID: LLVMContext::MD_callsite)) {
2445	CallsWithMetadata.push_back(x: &I);
2446	}
2447	}
2448	}
2449	if (!CallsWithMetadata.empty())
2450	FuncToCallsWithMetadata [&F] = CallsWithMetadata;
2451	}
2452
2453	if (DumpCCG) {
2454	dbgs() << "CCG before updating call stack chains:\n";
2455	dbgs() << *this;
2456	}
2457
2458	if (ExportToDot)
2459	exportToDot(Label: "prestackupdate");
2460
2461	updateStackNodes();
2462
2463	if (ExportToDot)
2464	exportToDot(Label: "poststackupdate");
2465
2466	handleCallsitesWithMultipleTargets();
2467
2468	markBackedges();
2469
2470	// Strip off remaining callsite metadata, no longer needed.
2471	for (auto &FuncEntry : FuncToCallsWithMetadata)
2472	for (auto &Call : FuncEntry.second)
2473	Call.call()->setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
2474	}
2475
2476	// Finds the set of GUIDs for weak aliasees that are prevailing in different
2477	// modules than any of their aliases. We need to handle these specially.
2478	DenseSet<GlobalValue::GUID>
2479	IndexCallsiteContextGraph::findAliaseeGUIDsPrevailingInDifferentModule() {
2480	DenseSet<GlobalValue::GUID> AliaseeGUIDs;
2481	for (auto &I : Index) {
2482	auto VI = Index.getValueInfo(R: I);
2483	for (auto &S : VI.getSummaryList()) {
2484	// We only care about aliases to functions.
2485	auto *AS = dyn_cast<AliasSummary>(Val: S.get());
2486	if (!AS)
2487	continue;
2488	auto *AliaseeSummary = &AS->getAliasee();
2489	auto *AliaseeFS = dyn_cast<FunctionSummary>(Val: AliaseeSummary);
2490	if (!AliaseeFS)
2491	continue;
2492	// Skip this summary if it is not for the prevailing symbol for this GUID.
2493	// The linker doesn't resolve local linkage values so don't check whether
2494	// those are prevailing.
2495	if (!GlobalValue::isLocalLinkage(Linkage: S ->linkage()) &&
2496	!isPrevailing (VI.getGUID(), S.get()))
2497	continue;
2498	// Prevailing aliasee could be in a different module only if it is weak.
2499	if (!GlobalValue::isWeakForLinker(Linkage: AliaseeSummary->linkage()))
2500	continue;
2501	auto AliaseeGUID = AS->getAliaseeGUID();
2502	// If the aliasee copy in this module is not prevailing, record it.
2503	if (!isPrevailing (AliaseeGUID, AliaseeSummary))
2504	AliaseeGUIDs.insert(V: AliaseeGUID);
2505	}
2506	}
2507	AliaseesPrevailingInDiffModuleFromAlias += AliaseeGUIDs.size();
2508	return AliaseeGUIDs;
2509	}
2510
2511	IndexCallsiteContextGraph::IndexCallsiteContextGraph(
2512	ModuleSummaryIndex &Index,
2513	llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
2514	isPrevailing)
2515	: Index(Index), isPrevailing (isPrevailing) {
2516	// Since we use the aliasee summary info to create the necessary clones for
2517	// its aliases, conservatively skip recording the aliasee function's callsites
2518	// in the CCG for any that are prevailing in a different module than one of
2519	// its aliases. We could record the necessary information to do this in the
2520	// summary, but this case should not be common.
2521	DenseSet<GlobalValue::GUID> GUIDsToSkip =
2522	findAliaseeGUIDsPrevailingInDifferentModule();
2523	// Map for keeping track of the largest cold contexts up to the number given
2524	// by MemProfTopNImportant. Must be a std::map (not DenseMap) because keys
2525	// must be sorted.
2526	std::map<uint64_t, uint32_t> TotalSizeToContextIdTopNCold;
2527	// Sort by GUID for deterministic graph construction order.
2528	// TODO: This sort has a measurable cost on the thin link when memprof is
2529	// enabled. Investigate gating it behind an option that is only enabled for
2530	// tests that check internal state.
2531	for (const auto &I : Index.sortedGlobalValueSummariesRange()) {
2532	auto VI = Index.getValueInfo(R: I);
2533	if (GUIDsToSkip.contains(V: VI.getGUID()))
2534	continue;
2535	for (auto &S : VI.getSummaryList()) {
2536	// We should only add the prevailing nodes. Otherwise we may try to clone
2537	// in a weak copy that won't be linked (and may be different than the
2538	// prevailing version).
2539	// We only keep the memprof summary on the prevailing copy now when
2540	// building the combined index, as a space optimization, however don't
2541	// rely on this optimization. The linker doesn't resolve local linkage
2542	// values so don't check whether those are prevailing.
2543	if (!GlobalValue::isLocalLinkage(Linkage: S ->linkage()) &&
2544	!isPrevailing (VI.getGUID(), S.get()))
2545	continue;
2546	auto *FS = dyn_cast<FunctionSummary>(Val: S.get());
2547	if (!FS)
2548	continue;
2549	std::vector<CallInfo> CallsWithMetadata;
2550	if (!FS->allocs().empty()) {
2551	for (auto &AN : FS->mutableAllocs()) {
2552	// This can happen because of recursion elimination handling that
2553	// currently exists in ModuleSummaryAnalysis. Skip these for now.
2554	// We still added them to the summary because we need to be able to
2555	// correlate properly in applyImport in the backends.
2556	if (AN.MIBs.empty())
2557	continue;
2558	IndexCall AllocCall(&AN);
2559	CallsWithMetadata.push_back(x: AllocCall);
2560	auto *AllocNode = addAllocNode(Call: AllocCall, F: FS);
2561	// Pass an empty CallStack to the CallsiteContext (second)
2562	// parameter, since for ThinLTO we already collapsed out the inlined
2563	// stack ids on the allocation call during ModuleSummaryAnalysis.
2564	CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
2565	EmptyContext;
2566	unsigned I = `0`;
2567	assert(!metadataMayIncludeContextSizeInfo() \|\|
2568	AN.ContextSizeInfos.size() == AN.MIBs.size());
2569	// Now add all of the MIBs and their stack nodes.
2570	for (auto &MIB : AN.MIBs) {
2571	CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
2572	StackContext(&MIB);
2573	std::vector<ContextTotalSize> ContextSizeInfo;
2574	if (!AN.ContextSizeInfos.empty()) {
2575	for (auto [FullStackId, TotalSize] : AN.ContextSizeInfos [I])
2576	ContextSizeInfo.push_back(x: {.FullStackId: FullStackId, .TotalSize: TotalSize});
2577	}
2578	addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
2579	AllocNode, StackContext, CallsiteContext&: EmptyContext, AllocType: MIB.AllocType,
2580	ContextSizeInfo, TotalSizeToContextIdTopNCold);
2581	I++;
2582	}
2583	// If exporting the graph to dot and an allocation id of interest was
2584	// specified, record all the context ids for this allocation node.
2585	if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
2586	DotAllocContextIds = AllocNode->getContextIds();
2587	assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2588	// Initialize version 0 on the summary alloc node to the current alloc
2589	// type, unless it has both types in which case make it default, so
2590	// that in the case where we aren't able to clone the original version
2591	// always ends up with the default allocation behavior.
2592	AN.Versions [`0`] = (uint8_t)allocTypeToUse(AllocTypes: AllocNode->AllocTypes);
2593	}
2594	}
2595	// For callsite metadata, add to list for this function for later use.
2596	if (!FS->callsites().empty())
2597	for (auto &SN : FS->mutableCallsites()) {
2598	IndexCall StackNodeCall(&SN);
2599	CallsWithMetadata.push_back(x: StackNodeCall);
2600	}
2601
2602	if (!CallsWithMetadata.empty())
2603	FuncToCallsWithMetadata [FS] = CallsWithMetadata;
2604
2605	if (!FS->allocs().empty() \|\| !FS->callsites().empty())
2606	FSToVIMap [FS] = VI;
2607	}
2608	}
2609
2610	if (DumpCCG) {
2611	dbgs() << "CCG before updating call stack chains:\n";
2612	dbgs() << *this;
2613	}
2614
2615	if (ExportToDot)
2616	exportToDot(Label: "prestackupdate");
2617
2618	updateStackNodes();
2619
2620	if (ExportToDot)
2621	exportToDot(Label: "poststackupdate");
2622
2623	handleCallsitesWithMultipleTargets();
2624
2625	markBackedges();
2626	}
2627
2628	template <typename DerivedCCG, typename FuncTy, typename CallTy>
2629	void CallsiteContextGraph<DerivedCCG, FuncTy,
2630	CallTy>::handleCallsitesWithMultipleTargets() {
2631	// Look for and workaround callsites that call multiple functions.
2632	// This can happen for indirect calls, which needs better handling, and in
2633	// more rare cases (e.g. macro expansion).
2634	// TODO: To fix this for indirect calls we will want to perform speculative
2635	// devirtualization using either the normal PGO info with ICP, or using the
2636	// information in the profiled MemProf contexts. We can do this prior to
2637	// this transformation for regular LTO, and for ThinLTO we can simulate that
2638	// effect in the summary and perform the actual speculative devirtualization
2639	// while cloning in the ThinLTO backend.
2640
2641	// Keep track of the new nodes synthesized for discovered tail calls missing
2642	// from the profiled contexts.
2643	MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;
2644
2645	std::vector<std::pair<CallInfo, ContextNode *>> NewCallToNode;
2646	for (auto &Entry : NonAllocationCallToContextNodeMap) {
2647	auto *Node = Entry.second;
2648	assert(Node->Clones.empty());
2649	// Check all node callees and see if in the same function.
2650	// We need to check all of the calls recorded in this Node, because in some
2651	// cases we may have had multiple calls with the same debug info calling
2652	// different callees. This can happen, for example, when an object is
2653	// constructed in the paramter list - the destructor call of the object has
2654	// the same debug info (line/col) as the call the object was passed to.
2655	// Here we will prune any that don't match all callee nodes.
2656	std::vector<CallInfo> AllCalls;
2657	AllCalls.reserve(Node->MatchingCalls.size() + `1`);
2658	AllCalls.push_back(Node->Call);
2659	llvm::append_range(AllCalls, Node->MatchingCalls);
2660
2661	// First see if we can partition the calls by callee function, creating new
2662	// nodes to host each set of calls calling the same callees. This is
2663	// necessary for support indirect calls with ThinLTO, for which we
2664	// synthesized CallsiteInfo records for each target. They will all have the
2665	// same callsite stack ids and would be sharing a context node at this
2666	// point. We need to perform separate cloning for each, which will be
2667	// applied along with speculative devirtualization in the ThinLTO backends
2668	// as needed. Note this does not currently support looking through tail
2669	// calls, it is unclear if we need that for indirect call targets.
2670	// First partition calls by callee func. Map indexed by func, value is
2671	// struct with list of matching calls, assigned node.
2672	if (partitionCallsByCallee(Node, AllCalls, NewCallToNode))
2673	continue;
2674
2675	auto It = AllCalls.begin();
2676	// Iterate through the calls until we find the first that matches.
2677	for (; It != AllCalls.end(); ++It) {
2678	auto ThisCall = *It;
2679	bool Match = true;
2680	for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();
2681	++EI) {
2682	auto Edge = *EI;
2683	if (!Edge->Callee->hasCall())
2684	continue;
2685	assert(NodeToCallingFunc.count(Edge->Callee));
2686	// Check if the called function matches that of the callee node.
2687	if (!calleesMatch(Call: ThisCall.call(), EI, TailCallToContextNodeMap)) {
2688	Match = false;
2689	break;
2690	}
2691	}
2692	// Found a call that matches the callee nodes, we can quit now.
2693	if (Match) {
2694	// If the first match is not the primary call on the Node, update it
2695	// now. We will update the list of matching calls further below.
2696	if (Node->Call != ThisCall) {
2697	Node->setCall(ThisCall);
2698	// We need to update the NonAllocationCallToContextNodeMap, but don't
2699	// want to do this during iteration over that map, so save the calls
2700	// that need updated entries.
2701	NewCallToNode.push_back({ThisCall, Node});
2702	}
2703	break;
2704	}
2705	}
2706	// We will update this list below (or leave it cleared if there was no
2707	// match found above).
2708	Node->MatchingCalls.clear();
2709	// If we hit the end of the AllCalls vector, no call matching the callee
2710	// nodes was found, clear the call information in the node.
2711	if (It == AllCalls.end()) {
2712	RemovedEdgesWithMismatchedCallees ++;
2713	// Work around by setting Node to have a null call, so it gets
2714	// skipped during cloning. Otherwise assignFunctions will assert
2715	// because its data structures are not designed to handle this case.
2716	Node->setCall(CallInfo());
2717	continue;
2718	}
2719	// Now add back any matching calls that call the same function as the
2720	// matching primary call on Node.
2721	for (++It; It != AllCalls.end(); ++It) {
2722	auto ThisCall = *It;
2723	if (!sameCallee(Call1: Node->Call.call(), Call2: ThisCall.call()))
2724	continue;
2725	Node->MatchingCalls.push_back(ThisCall);
2726	}
2727	}
2728
2729	// Remove all mismatched nodes identified in the above loop from the node map
2730	// (checking whether they have a null call which is set above). For a
2731	// MapVector like NonAllocationCallToContextNodeMap it is much more efficient
2732	// to do the removal via remove_if than by individually erasing entries above.
2733	// Also remove any entries if we updated the node's primary call above.
2734	NonAllocationCallToContextNodeMap.remove_if([](const auto &it) {
2735	return !it.second->hasCall() \|\| it.second->Call != it.first;
2736	});
2737
2738	// Add entries for any new primary calls recorded above.
2739	for (auto &[Call, Node] : NewCallToNode)
2740	NonAllocationCallToContextNodeMap[Call] = Node;
2741
2742	// Add the new nodes after the above loop so that the iteration is not
2743	// invalidated.
2744	for (auto &[Call, Node] : TailCallToContextNodeMap)
2745	NonAllocationCallToContextNodeMap[Call] = Node;
2746	}
2747
2748	template <typename DerivedCCG, typename FuncTy, typename CallTy>
2749	bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::partitionCallsByCallee(
2750	ContextNode *Node, ArrayRef<CallInfo> AllCalls,
2751	std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode) {
2752	// Struct to keep track of all the calls having the same callee function,
2753	// and the node we eventually assign to them. Eventually we will record the
2754	// context node assigned to this group of calls.
2755	struct CallsWithSameCallee {
2756	std::vector<CallInfo> Calls;
2757	ContextNode Node = nullptr*;
2758	};
2759
2760	// First partition calls by callee function. Build map from each function
2761	// to the list of matching calls.
2762	DenseMap<const FuncTy *, CallsWithSameCallee> CalleeFuncToCallInfo;
2763	for (auto ThisCall : AllCalls) {
2764	auto *F = getCalleeFunc(Call: ThisCall.call());
2765	if (F)
2766	CalleeFuncToCallInfo[F].Calls.push_back(ThisCall);
2767	}
2768
2769	// Next, walk through all callee edges. For each callee node, get its
2770	// containing function and see if it was recorded in the above map (meaning we
2771	// have at least one matching call). Build another map from each callee node
2772	// with a matching call to the structure instance created above containing all
2773	// the calls.
2774	DenseMap<ContextNode , CallsWithSameCallee > CalleeNodeToCallInfo;
2775	for (const auto &Edge : Node->CalleeEdges) {
2776	if (!Edge->Callee->hasCall())
2777	continue;
2778	const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2779	if (CalleeFuncToCallInfo.contains(ProfiledCalleeFunc))
2780	CalleeNodeToCallInfo[Edge->Callee] =
2781	&CalleeFuncToCallInfo[ProfiledCalleeFunc];
2782	}
2783
2784	// If there are entries in the second map, then there were no matching
2785	// calls/callees, nothing to do here. Return so we can go to the handling that
2786	// looks through tail calls.
2787	if (CalleeNodeToCallInfo.empty())
2788	return false;
2789
2790	// Walk through all callee edges again. Any and all callee edges that didn't
2791	// match any calls (callee not in the CalleeNodeToCallInfo map) are moved to a
2792	// new caller node (UnmatchedCalleesNode) which gets a null call so that it is
2793	// ignored during cloning. If it is in the map, then we use the node recorded
2794	// in that entry (creating it if needed), and move the callee edge to it.
2795	// The first callee will use the original node instead of creating a new one.
2796	// Note that any of the original calls on this node (in AllCalls) that didn't
2797	// have a callee function automatically get dropped from the node as part of
2798	// this process.
2799	ContextNode UnmatchedCalleesNode = nullptr*;
2800	// Track whether we already assigned original node to a callee.
2801	bool UsedOrigNode = false;
2802	assert(NodeToCallingFunc[Node]);
2803	// Iterate over a copy of Node's callee edges, since we may need to remove
2804	// edges in moveCalleeEdgeToNewCaller, and this simplifies the handling and
2805	// makes it less error-prone.
2806	auto CalleeEdges = Node->CalleeEdges;
2807	for (auto &Edge : CalleeEdges) {
2808	if (!Edge->Callee->hasCall())
2809	continue;
2810
2811	// Will be updated below to point to whatever (caller) node this callee edge
2812	// should be moved to.
2813	ContextNode CallerNodeToUse = nullptr*;
2814
2815	// Handle the case where there were no matching calls first. Move this
2816	// callee edge to the UnmatchedCalleesNode, creating it if needed.
2817	if (!CalleeNodeToCallInfo.contains(Edge->Callee)) {
2818	if (!UnmatchedCalleesNode)
2819	UnmatchedCalleesNode =
2820	createNewNode(/IsAllocation=/false, F: NodeToCallingFunc[Node]);
2821	CallerNodeToUse = UnmatchedCalleesNode;
2822	} else {
2823	// Look up the information recorded for this callee node, and use the
2824	// recorded caller node (creating it if needed).
2825	auto *Info = CalleeNodeToCallInfo[Edge->Callee];
2826	if (!Info->Node) {
2827	// If we haven't assigned any callees to the original node use it.
2828	if (!UsedOrigNode) {
2829	Info->Node = Node;
2830	// Clear the set of matching calls which will be updated below.
2831	Node->MatchingCalls.clear();
2832	UsedOrigNode = true;
2833	} else
2834	Info->Node =
2835	createNewNode(/IsAllocation=/false, F: NodeToCallingFunc[Node]);
2836	assert(!Info->Calls.empty());
2837	// The first call becomes the primary call for this caller node, and the
2838	// rest go in the matching calls list.
2839	Info->Node->setCall(Info->Calls.front());
2840	llvm::append_range(Info->Node->MatchingCalls,
2841	llvm::drop_begin(Info->Calls));
2842	// Save the primary call to node correspondence so that we can update
2843	// the NonAllocationCallToContextNodeMap, which is being iterated in the
2844	// caller of this function.
2845	NewCallToNode.push_back({Info->Node->Call, Info->Node});
2846	}
2847	CallerNodeToUse = Info->Node;
2848	}
2849
2850	// Don't need to move edge if we are using the original node;
2851	if (CallerNodeToUse == Node)
2852	continue;
2853
2854	moveCalleeEdgeToNewCaller(Edge, NewCaller: CallerNodeToUse);
2855	}
2856	// Now that we are done moving edges, clean up any caller edges that ended
2857	// up with no type or context ids. During moveCalleeEdgeToNewCaller all
2858	// caller edges from Node are replicated onto the new callers, and it
2859	// simplifies the handling to leave them until we have moved all
2860	// edges/context ids.
2861	for (auto &I : CalleeNodeToCallInfo)
2862	removeNoneTypeCallerEdges(Node: I.second->Node);
2863	if (UnmatchedCalleesNode)
2864	removeNoneTypeCallerEdges(Node: UnmatchedCalleesNode);
2865	removeNoneTypeCallerEdges(Node);
2866
2867	return true;
2868	}
2869
2870	uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2871	// In the Module (IR) case this is already the Id.
2872	return IdOrIndex;
2873	}
2874
2875	uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2876	// In the Index case this is an index into the stack id list in the summary
2877	// index, convert it to an Id.
2878	return Index.getStackIdAtIndex(Index: IdOrIndex);
2879	}
2880
2881	template <typename DerivedCCG, typename FuncTy, typename CallTy>
2882	bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
2883	CallTy Call, EdgeIter &EI,
2884	MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) {
2885	auto Edge = *EI;
2886	const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2887	const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller];
2888	// Will be populated in order of callee to caller if we find a chain of tail
2889	// calls between the profiled caller and callee.
2890	std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain;
2891	if (!calleeMatchesFunc(Call, Func: ProfiledCalleeFunc, CallerFunc,
2892	FoundCalleeChain))
2893	return false;
2894
2895	// The usual case where the profiled callee matches that of the IR/summary.
2896	if (FoundCalleeChain.empty())
2897	return true;
2898
2899	auto AddEdge = [Edge, &EI](ContextNode Caller, ContextNode Callee) {
2900	auto *CurEdge = Callee->findEdgeFromCaller(Caller);
2901	// If there is already an edge between these nodes, simply update it and
2902	// return.
2903	if (CurEdge) {
2904	CurEdge->ContextIds.insert_range(Edge->ContextIds);
2905	CurEdge->AllocTypes \|= Edge->AllocTypes;
2906	return;
2907	}
2908	// Otherwise, create a new edge and insert it into the caller and callee
2909	// lists.
2910	auto NewEdge = std::make_shared<ContextEdge>(
2911	Callee, Caller, Edge->AllocTypes, Edge->ContextIds);
2912	Callee->CallerEdges.push_back(NewEdge);
2913	if (Caller == Edge->Caller) {
2914	// If we are inserting the new edge into the current edge's caller, insert
2915	// the new edge before the current iterator position, and then increment
2916	// back to the current edge.
2917	EI = Caller->CalleeEdges.insert(EI, NewEdge);
2918	++EI;
2919	assert(*EI == Edge &&
2920	"Iterator position not restored after insert and increment");
2921	} else
2922	Caller->CalleeEdges.push_back(NewEdge);
2923	};
2924
2925	// Create new nodes for each found callee and connect in between the profiled
2926	// caller and callee.
2927	auto *CurCalleeNode = Edge->Callee;
2928	for (auto &[NewCall, Func] : FoundCalleeChain) {
2929	ContextNode NewNode = nullptr*;
2930	// First check if we have already synthesized a node for this tail call.
2931	if (TailCallToContextNodeMap.count(NewCall)) {
2932	NewNode = TailCallToContextNodeMap[NewCall];
2933	NewNode->AllocTypes \|= Edge->AllocTypes;
2934	} else {
2935	FuncToCallsWithMetadata[Func].push_back({NewCall});
2936	// Create Node and record node info.
2937	NewNode = createNewNode(/IsAllocation=/false, F: Func, C: NewCall);
2938	TailCallToContextNodeMap[NewCall] = NewNode;
2939	NewNode->AllocTypes = Edge->AllocTypes;
2940	}
2941
2942	// Hook up node to its callee node
2943	AddEdge(NewNode, CurCalleeNode);
2944
2945	CurCalleeNode = NewNode;
2946	}
2947
2948	// Hook up edge's original caller to new callee node.
2949	AddEdge(Edge->Caller, CurCalleeNode);
2950
2951	#ifndef NDEBUG
2952	// Save this because Edge's fields get cleared below when removed.
2953	auto *Caller = Edge->Caller;
2954	#endif
2955
2956	// Remove old edge
2957	removeEdgeFromGraph(Edge: Edge.get(), EI: &EI, /CalleeIter=/true);
2958
2959	// To simplify the increment of EI in the caller, subtract one from EI.
2960	// In the final AddEdge call we would have either added a new callee edge,
2961	// to Edge->Caller, or found an existing one. Either way we are guaranteed
2962	// that there is at least one callee edge.
2963	assert(!Caller->CalleeEdges.empty());
2964	--EI;
2965
2966	return true;
2967	}
2968
2969	bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2970	const Function ProfiledCallee, Value CurCallee, unsigned Depth,
2971	std::vector<std::pair<Instruction , Function >> &FoundCalleeChain,
2972	bool &FoundMultipleCalleeChains) {
2973	// Stop recursive search if we have already explored the maximum specified
2974	// depth.
2975	if (Depth > TailCallSearchDepth)
2976	return false;
2977
2978	auto SaveCallsiteInfo = [&](Instruction Callsite, Function F) {
2979	FoundCalleeChain.push_back(x: {Callsite, F});
2980	};
2981
2982	auto *CalleeFunc = dyn_cast<Function>(Val: CurCallee);
2983	if (!CalleeFunc) {
2984	auto *Alias = dyn_cast<GlobalAlias>(Val: CurCallee);
2985	assert(Alias);
2986	CalleeFunc = dyn_cast<Function>(Val: Alias->getAliasee());
2987	assert(CalleeFunc);
2988	}
2989
2990	// Look for tail calls in this function, and check if they either call the
2991	// profiled callee directly, or indirectly (via a recursive search).
2992	// Only succeed if there is a single unique tail call chain found between the
2993	// profiled caller and callee, otherwise we could perform incorrect cloning.
2994	bool FoundSingleCalleeChain = false;
2995	for (auto &BB : *CalleeFunc) {
2996	for (auto &I : BB) {
2997	auto *CB = dyn_cast<CallBase>(Val: &I);
2998	if (!CB \|\| !CB->isTailCall())
2999	continue;
3000	auto *CalledValue = CB->getCalledOperand();
3001	auto *CalledFunction = CB->getCalledFunction();
3002	if (CalledValue && !CalledFunction) {
3003	CalledValue = CalledValue->stripPointerCasts();
3004	// Stripping pointer casts can reveal a called function.
3005	CalledFunction = dyn_cast<Function>(Val: CalledValue);
3006	}
3007	// Check if this is an alias to a function. If so, get the
3008	// called aliasee for the checks below.
3009	if (auto *GA = dyn_cast<GlobalAlias>(Val: CalledValue)) {
3010	assert(!CalledFunction &&
3011	"Expected null called function in callsite for alias");
3012	CalledFunction = dyn_cast<Function>(Val: GA->getAliaseeObject());
3013	}
3014	if (!CalledFunction)
3015	continue;
3016	if (CalledFunction == ProfiledCallee) {
3017	if (FoundSingleCalleeChain) {
3018	FoundMultipleCalleeChains = true;
3019	return false;
3020	}
3021	FoundSingleCalleeChain = true;
3022	FoundProfiledCalleeCount ++;
3023	FoundProfiledCalleeDepth += Depth;
3024	if (Depth > FoundProfiledCalleeMaxDepth)
3025	FoundProfiledCalleeMaxDepth = Depth;
3026	SaveCallsiteInfo (&I, CalleeFunc);
3027	} else if (findProfiledCalleeThroughTailCalls(
3028	ProfiledCallee, CurCallee: CalledFunction, Depth: Depth + `1`,
3029	FoundCalleeChain, FoundMultipleCalleeChains)) {
3030	// findProfiledCalleeThroughTailCalls should not have returned
3031	// true if FoundMultipleCalleeChains.
3032	assert(!FoundMultipleCalleeChains);
3033	if (FoundSingleCalleeChain) {
3034	FoundMultipleCalleeChains = true;
3035	return false;
3036	}
3037	FoundSingleCalleeChain = true;
3038	SaveCallsiteInfo (&I, CalleeFunc);
3039	} else if (FoundMultipleCalleeChains)
3040	return false;
3041	}
3042	}
3043
3044	return FoundSingleCalleeChain;
3045	}
3046
3047	const Function ModuleCallsiteContextGraph::getCalleeFunc(Instruction Call) {
3048	auto *CB = dyn_cast<CallBase>(Val: Call);
3049	if (!CB->getCalledOperand() \|\| CB->isIndirectCall())
3050	return nullptr;
3051	auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
3052	auto *Alias = dyn_cast<GlobalAlias>(Val: CalleeVal);
3053	if (Alias)
3054	return dyn_cast<Function>(Val: Alias->getAliasee());
3055	return dyn_cast<Function>(Val: CalleeVal);
3056	}
3057
3058	bool ModuleCallsiteContextGraph::calleeMatchesFunc(
3059	Instruction Call, const* Function Func, const* Function *CallerFunc,
3060	std::vector<std::pair<Instruction , Function >> &FoundCalleeChain) {
3061	auto *CB = dyn_cast<CallBase>(Val: Call);
3062	if (!CB->getCalledOperand() \|\| CB->isIndirectCall())
3063	return false;
3064	auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
3065	auto *CalleeFunc = dyn_cast<Function>(Val: CalleeVal);
3066	if (CalleeFunc == Func)
3067	return true;
3068	auto *Alias = dyn_cast<GlobalAlias>(Val: CalleeVal);
3069	if (Alias && Alias->getAliasee() == Func)
3070	return true;
3071
3072	// Recursively search for the profiled callee through tail calls starting with
3073	// the actual Callee. The discovered tail call chain is saved in
3074	// FoundCalleeChain, and we will fixup the graph to include these callsites
3075	// after returning.
3076	// FIXME: We will currently redo the same recursive walk if we find the same
3077	// mismatched callee from another callsite. We can improve this with more
3078	// bookkeeping of the created chain of new nodes for each mismatch.
3079	unsigned Depth = `1`;
3080	bool FoundMultipleCalleeChains = false;
3081	if (!findProfiledCalleeThroughTailCalls(ProfiledCallee: Func, CurCallee: CalleeVal, Depth,
3082	FoundCalleeChain,
3083	FoundMultipleCalleeChains)) {
3084	LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: "
3085	<< Func->getName() << " from " << CallerFunc->getName()
3086	<< " that actually called " << CalleeVal->getName()
3087	<< (FoundMultipleCalleeChains
3088	? " (found multiple possible chains)"
3089	: "")
3090	<< "\n");
3091	if (FoundMultipleCalleeChains)
3092	FoundProfiledCalleeNonUniquelyCount ++;
3093	return false;
3094	}
3095
3096	return true;
3097	}
3098
3099	bool ModuleCallsiteContextGraph::sameCallee(Instruction *Call1,
3100	Instruction *Call2) {
3101	auto *CB1 = cast<CallBase>(Val: Call1);
3102	if (!CB1->getCalledOperand() \|\| CB1->isIndirectCall())
3103	return false;
3104	auto *CalleeVal1 = CB1->getCalledOperand()->stripPointerCasts();
3105	auto *CalleeFunc1 = dyn_cast<Function>(Val: CalleeVal1);
3106	auto *CB2 = cast<CallBase>(Val: Call2);
3107	if (!CB2->getCalledOperand() \|\| CB2->isIndirectCall())
3108	return false;
3109	auto *CalleeVal2 = CB2->getCalledOperand()->stripPointerCasts();
3110	auto *CalleeFunc2 = dyn_cast<Function>(Val: CalleeVal2);
3111	return CalleeFunc1 == CalleeFunc2;
3112	}
3113
3114	bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
3115	ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
3116	std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
3117	bool &FoundMultipleCalleeChains) {
3118	// Stop recursive search if we have already explored the maximum specified
3119	// depth.
3120	if (Depth > TailCallSearchDepth)
3121	return false;
3122
3123	auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) {
3124	// Make a CallsiteInfo for each discovered callee, if one hasn't already
3125	// been synthesized.
3126	if (!FunctionCalleesToSynthesizedCallsiteInfos.count(Val: FS) \|\|
3127	!FunctionCalleesToSynthesizedCallsiteInfos [FS].count(x: Callee))
3128	// StackIds is empty (we don't have debug info available in the index for
3129	// these callsites)
3130	FunctionCalleesToSynthesizedCallsiteInfos [FS][Callee] =
3131	std::make_unique<CallsiteInfo>(args&: Callee, args: SmallVector<unsigned>());
3132	CallsiteInfo *NewCallsiteInfo =
3133	FunctionCalleesToSynthesizedCallsiteInfos [FS][Callee].get();
3134	FoundCalleeChain.push_back(x: {NewCallsiteInfo, FS});
3135	};
3136
3137	// Look for tail calls in this function, and check if they either call the
3138	// profiled callee directly, or indirectly (via a recursive search).
3139	// Only succeed if there is a single unique tail call chain found between the
3140	// profiled caller and callee, otherwise we could perform incorrect cloning.
3141	bool FoundSingleCalleeChain = false;
3142	for (auto &S : CurCallee.getSummaryList()) {
3143	if (!GlobalValue::isLocalLinkage(Linkage: S ->linkage()) &&
3144	!isPrevailing (CurCallee.getGUID(), S.get()))
3145	continue;
3146	auto *FS = dyn_cast<FunctionSummary>(Val: S ->getBaseObject());
3147	if (!FS)
3148	continue;
3149	auto FSVI = CurCallee;
3150	auto *AS = dyn_cast<AliasSummary>(Val: S.get());
3151	if (AS)
3152	FSVI = AS->getAliaseeVI();
3153	for (auto &CallEdge : FS->calls()) {
3154	if (!CallEdge.second.hasTailCall())
3155	continue;
3156	if (CallEdge.first == ProfiledCallee) {
3157	if (FoundSingleCalleeChain) {
3158	FoundMultipleCalleeChains = true;
3159	return false;
3160	}
3161	FoundSingleCalleeChain = true;
3162	FoundProfiledCalleeCount ++;
3163	FoundProfiledCalleeDepth += Depth;
3164	if (Depth > FoundProfiledCalleeMaxDepth)
3165	FoundProfiledCalleeMaxDepth = Depth;
3166	CreateAndSaveCallsiteInfo (CallEdge.first, FS);
3167	// Add FS to FSToVIMap in case it isn't already there.
3168	assert(!FSToVIMap.count(FS) \|\| FSToVIMap[FS] == FSVI);
3169	FSToVIMap [FS] = FSVI;
3170	} else if (findProfiledCalleeThroughTailCalls(
3171	ProfiledCallee, CurCallee: CallEdge.first, Depth: Depth + `1`,
3172	FoundCalleeChain, FoundMultipleCalleeChains)) {
3173	// findProfiledCalleeThroughTailCalls should not have returned
3174	// true if FoundMultipleCalleeChains.
3175	assert(!FoundMultipleCalleeChains);
3176	if (FoundSingleCalleeChain) {
3177	FoundMultipleCalleeChains = true;
3178	return false;
3179	}
3180	FoundSingleCalleeChain = true;
3181	CreateAndSaveCallsiteInfo (CallEdge.first, FS);
3182	// Add FS to FSToVIMap in case it isn't already there.
3183	assert(!FSToVIMap.count(FS) \|\| FSToVIMap[FS] == FSVI);
3184	FSToVIMap [FS] = FSVI;
3185	} else if (FoundMultipleCalleeChains)
3186	return false;
3187	}
3188	}
3189
3190	return FoundSingleCalleeChain;
3191	}
3192
3193	const FunctionSummary *
3194	IndexCallsiteContextGraph::getCalleeFunc(IndexCall &Call) {
3195	ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Val&: Call)->Callee;
3196	if (Callee.getSummaryList().empty())
3197	return nullptr;
3198	return dyn_cast<FunctionSummary>(Val: Callee.getSummaryList()[`0`]->getBaseObject());
3199	}
3200
3201	bool IndexCallsiteContextGraph::calleeMatchesFunc(
3202	IndexCall &Call, const FunctionSummary *Func,
3203	const FunctionSummary *CallerFunc,
3204	std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {
3205	ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Val&: Call)->Callee;
3206	// If there is no summary list then this is a call to an externally defined
3207	// symbol.
3208	AliasSummary *Alias =
3209	Callee.getSummaryList().empty()
3210	? nullptr
3211	: dyn_cast<AliasSummary>(Val: Callee.getSummaryList()[`0`].get());
3212	assert(FSToVIMap.count(Func));
3213	auto FuncVI = FSToVIMap [Func];
3214	if (Callee == FuncVI \|\|
3215	// If callee is an alias, check the aliasee, since only function
3216	// summary base objects will contain the stack node summaries and thus
3217	// get a context node.
3218	(Alias && Alias->getAliaseeVI() == FuncVI))
3219	return true;
3220
3221	// Recursively search for the profiled callee through tail calls starting with
3222	// the actual Callee. The discovered tail call chain is saved in
3223	// FoundCalleeChain, and we will fixup the graph to include these callsites
3224	// after returning.
3225	// FIXME: We will currently redo the same recursive walk if we find the same
3226	// mismatched callee from another callsite. We can improve this with more
3227	// bookkeeping of the created chain of new nodes for each mismatch.
3228	unsigned Depth = `1`;
3229	bool FoundMultipleCalleeChains = false;
3230	if (!findProfiledCalleeThroughTailCalls(
3231	ProfiledCallee: FuncVI, CurCallee: Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) {
3232	LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI
3233	<< " from " << FSToVIMap[CallerFunc]
3234	<< " that actually called " << Callee
3235	<< (FoundMultipleCalleeChains
3236	? " (found multiple possible chains)"
3237	: "")
3238	<< "\n");
3239	if (FoundMultipleCalleeChains)
3240	FoundProfiledCalleeNonUniquelyCount ++;
3241	return false;
3242	}
3243
3244	return true;
3245	}
3246
3247	bool IndexCallsiteContextGraph::sameCallee(IndexCall &Call1, IndexCall &Call2) {
3248	ValueInfo Callee1 = dyn_cast_if_present<CallsiteInfo *>(Val&: Call1)->Callee;
3249	ValueInfo Callee2 = dyn_cast_if_present<CallsiteInfo *>(Val&: Call2)->Callee;
3250	return Callee1 == Callee2;
3251	}
3252
3253	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3254	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
3255	const {
3256	print(OS&: dbgs());
3257	dbgs() << "\n";
3258	}
3259
3260	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3261	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
3262	raw_ostream &OS) const {
3263	OS << "Node " << this << "\n";
3264	OS << "\t";
3265	printCall(OS);
3266	if (Recursive)
3267	OS << " (recursive)";
3268	OS << "\n";
3269	if (!MatchingCalls.empty()) {
3270	OS << "\tMatchingCalls:\n";
3271	for (auto &MatchingCall : MatchingCalls) {
3272	OS << "\t";
3273	MatchingCall.print(OS);
3274	OS << "\n";
3275	}
3276	}
3277	OS << "\tNodeId: " << NodeId << "\n";
3278	OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
3279	OS << "\tContextIds:";
3280	// Make a copy of the computed context ids that we can sort for stability.
3281	auto ContextIds = getContextIds();
3282	std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3283	std::sort(first: SortedIds.begin(), last: SortedIds.end());
3284	for (auto Id : SortedIds)
3285	OS << " " << Id;
3286	OS << "\n";
3287	OS << "\tCalleeEdges:\n";
3288	for (auto &Edge : CalleeEdges)
3289	OS << "\t\t" << *Edge << " (Callee NodeId: " << Edge->Callee->NodeId
3290	<< ")\n";
3291	OS << "\tCallerEdges:\n";
3292	for (auto &Edge : CallerEdges)
3293	OS << "\t\t" << *Edge << " (Caller NodeId: " << Edge->Caller->NodeId
3294	<< ")\n";
3295	if (!Clones.empty()) {
3296	OS << "\tClones: ";
3297	ListSeparator LS;
3298	for (auto *C : Clones)
3299	OS << LS << C << " NodeId: " << C->NodeId;
3300	OS << "\n";
3301	} else if (CloneOf) {
3302	OS << "\tClone of " << CloneOf << " NodeId: " << CloneOf->NodeId << "\n";
3303	}
3304	}
3305
3306	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3307	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
3308	const {
3309	print(OS&: dbgs());
3310	dbgs() << "\n";
3311	}
3312
3313	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3314	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
3315	raw_ostream &OS) const {
3316	OS << "Edge from Callee " << Callee << " to Caller: " << Caller
3317	<< (IsBackedge ? " (BE)" : "")
3318	<< " AllocTypes: " << getAllocTypeString(AllocTypes);
3319	OS << " ContextIds:";
3320	std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3321	std::sort(first: SortedIds.begin(), last: SortedIds.end());
3322	for (auto Id : SortedIds)
3323	OS << " " << Id;
3324	}
3325
3326	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3327	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
3328	print(OS&: dbgs());
3329	}
3330
3331	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3332	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
3333	raw_ostream &OS) const {
3334	OS << "Callsite Context Graph:\n";
3335	using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3336	for (const auto Node : nodes<GraphType>(this)) {
3337	if (Node->isRemoved())
3338	continue;
3339	Node->print(OS);
3340	OS << "\n";
3341	}
3342	}
3343
3344	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3345	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
3346	raw_ostream &OS,
3347	function_ref<void(StringRef, StringRef, const Twine &)> EmitRemark) const {
3348	using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3349	for (const auto Node : nodes<GraphType>(this)) {
3350	if (Node->isRemoved())
3351	continue;
3352	if (!Node->IsAllocation)
3353	continue;
3354	DenseSet<uint32_t> ContextIds = Node->getContextIds();
3355	auto AllocTypeFromCall = getAllocationCallType(Call: Node->Call);
3356	std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3357	std::sort(first: SortedIds.begin(), last: SortedIds.end());
3358	for (auto Id : SortedIds) {
3359	auto TypeI = ContextIdToAllocationType.find(Val: Id);
3360	assert(TypeI != ContextIdToAllocationType.end());
3361	auto CSI = ContextIdToContextSizeInfos.find(Val: Id);
3362	if (CSI != ContextIdToContextSizeInfos.end()) {
3363	for (auto &Info : CSI ->second) {
3364	std::string Msg =
3365	"MemProf hinting: " + getAllocTypeString(AllocTypes: (uint8_t)TypeI ->second) +
3366	" full allocation context " + std::to_string(val: Info.FullStackId) +
3367	" with total size " + std::to_string(val: Info.TotalSize) + " is " +
3368	getAllocTypeString(Node->AllocTypes) + " after cloning";
3369	if (allocTypeToUse(Node->AllocTypes) != AllocTypeFromCall)
3370	Msg += " marked " + getAllocTypeString(AllocTypes: (uint8_t)AllocTypeFromCall) +
3371	" due to cold byte percent";
3372	// Print the internal context id to aid debugging and visualization.
3373	Msg += " (internal context id " + std::to_string(val: Id) + ")";
3374	if (MemProfReportHintedSizes)
3375	OS << Msg << "\n";
3376	if (EmitRemark)
3377	EmitRemark (DEBUG_TYPE, "MemProfReport", Msg);
3378	}
3379	} else {
3380	// This is only emitted if the context size info is not present.
3381	std::string Msg =
3382	"MemProf hinting: " + getAllocTypeString(AllocTypes: (uint8_t)TypeI ->second) +
3383	" context is " + getAllocTypeString(Node->AllocTypes) +
3384	" after cloning";
3385	if (allocTypeToUse(Node->AllocTypes) != AllocTypeFromCall)
3386	Msg += " marked " + getAllocTypeString(AllocTypes: (uint8_t)AllocTypeFromCall) +
3387	" due to cold byte percent";
3388	// Print the internal context id to aid debugging and visualization.
3389	Msg += " (internal context id " + std::to_string(val: Id) + ")";
3390	if (MemProfReportHintedSizes)
3391	OS << Msg << "\n";
3392	if (EmitRemark)
3393	EmitRemark (DEBUG_TYPE, "MemProfReport", Msg);
3394	}
3395	}
3396	}
3397	}
3398
3399	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3400	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
3401	using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3402	for (const auto Node : nodes<GraphType>(this)) {
3403	checkNode<DerivedCCG, FuncTy, CallTy>(Node, /CheckEdges=/false);
3404	for (auto &Edge : Node->CallerEdges)
3405	checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
3406	}
3407	}
3408
3409	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3410	struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
3411	using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3412	using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
3413
3414	using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
3415	static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
3416
3417	using nodes_iterator =
3418	mapped_iterator<typename std::vector<NodePtrTy>::const_iterator,
3419	decltype(&getNode)>;
3420
3421	static nodes_iterator nodes_begin(GraphType G) {
3422	return nodes_iterator(G->NodeOwner.begin(), &getNode);
3423	}
3424
3425	static nodes_iterator nodes_end(GraphType G) {
3426	return nodes_iterator(G->NodeOwner.end(), &getNode);
3427	}
3428
3429	static NodeRef getEntryNode(GraphType G) {
3430	return G->NodeOwner.begin()->get();
3431	}
3432
3433	using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
3434	static const ContextNode<DerivedCCG, FuncTy, CallTy> *
3435	GetCallee(const EdgePtrTy &P) {
3436	return P->Callee;
3437	}
3438
3439	using ChildIteratorType =
3440	mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
3441	DerivedCCG, FuncTy, CallTy>>>::const_iterator,
3442	decltype(&GetCallee)>;
3443
3444	static ChildIteratorType child_begin(NodeRef N) {
3445	return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
3446	}
3447
3448	static ChildIteratorType child_end(NodeRef N) {
3449	return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
3450	}
3451	};
3452
3453	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3454	struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
3455	: public DefaultDOTGraphTraits {
3456	DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {
3457	// If the user requested the full graph to be exported, but provided an
3458	// allocation id, or if the user gave a context id and requested more than
3459	// just a specific context to be exported, note that highlighting is
3460	// enabled.
3461	DoHighlight =
3462	(AllocIdForDot.getNumOccurrences() && DotGraphScope == DotScope::All) \|\|
3463	(ContextIdForDot.getNumOccurrences() &&
3464	DotGraphScope != DotScope::Context);
3465	}
3466
3467	using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3468	using GTraits = GraphTraits<GraphType>;
3469	using NodeRef = typename GTraits::NodeRef;
3470	using ChildIteratorType = typename GTraits::ChildIteratorType;
3471
3472	static std::string getNodeLabel(NodeRef Node, GraphType G) {
3473	std::string LabelString =
3474	(Twine ("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
3475	Twine(Node->OrigStackOrAllocId) + " NodeId: " + Twine(Node->NodeId))
3476	.str();
3477	LabelString += "\n";
3478	if (Node->hasCall()) {
3479	auto Func = G->NodeToCallingFunc.find(Node);
3480	assert(Func != G->NodeToCallingFunc.end());
3481	LabelString +=
3482	G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
3483	for (auto &MatchingCall : Node->MatchingCalls) {
3484	LabelString += "\n";
3485	LabelString += G->getLabel(Func->second, MatchingCall.call(),
3486	MatchingCall.cloneNo());
3487	}
3488	} else {
3489	LabelString += "null call";
3490	if (Node->Recursive)
3491	LabelString += " (recursive)";
3492	else
3493	LabelString += " (external)";
3494	}
3495	return LabelString;
3496	}
3497
3498	static std::string getNodeAttributes(NodeRef Node, GraphType G) {
3499	auto ContextIds = Node->getContextIds();
3500	// If highlighting enabled, see if this node contains any of the context ids
3501	// of interest. If so, it will use a different color and a larger fontsize
3502	// (which makes the node larger as well).
3503	bool Highlight = false;
3504	if (DoHighlight) {
3505	assert(ContextIdForDot.getNumOccurrences() \|\|
3506	AllocIdForDot.getNumOccurrences());
3507	if (ContextIdForDot.getNumOccurrences())
3508	Highlight = ContextIds.contains(ContextIdForDot);
3509	else
3510	Highlight = set_intersects(ContextIds, G->DotAllocContextIds);
3511	}
3512	std::string AttributeString = (Twine ("tooltip=\"") + getNodeId(Node) + " " +
3513	getContextIds(ContextIds) + "\"")
3514	.str();
3515	// Default fontsize is 14
3516	if (Highlight)
3517	AttributeString += ",fontsize=\"30\"";
3518	AttributeString +=
3519	(Twine (",fillcolor=\"") + getColor(AllocTypes: Node->AllocTypes, Highlight) + "\"")
3520	.str();
3521	if (Node->CloneOf) {
3522	AttributeString += ",color=\"blue\"";
3523	AttributeString += ",style=\"filled,bold,dashed\"";
3524	} else
3525	AttributeString += ",style=\"filled\"";
3526	return AttributeString;
3527	}
3528
3529	static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
3530	GraphType G) {
3531	auto &Edge = *(ChildIter.getCurrent());
3532	// If highlighting enabled, see if this edge contains any of the context ids
3533	// of interest. If so, it will use a different color and a heavier arrow
3534	// size and weight (the larger weight makes the highlighted path
3535	// straighter).
3536	bool Highlight = false;
3537	if (DoHighlight) {
3538	assert(ContextIdForDot.getNumOccurrences() \|\|
3539	AllocIdForDot.getNumOccurrences());
3540	if (ContextIdForDot.getNumOccurrences())
3541	Highlight = Edge->ContextIds.contains(ContextIdForDot);
3542	else
3543	Highlight = set_intersects(Edge->ContextIds, G->DotAllocContextIds);
3544	}
3545	auto Color = getColor(AllocTypes: Edge->AllocTypes, Highlight);
3546	std::string AttributeString =
3547	(Twine ("tooltip=\"") + getContextIds(ContextIds: Edge->ContextIds) + "\"" +
3548	// fillcolor is the arrow head and color is the line
3549	Twine (",fillcolor=\"") + Color + "\"" + Twine (",color=\"") + Color +
3550	"\"")
3551	.str();
3552	if (Edge->IsBackedge)
3553	AttributeString += ",style=\"dotted\"";
3554	// Default penwidth and weight are both 1.
3555	if (Highlight)
3556	AttributeString += ",penwidth=\"2.0\",weight=\"2\"";
3557	return AttributeString;
3558	}
3559
3560	// Since the NodeOwners list includes nodes that are no longer connected to
3561	// the graph, skip them here.
3562	static bool isNodeHidden(NodeRef Node, GraphType G) {
3563	if (Node->isRemoved())
3564	return true;
3565	// If a scope smaller than the full graph was requested, see if this node
3566	// contains any of the context ids of interest.
3567	if (DotGraphScope == DotScope::Alloc)
3568	return !set_intersects(Node->getContextIds(), G->DotAllocContextIds);
3569	if (DotGraphScope == DotScope::Context)
3570	return !Node->getContextIds().contains(ContextIdForDot);
3571	return false;
3572	}
3573
3574	private:
3575	static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
3576	std::string IdString = "ContextIds:";
3577	if (ContextIds.size() < `100`) {
3578	std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3579	std::sort(first: SortedIds.begin(), last: SortedIds.end());
3580	for (auto Id : SortedIds)
3581	IdString += (" " + Twine (Id)).str();
3582	} else {
3583	IdString += (" (" + Twine (ContextIds.size()) + " ids)").str();
3584	}
3585	return IdString;
3586	}
3587
3588	static std::string getColor(uint8_t AllocTypes, bool Highlight) {
3589	// If DoHighlight is not enabled, we want to use the highlight colors for
3590	// NotCold and Cold, and the non-highlight color for NotCold+Cold. This is
3591	// both compatible with the color scheme before highlighting was supported,
3592	// and for the NotCold+Cold color the non-highlight color is a bit more
3593	// readable.
3594	if (AllocTypes == (uint8_t)AllocationType::NotCold)
3595	// Color "brown1" actually looks like a lighter red.
3596	return !DoHighlight \|\| Highlight ? "brown1" : "lightpink";
3597	if (AllocTypes == (uint8_t)AllocationType::Cold)
3598	return !DoHighlight \|\| Highlight ? "cyan" : "lightskyblue";
3599	if (AllocTypes ==
3600	((uint8_t)AllocationType::NotCold \| (uint8_t)AllocationType::Cold))
3601	return Highlight ? "magenta" : "mediumorchid1";
3602	return "gray";
3603	}
3604
3605	static std::string getNodeId(NodeRef Node) {
3606	std::stringstream SStream;
3607	SStream << std::hex << "N0x" << (unsigned long long)Node;
3608	std::string Result = SStream.str();
3609	return Result;
3610	}
3611
3612	// True if we should highlight a specific context or allocation's contexts in
3613	// the emitted graph.
3614	static bool DoHighlight;
3615	};
3616
3617	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3618	bool DOTGraphTraits<
3619	const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>::DoHighlight =
3620	false;
3621
3622	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3623	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
3624	std::string Label) const {
3625	WriteGraph(this, "", false, Label,
3626	DotFilePathPrefix + "ccg." + Label + ".dot");
3627	}
3628
3629	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3630	typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
3631	CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
3632	const std::shared_ptr<ContextEdge> &Edge,
3633	DenseSet<uint32_t> ContextIdsToMove) {
3634	ContextNode *Node = Edge->Callee;
3635	assert(NodeToCallingFunc.count(Node));
3636	ContextNode *Clone =
3637	createNewNode(IsAllocation: Node->IsAllocation, F: NodeToCallingFunc[Node], C: Node->Call);
3638	Node->addClone(Clone);
3639	Clone->MatchingCalls = Node->MatchingCalls;
3640	moveEdgeToExistingCalleeClone(Edge, NewCallee: Clone, /NewClone=/true,
3641	ContextIdsToMove);
3642	return Clone;
3643	}
3644
3645	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3646	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3647	moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
3648	ContextNode NewCallee, bool* NewClone,
3649	DenseSet<uint32_t> ContextIdsToMove) {
3650	// NewCallee and Edge's current callee must be clones of the same original
3651	// node (Edge's current callee may be the original node too).
3652	assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
3653
3654	bool EdgeIsRecursive = Edge->Callee == Edge->Caller;
3655
3656	ContextNode *OldCallee = Edge->Callee;
3657
3658	// We might already have an edge to the new callee from earlier cloning for a
3659	// different allocation. If one exists we will reuse it.
3660	auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller);
3661
3662	// Callers will pass an empty ContextIdsToMove set when they want to move the
3663	// edge. Copy in Edge's ids for simplicity.
3664	if (ContextIdsToMove.empty())
3665	ContextIdsToMove = Edge->getContextIds();
3666
3667	// If we are moving all of Edge's ids, then just move the whole Edge.
3668	// Otherwise only move the specified subset, to a new edge if needed.
3669	if (Edge->getContextIds().size() == ContextIdsToMove.size()) {
3670	// First, update the alloc types on New Callee from Edge.
3671	// Do this before we potentially clear Edge's fields below!
3672	NewCallee->AllocTypes \|= Edge->AllocTypes;
3673	// Moving the whole Edge.
3674	if (ExistingEdgeToNewCallee) {
3675	// Since we already have an edge to NewCallee, simply move the ids
3676	// onto it, and remove the existing Edge.
3677	ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
3678	ExistingEdgeToNewCallee->AllocTypes \|= Edge->AllocTypes;
3679	assert(Edge->ContextIds == ContextIdsToMove);
3680	removeEdgeFromGraph(Edge: Edge.get());
3681	} else {
3682	// Otherwise just reconnect Edge to NewCallee.
3683	Edge->Callee = NewCallee;
3684	NewCallee->CallerEdges.push_back(Edge);
3685	// Remove it from callee where it was previously connected.
3686	OldCallee->eraseCallerEdge(Edge.get());
3687	// Don't need to update Edge's context ids since we are simply
3688	// reconnecting it.
3689	}
3690	} else {
3691	// Only moving a subset of Edge's ids.
3692	// Compute the alloc type of the subset of ids being moved.
3693	auto CallerEdgeAllocType = computeAllocType(ContextIds&: ContextIdsToMove);
3694	if (ExistingEdgeToNewCallee) {
3695	// Since we already have an edge to NewCallee, simply move the ids
3696	// onto it.
3697	ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
3698	ExistingEdgeToNewCallee->AllocTypes \|= CallerEdgeAllocType;
3699	} else {
3700	// Otherwise, create a new edge to NewCallee for the ids being moved.
3701	auto NewEdge = std::make_shared<ContextEdge>(
3702	NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove);
3703	Edge->Caller->CalleeEdges.push_back(NewEdge);
3704	NewCallee->CallerEdges.push_back(NewEdge);
3705	}
3706	// In either case, need to update the alloc types on NewCallee, and remove
3707	// those ids and update the alloc type on the original Edge.
3708	NewCallee->AllocTypes \|= CallerEdgeAllocType;
3709	set_subtract(Edge->ContextIds, ContextIdsToMove);
3710	Edge->AllocTypes = computeAllocType(ContextIds&: Edge->ContextIds);
3711	}
3712	// Now walk the old callee node's callee edges and move Edge's context ids
3713	// over to the corresponding edge into the clone (which is created here if
3714	// this is a newly created clone).
3715	for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
3716	ContextNode *CalleeToUse = OldCalleeEdge->Callee;
3717	// If this is a direct recursion edge, use NewCallee (the clone) as the
3718	// callee as well, so that any edge updated/created here is also direct
3719	// recursive.
3720	if (CalleeToUse == OldCallee) {
3721	// If this is a recursive edge, see if we already moved a recursive edge
3722	// (which would have to have been this one) - if we were only moving a
3723	// subset of context ids it would still be on OldCallee.
3724	if (EdgeIsRecursive) {
3725	assert(OldCalleeEdge == Edge);
3726	continue;
3727	}
3728	CalleeToUse = NewCallee;
3729	}
3730	// The context ids moving to the new callee are the subset of this edge's
3731	// context ids and the context ids on the caller edge being moved.
3732	DenseSet<uint32_t> EdgeContextIdsToMove =
3733	set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove);
3734	set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
3735	OldCalleeEdge->AllocTypes =
3736	computeAllocType(ContextIds&: OldCalleeEdge->getContextIds());
3737	if (!NewClone) {
3738	// Update context ids / alloc type on corresponding edge to NewCallee.
3739	// There is a chance this may not exist if we are reusing an existing
3740	// clone, specifically during function assignment, where we would have
3741	// removed none type edges after creating the clone. If we can't find
3742	// a corresponding edge there, fall through to the cloning below.
3743	if (auto *NewCalleeEdge = NewCallee->findEdgeFromCallee(CalleeToUse)) {
3744	NewCalleeEdge->getContextIds().insert_range(EdgeContextIdsToMove);
3745	NewCalleeEdge->AllocTypes \|= computeAllocType(ContextIds&: EdgeContextIdsToMove);
3746	continue;
3747	}
3748	}
3749	auto NewEdge = std::make_shared<ContextEdge>(
3750	CalleeToUse, NewCallee, computeAllocType(ContextIds&: EdgeContextIdsToMove),
3751	EdgeContextIdsToMove);
3752	NewCallee->CalleeEdges.push_back(NewEdge);
3753	NewEdge->Callee->CallerEdges.push_back(NewEdge);
3754	}
3755	// Recompute the node alloc type now that its callee edges have been
3756	// updated (since we will compute from those edges).
3757	OldCallee->AllocTypes = OldCallee->computeAllocType();
3758	// OldCallee alloc type should be None iff its context id set is now empty.
3759	assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
3760	OldCallee->emptyContextIds());
3761	if (VerifyCCG) {
3762	checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /CheckEdges=/false);
3763	checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /CheckEdges=/false);
3764	for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
3765	checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
3766	/CheckEdges=/false);
3767	for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
3768	checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
3769	/CheckEdges=/false);
3770	}
3771	}
3772
3773	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3774	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3775	moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
3776	ContextNode *NewCaller) {
3777	auto *OldCallee = Edge->Callee;
3778	auto *NewCallee = OldCallee;
3779	// If this edge was direct recursive, make any new/updated edge also direct
3780	// recursive to NewCaller.
3781	bool Recursive = Edge->Caller == Edge->Callee;
3782	if (Recursive)
3783	NewCallee = NewCaller;
3784
3785	ContextNode *OldCaller = Edge->Caller;
3786	OldCaller->eraseCalleeEdge(Edge.get());
3787
3788	// We might already have an edge to the new caller. If one exists we will
3789	// reuse it.
3790	auto ExistingEdgeToNewCaller = NewCaller->findEdgeFromCallee(NewCallee);
3791
3792	if (ExistingEdgeToNewCaller) {
3793	// Since we already have an edge to NewCaller, simply move the ids
3794	// onto it, and remove the existing Edge.
3795	ExistingEdgeToNewCaller->getContextIds().insert_range(
3796	Edge->getContextIds());
3797	ExistingEdgeToNewCaller->AllocTypes \|= Edge->AllocTypes;
3798	Edge->ContextIds.clear();
3799	Edge->AllocTypes = (uint8_t)AllocationType::None;
3800	OldCallee->eraseCallerEdge(Edge.get());
3801	} else {
3802	// Otherwise just reconnect Edge to NewCaller.
3803	Edge->Caller = NewCaller;
3804	NewCaller->CalleeEdges.push_back(Edge);
3805	if (Recursive) {
3806	assert(NewCallee == NewCaller);
3807	// In the case of (direct) recursive edges, we update the callee as well
3808	// so that it becomes recursive on the new caller.
3809	Edge->Callee = NewCallee;
3810	NewCallee->CallerEdges.push_back(Edge);
3811	OldCallee->eraseCallerEdge(Edge.get());
3812	}
3813	// Don't need to update Edge's context ids since we are simply
3814	// reconnecting it.
3815	}
3816	// In either case, need to update the alloc types on New Caller.
3817	NewCaller->AllocTypes \|= Edge->AllocTypes;
3818
3819	// Now walk the old caller node's caller edges and move Edge's context ids
3820	// over to the corresponding edge into the node (which is created here if
3821	// this is a newly created node). We can tell whether this is a newly created
3822	// node by seeing if it has any caller edges yet.
3823	#ifndef NDEBUG
3824	bool IsNewNode = NewCaller->CallerEdges.empty();
3825	#endif
3826	// If we just moved a direct recursive edge, presumably its context ids should
3827	// also flow out of OldCaller via some other non-recursive callee edge. We
3828	// don't want to remove the recursive context ids from other caller edges yet,
3829	// otherwise the context ids get into an inconsistent state on OldCaller.
3830	// We will update these context ids on the non-recursive caller edge when and
3831	// if they are updated on the non-recursive callee.
3832	if (!Recursive) {
3833	for (auto &OldCallerEdge : OldCaller->CallerEdges) {
3834	auto OldCallerCaller = OldCallerEdge->Caller;
3835	// The context ids moving to the new caller are the subset of this edge's
3836	// context ids and the context ids on the callee edge being moved.
3837	DenseSet<uint32_t> EdgeContextIdsToMove = set_intersection(
3838	OldCallerEdge->getContextIds(), Edge->getContextIds());
3839	if (OldCaller == OldCallerCaller) {
3840	OldCallerCaller = NewCaller;
3841	// Don't actually move this one. The caller will move it directly via a
3842	// call to this function with this as the Edge if it is appropriate to
3843	// move to a diff node that has a matching callee (itself).
3844	continue;
3845	}
3846	set_subtract(OldCallerEdge->getContextIds(), EdgeContextIdsToMove);
3847	OldCallerEdge->AllocTypes =
3848	computeAllocType(ContextIds&: OldCallerEdge->getContextIds());
3849	// In this function we expect that any pre-existing node already has edges
3850	// from the same callers as the old node. That should be true in the
3851	// current use case, where we will remove None-type edges after copying
3852	// over all caller edges from the callee.
3853	auto *ExistingCallerEdge = NewCaller->findEdgeFromCaller(OldCallerCaller);
3854	// Since we would have skipped caller edges when moving a direct recursive
3855	// edge, this may not hold true when recursive handling enabled.
3856	assert(IsNewNode \|\| ExistingCallerEdge \|\| AllowRecursiveCallsites);
3857	if (ExistingCallerEdge) {
3858	ExistingCallerEdge->getContextIds().insert_range(EdgeContextIdsToMove);
3859	ExistingCallerEdge->AllocTypes \|=
3860	computeAllocType(ContextIds&: EdgeContextIdsToMove);
3861	continue;
3862	}
3863	auto NewEdge = std::make_shared<ContextEdge>(
3864	NewCaller, OldCallerCaller, computeAllocType(ContextIds&: EdgeContextIdsToMove),
3865	EdgeContextIdsToMove);
3866	NewCaller->CallerEdges.push_back(NewEdge);
3867	NewEdge->Caller->CalleeEdges.push_back(NewEdge);
3868	}
3869	}
3870	// Recompute the node alloc type now that its caller edges have been
3871	// updated (since we will compute from those edges).
3872	OldCaller->AllocTypes = OldCaller->computeAllocType();
3873	// OldCaller alloc type should be None iff its context id set is now empty.
3874	assert((OldCaller->AllocTypes == (uint8_t)AllocationType::None) ==
3875	OldCaller->emptyContextIds());
3876	if (VerifyCCG) {
3877	checkNode<DerivedCCG, FuncTy, CallTy>(OldCaller, /CheckEdges=/false);
3878	checkNode<DerivedCCG, FuncTy, CallTy>(NewCaller, /CheckEdges=/false);
3879	for (const auto &OldCallerEdge : OldCaller->CallerEdges)
3880	checkNode<DerivedCCG, FuncTy, CallTy>(OldCallerEdge->Caller,
3881	/CheckEdges=/false);
3882	for (const auto &NewCallerEdge : NewCaller->CallerEdges)
3883	checkNode<DerivedCCG, FuncTy, CallTy>(NewCallerEdge->Caller,
3884	/CheckEdges=/false);
3885	}
3886	}
3887
3888	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3889	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3890	recursivelyRemoveNoneTypeCalleeEdges(
3891	ContextNode Node, DenseSet<const* ContextNode *> &Visited) {
3892	auto Inserted = Visited.insert(Node);
3893	if (!Inserted.second)
3894	return;
3895
3896	removeNoneTypeCalleeEdges(Node);
3897
3898	for (auto *Clone : Node->Clones)
3899	recursivelyRemoveNoneTypeCalleeEdges(Node: Clone, Visited);
3900
3901	// The recursive call may remove some of this Node's caller edges.
3902	// Iterate over a copy and skip any that were removed.
3903	auto CallerEdges = Node->CallerEdges;
3904	for (auto &Edge : CallerEdges) {
3905	// Skip any that have been removed by an earlier recursive call.
3906	if (Edge->isRemoved()) {
3907	assert(!is_contained(Node->CallerEdges, Edge));
3908	continue;
3909	}
3910	recursivelyRemoveNoneTypeCalleeEdges(Node: Edge->Caller, Visited);
3911	}
3912	}
3913
3914	// This is the standard DFS based backedge discovery algorithm.
3915	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3916	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges() {
3917	// If we are cloning recursive contexts, find and mark backedges from all root
3918	// callers, using the typical DFS based backedge analysis.
3919	if (!CloneRecursiveContexts)
3920	return;
3921	DenseSet<const ContextNode *> Visited;
3922	DenseSet<const ContextNode *> CurrentStack;
3923	for (auto &Entry : NonAllocationCallToContextNodeMap) {
3924	auto *Node = Entry.second;
3925	if (Node->isRemoved())
3926	continue;
3927	// It is a root if it doesn't have callers.
3928	if (!Node->CallerEdges.empty())
3929	continue;
3930	markBackedges(Node, Visited, CurrentStack);
3931	assert(CurrentStack.empty());
3932	}
3933	}
3934
3935	// Recursive helper for above markBackedges method.
3936	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3937	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges(
3938	ContextNode Node, DenseSet<const* ContextNode *> &Visited,
3939	DenseSet<const ContextNode *> &CurrentStack) {
3940	auto I = Visited.insert(Node);
3941	// We should only call this for unvisited nodes.
3942	assert(I.second);
3943	(void)I;
3944	for (auto &CalleeEdge : Node->CalleeEdges) {
3945	auto *Callee = CalleeEdge->Callee;
3946	if (Visited.count(Callee)) {
3947	// Since this was already visited we need to check if it is currently on
3948	// the recursive stack in which case it is a backedge.
3949	if (CurrentStack.count(Callee))
3950	CalleeEdge->IsBackedge = true;
3951	continue;
3952	}
3953	CurrentStack.insert(Callee);
3954	markBackedges(Callee, Visited, CurrentStack);
3955	CurrentStack.erase(Callee);
3956	}
3957	}
3958
3959	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3960	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
3961	DenseSet<const ContextNode *> Visited;
3962	for (auto &Entry : AllocationCallToContextNodeMap) {
3963	Visited.clear();
3964	identifyClones(Entry.second, Visited, Entry.second->getContextIds());
3965	}
3966	Visited.clear();
3967	for (auto &Entry : AllocationCallToContextNodeMap)
3968	recursivelyRemoveNoneTypeCalleeEdges(Node: Entry.second, Visited);
3969	if (VerifyCCG)
3970	check();
3971	}
3972
3973	// helper function to check an AllocType is cold or notcold or both.
3974	bool checkColdOrNotCold(uint8_t AllocType) {
3975	return (AllocType == (uint8_t)AllocationType::Cold) \|\|
3976	(AllocType == (uint8_t)AllocationType::NotCold) \|\|
3977	(AllocType ==
3978	((uint8_t)AllocationType::Cold \| (uint8_t)AllocationType::NotCold));
3979	}
3980
3981	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3982	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
3983	ContextNode Node, DenseSet<const* ContextNode *> &Visited,
3984	const DenseSet<uint32_t> &AllocContextIds) {
3985	if (VerifyNodes)
3986	checkNode<DerivedCCG, FuncTy, CallTy>(Node, /CheckEdges=/false);
3987	assert(!Node->CloneOf);
3988
3989	// If Node as a null call, then either it wasn't found in the module (regular
3990	// LTO) or summary index (ThinLTO), or there were other conditions blocking
3991	// cloning (e.g. recursion, calls multiple targets, etc).
3992	// Do this here so that we don't try to recursively clone callers below, which
3993	// isn't useful at least for this node.
3994	if (!Node->hasCall())
3995	return;
3996
3997	// No need to look at any callers if allocation type already unambiguous.
3998	if (hasSingleAllocType(Node->AllocTypes))
3999	return;
4000
4001	#ifndef NDEBUG
4002	auto Insert =
4003	#endif
4004	Visited.insert(Node);
4005	// We should not have visited this node yet.
4006	assert(Insert.second);
4007	// The recursive call to identifyClones may delete the current edge from the
4008	// CallerEdges vector. Make a copy and iterate on that, simpler than passing
4009	// in an iterator and having recursive call erase from it. Other edges may
4010	// also get removed during the recursion, which will have null Callee and
4011	// Caller pointers (and are deleted later), so we skip those below.
4012	{
4013	auto CallerEdges = Node->CallerEdges;
4014	for (auto &Edge : CallerEdges) {
4015	// Skip any that have been removed by an earlier recursive call.
4016	if (Edge->isRemoved()) {
4017	assert(!is_contained(Node->CallerEdges, Edge));
4018	continue;
4019	}
4020	// Defer backedges. See comments further below where these edges are
4021	// handled during the cloning of this Node.
4022	if (Edge->IsBackedge) {
4023	// We should only mark these if cloning recursive contexts, where we
4024	// need to do this deferral.
4025	assert(CloneRecursiveContexts);
4026	continue;
4027	}
4028	// Ignore any caller we previously visited via another edge.
4029	if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
4030	identifyClones(Edge->Caller, Visited, AllocContextIds);
4031	}
4032	}
4033	}
4034
4035	// Check if we reached an unambiguous call or have have only a single caller.
4036	if (hasSingleAllocType(Node->AllocTypes) \|\| Node->CallerEdges.size() <= `1`)
4037	return;
4038
4039	// We need to clone.
4040
4041	// Try to keep the original version as alloc type NotCold. This will make
4042	// cases with indirect calls or any other situation with an unknown call to
4043	// the original function get the default behavior. We do this by sorting the
4044	// CallerEdges of the Node we will clone by alloc type.
4045	//
4046	// Give NotCold edge the lowest sort priority so those edges are at the end of
4047	// the caller edges vector, and stay on the original version (since the below
4048	// code clones greedily until it finds all remaining edges have the same type
4049	// and leaves the remaining ones on the original Node).
4050	//
4051	// We shouldn't actually have any None type edges, so the sorting priority for
4052	// that is arbitrary, and we assert in that case below.
4053	const unsigned AllocTypeCloningPriority[] = {/None/ `3`, /NotCold/ `4`,
4054	/Cold/ `1`,
4055	/NotColdCold/ `2`};
4056	llvm::stable_sort(Node->CallerEdges,
4057	[&](const std::shared_ptr<ContextEdge> &A,
4058	const std::shared_ptr<ContextEdge> &B) {
4059	// Nodes with non-empty context ids should be sorted
4060	// before those with empty context ids.
4061	if (A->ContextIds.empty())
4062	// Either B ContextIds are non-empty (in which case we
4063	// should return false because B < A), or B ContextIds
4064	// are empty, in which case they are equal, and we
4065	// should maintain the original relative ordering.
4066	return false;
4067	if (B->ContextIds.empty())
4068	return true;
4069
4070	if (A->AllocTypes == B->AllocTypes)
4071	// Use the first context id for each edge as a
4072	// tie-breaker.
4073	return A->ContextIds.begin() < B->ContextIds.begin();
4074	return AllocTypeCloningPriority[A->AllocTypes] <
4075	AllocTypeCloningPriority[B->AllocTypes];
4076	});
4077
4078	assert(Node->AllocTypes != (uint8_t)AllocationType::None);
4079
4080	DenseSet<uint32_t> RecursiveContextIds;
4081	assert(AllowRecursiveContexts \|\| !CloneRecursiveContexts);
4082	// If we are allowing recursive callsites, but have also disabled recursive
4083	// contexts, look for context ids that show up in multiple caller edges.
4084	if (AllowRecursiveCallsites && !AllowRecursiveContexts) {
4085	DenseSet<uint32_t> AllCallerContextIds;
4086	for (auto &CE : Node->CallerEdges) {
4087	// Resize to the largest set of caller context ids, since we know the
4088	// final set will be at least that large.
4089	AllCallerContextIds.reserve(Size: CE->getContextIds().size());
4090	for (auto Id : CE->getContextIds())
4091	if (!AllCallerContextIds.insert(Id).second)
4092	RecursiveContextIds.insert(Id);
4093	}
4094	}
4095
4096	// Iterate until we find no more opportunities for disambiguating the alloc
4097	// types via cloning. In most cases this loop will terminate once the Node
4098	// has a single allocation type, in which case no more cloning is needed.
4099	// Iterate over a copy of Node's caller edges, since we may need to remove
4100	// edges in the moveEdgeTo methods, and this simplifies the handling and*
4101	// makes it less error-prone.
4102	auto CallerEdges = Node->CallerEdges;
4103	for (auto &CallerEdge : CallerEdges) {
4104	// Skip any that have been removed by an earlier recursive call.
4105	if (CallerEdge->isRemoved()) {
4106	assert(!is_contained(Node->CallerEdges, CallerEdge));
4107	continue;
4108	}
4109	assert(CallerEdge->Callee == Node);
4110
4111	// See if cloning the prior caller edge left this node with a single alloc
4112	// type or a single caller. In that case no more cloning of Node is needed.
4113	if (hasSingleAllocType(Node->AllocTypes) \|\| Node->CallerEdges.size() <= `1`)
4114	break;
4115
4116	// If the caller was not successfully matched to a call in the IR/summary,
4117	// there is no point in trying to clone for it as we can't update that call.
4118	if (!CallerEdge->Caller->hasCall())
4119	continue;
4120
4121	// Only need to process the ids along this edge pertaining to the given
4122	// allocation.
4123	auto CallerEdgeContextsForAlloc =
4124	set_intersection(CallerEdge->getContextIds(), AllocContextIds);
4125	if (!RecursiveContextIds.empty())
4126	CallerEdgeContextsForAlloc =
4127	set_difference(CallerEdgeContextsForAlloc, RecursiveContextIds);
4128	if (CallerEdgeContextsForAlloc.empty())
4129	continue;
4130
4131	auto CallerAllocTypeForAlloc = computeAllocType(ContextIds&: CallerEdgeContextsForAlloc);
4132
4133	// Compute the node callee edge alloc types corresponding to the context ids
4134	// for this caller edge.
4135	std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
4136	CalleeEdgeAllocTypesForCallerEdge.reserve(n: Node->CalleeEdges.size());
4137	for (auto &CalleeEdge : Node->CalleeEdges)
4138	CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
4139	Node1Ids: CalleeEdge->getContextIds(), Node2Ids: CallerEdgeContextsForAlloc));
4140
4141	// Don't clone if doing so will not disambiguate any alloc types amongst
4142	// caller edges (including the callee edges that would be cloned).
4143	// Otherwise we will simply move all edges to the clone.
4144	//
4145	// First check if by cloning we will disambiguate the caller allocation
4146	// type from node's allocation type. Query allocTypeToUse so that we don't
4147	// bother cloning to distinguish NotCold+Cold from NotCold. Note that
4148	// neither of these should be None type.
4149	//
4150	// Then check if by cloning node at least one of the callee edges will be
4151	// disambiguated by splitting out different context ids.
4152	//
4153	// However, always do the cloning if this is a backedge, in which case we
4154	// have not yet cloned along this caller edge.
4155	assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
4156	assert(Node->AllocTypes != (uint8_t)AllocationType::None);
4157	if (!CallerEdge->IsBackedge &&
4158	allocTypeToUse(CallerAllocTypeForAlloc) ==
4159	allocTypeToUse(Node->AllocTypes) &&
4160	allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
4161	CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {
4162	continue;
4163	}
4164
4165	if (CallerEdge->IsBackedge) {
4166	// We should only mark these if cloning recursive contexts, where we
4167	// need to do this deferral.
4168	assert(CloneRecursiveContexts);
4169	DeferredBackedges ++;
4170	}
4171
4172	// If this is a backedge, we now do recursive cloning starting from its
4173	// caller since we may have moved unambiguous caller contexts to a clone
4174	// of this Node in a previous iteration of the current loop, giving more
4175	// opportunity for cloning through the backedge. Because we sorted the
4176	// caller edges earlier so that cold caller edges are first, we would have
4177	// visited and cloned this node for any unamibiguously cold non-recursive
4178	// callers before any ambiguous backedge callers. Note that we don't do this
4179	// if the caller is already cloned or visited during cloning (e.g. via a
4180	// different context path from the allocation).
4181	// TODO: Can we do better in the case where the caller was already visited?
4182	if (CallerEdge->IsBackedge && !CallerEdge->Caller->CloneOf &&
4183	!Visited.count(CallerEdge->Caller)) {
4184	const auto OrigIdCount = CallerEdge->getContextIds().size();
4185	// Now do the recursive cloning of this backedge's caller, which was
4186	// deferred earlier.
4187	identifyClones(CallerEdge->Caller, Visited, CallerEdgeContextsForAlloc);
4188	removeNoneTypeCalleeEdges(Node: CallerEdge->Caller);
4189	// See if the recursive call to identifyClones moved the context ids to a
4190	// new edge from this node to a clone of caller, and switch to looking at
4191	// that new edge so that we clone Node for the new caller clone.
4192	bool UpdatedEdge = false;
4193	if (OrigIdCount > CallerEdge->getContextIds().size()) {
4194	for (auto E : Node->CallerEdges) {
4195	// Only interested in clones of the current edges caller.
4196	if (E->Caller->CloneOf != CallerEdge->Caller)
4197	continue;
4198	// See if this edge contains any of the context ids originally on the
4199	// current caller edge.
4200	auto CallerEdgeContextsForAllocNew =
4201	set_intersection(CallerEdgeContextsForAlloc, E->getContextIds());
4202	if (CallerEdgeContextsForAllocNew.empty())
4203	continue;
4204	// Make sure we don't pick a previously existing caller edge of this
4205	// Node, which would be processed on a different iteration of the
4206	// outer loop over the saved CallerEdges.
4207	if (llvm::is_contained(CallerEdges, E))
4208	continue;
4209	// The CallerAllocTypeForAlloc and CalleeEdgeAllocTypesForCallerEdge
4210	// are updated further below for all cases where we just invoked
4211	// identifyClones recursively.
4212	CallerEdgeContextsForAlloc.swap(CallerEdgeContextsForAllocNew);
4213	CallerEdge = E;
4214	UpdatedEdge = true;
4215	break;
4216	}
4217	}
4218	// If cloning removed this edge (and we didn't update it to a new edge
4219	// above), we're done with this edge. It's possible we moved all of the
4220	// context ids to an existing clone, in which case there's no need to do
4221	// further processing for them.
4222	if (CallerEdge->isRemoved())
4223	continue;
4224
4225	// Now we need to update the information used for the cloning decisions
4226	// further below, as we may have modified edges and their context ids.
4227
4228	// Note if we changed the CallerEdge above we would have already updated
4229	// the context ids.
4230	if (!UpdatedEdge) {
4231	CallerEdgeContextsForAlloc = set_intersection(
4232	CallerEdgeContextsForAlloc, CallerEdge->getContextIds());
4233	if (CallerEdgeContextsForAlloc.empty())
4234	continue;
4235	}
4236	// Update the other information that depends on the edges and on the now
4237	// updated CallerEdgeContextsForAlloc.
4238	CallerAllocTypeForAlloc = computeAllocType(ContextIds&: CallerEdgeContextsForAlloc);
4239	CalleeEdgeAllocTypesForCallerEdge.clear();
4240	for (auto &CalleeEdge : Node->CalleeEdges) {
4241	CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
4242	Node1Ids: CalleeEdge->getContextIds(), Node2Ids: CallerEdgeContextsForAlloc));
4243	}
4244	}
4245
4246	// First see if we can use an existing clone. Check each clone and its
4247	// callee edges for matching alloc types.
4248	ContextNode Clone = nullptr*;
4249	for (auto *CurClone : Node->Clones) {
4250	if (allocTypeToUse(CurClone->AllocTypes) !=
4251	allocTypeToUse(CallerAllocTypeForAlloc))
4252	continue;
4253
4254	bool BothSingleAlloc = hasSingleAllocType(CurClone->AllocTypes) &&
4255	hasSingleAllocType(CallerAllocTypeForAlloc);
4256	// The above check should mean that if both have single alloc types that
4257	// they should be equal.
4258	assert(!BothSingleAlloc \|\|
4259	CurClone->AllocTypes == CallerAllocTypeForAlloc);
4260
4261	// If either both have a single alloc type (which are the same), or if the
4262	// clone's callee edges have the same alloc types as those for the current
4263	// allocation on Node's callee edges (CalleeEdgeAllocTypesForCallerEdge),
4264	// then we can reuse this clone.
4265	if (BothSingleAlloc \|\| allocTypesMatchClone<DerivedCCG, FuncTy, CallTy>(
4266	CalleeEdgeAllocTypesForCallerEdge, CurClone)) {
4267	Clone = CurClone;
4268	break;
4269	}
4270	}
4271
4272	// The edge iterator is adjusted when we move the CallerEdge to the clone.
4273	if (Clone)
4274	moveEdgeToExistingCalleeClone(Edge: CallerEdge, NewCallee: Clone, /NewClone=/false,
4275	ContextIdsToMove: CallerEdgeContextsForAlloc);
4276	else
4277	Clone = moveEdgeToNewCalleeClone(Edge: CallerEdge, ContextIdsToMove: CallerEdgeContextsForAlloc);
4278
4279	// Sanity check that no alloc types on clone or its edges are None.
4280	assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
4281	}
4282
4283	// We should still have some context ids on the original Node.
4284	assert(!Node->emptyContextIds());
4285
4286	// Sanity check that no alloc types on node or edges are None.
4287	assert(Node->AllocTypes != (uint8_t)AllocationType::None);
4288
4289	if (VerifyNodes)
4290	checkNode<DerivedCCG, FuncTy, CallTy>(Node, /CheckEdges=/false);
4291	}
4292
4293	void ModuleCallsiteContextGraph::updateAllocationCall(
4294	CallInfo &Call, AllocationType AllocType) {
4295	std::string AllocTypeString = getAllocTypeAttributeString(Type: AllocType);
4296	removeAnyExistingAmbiguousAttribute(CB: cast<CallBase>(Val: Call.call()));
4297	auto A = llvm::Attribute::get(Context&: Call.call()->getFunction()->getContext(),
4298	Kind: "memprof", Val: AllocTypeString);
4299	cast<CallBase>(Val: Call.call())->addFnAttr(Attr: A);
4300	OREGetter (Call.call()->getFunction())
4301	.emit(OptDiag: OptimizationRemark (DEBUG_TYPE, "MemprofAttribute", Call.call())
4302	<< ore::NV ("AllocationCall", Call.call()) << " in clone "
4303	<< ore::NV ("Caller", Call.call()->getFunction())
4304	<< " marked with memprof allocation attribute "
4305	<< ore::NV ("Attribute", AllocTypeString));
4306	}
4307
4308	void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
4309	AllocationType AllocType) {
4310	auto AI = cast<AllocInfo >(Val: Call.call());
4311	assert(AI);
4312	assert(AI->Versions.size() > Call.cloneNo());
4313	AI->Versions [Call.cloneNo()] = (uint8_t)AllocType;
4314	}
4315
4316	AllocationType
4317	ModuleCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
4318	const auto *CB = cast<CallBase>(Val: Call.call());
4319	if (!CB->getAttributes().hasFnAttr(Kind: "memprof"))
4320	return AllocationType::None;
4321	return CB->getAttributes().getFnAttr(Kind: "memprof").getValueAsString() == "cold"
4322	? AllocationType::Cold
4323	: AllocationType::NotCold;
4324	}
4325
4326	AllocationType
4327	IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
4328	const auto AI = cast<AllocInfo >(Val: Call.call());
4329	assert(AI->Versions.size() > Call.cloneNo());
4330	return (AllocationType)AI->Versions [Call.cloneNo()];
4331	}
4332
4333	void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
4334	FuncInfo CalleeFunc) {
4335	auto *CurF = getCalleeFunc(Call: CallerCall.call());
4336	auto NewCalleeCloneNo = CalleeFunc.cloneNo();
4337	if (isMemProfClone(F: *CurF)) {
4338	// If we already assigned this callsite to call a specific non-default
4339	// clone (i.e. not the original function which is clone 0), ensure that we
4340	// aren't trying to now update it to call a different clone, which is
4341	// indicative of a bug in the graph or function assignment.
4342	auto CurCalleeCloneNo = getMemProfCloneNum(F: *CurF);
4343	if (CurCalleeCloneNo != NewCalleeCloneNo) {
4344	LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was "
4345	<< CurCalleeCloneNo << " now " << NewCalleeCloneNo
4346	<< "\n");
4347	MismatchedCloneAssignments ++;
4348	}
4349	}
4350	if (NewCalleeCloneNo > `0`)
4351	cast<CallBase>(Val: CallerCall.call())->setCalledFunction(CalleeFunc.func());
4352	OREGetter (CallerCall.call()->getFunction())
4353	.emit(OptDiag: OptimizationRemark (DEBUG_TYPE, "MemprofCall", CallerCall.call())
4354	<< ore::NV ("Call", CallerCall.call()) << " in clone "
4355	<< ore::NV ("Caller", CallerCall.call()->getFunction())
4356	<< " assigned to call function clone "
4357	<< ore::NV ("Callee", CalleeFunc.func()));
4358	}
4359
4360	void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
4361	FuncInfo CalleeFunc) {
4362	auto CI = cast<CallsiteInfo >(Val: CallerCall.call());
4363	assert(CI &&
4364	"Caller cannot be an allocation which should not have profiled calls");
4365	assert(CI->Clones.size() > CallerCall.cloneNo());
4366	auto NewCalleeCloneNo = CalleeFunc.cloneNo();
4367	auto &CurCalleeCloneNo = CI->Clones [CallerCall.cloneNo()];
4368	// If we already assigned this callsite to call a specific non-default
4369	// clone (i.e. not the original function which is clone 0), ensure that we
4370	// aren't trying to now update it to call a different clone, which is
4371	// indicative of a bug in the graph or function assignment.
4372	if (CurCalleeCloneNo != `0` && CurCalleeCloneNo != NewCalleeCloneNo) {
4373	LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was "
4374	<< CurCalleeCloneNo << " now " << NewCalleeCloneNo
4375	<< "\n");
4376	MismatchedCloneAssignments ++;
4377	}
4378	CurCalleeCloneNo = NewCalleeCloneNo;
4379	}
4380
4381	// Update the debug information attached to NewFunc to use the clone Name. Note
4382	// this needs to be done for both any existing DISubprogram for the definition,
4383	// as well as any separate declaration DISubprogram.
4384	static void updateSubprogramLinkageName(Function *NewFunc, StringRef Name) {
4385	assert(Name == NewFunc->getName());
4386	auto *SP = NewFunc->getSubprogram();
4387	if (!SP)
4388	return;
4389	auto *MDName = MDString::get(Context&: NewFunc->getParent()->getContext(), Str: Name);
4390	SP->replaceLinkageName(LN: MDName);
4391	DISubprogram *Decl = SP->getDeclaration();
4392	if (!Decl)
4393	return;
4394	TempDISubprogram NewDecl = Decl->clone();
4395	NewDecl ->replaceLinkageName(LN: MDName);
4396	SP->replaceDeclaration(Decl: MDNode::replaceWithUniqued(N: std::move(NewDecl)));
4397	}
4398
4399	CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
4400	Instruction *>::FuncInfo
4401	ModuleCallsiteContextGraph::cloneFunctionForCallsite(
4402	FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
4403	std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
4404	// Use existing LLVM facilities for cloning and obtaining Call in clone
4405	ValueToValueMapTy VMap;
4406	auto *NewFunc = CloneFunction(F: Func.func(), VMap);
4407	std::string Name = getMemProfFuncName(Base: Func.func()->getName(), CloneNo);
4408	assert(!Func.func()->getParent()->getFunction(Name));
4409	NewFunc->setName(Name);
4410	updateSubprogramLinkageName(NewFunc, Name);
4411	for (auto &Inst : CallsWithMetadataInFunc) {
4412	// This map always has the initial version in it.
4413	assert(Inst.cloneNo() == `0`);
4414	CallMap [Inst] = {cast<Instruction>(Val&: VMap [Inst.call()]), CloneNo};
4415	}
4416	OREGetter (Func.func())
4417	.emit(OptDiag: OptimizationRemark (DEBUG_TYPE, "MemprofClone", Func.func())
4418	<< "created clone " << ore::NV ("NewFunction", NewFunc));
4419	return {NewFunc, CloneNo};
4420	}
4421
4422	CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
4423	IndexCall>::FuncInfo
4424	IndexCallsiteContextGraph::cloneFunctionForCallsite(
4425	FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
4426	std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
4427	// Check how many clones we have of Call (and therefore function).
4428	// The next clone number is the current size of versions array.
4429	// Confirm this matches the CloneNo provided by the caller, which is based on
4430	// the number of function clones we have.
4431	assert(CloneNo == (isa<AllocInfo *>(Call.call())
4432	? cast<AllocInfo *>(Call.call())->Versions.size()
4433	: cast<CallsiteInfo *>(Call.call())->Clones.size()));
4434	// Walk all the instructions in this function. Create a new version for
4435	// each (by adding an entry to the Versions/Clones summary array), and copy
4436	// over the version being called for the function clone being cloned here.
4437	// Additionally, add an entry to the CallMap for the new function clone,
4438	// mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
4439	// to the new call clone.
4440	for (auto &Inst : CallsWithMetadataInFunc) {
4441	// This map always has the initial version in it.
4442	assert(Inst.cloneNo() == `0`);
4443	if (auto AI = dyn_cast<AllocInfo >(Val: Inst.call())) {
4444	assert(AI->Versions.size() == CloneNo);
4445	// We assign the allocation type later (in updateAllocationCall), just add
4446	// an entry for it here.
4447	AI->Versions.push_back(Elt: `0`);
4448	} else {
4449	auto CI = cast<CallsiteInfo >(Val: Inst.call());
4450	assert(CI && CI->Clones.size() == CloneNo);
4451	// We assign the clone number later (in updateCall), just add an entry for
4452	// it here.
4453	CI->Clones.push_back(Elt: `0`);
4454	}
4455	CallMap [Inst] = {Inst.call(), CloneNo};
4456	}
4457	return {Func.func(), CloneNo};
4458	}
4459
4460	// We perform cloning for each allocation node separately. However, this
4461	// sometimes results in a situation where the same node calls multiple
4462	// clones of the same callee, created for different allocations. This
4463	// causes issues when assigning functions to these clones, as each node can
4464	// in reality only call a single callee clone.
4465	//
4466	// To address this, before assigning functions, merge callee clone nodes as
4467	// needed using a post order traversal from the allocations. We attempt to
4468	// use existing clones as the merge node when legal, and to share them
4469	// among callers with the same properties (callers calling the same set of
4470	// callee clone nodes for the same allocations).
4471	//
4472	// Without this fix, in some cases incorrect function assignment will lead
4473	// to calling the wrong allocation clone.
4474	template <typename DerivedCCG, typename FuncTy, typename CallTy>
4475	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones() {
4476	if (!MergeClones)
4477	return;
4478
4479	// Generate a map from context id to the associated allocation node for use
4480	// when merging clones.
4481	DenseMap<uint32_t, ContextNode *> ContextIdToAllocationNode;
4482	for (auto &Entry : AllocationCallToContextNodeMap) {
4483	auto *Node = Entry.second;
4484	for (auto Id : Node->getContextIds())
4485	ContextIdToAllocationNode[Id] = Node->getOrigNode();
4486	for (auto *Clone : Node->Clones) {
4487	for (auto Id : Clone->getContextIds())
4488	ContextIdToAllocationNode[Id] = Clone->getOrigNode();
4489	}
4490	}
4491
4492	// Post order traversal starting from allocations to ensure each callsite
4493	// calls a single clone of its callee. Callee nodes that are clones of each
4494	// other are merged (via new merge nodes if needed) to achieve this.
4495	DenseSet<const ContextNode *> Visited;
4496	for (auto &Entry : AllocationCallToContextNodeMap) {
4497	auto *Node = Entry.second;
4498
4499	mergeClones(Node, Visited, ContextIdToAllocationNode);
4500
4501	// Make a copy so the recursive post order traversal that may create new
4502	// clones doesn't mess up iteration. Note that the recursive traversal
4503	// itself does not call mergeClones on any of these nodes, which are all
4504	// (clones of) allocations.
4505	auto Clones = Node->Clones;
4506	for (auto *Clone : Clones)
4507	mergeClones(Clone, Visited, ContextIdToAllocationNode);
4508	}
4509
4510	if (DumpCCG) {
4511	dbgs() << "CCG after merging:\n";
4512	dbgs() << *this;
4513	}
4514	if (ExportToDot)
4515	exportToDot(Label: "aftermerge");
4516
4517	if (VerifyCCG) {
4518	check();
4519	}
4520	}
4521
4522	// Recursive helper for above mergeClones method.
4523	template <typename DerivedCCG, typename FuncTy, typename CallTy>
4524	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones(
4525	ContextNode Node, DenseSet<const* ContextNode *> &Visited,
4526	DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
4527	auto Inserted = Visited.insert(Node);
4528	if (!Inserted.second)
4529	return;
4530
4531	// Iteratively perform merging on this node to handle new caller nodes created
4532	// during the recursive traversal. We could do something more elegant such as
4533	// maintain a worklist, but this is a simple approach that doesn't cause a
4534	// measureable compile time effect, as most nodes don't have many caller
4535	// edges to check.
4536	bool FoundUnvisited = true;
4537	unsigned Iters = `0`;
4538	while (FoundUnvisited) {
4539	Iters++;
4540	FoundUnvisited = false;
4541	// Make a copy since the recursive call may move a caller edge to a new
4542	// callee, messing up the iterator.
4543	auto CallerEdges = Node->CallerEdges;
4544	for (auto CallerEdge : CallerEdges) {
4545	// Skip any caller edge moved onto a different callee during recursion.
4546	if (CallerEdge->Callee != Node)
4547	continue;
4548	// If we found an unvisited caller, note that we should check the caller
4549	// edges again as mergeClones may add or change caller nodes.
4550	if (DoMergeIteration && !Visited.contains(CallerEdge->Caller))
4551	FoundUnvisited = true;
4552	mergeClones(CallerEdge->Caller, Visited, ContextIdToAllocationNode);
4553	}
4554	}
4555
4556	TotalMergeInvokes ++;
4557	TotalMergeIters += Iters;
4558	if (Iters > MaxMergeIters)
4559	MaxMergeIters = Iters;
4560
4561	// Merge for this node after we handle its callers.
4562	mergeNodeCalleeClones(Node, Visited, ContextIdToAllocationNode);
4563	}
4564
4565	template <typename DerivedCCG, typename FuncTy, typename CallTy>
4566	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeNodeCalleeClones(
4567	ContextNode Node, DenseSet<const* ContextNode *> &Visited,
4568	DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
4569	// Ignore Node if we moved all of its contexts to clones.
4570	if (Node->emptyContextIds())
4571	return;
4572
4573	// First identify groups of clones among Node's callee edges, by building
4574	// a map from each callee base node to the associated callee edges from Node.
4575	MapVector<ContextNode *, std::vector<std::shared_ptr<ContextEdge>>>
4576	OrigNodeToCloneEdges;
4577	for (const auto &E : Node->CalleeEdges) {
4578	auto *Callee = E->Callee;
4579	if (!Callee->CloneOf && Callee->Clones.empty())
4580	continue;
4581	ContextNode *Base = Callee->getOrigNode();
4582	OrigNodeToCloneEdges[Base].push_back(E);
4583	}
4584
4585	// Helper for callee edge sorting below. Return true if A's callee has fewer
4586	// caller edges than B, or if A is a clone and B is not, or if A's first
4587	// context id is smaller than B's.
4588	auto CalleeCallerEdgeLessThan = [](const std::shared_ptr<ContextEdge> &A,
4589	const std::shared_ptr<ContextEdge> &B) {
4590	if (A->Callee->CallerEdges.size() != B->Callee->CallerEdges.size())
4591	return A->Callee->CallerEdges.size() < B->Callee->CallerEdges.size();
4592	if (A->Callee->CloneOf && !B->Callee->CloneOf)
4593	return true;
4594	else if (!A->Callee->CloneOf && B->Callee->CloneOf)
4595	return false;
4596	// Use the first context id for each edge as a
4597	// tie-breaker.
4598	return A->ContextIds.begin() < B->ContextIds.begin();
4599	};
4600
4601	// Process each set of callee clones called by Node, performing the needed
4602	// merging.
4603	for (auto Entry : OrigNodeToCloneEdges) {
4604	// CalleeEdges is the set of edges from Node reaching callees that are
4605	// mutual clones of each other.
4606	auto &CalleeEdges = Entry.second;
4607	auto NumCalleeClones = CalleeEdges.size();
4608	// A single edge means there is no merging needed.
4609	if (NumCalleeClones == `1`)
4610	continue;
4611	// Sort the CalleeEdges calling this group of clones in ascending order of
4612	// their caller edge counts, putting the original non-clone node first in
4613	// cases of a tie. This simplifies finding an existing node to use as the
4614	// merge node.
4615	llvm::stable_sort(CalleeEdges, CalleeCallerEdgeLessThan);
4616
4617	/// Find other callers of the given set of callee edges that can
4618	/// share the same callee merge node. See the comments at this method
4619	/// definition for details.
4620	DenseSet<ContextNode *> OtherCallersToShareMerge;
4621	findOtherCallersToShareMerge(Node, CalleeEdges, ContextIdToAllocationNode,
4622	OtherCallersToShareMerge);
4623
4624	// Now do the actual merging. Identify existing or create a new MergeNode
4625	// during the first iteration. Move each callee over, along with edges from
4626	// other callers we've determined above can share the same merge node.
4627	ContextNode MergeNode = nullptr*;
4628	DenseMap<ContextNode , unsigned*> CallerToMoveCount;
4629	for (auto CalleeEdge : CalleeEdges) {
4630	auto *OrigCallee = CalleeEdge->Callee;
4631	// If we don't have a MergeNode yet (only happens on the first iteration,
4632	// as a new one will be created when we go to move the first callee edge
4633	// over as needed), see if we can use this callee.
4634	if (!MergeNode) {
4635	// If there are no other callers, simply use this callee.
4636	if (CalleeEdge->Callee->CallerEdges.size() == `1`) {
4637	MergeNode = OrigCallee;
4638	NonNewMergedNodes ++;
4639	continue;
4640	}
4641	// Otherwise, if we have identified other caller nodes that can share
4642	// the merge node with Node, see if all of OrigCallee's callers are
4643	// going to share the same merge node. In that case we can use callee
4644	// (since all of its callers would move to the new merge node).
4645	if (!OtherCallersToShareMerge.empty()) {
4646	bool MoveAllCallerEdges = true;
4647	for (auto CalleeCallerE : OrigCallee->CallerEdges) {
4648	if (CalleeCallerE == CalleeEdge)
4649	continue;
4650	if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller)) {
4651	MoveAllCallerEdges = false;
4652	break;
4653	}
4654	}
4655	// If we are going to move all callers over, we can use this callee as
4656	// the MergeNode.
4657	if (MoveAllCallerEdges) {
4658	MergeNode = OrigCallee;
4659	NonNewMergedNodes ++;
4660	continue;
4661	}
4662	}
4663	}
4664	// Move this callee edge, creating a new merge node if necessary.
4665	if (MergeNode) {
4666	assert(MergeNode != OrigCallee);
4667	moveEdgeToExistingCalleeClone(Edge: CalleeEdge, NewCallee: MergeNode,
4668	/NewClone/ false);
4669	} else {
4670	MergeNode = moveEdgeToNewCalleeClone(Edge: CalleeEdge);
4671	NewMergedNodes ++;
4672	}
4673	// Now move all identified edges from other callers over to the merge node
4674	// as well.
4675	if (!OtherCallersToShareMerge.empty()) {
4676	// Make and iterate over a copy of OrigCallee's caller edges because
4677	// some of these will be moved off of the OrigCallee and that would mess
4678	// up the iteration from OrigCallee.
4679	auto OrigCalleeCallerEdges = OrigCallee->CallerEdges;
4680	for (auto &CalleeCallerE : OrigCalleeCallerEdges) {
4681	if (CalleeCallerE == CalleeEdge)
4682	continue;
4683	if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller))
4684	continue;
4685	CallerToMoveCount[CalleeCallerE->Caller]++;
4686	moveEdgeToExistingCalleeClone(Edge: CalleeCallerE, NewCallee: MergeNode,
4687	/NewClone/ false);
4688	}
4689	}
4690	removeNoneTypeCalleeEdges(Node: OrigCallee);
4691	removeNoneTypeCalleeEdges(Node: MergeNode);
4692	}
4693	}
4694	}
4695
4696	// Look for other nodes that have edges to the same set of callee
4697	// clones as the current Node. Those can share the eventual merge node
4698	// (reducing cloning and binary size overhead) iff:
4699	// - they have edges to the same set of callee clones
4700	// - each callee edge reaches a subset of the same allocations as Node's
4701	// corresponding edge to the same callee clone.
4702	// The second requirement is to ensure that we don't undo any of the
4703	// necessary cloning to distinguish contexts with different allocation
4704	// behavior.
4705	// FIXME: This is somewhat conservative, as we really just need to ensure
4706	// that they don't reach the same allocations as contexts on edges from Node
4707	// going to any of the other* callee clones being merged. However, that*
4708	// requires more tracking and checking to get right.
4709	template <typename DerivedCCG, typename FuncTy, typename CallTy>
4710	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
4711	findOtherCallersToShareMerge(
4712	ContextNode *Node,
4713	std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges,
4714	DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode,
4715	DenseSet<ContextNode *> &OtherCallersToShareMerge) {
4716	auto NumCalleeClones = CalleeEdges.size();
4717	// This map counts how many edges to the same callee clone exist for other
4718	// caller nodes of each callee clone.
4719	DenseMap<ContextNode , unsigned*> OtherCallersToSharedCalleeEdgeCount;
4720	// Counts the number of other caller nodes that have edges to all callee
4721	// clones that don't violate the allocation context checking.
4722	unsigned PossibleOtherCallerNodes = `0`;
4723
4724	// We only need to look at other Caller nodes if the first callee edge has
4725	// multiple callers (recall they are sorted in ascending order above).
4726	if (CalleeEdges[`0`]->Callee->CallerEdges.size() < `2`)
4727	return;
4728
4729	// For each callee edge:
4730	// - Collect the count of other caller nodes calling the same callees.
4731	// - Collect the alloc nodes reached by contexts on each callee edge.
4732	DenseMap<ContextEdge , DenseSet<ContextNode >> CalleeEdgeToAllocNodes;
4733	for (auto CalleeEdge : CalleeEdges) {
4734	assert(CalleeEdge->Callee->CallerEdges.size() > `1`);
4735	// For each other caller of the same callee, increment the count of
4736	// edges reaching the same callee clone.
4737	for (auto CalleeCallerEdges : CalleeEdge->Callee->CallerEdges) {
4738	if (CalleeCallerEdges->Caller == Node) {
4739	assert(CalleeCallerEdges == CalleeEdge);
4740	continue;
4741	}
4742	OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller]++;
4743	// If this caller edge now reaches all of the same callee clones,
4744	// increment the count of candidate other caller nodes.
4745	if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller] ==
4746	NumCalleeClones)
4747	PossibleOtherCallerNodes++;
4748	}
4749	// Collect the alloc nodes reached by contexts on each callee edge, for
4750	// later analysis.
4751	for (auto Id : CalleeEdge->getContextIds()) {
4752	auto *Alloc = ContextIdToAllocationNode.lookup(Id);
4753	if (!Alloc) {
4754	// FIXME: unclear why this happens occasionally, presumably
4755	// imperfect graph updates possibly with recursion.
4756	MissingAllocForContextId ++;
4757	continue;
4758	}
4759	CalleeEdgeToAllocNodes[CalleeEdge.get()].insert(Alloc);
4760	}
4761	}
4762
4763	// Now walk the callee edges again, and make sure that for each candidate
4764	// caller node all of its edges to the callees reach the same allocs (or
4765	// a subset) as those along the corresponding callee edge from Node.
4766	for (auto CalleeEdge : CalleeEdges) {
4767	assert(CalleeEdge->Callee->CallerEdges.size() > `1`);
4768	// Stop if we do not have any (more) candidate other caller nodes.
4769	if (!PossibleOtherCallerNodes)
4770	break;
4771	auto &CurCalleeAllocNodes = CalleeEdgeToAllocNodes[CalleeEdge.get()];
4772	// Check each other caller of this callee clone.
4773	for (auto &CalleeCallerE : CalleeEdge->Callee->CallerEdges) {
4774	// Not interested in the callee edge from Node itself.
4775	if (CalleeCallerE == CalleeEdge)
4776	continue;
4777	// Skip any callers that didn't have callee edges to all the same
4778	// callee clones.
4779	if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] !=
4780	NumCalleeClones)
4781	continue;
4782	// Make sure that each context along edge from candidate caller node
4783	// reaches an allocation also reached by this callee edge from Node.
4784	for (auto Id : CalleeCallerE->getContextIds()) {
4785	auto *Alloc = ContextIdToAllocationNode.lookup(Id);
4786	if (!Alloc)
4787	continue;
4788	// If not, simply reset the map entry to 0 so caller is ignored, and
4789	// reduce the count of candidate other caller nodes.
4790	if (!CurCalleeAllocNodes.contains(Alloc)) {
4791	OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] = `0`;
4792	PossibleOtherCallerNodes--;
4793	break;
4794	}
4795	}
4796	}
4797	}
4798
4799	if (!PossibleOtherCallerNodes)
4800	return;
4801
4802	// Build the set of other caller nodes that can use the same callee merge
4803	// node.
4804	for (auto &[OtherCaller, Count] : OtherCallersToSharedCalleeEdgeCount) {
4805	if (Count != NumCalleeClones)
4806	continue;
4807	OtherCallersToShareMerge.insert(OtherCaller);
4808	}
4809	}
4810
4811	// This method assigns cloned callsites to functions, cloning the functions as
4812	// needed. The assignment is greedy and proceeds roughly as follows:
4813	//
4814	// For each function Func:
4815	// For each call with graph Node having clones:
4816	// Initialize ClonesWorklist to Node and its clones
4817	// Initialize NodeCloneCount to 0
4818	// While ClonesWorklist is not empty:
4819	// Clone = pop front ClonesWorklist
4820	// NodeCloneCount++
4821	// If Func has been cloned less than NodeCloneCount times:
4822	// If NodeCloneCount is 1:
4823	// Assign Clone to original Func
4824	// Continue
4825	// Create a new function clone
4826	// If other callers not assigned to call a function clone yet:
4827	// Assign them to call new function clone
4828	// Continue
4829	// Assign any other caller calling the cloned version to new clone
4830	//
4831	// For each caller of Clone:
4832	// If caller is assigned to call a specific function clone:
4833	// If we cannot assign Clone to that function clone:
4834	// Create new callsite Clone NewClone
4835	// Add NewClone to ClonesWorklist
4836	// Continue
4837	// Assign Clone to existing caller's called function clone
4838	// Else:
4839	// If Clone not already assigned to a function clone:
4840	// Assign to first function clone without assignment
4841	// Assign caller to selected function clone
4842	// For each call with graph Node having clones:
4843	// If number func clones > number call's callsite Node clones:
4844	// Record func CallInfo clones without Node clone in UnassignedCallClones
4845	// For callsite Nodes in DFS order from allocations:
4846	// If IsAllocation:
4847	// Update allocation with alloc type
4848	// Else:
4849	// For Call, all MatchingCalls, and associated UnnassignedCallClones:
4850	// Update call to call recorded callee clone
4851	//
4852	template <typename DerivedCCG, typename FuncTy, typename CallTy>
4853	bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
4854	bool Changed = false;
4855
4856	mergeClones();
4857
4858	// Keep track of the assignment of nodes (callsites) to function clones they
4859	// call.
4860	DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
4861
4862	// Update caller node to call function version CalleeFunc, by recording the
4863	// assignment in CallsiteToCalleeFuncCloneMap.
4864	auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
4865	const FuncInfo &CalleeFunc) {
4866	assert(Caller->hasCall());
4867	CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
4868	};
4869
4870	// Information for a single clone of this Func.
4871	struct FuncCloneInfo {
4872	// The function clone.
4873	FuncInfo FuncClone;
4874	// Remappings of each call of interest (from original uncloned call to the
4875	// corresponding cloned call in this function clone).
4876	DenseMap<CallInfo, CallInfo> CallMap;
4877	};
4878
4879	// Map to keep track of information needed to update calls in function clones
4880	// when their corresponding callsite node was not itself cloned for that
4881	// function clone. Because of call context pruning (i.e. we only keep as much
4882	// caller information as needed to distinguish hot vs cold), we may not have
4883	// caller edges coming to each callsite node from all possible function
4884	// callers. A function clone may get created for other callsites in the
4885	// function for which there are caller edges that were not pruned. Any other
4886	// callsites in that function clone, which were not themselved cloned for
4887	// that function clone, should get updated the same way as the corresponding
4888	// callsite in the original function (which may call a clone of its callee).
4889	//
4890	// We build this map after completing function cloning for each function, so
4891	// that we can record the information from its call maps before they are
4892	// destructed. The map will be used as we update calls to update any still
4893	// unassigned call clones. Note that we may create new node clones as we clone
4894	// other functions, so later on we check which node clones were still not
4895	// created. To this end, the inner map is a map from function clone number to
4896	// the list of calls cloned for that function (can be more than one due to the
4897	// Node's MatchingCalls array).
4898	//
4899	// The alternative is creating new callsite clone nodes below as we clone the
4900	// function, but that is tricker to get right and likely more overhead.
4901	//
4902	// Inner map is a std::map so sorted by key (clone number), in order to get
4903	// ordered remarks in the full LTO case.
4904	DenseMap<const ContextNode , std::map<unsigned*, SmallVector<CallInfo, `0`>>>
4905	UnassignedCallClones;
4906
4907	// Walk all functions for which we saw calls with memprof metadata, and handle
4908	// cloning for each of its calls.
4909	for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
4910	FuncInfo OrigFunc(Func);
4911	// Map from each clone number of OrigFunc to information about that function
4912	// clone (the function clone FuncInfo and call remappings). The index into
4913	// the vector is the clone number, as function clones are created and
4914	// numbered sequentially.
4915	std::vector<FuncCloneInfo> FuncCloneInfos;
4916	for (auto &Call : CallsWithMetadata) {
4917	ContextNode *Node = getNodeForInst(C: Call);
4918	// Skip call if we do not have a node for it (all uses of its stack ids
4919	// were either on inlined chains or pruned from the MIBs), or if we did
4920	// not create any clones for it.
4921	if (!Node \|\| Node->Clones.empty())
4922	continue;
4923	assert(Node->hasCall() &&
4924	"Not having a call should have prevented cloning");
4925
4926	// Track the assignment of function clones to clones of the current
4927	// callsite Node being handled.
4928	std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
4929
4930	// Assign callsite version CallsiteClone to function version FuncClone,
4931	// and also assign (possibly cloned) Call to CallsiteClone.
4932	auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
4933	CallInfo &Call,
4934	ContextNode *CallsiteClone,
4935	bool IsAlloc) {
4936	// Record the clone of callsite node assigned to this function clone.
4937	FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
4938
4939	assert(FuncCloneInfos.size() > FuncClone.cloneNo());
4940	DenseMap<CallInfo, CallInfo> &CallMap =
4941	FuncCloneInfos[FuncClone.cloneNo()].CallMap;
4942	CallInfo CallClone(Call);
4943	if (auto It = CallMap.find(Call); It != CallMap.end())
4944	CallClone = It->second;
4945	CallsiteClone->setCall(CallClone);
4946	// Need to do the same for all matching calls.
4947	for (auto &MatchingCall : Node->MatchingCalls) {
4948	CallInfo CallClone(MatchingCall);
4949	if (auto It = CallMap.find(MatchingCall); It != CallMap.end())
4950	CallClone = It->second;
4951	// Updates the call in the list.
4952	MatchingCall = CallClone;
4953	}
4954	};
4955
4956	// Invokes moveEdgeToNewCalleeClone which creates a new clone, and then
4957	// performs the necessary fixups (removing none type edges, and
4958	// importantly, propagating any function call assignment of the original
4959	// node to the new clone).
4960	auto MoveEdgeToNewCalleeCloneAndSetUp =
4961	[&](const std::shared_ptr<ContextEdge> &Edge) {
4962	ContextNode *OrigCallee = Edge->Callee;
4963	ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge);
4964	removeNoneTypeCalleeEdges(Node: NewClone);
4965	assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
4966	// If the original Callee was already assigned to call a specific
4967	// function version, make sure its new clone is assigned to call
4968	// that same function clone.
4969	if (CallsiteToCalleeFuncCloneMap.count(OrigCallee))
4970	RecordCalleeFuncOfCallsite(
4971	NewClone, CallsiteToCalleeFuncCloneMap[OrigCallee]);
4972	return NewClone;
4973	};
4974
4975	// Keep track of the clones of callsite Node that need to be assigned to
4976	// function clones. This list may be expanded in the loop body below if we
4977	// find additional cloning is required.
4978	std::deque<ContextNode *> ClonesWorklist;
4979	// Ignore original Node if we moved all of its contexts to clones.
4980	if (!Node->emptyContextIds())
4981	ClonesWorklist.push_back(Node);
4982	llvm::append_range(ClonesWorklist, Node->Clones);
4983
4984	// Now walk through all of the clones of this callsite Node that we need,
4985	// and determine the assignment to a corresponding clone of the current
4986	// function (creating new function clones as needed).
4987	unsigned NodeCloneCount = `0`;
4988	while (!ClonesWorklist.empty()) {
4989	ContextNode *Clone = ClonesWorklist.front();
4990	ClonesWorklist.pop_front();
4991	NodeCloneCount++;
4992	if (VerifyNodes)
4993	checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
4994
4995	// Need to create a new function clone if we have more callsite clones
4996	// than existing function clones, which would have been assigned to an
4997	// earlier clone in the list (we assign callsite clones to function
4998	// clones greedily).
4999	if (FuncCloneInfos.size() < NodeCloneCount) {
5000	// If this is the first callsite copy, assign to original function.
5001	if (NodeCloneCount == `1`) {
5002	// Since FuncCloneInfos is empty in this case, no clones have
5003	// been created for this function yet, and no callers should have
5004	// been assigned a function clone for this callee node yet.
5005	assert(llvm::none_of(
5006	Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
5007	return CallsiteToCalleeFuncCloneMap.count(E->Caller);
5008	}));
5009	// Initialize with empty call map, assign Clone to original function
5010	// and its callers, and skip to the next clone.
5011	FuncCloneInfos.push_back(
5012	{OrigFunc, DenseMap<CallInfo, CallInfo>()});
5013	AssignCallsiteCloneToFuncClone(
5014	OrigFunc, Call, Clone,
5015	AllocationCallToContextNodeMap.count(Call));
5016	for (auto &CE : Clone->CallerEdges) {
5017	// Ignore any caller that does not have a recorded callsite Call.
5018	if (!CE->Caller->hasCall())
5019	continue;
5020	RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
5021	}
5022	continue;
5023	}
5024
5025	// First locate which copy of OrigFunc to clone again. If a caller
5026	// of this callsite clone was already assigned to call a particular
5027	// function clone, we need to redirect all of those callers to the
5028	// new function clone, and update their other callees within this
5029	// function.
5030	FuncInfo PreviousAssignedFuncClone;
5031	auto EI = llvm::find_if(
5032	Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
5033	return CallsiteToCalleeFuncCloneMap.count(E->Caller);
5034	});
5035	bool CallerAssignedToCloneOfFunc = false;
5036	if (EI != Clone->CallerEdges.end()) {
5037	const std::shared_ptr<ContextEdge> &Edge = *EI;
5038	PreviousAssignedFuncClone =
5039	CallsiteToCalleeFuncCloneMap[Edge->Caller];
5040	CallerAssignedToCloneOfFunc = true;
5041	}
5042
5043	// Clone function and save it along with the CallInfo map created
5044	// during cloning in the FuncCloneInfos.
5045	DenseMap<CallInfo, CallInfo> NewCallMap;
5046	unsigned CloneNo = FuncCloneInfos.size();
5047	assert(CloneNo > `0` && "Clone 0 is the original function, which "
5048	"should already exist in the map");
5049	FuncInfo NewFuncClone = cloneFunctionForCallsite(
5050	Func&: OrigFunc, Call, CallMap&: NewCallMap, CallsWithMetadataInFunc&: CallsWithMetadata, CloneNo);
5051	FuncCloneInfos.push_back({NewFuncClone, std::move(NewCallMap)});
5052	FunctionClonesAnalysis ++;
5053	Changed = true;
5054
5055	// If no caller callsites were already assigned to a clone of this
5056	// function, we can simply assign this clone to the new func clone
5057	// and update all callers to it, then skip to the next clone.
5058	if (!CallerAssignedToCloneOfFunc) {
5059	AssignCallsiteCloneToFuncClone(
5060	NewFuncClone, Call, Clone,
5061	AllocationCallToContextNodeMap.count(Call));
5062	for (auto &CE : Clone->CallerEdges) {
5063	// Ignore any caller that does not have a recorded callsite Call.
5064	if (!CE->Caller->hasCall())
5065	continue;
5066	RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
5067	}
5068	continue;
5069	}
5070
5071	// We may need to do additional node cloning in this case.
5072	// Reset the CallsiteToCalleeFuncCloneMap entry for any callers
5073	// that were previously assigned to call PreviousAssignedFuncClone,
5074	// to record that they now call NewFuncClone.
5075	// The none type edge removal may remove some of this Clone's caller
5076	// edges, if it is reached via another of its caller's callees.
5077	// Iterate over a copy and skip any that were removed.
5078	auto CallerEdges = Clone->CallerEdges;
5079	for (auto CE : CallerEdges) {
5080	// Skip any that have been removed on an earlier iteration.
5081	if (CE->isRemoved()) {
5082	assert(!is_contained(Clone->CallerEdges, CE));
5083	continue;
5084	}
5085	assert(CE);
5086	// Ignore any caller that does not have a recorded callsite Call.
5087	if (!CE->Caller->hasCall())
5088	continue;
5089
5090	if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) \|\|
5091	// We subsequently fall through to later handling that
5092	// will perform any additional cloning required for
5093	// callers that were calling other function clones.
5094	CallsiteToCalleeFuncCloneMap[CE->Caller] !=
5095	PreviousAssignedFuncClone)
5096	continue;
5097
5098	RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
5099
5100	// If we are cloning a function that was already assigned to some
5101	// callers, then essentially we are creating new callsite clones
5102	// of the other callsites in that function that are reached by those
5103	// callers. Clone the other callees of the current callsite's caller
5104	// that were already assigned to PreviousAssignedFuncClone
5105	// accordingly. This is important since we subsequently update the
5106	// calls from the nodes in the graph and their assignments to callee
5107	// functions recorded in CallsiteToCalleeFuncCloneMap.
5108	// The none type edge removal may remove some of this caller's
5109	// callee edges, if it is reached via another of its callees.
5110	// Iterate over a copy and skip any that were removed.
5111	auto CalleeEdges = CE->Caller->CalleeEdges;
5112	for (auto CalleeEdge : CalleeEdges) {
5113	// Skip any that have been removed on an earlier iteration when
5114	// cleaning up newly None type callee edges.
5115	if (CalleeEdge->isRemoved()) {
5116	assert(!is_contained(CE->Caller->CalleeEdges, CalleeEdge));
5117	continue;
5118	}
5119	assert(CalleeEdge);
5120	ContextNode *Callee = CalleeEdge->Callee;
5121	// Skip the current callsite, we are looking for other
5122	// callsites Caller calls, as well as any that does not have a
5123	// recorded callsite Call.
5124	if (Callee == Clone \|\| !Callee->hasCall())
5125	continue;
5126	// Skip direct recursive calls. We don't need/want to clone the
5127	// caller node again, and this loop will not behave as expected if
5128	// we tried.
5129	if (Callee == CalleeEdge->Caller)
5130	continue;
5131	ContextNode *NewClone =
5132	MoveEdgeToNewCalleeCloneAndSetUp(CalleeEdge);
5133	// Moving the edge may have resulted in some none type
5134	// callee edges on the original Callee.
5135	removeNoneTypeCalleeEdges(Node: Callee);
5136	// Update NewClone with the new Call clone of this callsite's Call
5137	// created for the new function clone created earlier.
5138	// Recall that we have already ensured when building the graph
5139	// that each caller can only call callsites within the same
5140	// function, so we are guaranteed that Callee Call is in the
5141	// current OrigFunc.
5142	// CallMap is set up as indexed by original Call at clone 0.
5143	CallInfo OrigCall(Callee->getOrigNode()->Call);
5144	OrigCall.setCloneNo(`0`);
5145	DenseMap<CallInfo, CallInfo> &CallMap =
5146	FuncCloneInfos[NewFuncClone.cloneNo()].CallMap;
5147	assert(CallMap.count(OrigCall));
5148	CallInfo NewCall(CallMap[OrigCall]);
5149	assert(NewCall);
5150	NewClone->setCall(NewCall);
5151	// Need to do the same for all matching calls.
5152	for (auto &MatchingCall : NewClone->MatchingCalls) {
5153	CallInfo OrigMatchingCall(MatchingCall);
5154	OrigMatchingCall.setCloneNo(`0`);
5155	assert(CallMap.count(OrigMatchingCall));
5156	CallInfo NewCall(CallMap[OrigMatchingCall]);
5157	assert(NewCall);
5158	// Updates the call in the list.
5159	MatchingCall = NewCall;
5160	}
5161	}
5162	}
5163	// Fall through to handling below to perform the recording of the
5164	// function for this callsite clone. This enables handling of cases
5165	// where the callers were assigned to different clones of a function.
5166	}
5167
5168	auto FindFirstAvailFuncClone = [&]() {
5169	// Find first function in FuncCloneInfos without an assigned
5170	// clone of this callsite Node. We should always have one
5171	// available at this point due to the earlier cloning when the
5172	// FuncCloneInfos size was smaller than the clone number.
5173	for (auto &CF : FuncCloneInfos) {
5174	if (!FuncCloneToCurNodeCloneMap.count(CF.FuncClone))
5175	return CF.FuncClone;
5176	}
5177	llvm_unreachable(
5178	"Expected an available func clone for this callsite clone");
5179	};
5180
5181	// See if we can use existing function clone. Walk through
5182	// all caller edges to see if any have already been assigned to
5183	// a clone of this callsite's function. If we can use it, do so. If not,
5184	// because that function clone is already assigned to a different clone
5185	// of this callsite, then we need to clone again.
5186	// Basically, this checking is needed to handle the case where different
5187	// caller functions/callsites may need versions of this function
5188	// containing different mixes of callsite clones across the different
5189	// callsites within the function. If that happens, we need to create
5190	// additional function clones to handle the various combinations.
5191	//
5192	// Keep track of any new clones of this callsite created by the
5193	// following loop, as well as any existing clone that we decided to
5194	// assign this clone to.
5195	std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
5196	FuncInfo FuncCloneAssignedToCurCallsiteClone;
5197	// Iterate over a copy of Clone's caller edges, since we may need to
5198	// remove edges in the moveEdgeTo methods, and this simplifies the*
5199	// handling and makes it less error-prone.
5200	auto CloneCallerEdges = Clone->CallerEdges;
5201	for (auto &Edge : CloneCallerEdges) {
5202	// Skip removed edges (due to direct recursive edges updated when
5203	// updating callee edges when moving an edge and subsequently
5204	// removed by call to removeNoneTypeCalleeEdges on the Clone).
5205	if (Edge->isRemoved())
5206	continue;
5207	// Ignore any caller that does not have a recorded callsite Call.
5208	if (!Edge->Caller->hasCall())
5209	continue;
5210	// If this caller already assigned to call a version of OrigFunc, need
5211	// to ensure we can assign this callsite clone to that function clone.
5212	if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
5213	FuncInfo FuncCloneCalledByCaller =
5214	CallsiteToCalleeFuncCloneMap[Edge->Caller];
5215	// First we need to confirm that this function clone is available
5216	// for use by this callsite node clone.
5217	//
5218	// While FuncCloneToCurNodeCloneMap is built only for this Node and
5219	// its callsite clones, one of those callsite clones X could have
5220	// been assigned to the same function clone called by Edge's caller
5221	// - if Edge's caller calls another callsite within Node's original
5222	// function, and that callsite has another caller reaching clone X.
5223	// We need to clone Node again in this case.
5224	if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
5225	FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
5226	Clone) \|\|
5227	// Detect when we have multiple callers of this callsite that
5228	// have already been assigned to specific, and different, clones
5229	// of OrigFunc (due to other unrelated callsites in Func they
5230	// reach via call contexts). Is this Clone of callsite Node
5231	// assigned to a different clone of OrigFunc? If so, clone Node
5232	// again.
5233	(FuncCloneAssignedToCurCallsiteClone &&
5234	FuncCloneAssignedToCurCallsiteClone !=
5235	FuncCloneCalledByCaller)) {
5236	// We need to use a different newly created callsite clone, in
5237	// order to assign it to another new function clone on a
5238	// subsequent iteration over the Clones array (adjusted below).
5239	// Note we specifically do not reset the
5240	// CallsiteToCalleeFuncCloneMap entry for this caller, so that
5241	// when this new clone is processed later we know which version of
5242	// the function to copy (so that other callsite clones we have
5243	// assigned to that function clone are properly cloned over). See
5244	// comments in the function cloning handling earlier.
5245
5246	// Check if we already have cloned this callsite again while
5247	// walking through caller edges, for a caller calling the same
5248	// function clone. If so, we can move this edge to that new clone
5249	// rather than creating yet another new clone.
5250	if (FuncCloneToNewCallsiteCloneMap.count(
5251	FuncCloneCalledByCaller)) {
5252	ContextNode *NewClone =
5253	FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
5254	moveEdgeToExistingCalleeClone(Edge, NewCallee: NewClone);
5255	// Cleanup any none type edges cloned over.
5256	removeNoneTypeCalleeEdges(Node: NewClone);
5257	} else {
5258	// Create a new callsite clone.
5259	ContextNode *NewClone = MoveEdgeToNewCalleeCloneAndSetUp(Edge);
5260	FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
5261	NewClone;
5262	// Add to list of clones and process later.
5263	ClonesWorklist.push_back(NewClone);
5264	}
5265	// Moving the caller edge may have resulted in some none type
5266	// callee edges.
5267	removeNoneTypeCalleeEdges(Node: Clone);
5268	// We will handle the newly created callsite clone in a subsequent
5269	// iteration over this Node's Clones.
5270	continue;
5271	}
5272
5273	// Otherwise, we can use the function clone already assigned to this
5274	// caller.
5275	if (!FuncCloneAssignedToCurCallsiteClone) {
5276	FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
5277	// Assign Clone to FuncCloneCalledByCaller
5278	AssignCallsiteCloneToFuncClone(
5279	FuncCloneCalledByCaller, Call, Clone,
5280	AllocationCallToContextNodeMap.count(Call));
5281	} else
5282	// Don't need to do anything - callsite is already calling this
5283	// function clone.
5284	assert(FuncCloneAssignedToCurCallsiteClone ==
5285	FuncCloneCalledByCaller);
5286
5287	} else {
5288	// We have not already assigned this caller to a version of
5289	// OrigFunc. Do the assignment now.
5290
5291	// First check if we have already assigned this callsite clone to a
5292	// clone of OrigFunc for another caller during this iteration over
5293	// its caller edges.
5294	if (!FuncCloneAssignedToCurCallsiteClone) {
5295	FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone();
5296	assert(FuncCloneAssignedToCurCallsiteClone);
5297	// Assign Clone to FuncCloneAssignedToCurCallsiteClone
5298	AssignCallsiteCloneToFuncClone(
5299	FuncCloneAssignedToCurCallsiteClone, Call, Clone,
5300	AllocationCallToContextNodeMap.count(Call));
5301	} else
5302	assert(FuncCloneToCurNodeCloneMap
5303	[FuncCloneAssignedToCurCallsiteClone] == Clone);
5304	// Update callers to record function version called.
5305	RecordCalleeFuncOfCallsite(Edge->Caller,
5306	FuncCloneAssignedToCurCallsiteClone);
5307	}
5308	}
5309	// If we didn't assign a function clone to this callsite clone yet, e.g.
5310	// none of its callers has a non-null call, do the assignment here.
5311	// We want to ensure that every callsite clone is assigned to some
5312	// function clone, so that the call updates below work as expected.
5313	// In particular if this is the original callsite, we want to ensure it
5314	// is assigned to the original function, otherwise the original function
5315	// will appear available for assignment to other callsite clones,
5316	// leading to unintended effects. For one, the unknown and not updated
5317	// callers will call into cloned paths leading to the wrong hints,
5318	// because they still call the original function (clone 0). Also,
5319	// because all callsites start out as being clone 0 by default, we can't
5320	// easily distinguish between callsites explicitly assigned to clone 0
5321	// vs those never assigned, which can lead to multiple updates of the
5322	// calls when invoking updateCall below, with mismatched clone values.
5323	// TODO: Add a flag to the callsite nodes or some other mechanism to
5324	// better distinguish and identify callsite clones that are not getting
5325	// assigned to function clones as expected.
5326	if (!FuncCloneAssignedToCurCallsiteClone) {
5327	FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone();
5328	assert(FuncCloneAssignedToCurCallsiteClone &&
5329	"No available func clone for this callsite clone");
5330	AssignCallsiteCloneToFuncClone(
5331	FuncCloneAssignedToCurCallsiteClone, Call, Clone,
5332	/IsAlloc=/AllocationCallToContextNodeMap.contains(Call));
5333	}
5334	}
5335	if (VerifyCCG) {
5336	checkNode<DerivedCCG, FuncTy, CallTy>(Node);
5337	for (const auto &PE : Node->CalleeEdges)
5338	checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
5339	for (const auto &CE : Node->CallerEdges)
5340	checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
5341	for (auto *Clone : Node->Clones) {
5342	checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
5343	for (const auto &PE : Clone->CalleeEdges)
5344	checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
5345	for (const auto &CE : Clone->CallerEdges)
5346	checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
5347	}
5348	}
5349	}
5350
5351	if (FuncCloneInfos.size() < `2`)
5352	continue;
5353
5354	// In this case there is more than just the original function copy.
5355	// Record call clones of any callsite nodes in the function that did not
5356	// themselves get cloned for all of the function clones.
5357	for (auto &Call : CallsWithMetadata) {
5358	ContextNode *Node = getNodeForInst(C: Call);
5359	if (!Node \|\| !Node->hasCall() \|\| Node->emptyContextIds())
5360	continue;
5361	// If Node has enough clones already to cover all function clones, we can
5362	// skip it. Need to add one for the original copy.
5363	// Use >= in case there were clones that were skipped due to having empty
5364	// context ids
5365	if (Node->Clones.size() + `1` >= FuncCloneInfos.size())
5366	continue;
5367	// First collect all function clones we cloned this callsite node for.
5368	// They may not be sequential due to empty clones e.g.
5369	DenseSet<unsigned> NodeCallClones;
5370	for (auto *C : Node->Clones)
5371	NodeCallClones.insert(C->Call.cloneNo());
5372	unsigned I = `0`;
5373	// Now check all the function clones.
5374	for (auto &FC : FuncCloneInfos) {
5375	// Function clones should be sequential.
5376	assert(FC.FuncClone.cloneNo() == I);
5377	// Skip the first clone which got the original call.
5378	// Also skip any other clones created for this Node.
5379	if (++I == `1` \|\| NodeCallClones.contains(V: I)) {
5380	continue;
5381	}
5382	// Record the call clones created for this callsite in this function
5383	// clone.
5384	auto &CallVector = UnassignedCallClones[Node][I];
5385	DenseMap<CallInfo, CallInfo> &CallMap = FC.CallMap;
5386	if (auto It = CallMap.find(Call); It != CallMap.end()) {
5387	CallInfo CallClone = It->second;
5388	CallVector.push_back(CallClone);
5389	} else {
5390	// All but the original clone (skipped earlier) should have an entry
5391	// for all calls.
5392	assert(false && "Expected to find call in CallMap");
5393	}
5394	// Need to do the same for all matching calls.
5395	for (auto &MatchingCall : Node->MatchingCalls) {
5396	if (auto It = CallMap.find(MatchingCall); It != CallMap.end()) {
5397	CallInfo CallClone = It->second;
5398	CallVector.push_back(CallClone);
5399	} else {
5400	// All but the original clone (skipped earlier) should have an entry
5401	// for all calls.
5402	assert(false && "Expected to find call in CallMap");
5403	}
5404	}
5405	}
5406	}
5407	}
5408
5409	uint8_t BothTypes =
5410	(uint8_t)AllocationType::Cold \| (uint8_t)AllocationType::NotCold;
5411
5412	auto UpdateCalls = [&](ContextNode *Node,
5413	DenseSet<const ContextNode *> &Visited,
5414	auto &&UpdateCalls) {
5415	auto Inserted = Visited.insert(Node);
5416	if (!Inserted.second)
5417	return;
5418
5419	for (auto *Clone : Node->Clones)
5420	UpdateCalls(Clone, Visited, UpdateCalls);
5421
5422	for (auto &Edge : Node->CallerEdges)
5423	UpdateCalls(Edge->Caller, Visited, UpdateCalls);
5424
5425	// Skip if either no call to update, or if we ended up with no context ids
5426	// (we moved all edges onto other clones).
5427	if (!Node->hasCall() \|\| Node->emptyContextIds())
5428	return;
5429
5430	if (Node->IsAllocation) {
5431	auto AT = allocTypeToUse(Node->AllocTypes);
5432	// If the allocation type is ambiguous, and more aggressive hinting
5433	// has been enabled via the MinClonedColdBytePercent flag, see if this
5434	// allocation should be hinted cold anyway because its fraction cold bytes
5435	// allocated is at least the given threshold.
5436	if (Node->AllocTypes == BothTypes && MinClonedColdBytePercent < `100` &&
5437	!ContextIdToContextSizeInfos.empty()) {
5438	uint64_t TotalCold = `0`;
5439	uint64_t Total = `0`;
5440	for (auto Id : Node->getContextIds()) {
5441	auto TypeI = ContextIdToAllocationType.find(Id);
5442	assert(TypeI != ContextIdToAllocationType.end());
5443	auto CSI = ContextIdToContextSizeInfos.find(Id);
5444	if (CSI != ContextIdToContextSizeInfos.end()) {
5445	for (auto &Info : CSI->second) {
5446	Total += Info.TotalSize;
5447	if (TypeI->second == AllocationType::Cold)
5448	TotalCold += Info.TotalSize;
5449	}
5450	}
5451	}
5452	if (TotalCold * `100` >= Total * MinClonedColdBytePercent)
5453	AT = AllocationType::Cold;
5454	}
5455	updateAllocationCall(Call&: Node->Call, AllocType: AT);
5456	assert(Node->MatchingCalls.empty());
5457	return;
5458	}
5459
5460	if (!CallsiteToCalleeFuncCloneMap.count(Node))
5461	return;
5462
5463	auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
5464	updateCall(CallerCall&: Node->Call, CalleeFunc);
5465	// Update all the matching calls as well.
5466	for (auto &Call : Node->MatchingCalls)
5467	updateCall(CallerCall&: Call, CalleeFunc);
5468
5469	// Now update all calls recorded earlier that are still in function clones
5470	// which don't have a clone of this callsite node.
5471	if (!UnassignedCallClones.contains(Node))
5472	return;
5473	DenseSet<unsigned> NodeCallClones;
5474	for (auto *C : Node->Clones)
5475	NodeCallClones.insert(C->Call.cloneNo());
5476	// Note that we already confirmed Node is in this map a few lines above.
5477	auto &ClonedCalls = UnassignedCallClones[Node];
5478	for (auto &[CloneNo, CallVector] : ClonedCalls) {
5479	// Should start at 1 as we never create an entry for original node.
5480	assert(CloneNo > `0`);
5481	// If we subsequently created a clone, skip this one.
5482	if (NodeCallClones.contains(V: CloneNo))
5483	continue;
5484	// Use the original Node's CalleeFunc.
5485	for (auto &Call : CallVector)
5486	updateCall(CallerCall&: Call, CalleeFunc);
5487	}
5488	};
5489
5490	// Performs DFS traversal starting from allocation nodes to update calls to
5491	// reflect cloning decisions recorded earlier. For regular LTO this will
5492	// update the actual calls in the IR to call the appropriate function clone
5493	// (and add attributes to allocation calls), whereas for ThinLTO the decisions
5494	// are recorded in the summary entries.
5495	DenseSet<const ContextNode *> Visited;
5496	for (auto &Entry : AllocationCallToContextNodeMap)
5497	UpdateCalls(Entry.second, Visited, UpdateCalls);
5498
5499	return Changed;
5500	}
5501
5502	// Compute a SHA1 hash of the callsite and alloc version information of clone I
5503	// in the summary, to use in detection of duplicate clones.
5504	uint64_t ComputeHash(const FunctionSummary FS, unsigned* I) {
5505	SHA1 Hasher;
5506	// Update hash with any callsites that call non-default (non-zero) callee
5507	// versions.
5508	for (auto &SN : FS->callsites()) {
5509	// In theory all callsites and allocs in this function should have the same
5510	// number of clone entries, but handle any discrepancies gracefully below
5511	// for NDEBUG builds.
5512	assert(
5513	SN.Clones.size() > I &&
5514	"Callsite summary has fewer entries than other summaries in function");
5515	if (SN.Clones.size() <= I \|\| !SN.Clones [I])
5516	continue;
5517	uint8_t Data[sizeof(SN.Clones [I])];
5518	support::endian::write32le(P: Data, V: SN.Clones [I]);
5519	Hasher.update(Data);
5520	}
5521	// Update hash with any allocs that have non-default (non-None) hints.
5522	for (auto &AN : FS->allocs()) {
5523	// In theory all callsites and allocs in this function should have the same
5524	// number of clone entries, but handle any discrepancies gracefully below
5525	// for NDEBUG builds.
5526	assert(AN.Versions.size() > I &&
5527	"Alloc summary has fewer entries than other summaries in function");
5528	if (AN.Versions.size() <= I \|\|
5529	(AllocationType)AN.Versions [I] == AllocationType::None)
5530	continue;
5531	Hasher.update(Data: ArrayRef<uint8_t>(&AN.Versions [I], `1`));
5532	}
5533	return support::endian::read64le(P: Hasher.result().data());
5534	}
5535
5536	static SmallVector<std::unique_ptr<ValueToValueMapTy>, `4`> createFunctionClones(
5537	Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
5538	std::map<const Function , SmallPtrSet<const* GlobalAlias *, `1`>>
5539	&FuncToAliasMap,
5540	FunctionSummary *FS) {
5541	auto TakeDeclNameAndReplace = [](GlobalValue DeclGV, GlobalValue NewGV) {
5542	// We might have created this when adjusting callsite in another
5543	// function. It should be a declaration.
5544	assert(DeclGV->isDeclaration());
5545	NewGV->takeName(V: DeclGV);
5546	DeclGV->replaceAllUsesWith(V: NewGV);
5547	DeclGV->eraseFromParent();
5548	};
5549
5550	// Handle aliases to this function, and create analogous alias clones to the
5551	// provided clone of this function.
5552	auto CloneFuncAliases = [&](Function NewF, unsigned* I) {
5553	if (!FuncToAliasMap.count(x: &F))
5554	return;
5555	for (auto *A : FuncToAliasMap [&F]) {
5556	std::string AliasName = getMemProfFuncName(Base: A->getName(), CloneNo: I);
5557	auto *PrevA = M.getNamedAlias(Name: AliasName);
5558	auto *NewA = GlobalAlias::create(Ty: A->getValueType(),
5559	AddressSpace: A->getType()->getPointerAddressSpace(),
5560	Linkage: A->getLinkage(), Name: AliasName, Aliasee: NewF);
5561	NewA->copyAttributesFrom(Src: A);
5562	if (PrevA)
5563	TakeDeclNameAndReplace (PrevA, NewA);
5564	}
5565	};
5566
5567	// The first "clone" is the original copy, we should only call this if we
5568	// needed to create new clones.
5569	assert(NumClones > `1`);
5570	SmallVector<std::unique_ptr<ValueToValueMapTy>, `4`> VMaps;
5571	VMaps.reserve(N: NumClones - `1`);
5572	FunctionsClonedThinBackend ++;
5573
5574	// Map of hash of callsite/alloc versions to the instantiated function clone
5575	// (possibly the original) implementing those calls. Used to avoid
5576	// instantiating duplicate function clones.
5577	// FIXME: Ideally the thin link would not generate such duplicate clones to
5578	// start with, but right now it happens due to phase ordering in the function
5579	// assignment and possible new clones that produces. We simply make each
5580	// duplicate an alias to the matching instantiated clone recorded in the map
5581	// (except for available_externally which are made declarations as they would
5582	// be aliases in the prevailing module, and available_externally aliases are
5583	// not well supported right now).
5584	DenseMap<uint64_t, Function *> HashToFunc;
5585
5586	// Save the hash of the original function version.
5587	HashToFunc [ComputeHash(FS, I: `0`)] = &F;
5588
5589	for (unsigned I = `1`; I < NumClones; I++) {
5590	VMaps.emplace_back(Args: std::make_unique<ValueToValueMapTy>());
5591	std::string Name = getMemProfFuncName(Base: F.getName(), CloneNo: I);
5592	auto Hash = ComputeHash(FS, I);
5593	// If this clone would duplicate a previously seen clone, don't generate the
5594	// duplicate clone body, just make an alias to satisfy any (potentially
5595	// cross-module) references.
5596	if (HashToFunc.contains(Val: Hash)) {
5597	FunctionCloneDuplicatesThinBackend ++;
5598	auto *Func = HashToFunc [Hash];
5599	if (Func->hasAvailableExternallyLinkage()) {
5600	// Skip these as EliminateAvailableExternallyPass does not handle
5601	// available_externally aliases correctly and we end up with an
5602	// available_externally alias to a declaration. Just create a
5603	// declaration for now as we know we will have a definition in another
5604	// module.
5605	auto Decl = M.getOrInsertFunction(Name, T: Func->getFunctionType());
5606	ORE.emit(OptDiag: OptimizationRemark (DEBUG_TYPE, "MemprofClone", &F)
5607	<< "created clone decl " << ore::NV ("Decl", Decl.getCallee()));
5608	continue;
5609	}
5610	auto *PrevF = M.getFunction(Name);
5611	auto *Alias = GlobalAlias::create(Name, Aliasee: Func);
5612	if (PrevF)
5613	TakeDeclNameAndReplace (PrevF, Alias);
5614	ORE.emit(OptDiag: OptimizationRemark (DEBUG_TYPE, "MemprofClone", &F)
5615	<< "created clone alias " << ore::NV ("Alias", Alias));
5616
5617	// Now handle aliases to this function, and clone those as well.
5618	CloneFuncAliases (Func, I);
5619	continue;
5620	}
5621	auto NewF = CloneFunction(F: &F, VMap&: VMaps.back());
5622	HashToFunc [Hash] = NewF;
5623	FunctionClonesThinBackend ++;
5624	// Strip memprof and callsite metadata from clone as they are no longer
5625	// needed.
5626	for (auto &BB : *NewF) {
5627	for (auto &Inst : BB) {
5628	Inst.setMetadata(KindID: LLVMContext::MD_memprof, Node: nullptr);
5629	Inst.setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
5630	}
5631	}
5632	auto *PrevF = M.getFunction(Name);
5633	if (PrevF)
5634	TakeDeclNameAndReplace (PrevF, NewF);
5635	else
5636	NewF->setName(Name);
5637	updateSubprogramLinkageName(NewFunc: NewF, Name);
5638	ORE.emit(OptDiag: OptimizationRemark (DEBUG_TYPE, "MemprofClone", &F)
5639	<< "created clone " << ore::NV ("NewFunction", NewF));
5640
5641	// Now handle aliases to this function, and clone those as well.
5642	CloneFuncAliases (NewF, I);
5643	}
5644	return VMaps;
5645	}
5646
5647	// Locate the summary for F. This is complicated by the fact that it might
5648	// have been internalized or promoted.
5649	static ValueInfo findValueInfoForFunc(const Function &F, const Module &M,
5650	const ModuleSummaryIndex *ImportSummary,
5651	const Function CallingFunc = nullptr*) {
5652	// FIXME: Ideally we would retain the original GUID in some fashion on the
5653	// function (e.g. as metadata), but for now do our best to locate the
5654	// summary without that information.
5655	ValueInfo TheFnVI = ImportSummary->getValueInfo(GUID: F.getGUID());
5656	if (!TheFnVI)
5657	// See if theFn was internalized, by checking index directly with
5658	// original name (this avoids the name adjustment done by getGUID() for
5659	// internal symbols).
5660	TheFnVI = ImportSummary->getValueInfo(
5661	GUID: GlobalValue::getGUIDAssumingExternalLinkage(GlobalName: F.getName()));
5662	if (TheFnVI)
5663	return TheFnVI;
5664	// Now query with the original name before any promotion was performed.
5665	StringRef OrigName =
5666	ModuleSummaryIndex::getOriginalNameBeforePromote(Name: F.getName());
5667	// When this pass is enabled, we always add thinlto_src_file provenance
5668	// metadata to imported function definitions, which allows us to recreate the
5669	// original internal symbol's GUID.
5670	auto SrcFileMD = F.getMetadata(Kind: "thinlto_src_file");
5671	// If this is a call to an imported/promoted local for which we didn't import
5672	// the definition, the metadata will not exist on the declaration. However,
5673	// since we are doing this early, before any inlining in the LTO backend, we
5674	// can simply look at the metadata on the calling function which must have
5675	// been from the same module if F was an internal symbol originally.
5676	if (!SrcFileMD && F.isDeclaration()) {
5677	// We would only call this for a declaration for a direct callsite, in which
5678	// case the caller would have provided the calling function pointer.
5679	assert(CallingFunc);
5680	SrcFileMD = CallingFunc->getMetadata(Kind: "thinlto_src_file");
5681	// If this is a promoted local (OrigName != F.getName()), since this is a
5682	// declaration, it must be imported from a different module and therefore we
5683	// should always find the metadata on its calling function. Any call to a
5684	// promoted local that came from this module should still be a definition.
5685	assert(SrcFileMD \|\| OrigName == F.getName());
5686	}
5687	StringRef SrcFile = M.getSourceFileName();
5688	if (SrcFileMD)
5689	SrcFile = dyn_cast<MDString>(Val: SrcFileMD->getOperand(I: `0`))->getString();
5690	std::string OrigId = GlobalValue::getGlobalIdentifier(
5691	Name: OrigName, Linkage: GlobalValue::InternalLinkage, FileName: SrcFile);
5692	TheFnVI = ImportSummary->getValueInfo(
5693	GUID: GlobalValue::getGUIDAssumingExternalLinkage(GlobalName: OrigId));
5694	// Internal func in original module may have gotten a numbered suffix if we
5695	// imported an external function with the same name. This happens
5696	// automatically during IR linking for naming conflicts. It would have to
5697	// still be internal in that case (otherwise it would have been renamed on
5698	// promotion in which case we wouldn't have a naming conflict).
5699	if (!TheFnVI && OrigName == F.getName() && F.hasLocalLinkage() &&
5700	F.getName().contains(C: `'.'`)) {
5701	OrigName = F.getName().rsplit(Separator: `'.'`).first;
5702	OrigId = GlobalValue::getGlobalIdentifier(
5703	Name: OrigName, Linkage: GlobalValue::InternalLinkage, FileName: SrcFile);
5704	TheFnVI = ImportSummary->getValueInfo(
5705	GUID: GlobalValue::getGUIDAssumingExternalLinkage(GlobalName: OrigId));
5706	}
5707	// The only way we may not have a VI is if this is a declaration created for
5708	// an imported reference. For distributed ThinLTO we may not have a VI for
5709	// such declarations in the distributed summary.
5710	assert(TheFnVI \|\| F.isDeclaration());
5711	return TheFnVI;
5712	}
5713
5714	bool MemProfContextDisambiguation::initializeIndirectCallPromotionInfo(
5715	Module &M) {
5716	ICallAnalysis = std::make_unique<ICallPromotionAnalysis>();
5717	Symtab = std::make_unique<InstrProfSymtab>();
5718	// Don't add canonical names, to avoid multiple functions to the symtab
5719	// when they both have the same root name with "." suffixes stripped.
5720	// If we pick the wrong one then this could lead to incorrect ICP and calling
5721	// a memprof clone that we don't actually create (resulting in linker unsats).
5722	// What this means is that the GUID of the function (or its PGOFuncName
5723	// metadata) must* match that in the VP metadata to allow promotion.*
5724	// In practice this should not be a limitation, since local functions should
5725	// have PGOFuncName metadata and global function names shouldn't need any
5726	// special handling (they should not get the ".llvm." suffix that the*
5727	// canonicalization handling is attempting to strip).
5728	if (Error E = Symtab ->create(M, /InLTO=/true, /AddCanonical=/false)) {
5729	std::string SymtabFailure = toString(E: std::move(E));
5730	M.getContext().emitError(ErrorStr: "Failed to create symtab: " + SymtabFailure);
5731	return false;
5732	}
5733	return true;
5734	}
5735
5736	#ifndef NDEBUG
5737	// Sanity check that the MIB stack ids match between the summary and
5738	// instruction metadata.
5739	static void checkAllocContextIds(
5740	const AllocInfo &AllocNode, const MDNode *MemProfMD,
5741	const CallStack<MDNode, MDNode::op_iterator> &CallsiteContext,
5742	const ModuleSummaryIndex *ImportSummary) {
5743	auto MIBIter = AllocNode.MIBs.begin();
5744	for (auto &MDOp : MemProfMD->operands()) {
5745	assert(MIBIter != AllocNode.MIBs.end());
5746	auto StackIdIndexIter = MIBIter->StackIdIndices.begin();
5747	auto MIBMD = cast<const* MDNode>(MDOp);
5748	MDNode *StackMDNode = getMIBStackNode(MIBMD);
5749	assert(StackMDNode);
5750	CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
5751	auto ContextIterBegin =
5752	StackContext.beginAfterSharedPrefix(CallsiteContext);
5753	// Skip the checking on the first iteration.
5754	uint64_t LastStackContextId =
5755	(ContextIterBegin != StackContext.end() && *ContextIterBegin == `0`) ? `1`
5756	: `0`;
5757	for (auto ContextIter = ContextIterBegin; ContextIter != StackContext.end();
5758	++ContextIter) {
5759	// If this is a direct recursion, simply skip the duplicate
5760	// entries, to be consistent with how the summary ids were
5761	// generated during ModuleSummaryAnalysis.
5762	if (LastStackContextId == *ContextIter)
5763	continue;
5764	LastStackContextId = *ContextIter;
5765	assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
5766	assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
5767	*ContextIter);
5768	StackIdIndexIter++;
5769	}
5770	MIBIter++;
5771	}
5772	}
5773	#endif
5774
5775	bool MemProfContextDisambiguation::applyImport(Module &M) {
5776	assert(ImportSummary);
5777	bool Changed = false;
5778
5779	// We also need to clone any aliases that reference cloned functions, because
5780	// the modified callsites may invoke via the alias. Keep track of the aliases
5781	// for each function.
5782	std::map<const Function , SmallPtrSet<const* GlobalAlias *, `1`>>
5783	FuncToAliasMap;
5784	for (auto &A : M.aliases()) {
5785	auto *Aliasee = A.getAliaseeObject();
5786	if (auto *F = dyn_cast<Function>(Val: Aliasee))
5787	FuncToAliasMap [F].insert(Ptr: &A);
5788	}
5789
5790	if (!initializeIndirectCallPromotionInfo(M))
5791	return false;
5792
5793	for (auto &F : M) {
5794	if (F.isDeclaration() \|\| isMemProfClone(F))
5795	continue;
5796
5797	OptimizationRemarkEmitter ORE(&F);
5798
5799	SmallVector<std::unique_ptr<ValueToValueMapTy>, `4`> VMaps;
5800	bool ClonesCreated = false;
5801	unsigned NumClonesCreated = `0`;
5802	auto CloneFuncIfNeeded = [&](unsigned NumClones, FunctionSummary *FS) {
5803	// We should at least have version 0 which is the original copy.
5804	assert(NumClones > `0`);
5805	// If only one copy needed use original.
5806	if (NumClones == `1`)
5807	return;
5808	// If we already performed cloning of this function, confirm that the
5809	// requested number of clones matches (the thin link should ensure the
5810	// number of clones for each constituent callsite is consistent within
5811	// each function), before returning.
5812	if (ClonesCreated) {
5813	assert(NumClonesCreated == NumClones);
5814	return;
5815	}
5816	VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap, FS);
5817	// The first "clone" is the original copy, which doesn't have a VMap.
5818	assert(VMaps.size() == NumClones - `1`);
5819	Changed = true;
5820	ClonesCreated = true;
5821	NumClonesCreated = NumClones;
5822	};
5823
5824	auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
5825	Function CalledFunction, FunctionSummary FS) {
5826	// Perform cloning if not yet done.
5827	CloneFuncIfNeeded (/NumClones=/StackNode.Clones.size(), FS);
5828
5829	assert(!isMemProfClone(*CalledFunction));
5830
5831	// Because we update the cloned calls by calling setCalledOperand (see
5832	// comment below), out of an abundance of caution make sure the called
5833	// function was actually the called operand (or its aliasee). We also
5834	// strip pointer casts when looking for calls (to match behavior during
5835	// summary generation), however, with opaque pointers in theory this
5836	// should not be an issue. Note we still clone the current function
5837	// (containing this call) above, as that could be needed for its callers.
5838	auto *GA = dyn_cast_or_null<GlobalAlias>(Val: CB->getCalledOperand());
5839	if (CalledFunction != CB->getCalledOperand() &&
5840	(!GA \|\| CalledFunction != GA->getAliaseeObject())) {
5841	SkippedCallsCloning ++;
5842	return;
5843	}
5844	// Update the calls per the summary info.
5845	// Save orig name since it gets updated in the first iteration
5846	// below.
5847	auto CalleeOrigName = CalledFunction->getName();
5848	for (unsigned J = `0`; J < StackNode.Clones.size(); J++) {
5849	// If the VMap is empty, this clone was a duplicate of another and was
5850	// created as an alias or a declaration.
5851	if (J > `0` && VMaps [J - `1`]->empty())
5852	continue;
5853	// Do nothing if this version calls the original version of its
5854	// callee.
5855	if (!StackNode.Clones [J])
5856	continue;
5857	auto NewF = M.getOrInsertFunction(
5858	Name: getMemProfFuncName(Base: CalleeOrigName, CloneNo: StackNode.Clones [J]),
5859	T: CalledFunction->getFunctionType());
5860	CallBase *CBClone;
5861	// Copy 0 is the original function.
5862	if (!J)
5863	CBClone = CB;
5864	else
5865	CBClone = cast<CallBase>(Val&: (*VMaps [J - `1`])[CB]);
5866	// Set the called operand directly instead of calling setCalledFunction,
5867	// as the latter mutates the function type on the call. In rare cases
5868	// we may have a slightly different type on a callee function
5869	// declaration due to it being imported from a different module with
5870	// incomplete types. We really just want to change the name of the
5871	// function to the clone, and not make any type changes.
5872	CBClone->setCalledOperand(NewF.getCallee());
5873	ORE.emit(OptDiag: OptimizationRemark (DEBUG_TYPE, "MemprofCall", CBClone)
5874	<< ore::NV ("Call", CBClone) << " in clone "
5875	<< ore::NV ("Caller", CBClone->getFunction())
5876	<< " assigned to call function clone "
5877	<< ore::NV ("Callee", NewF.getCallee()));
5878	}
5879	};
5880
5881	// Locate the summary for F.
5882	ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
5883	// If not found, this could be an imported local (see comment in
5884	// findValueInfoForFunc). Skip for now as it will be cloned in its original
5885	// module (where it would have been promoted to global scope so should
5886	// satisfy any reference in this module).
5887	if (!TheFnVI)
5888	continue;
5889
5890	auto *GVSummary =
5891	ImportSummary->findSummaryInModule(VI: TheFnVI, ModuleId: M.getModuleIdentifier());
5892	if (!GVSummary) {
5893	// Must have been imported, use the summary which matches the definition。
5894	// (might be multiple if this was a linkonce_odr).
5895	auto SrcModuleMD = F.getMetadata(Kind: "thinlto_src_module");
5896	assert(SrcModuleMD &&
5897	"enable-import-metadata is needed to emit thinlto_src_module");
5898	StringRef SrcModule =
5899	dyn_cast<MDString>(Val: SrcModuleMD->getOperand(I: `0`))->getString();
5900	for (auto &GVS : TheFnVI.getSummaryList()) {
5901	if (GVS ->modulePath() == SrcModule) {
5902	GVSummary = GVS.get();
5903	break;
5904	}
5905	}
5906	// TODO: Put back the assert once we have metadata on imported copies of
5907	// aliases linking them back to the original alias GUID, which would allow
5908	// us to locate the alias summary here.
5909	// assert(GVSummary && GVSummary->modulePath() == SrcModule);
5910	}
5911
5912	// GVSummary can be null if this is a function imported as a copy of an
5913	// alias, and we don't have the aliasee's summary in our distributed index.
5914	// TODO: Once we can locate the original GUID for imported aliases (e.g. via
5915	// TBD additional metadata), we should find the alias summary instead, and
5916	// we can remove this check and fall back to the original check below.
5917	if (!GVSummary)
5918	continue;
5919
5920	// If this was an imported alias skip it as we won't have the function
5921	// summary, and it should be cloned in the original module.
5922	if (isa<AliasSummary>(Val: GVSummary))
5923	continue;
5924
5925	auto *FS = cast<FunctionSummary>(Val: GVSummary->getBaseObject());
5926
5927	if (FS->allocs().empty() && FS->callsites().empty())
5928	continue;
5929
5930	auto SI = FS->callsites().begin();
5931	auto AI = FS->allocs().begin();
5932
5933	// To handle callsite infos synthesized for tail calls which have missing
5934	// frames in the profiled context, map callee VI to the synthesized callsite
5935	// info.
5936	DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite;
5937	// Iterate the callsites for this function in reverse, since we place all
5938	// those synthesized for tail calls at the end.
5939	for (auto CallsiteIt = FS->callsites().rbegin();
5940	CallsiteIt != FS->callsites().rend(); CallsiteIt ++) {
5941	auto &Callsite = *CallsiteIt;
5942	// Stop as soon as we see a non-synthesized callsite info (see comment
5943	// above loop). All the entries added for discovered tail calls have empty
5944	// stack ids.
5945	if (!Callsite.StackIdIndices.empty())
5946	break;
5947	MapTailCallCalleeVIToCallsite.insert(KV: {Callsite.Callee, Callsite});
5948	}
5949
5950	// Keeps track of needed ICP for the function.
5951	SmallVector<ICallAnalysisData> ICallAnalysisInfo;
5952
5953	// Assume for now that the instructions are in the exact same order
5954	// as when the summary was created, but confirm this is correct by
5955	// matching the stack ids.
5956	for (auto &BB : F) {
5957	for (auto &I : BB) {
5958	auto *CB = dyn_cast<CallBase>(Val: &I);
5959	// Same handling as when creating module summary.
5960	if (!mayHaveMemprofSummary(CB))
5961	continue;
5962
5963	auto *CalledValue = CB->getCalledOperand();
5964	auto *CalledFunction = CB->getCalledFunction();
5965	if (CalledValue && !CalledFunction) {
5966	CalledValue = CalledValue->stripPointerCasts();
5967	// Stripping pointer casts can reveal a called function.
5968	CalledFunction = dyn_cast<Function>(Val: CalledValue);
5969	}
5970	// Check if this is an alias to a function. If so, get the
5971	// called aliasee for the checks below.
5972	if (auto *GA = dyn_cast<GlobalAlias>(Val: CalledValue)) {
5973	assert(!CalledFunction &&
5974	"Expected null called function in callsite for alias");
5975	CalledFunction = dyn_cast<Function>(Val: GA->getAliaseeObject());
5976	}
5977
5978	CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
5979	I.getMetadata(KindID: LLVMContext::MD_callsite));
5980	auto *MemProfMD = I.getMetadata(KindID: LLVMContext::MD_memprof);
5981
5982	// Include allocs that were already assigned a memprof function
5983	// attribute in the statistics. Only do this for those that do not have
5984	// memprof metadata, since we add an "ambiguous" memprof attribute by
5985	// default.
5986	if (CB->getAttributes().hasFnAttr(Kind: "memprof") && !MemProfMD) {
5987	CB->getAttributes().getFnAttr(Kind: "memprof").getValueAsString() == "cold"
5988	? AllocTypeColdThinBackend ++
5989	: AllocTypeNotColdThinBackend ++;
5990	OrigAllocsThinBackend ++;
5991	AllocVersionsThinBackend ++;
5992	if (!MaxAllocVersionsThinBackend)
5993	MaxAllocVersionsThinBackend = `1`;
5994	continue;
5995	}
5996
5997	if (MemProfMD) {
5998	// Consult the next alloc node.
5999	assert(AI != FS->allocs().end());
6000	auto &AllocNode = *(AI++);
6001
6002	#ifndef NDEBUG
6003	checkAllocContextIds(AllocNode, MemProfMD, CallsiteContext,
6004	ImportSummary);
6005	#endif
6006
6007	// Perform cloning if not yet done.
6008	CloneFuncIfNeeded (/NumClones=/AllocNode.Versions.size(), FS);
6009
6010	OrigAllocsThinBackend ++;
6011	AllocVersionsThinBackend += AllocNode.Versions.size();
6012	if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
6013	MaxAllocVersionsThinBackend = AllocNode.Versions.size();
6014
6015	// If there is only one version that means we didn't end up
6016	// considering this function for cloning, and in that case the alloc
6017	// will still be none type or should have gotten the default NotCold.
6018	// Skip that after calling clone helper since that does some sanity
6019	// checks that confirm we haven't decided yet that we need cloning.
6020	// We might have a single version that is cold due to the
6021	// MinClonedColdBytePercent heuristic, make sure we don't skip in that
6022	// case.
6023	if (AllocNode.Versions.size() == `1` &&
6024	(AllocationType)AllocNode.Versions [`0`] != AllocationType::Cold) {
6025	assert((AllocationType)AllocNode.Versions[`0`] ==
6026	AllocationType::NotCold \|\|
6027	(AllocationType)AllocNode.Versions[`0`] ==
6028	AllocationType::None);
6029	UnclonableAllocsThinBackend ++;
6030	continue;
6031	}
6032
6033	// All versions should have a singular allocation type.
6034	assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
6035	return Type == ((uint8_t)AllocationType::NotCold \|
6036	(uint8_t)AllocationType::Cold);
6037	}));
6038
6039	// Update the allocation types per the summary info.
6040	for (unsigned J = `0`; J < AllocNode.Versions.size(); J++) {
6041	// If the VMap is empty, this clone was a duplicate of another and
6042	// was created as an alias or a declaration.
6043	if (J > `0` && VMaps [J - `1`]->empty())
6044	continue;
6045	// Ignore any that didn't get an assigned allocation type.
6046	if (AllocNode.Versions [J] == (uint8_t)AllocationType::None)
6047	continue;
6048	AllocationType AllocTy = (AllocationType)AllocNode.Versions [J];
6049	AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend ++
6050	: AllocTypeNotColdThinBackend ++;
6051	std::string AllocTypeString = getAllocTypeAttributeString(Type: AllocTy);
6052	auto A = llvm::Attribute::get(Context&: F.getContext(), Kind: "memprof",
6053	Val: AllocTypeString);
6054	CallBase *CBClone;
6055	// Copy 0 is the original function.
6056	if (!J)
6057	CBClone = CB;
6058	else
6059	// Since VMaps are only created for new clones, we index with
6060	// clone J-1 (J==0 is the original clone and does not have a VMaps
6061	// entry).
6062	CBClone = cast<CallBase>(Val&: (*VMaps [J - `1`])[CB]);
6063	removeAnyExistingAmbiguousAttribute(CB: CBClone);
6064	CBClone->addFnAttr(Attr: A);
6065	ORE.emit(OptDiag: OptimizationRemark (DEBUG_TYPE, "MemprofAttribute", CBClone)
6066	<< ore::NV ("AllocationCall", CBClone) << " in clone "
6067	<< ore::NV ("Caller", CBClone->getFunction())
6068	<< " marked with memprof allocation attribute "
6069	<< ore::NV ("Attribute", AllocTypeString));
6070	}
6071	} else if (!CallsiteContext.empty()) {
6072	if (!CalledFunction) {
6073	#ifndef NDEBUG
6074	// We should have skipped inline assembly calls.
6075	auto *CI = dyn_cast<CallInst>(CB);
6076	assert(!CI \|\| !CI->isInlineAsm());
6077	#endif
6078	// We should have skipped direct calls via a Constant.
6079	assert(CalledValue && !isa<Constant>(CalledValue));
6080
6081	// This is an indirect call, see if we have profile information and
6082	// whether any clones were recorded for the profiled targets (that
6083	// we synthesized CallsiteInfo summary records for when building the
6084	// index).
6085	auto NumClones =
6086	recordICPInfo(CB, AllCallsites: FS->callsites(), SI, ICallAnalysisInfo);
6087
6088	// Perform cloning if not yet done. This is done here in case
6089	// we don't need to do ICP, but might need to clone this
6090	// function as it is the target of other cloned calls.
6091	if (NumClones)
6092	CloneFuncIfNeeded (NumClones, FS);
6093	}
6094
6095	else {
6096	// Consult the next callsite node.
6097	assert(SI != FS->callsites().end());
6098	auto &StackNode = *(SI++);
6099
6100	#ifndef NDEBUG
6101	// Sanity check that the stack ids match between the summary and
6102	// instruction metadata.
6103	auto StackIdIndexIter = StackNode.StackIdIndices.begin();
6104	for (auto StackId : CallsiteContext) {
6105	assert(StackIdIndexIter != StackNode.StackIdIndices.end());
6106	assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
6107	StackId);
6108	StackIdIndexIter++;
6109	}
6110	#endif
6111
6112	CloneCallsite (StackNode, CB, CalledFunction, FS);
6113	}
6114	} else if (CB->isTailCall() && CalledFunction) {
6115	// Locate the synthesized callsite info for the callee VI, if any was
6116	// created, and use that for cloning.
6117	ValueInfo CalleeVI =
6118	findValueInfoForFunc(F: *CalledFunction, M, ImportSummary, CallingFunc: &F);
6119	if (CalleeVI && MapTailCallCalleeVIToCallsite.count(Val: CalleeVI)) {
6120	auto Callsite = MapTailCallCalleeVIToCallsite.find(Val: CalleeVI);
6121	assert(Callsite != MapTailCallCalleeVIToCallsite.end());
6122	CloneCallsite (Callsite ->second, CB, CalledFunction, FS);
6123	}
6124	}
6125	}
6126	}
6127
6128	// Now do any promotion required for cloning.
6129	performICP(M, AllCallsites: FS->callsites(), VMaps, ICallAnalysisInfo, ORE);
6130	}
6131
6132	// We skip some of the functions and instructions above, so remove all the
6133	// metadata in a single sweep here.
6134	for (auto &F : M) {
6135	// We can skip memprof clones because createFunctionClones already strips
6136	// the metadata from the newly created clones.
6137	if (F.isDeclaration() \|\| isMemProfClone(F))
6138	continue;
6139	for (auto &BB : F) {
6140	for (auto &I : BB) {
6141	if (!isa<CallBase>(Val: I))
6142	continue;
6143	I.setMetadata(KindID: LLVMContext::MD_memprof, Node: nullptr);
6144	I.setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
6145	}
6146	}
6147	}
6148
6149	return Changed;
6150	}
6151
6152	unsigned MemProfContextDisambiguation::recordICPInfo(
6153	CallBase *CB, ArrayRef<CallsiteInfo> AllCallsites,
6154	ArrayRef<CallsiteInfo>::iterator &SI,
6155	SmallVector<ICallAnalysisData> &ICallAnalysisInfo) {
6156	// First see if we have profile information for this indirect call.
6157	uint32_t NumCandidates;
6158	uint64_t TotalCount;
6159	auto CandidateProfileData =
6160	ICallAnalysis ->getPromotionCandidatesForInstruction(
6161	I: CB, TotalCount, NumCandidates, MaxNumValueData: MaxSummaryIndirectEdges);
6162	if (CandidateProfileData.empty())
6163	return `0`;
6164
6165	// Iterate through all of the candidate profiled targets along with the
6166	// CallsiteInfo summary records synthesized for them when building the index,
6167	// and see if any are cloned and/or refer to clones.
6168	bool ICPNeeded = false;
6169	unsigned NumClones = `0`;
6170	size_t CallsiteInfoStartIndex = std::distance(first: AllCallsites.begin(), last: SI);
6171	for (const auto &Candidate : CandidateProfileData) {
6172	#ifndef NDEBUG
6173	auto CalleeValueInfo =
6174	#endif
6175	ImportSummary->getValueInfo(GUID: Candidate.Value);
6176	// We might not have a ValueInfo if this is a distributed
6177	// ThinLTO backend and decided not to import that function.
6178	assert(!CalleeValueInfo \|\| SI->Callee == CalleeValueInfo);
6179	assert(SI != AllCallsites.end());
6180	auto &StackNode = *(SI++);
6181	// See if any of the clones of the indirect callsite for this
6182	// profiled target should call a cloned version of the profiled
6183	// target. We only need to do the ICP here if so.
6184	ICPNeeded \|= llvm::any_of(Range: StackNode.Clones,
6185	P: [](unsigned CloneNo) { return CloneNo != `0`; });
6186	// Every callsite in the same function should have been cloned the same
6187	// number of times.
6188	assert(!NumClones \|\| NumClones == StackNode.Clones.size());
6189	NumClones = StackNode.Clones.size();
6190	}
6191	if (!ICPNeeded)
6192	return NumClones;
6193	// Save information for ICP, which is performed later to avoid messing up the
6194	// current function traversal.
6195	ICallAnalysisInfo.push_back(Elt: {.CB: CB, .CandidateProfileData: CandidateProfileData.vec(), .NumCandidates: NumCandidates,
6196	.TotalCount: TotalCount, .CallsiteInfoStartIndex: CallsiteInfoStartIndex});
6197	return NumClones;
6198	}
6199
6200	void MemProfContextDisambiguation::performICP(
6201	Module &M, ArrayRef<CallsiteInfo> AllCallsites,
6202	ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps,
6203	ArrayRef<ICallAnalysisData> ICallAnalysisInfo,
6204	OptimizationRemarkEmitter &ORE) {
6205	// Now do any promotion required for cloning. Specifically, for each
6206	// recorded ICP candidate (which was only recorded because one clone of that
6207	// candidate should call a cloned target), we perform ICP (speculative
6208	// devirtualization) for each clone of the callsite, and update its callee
6209	// to the appropriate clone. Note that the ICP compares against the original
6210	// version of the target, which is what is in the vtable.
6211	for (auto &Info : ICallAnalysisInfo) {
6212	auto *CB = Info.CB;
6213	auto CallsiteIndex = Info.CallsiteInfoStartIndex;
6214	auto TotalCount = Info.TotalCount;
6215	unsigned NumClones = `0`;
6216	SmallVector<InstrProfValueData, `8`> RemainingCandidates;
6217
6218	for (auto &Candidate : Info.CandidateProfileData) {
6219	auto &StackNode = AllCallsites [CallsiteIndex++];
6220
6221	// All calls in the same function must have the same number of clones.
6222	assert(!NumClones \|\| NumClones == StackNode.Clones.size());
6223	NumClones = StackNode.Clones.size();
6224
6225	// See if the target is in the module. If it wasn't imported, it is
6226	// possible that this profile could have been collected on a different
6227	// target (or version of the code), and we need to be conservative
6228	// (similar to what is done in the ICP pass).
6229	Function *TargetFunction = Symtab ->getFunction(FuncMD5Hash: Candidate.Value);
6230	if (TargetFunction == nullptr \|\|
6231	// Any ThinLTO global dead symbol removal should have already
6232	// occurred, so it should be safe to promote when the target is a
6233	// declaration.
6234	// TODO: Remove internal option once more fully tested.
6235	(MemProfRequireDefinitionForPromotion &&
6236	TargetFunction->isDeclaration())) {
6237	ORE.emit(RemarkBuilder: [&]() {
6238	return OptimizationRemarkMissed (DEBUG_TYPE, "UnableToFindTarget", CB)
6239	<< "Memprof cannot promote indirect call: target with md5sum "
6240	<< ore::NV ("target md5sum", Candidate.Value) << " not found";
6241	});
6242	// FIXME: See if we can use the new declaration importing support to
6243	// at least get the declarations imported for this case. Hot indirect
6244	// targets should have been imported normally, however.
6245	RemainingCandidates.push_back(Elt: Candidate);
6246	continue;
6247	}
6248
6249	// Check if legal to promote
6250	const char Reason = nullptr*;
6251	if (!isLegalToPromote(CB: *CB, Callee: TargetFunction, FailureReason: &Reason)) {
6252	ORE.emit(RemarkBuilder: [&]() {
6253	return OptimizationRemarkMissed (DEBUG_TYPE, "UnableToPromote", CB)
6254	<< "Memprof cannot promote indirect call to "
6255	<< ore::NV ("TargetFunction", TargetFunction)
6256	<< " with count of " << ore::NV ("TotalCount", TotalCount)
6257	<< ": " << Reason;
6258	});
6259	RemainingCandidates.push_back(Elt: Candidate);
6260	continue;
6261	}
6262
6263	assert(!isMemProfClone(*TargetFunction));
6264
6265	// Handle each call clone, applying ICP so that each clone directly
6266	// calls the specified callee clone, guarded by the appropriate ICP
6267	// check.
6268	CallBase *CBClone = CB;
6269	for (unsigned J = `0`; J < NumClones; J++) {
6270	// If the VMap is empty, this clone was a duplicate of another and was
6271	// created as an alias or a declaration.
6272	if (J > `0` && VMaps [J - `1`]->empty())
6273	continue;
6274	// Copy 0 is the original function.
6275	if (J > `0`)
6276	CBClone = cast<CallBase>(Val&: (*VMaps [J - `1`])[CB]);
6277	// We do the promotion using the original name, so that the comparison
6278	// is against the name in the vtable. Then just below, change the new
6279	// direct call to call the cloned function.
6280	auto &DirectCall =
6281	pgo::promoteIndirectCall(CB&: *CBClone, F: TargetFunction, Count: Candidate.Count,
6282	TotalCount, AttachProfToDirectCall: isSamplePGO, ORE: &ORE);
6283	auto *TargetToUse = TargetFunction;
6284	// Call original if this version calls the original version of its
6285	// callee.
6286	if (StackNode.Clones [J]) {
6287	TargetToUse =
6288	cast<Function>(Val: M.getOrInsertFunction(
6289	Name: getMemProfFuncName(Base: TargetFunction->getName(),
6290	CloneNo: StackNode.Clones [J]),
6291	T: TargetFunction->getFunctionType())
6292	.getCallee());
6293	}
6294	DirectCall.setCalledFunction(TargetToUse);
6295	// During matching we generate synthetic VP metadata for indirect calls
6296	// not already having any, from the memprof profile's callee GUIDs. If
6297	// we subsequently promote and inline those callees, we currently lose
6298	// the ability to generate this synthetic VP metadata. Optionally apply
6299	// a noinline attribute to promoted direct calls, where the threshold is
6300	// set to capture synthetic VP metadata targets which get a count of 1.
6301	if (MemProfICPNoInlineThreshold &&
6302	Candidate.Count < MemProfICPNoInlineThreshold)
6303	DirectCall.setIsNoInline();
6304	ORE.emit(OptDiag: OptimizationRemark (DEBUG_TYPE, "MemprofCall", CBClone)
6305	<< ore::NV ("Call", CBClone) << " in clone "
6306	<< ore::NV ("Caller", CBClone->getFunction())
6307	<< " promoted and assigned to call function clone "
6308	<< ore::NV ("Callee", TargetToUse));
6309	}
6310
6311	// Update TotalCount (all clones should get same count above)
6312	TotalCount -= Candidate.Count;
6313	}
6314	// Adjust the MD.prof metadata for all clones, now that we have the new
6315	// TotalCount and the remaining candidates.
6316	CallBase *CBClone = CB;
6317	for (unsigned J = `0`; J < NumClones; J++) {
6318	// If the VMap is empty, this clone was a duplicate of another and was
6319	// created as an alias or a declaration.
6320	if (J > `0` && VMaps [J - `1`]->empty())
6321	continue;
6322	// Copy 0 is the original function.
6323	if (J > `0`)
6324	CBClone = cast<CallBase>(Val&: (*VMaps [J - `1`])[CB]);
6325	// First delete the old one.
6326	CBClone->setMetadata(KindID: LLVMContext::MD_prof, Node: nullptr);
6327	// If all promoted, we don't need the MD.prof metadata.
6328	// Otherwise we need update with the un-promoted records back.
6329	if (TotalCount != `0`)
6330	annotateValueSite(M, Inst&: *CBClone, VDs: RemainingCandidates, Sum: TotalCount,
6331	ValueKind: IPVK_IndirectCallTarget, MaxMDCount: Info.NumCandidates);
6332	}
6333	}
6334	}
6335
6336	template <typename DerivedCCG, typename FuncTy, typename CallTy>
6337	bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process(
6338	function_ref<void(StringRef, StringRef, const Twine &)> EmitRemark,
6339	bool AllowExtraAnalysis) {
6340	if (DumpCCG) {
6341	dbgs() << "CCG before cloning:\n";
6342	dbgs() << *this;
6343	}
6344	if (ExportToDot)
6345	exportToDot(Label: "postbuild");
6346
6347	if (VerifyCCG) {
6348	check();
6349	}
6350
6351	identifyClones();
6352
6353	if (VerifyCCG) {
6354	check();
6355	}
6356
6357	if (DumpCCG) {
6358	dbgs() << "CCG after cloning:\n";
6359	dbgs() << *this;
6360	}
6361	if (ExportToDot)
6362	exportToDot(Label: "cloned");
6363
6364	bool Changed = assignFunctions();
6365
6366	if (DumpCCG) {
6367	dbgs() << "CCG after assigning function clones:\n";
6368	dbgs() << *this;
6369	}
6370	if (ExportToDot)
6371	exportToDot(Label: "clonefuncassign");
6372
6373	if (MemProfReportHintedSizes \|\| AllowExtraAnalysis)
6374	printTotalSizes(OS&: errs(), EmitRemark);
6375
6376	return Changed;
6377	}
6378
6379	bool MemProfContextDisambiguation::processModule(
6380	Module &M,
6381	llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
6382
6383	// If we have an import summary, then the cloning decisions were made during
6384	// the thin link on the index. Apply them and return.
6385	if (ImportSummary)
6386	return applyImport(M);
6387
6388	// TODO: If/when other types of memprof cloning are enabled beyond just for
6389	// hot and cold, we will need to change this to individually control the
6390	// AllocationType passed to addStackNodesForMIB during CCG construction.
6391	// Note that we specifically check this after applying imports above, so that
6392	// the option isn't needed to be passed to distributed ThinLTO backend
6393	// clang processes, which won't necessarily have visibility into the linker
6394	// dependences. Instead the information is communicated from the LTO link to
6395	// the backends via the combined summary index.
6396	if (!SupportsHotColdNew)
6397	return false;
6398
6399	ModuleCallsiteContextGraph CCG(M, OREGetter);
6400	// TODO: Set up remarks for regular LTO. We need to decide what function to
6401	// use in the callback.
6402	return CCG.process();
6403	}
6404
6405	MemProfContextDisambiguation::MemProfContextDisambiguation(
6406	const ModuleSummaryIndex Summary, bool* isSamplePGO)
6407	: ImportSummary(Summary), isSamplePGO(isSamplePGO) {
6408	// Check the dot graph printing options once here, to make sure we have valid
6409	// and expected combinations.
6410	if (DotGraphScope == DotScope::Alloc && !AllocIdForDot.getNumOccurrences())
6411	llvm::report_fatal_error(
6412	reason: "-memprof-dot-scope=alloc requires -memprof-dot-alloc-id");
6413	if (DotGraphScope == DotScope::Context &&
6414	!ContextIdForDot.getNumOccurrences())
6415	llvm::report_fatal_error(
6416	reason: "-memprof-dot-scope=context requires -memprof-dot-context-id");
6417	if (DotGraphScope == DotScope::All && AllocIdForDot.getNumOccurrences() &&
6418	ContextIdForDot.getNumOccurrences())
6419	llvm::report_fatal_error(
6420	reason: "-memprof-dot-scope=all can't have both -memprof-dot-alloc-id and "
6421	"-memprof-dot-context-id");
6422	if (ImportSummary) {
6423	// The MemProfImportSummary should only be used for testing ThinLTO
6424	// distributed backend handling via opt, in which case we don't have a
6425	// summary from the pass pipeline.
6426	assert(MemProfImportSummary.empty());
6427	return;
6428	}
6429	if (MemProfImportSummary.empty())
6430	return;
6431
6432	auto ReadSummaryFile =
6433	errorOrToExpected(EO: MemoryBuffer::getFile(Filename: MemProfImportSummary));
6434	if (!ReadSummaryFile) {
6435	logAllUnhandledErrors(E: ReadSummaryFile.takeError(), OS&: errs(),
6436	ErrorBanner: "Error loading file '" + MemProfImportSummary +
6437	"': ");
6438	return;
6439	}
6440	auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(Buffer: **ReadSummaryFile);
6441	if (!ImportSummaryForTestingOrErr) {
6442	logAllUnhandledErrors(E: ImportSummaryForTestingOrErr.takeError(), OS&: errs(),
6443	ErrorBanner: "Error parsing file '" + MemProfImportSummary +
6444	"': ");
6445	return;
6446	}
6447	ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
6448	ImportSummary = ImportSummaryForTesting.get();
6449	}
6450
6451	PreservedAnalyses MemProfContextDisambiguation::run(Module &M,
6452	ModuleAnalysisManager &AM) {
6453	auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(IR&: M).getManager();
6454	auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
6455	return FAM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: *F);
6456	};
6457	if (!processModule(M, OREGetter))
6458	return PreservedAnalyses::all();
6459	return PreservedAnalyses::none();
6460	}
6461
6462	void MemProfContextDisambiguation::run(
6463	ModuleSummaryIndex &Index,
6464	llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
6465	isPrevailing,
6466	LLVMContext &Ctx,
6467	function_ref<void(StringRef, StringRef, const Twine &)> EmitRemark) {
6468	// TODO: If/when other types of memprof cloning are enabled beyond just for
6469	// hot and cold, we will need to change this to individually control the
6470	// AllocationType passed to addStackNodesForMIB during CCG construction.
6471	// The index was set from the option, so these should be in sync.
6472	assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
6473	if (!SupportsHotColdNew)
6474	return;
6475
6476	bool AllowExtraAnalysis =
6477	OptimizationRemarkEmitter::allowExtraAnalysis(Ctx, DEBUG_TYPE);
6478
6479	IndexCallsiteContextGraph CCG(Index, isPrevailing);
6480	CCG.process(EmitRemark, AllowExtraAnalysis);
6481	}
6482
6483	// Strips MemProf attributes and metadata. Can be invoked by the pass pipeline
6484	// when we don't have an index that has recorded that we are linking with
6485	// allocation libraries containing the necessary APIs for downstream
6486	// transformations.
6487	PreservedAnalyses MemProfRemoveInfo::run(Module &M, ModuleAnalysisManager &AM) {
6488	// The profile matcher applies hotness attributes directly for allocations,
6489	// and those will cause us to generate calls to the hot/cold interfaces
6490	// unconditionally. If supports-hot-cold-new was not enabled in the LTO
6491	// link then assume we don't want these calls (e.g. not linking with
6492	// the appropriate library, or otherwise trying to disable this behavior).
6493	bool Changed = false;
6494	for (auto &F : M) {
6495	for (auto &BB : F) {
6496	for (auto &I : BB) {
6497	auto *CI = dyn_cast<CallBase>(Val: &I);
6498	if (!CI)
6499	continue;
6500	if (CI->hasFnAttr(Kind: "memprof")) {
6501	CI->removeFnAttr(Kind: "memprof");
6502	Changed = true;
6503	}
6504	if (!CI->hasMetadata(KindID: LLVMContext::MD_callsite)) {
6505	assert(!CI->hasMetadata(LLVMContext::MD_memprof));
6506	continue;
6507	}
6508	// Strip off all memprof metadata as it is no longer needed.
6509	// Importantly, this avoids the addition of new memprof attributes
6510	// after inlining propagation.
6511	CI->setMetadata(KindID: LLVMContext::MD_memprof, Node: nullptr);
6512	CI->setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
6513	Changed = true;
6514	}
6515	}
6516	}
6517	if (!Changed)
6518	return PreservedAnalyses::all();
6519	return PreservedAnalyses::none();
6520	}
6521

Browse the source code of llvm_projects/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp