1 | //==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file implements support for context disambiguation of allocation |
10 | // calls for profile guided heap optimization. Specifically, it uses Memprof |
11 | // profiles which indicate context specific allocation behavior (currently |
12 | // distinguishing cold vs hot memory allocations). Cloning is performed to |
13 | // expose the cold allocation call contexts, and the allocation calls are |
14 | // subsequently annotated with an attribute for later transformation. |
15 | // |
16 | // The transformations can be performed either directly on IR (regular LTO), or |
17 | // on a ThinLTO index (and later applied to the IR during the ThinLTO backend). |
18 | // Both types of LTO operate on a the same base graph representation, which |
19 | // uses CRTP to support either IR or Index formats. |
20 | // |
21 | //===----------------------------------------------------------------------===// |
22 | |
23 | #include "llvm/Transforms/IPO/MemProfContextDisambiguation.h" |
24 | #include "llvm/ADT/DenseMap.h" |
25 | #include "llvm/ADT/DenseSet.h" |
26 | #include "llvm/ADT/MapVector.h" |
27 | #include "llvm/ADT/SetOperations.h" |
28 | #include "llvm/ADT/SmallPtrSet.h" |
29 | #include "llvm/ADT/SmallSet.h" |
30 | #include "llvm/ADT/SmallVector.h" |
31 | #include "llvm/ADT/Statistic.h" |
32 | #include "llvm/Analysis/MemoryProfileInfo.h" |
33 | #include "llvm/Analysis/ModuleSummaryAnalysis.h" |
34 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
35 | #include "llvm/Bitcode/BitcodeReader.h" |
36 | #include "llvm/IR/Instructions.h" |
37 | #include "llvm/IR/Module.h" |
38 | #include "llvm/IR/ModuleSummaryIndex.h" |
39 | #include "llvm/Pass.h" |
40 | #include "llvm/Support/CommandLine.h" |
41 | #include "llvm/Support/GraphWriter.h" |
42 | #include "llvm/Support/InterleavedRange.h" |
43 | #include "llvm/Support/raw_ostream.h" |
44 | #include "llvm/Transforms/IPO.h" |
45 | #include "llvm/Transforms/Utils/CallPromotionUtils.h" |
46 | #include "llvm/Transforms/Utils/Cloning.h" |
47 | #include "llvm/Transforms/Utils/Instrumentation.h" |
48 | #include <deque> |
49 | #include <sstream> |
50 | #include <unordered_map> |
51 | #include <vector> |
52 | using namespace llvm; |
53 | using namespace llvm::memprof; |
54 | |
55 | #define DEBUG_TYPE "memprof-context-disambiguation" |
56 | |
57 | STATISTIC(FunctionClonesAnalysis, |
58 | "Number of function clones created during whole program analysis" ); |
59 | STATISTIC(FunctionClonesThinBackend, |
60 | "Number of function clones created during ThinLTO backend" ); |
61 | STATISTIC(FunctionsClonedThinBackend, |
62 | "Number of functions that had clones created during ThinLTO backend" ); |
63 | STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly " |
64 | "cloned) during whole program analysis" ); |
65 | STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) " |
66 | "during whole program analysis" ); |
67 | STATISTIC(AllocTypeNotColdThinBackend, |
68 | "Number of not cold static allocations (possibly cloned) during " |
69 | "ThinLTO backend" ); |
70 | STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations " |
71 | "(possibly cloned) during ThinLTO backend" ); |
72 | STATISTIC(OrigAllocsThinBackend, |
73 | "Number of original (not cloned) allocations with memprof profiles " |
74 | "during ThinLTO backend" ); |
75 | STATISTIC( |
76 | AllocVersionsThinBackend, |
77 | "Number of allocation versions (including clones) during ThinLTO backend" ); |
78 | STATISTIC(MaxAllocVersionsThinBackend, |
79 | "Maximum number of allocation versions created for an original " |
80 | "allocation during ThinLTO backend" ); |
81 | STATISTIC(UnclonableAllocsThinBackend, |
82 | "Number of unclonable ambigous allocations during ThinLTO backend" ); |
83 | STATISTIC(RemovedEdgesWithMismatchedCallees, |
84 | "Number of edges removed due to mismatched callees (profiled vs IR)" ); |
85 | STATISTIC(FoundProfiledCalleeCount, |
86 | "Number of profiled callees found via tail calls" ); |
87 | STATISTIC(FoundProfiledCalleeDepth, |
88 | "Aggregate depth of profiled callees found via tail calls" ); |
89 | STATISTIC(FoundProfiledCalleeMaxDepth, |
90 | "Maximum depth of profiled callees found via tail calls" ); |
91 | STATISTIC(FoundProfiledCalleeNonUniquelyCount, |
92 | "Number of profiled callees found via multiple tail call chains" ); |
93 | STATISTIC(DeferredBackedges, "Number of backedges with deferred cloning" ); |
94 | STATISTIC(NewMergedNodes, "Number of new nodes created during merging" ); |
95 | STATISTIC(NonNewMergedNodes, "Number of non new nodes used during merging" ); |
96 | STATISTIC(MissingAllocForContextId, |
97 | "Number of missing alloc nodes for context ids" ); |
98 | |
99 | static cl::opt<std::string> DotFilePathPrefix( |
100 | "memprof-dot-file-path-prefix" , cl::init(Val: "" ), cl::Hidden, |
101 | cl::value_desc("filename" ), |
102 | cl::desc("Specify the path prefix of the MemProf dot files." )); |
103 | |
104 | static cl::opt<bool> ExportToDot("memprof-export-to-dot" , cl::init(Val: false), |
105 | cl::Hidden, |
106 | cl::desc("Export graph to dot files." )); |
107 | |
108 | // How much of the graph to export to dot. |
109 | enum DotScope { |
110 | All, // The full CCG graph. |
111 | Alloc, // Only contexts for the specified allocation. |
112 | Context, // Only the specified context. |
113 | }; |
114 | |
115 | static cl::opt<DotScope> DotGraphScope( |
116 | "memprof-dot-scope" , cl::desc("Scope of graph to export to dot" ), |
117 | cl::Hidden, cl::init(Val: DotScope::All), |
118 | cl::values( |
119 | clEnumValN(DotScope::All, "all" , "Export full callsite graph" ), |
120 | clEnumValN(DotScope::Alloc, "alloc" , |
121 | "Export only nodes with contexts feeding given " |
122 | "-memprof-dot-alloc-id" ), |
123 | clEnumValN(DotScope::Context, "context" , |
124 | "Export only nodes with given -memprof-dot-context-id" ))); |
125 | |
126 | static cl::opt<unsigned> |
127 | AllocIdForDot("memprof-dot-alloc-id" , cl::init(Val: 0), cl::Hidden, |
128 | cl::desc("Id of alloc to export if -memprof-dot-scope=alloc " |
129 | "or to highlight if -memprof-dot-scope=all" )); |
130 | |
131 | static cl::opt<unsigned> ContextIdForDot( |
132 | "memprof-dot-context-id" , cl::init(Val: 0), cl::Hidden, |
133 | cl::desc("Id of context to export if -memprof-dot-scope=context or to " |
134 | "highlight otherwise" )); |
135 | |
136 | static cl::opt<bool> |
137 | DumpCCG("memprof-dump-ccg" , cl::init(Val: false), cl::Hidden, |
138 | cl::desc("Dump CallingContextGraph to stdout after each stage." )); |
139 | |
140 | static cl::opt<bool> |
141 | VerifyCCG("memprof-verify-ccg" , cl::init(Val: false), cl::Hidden, |
142 | cl::desc("Perform verification checks on CallingContextGraph." )); |
143 | |
144 | static cl::opt<bool> |
145 | VerifyNodes("memprof-verify-nodes" , cl::init(Val: false), cl::Hidden, |
146 | cl::desc("Perform frequent verification checks on nodes." )); |
147 | |
148 | static cl::opt<std::string> MemProfImportSummary( |
149 | "memprof-import-summary" , |
150 | cl::desc("Import summary to use for testing the ThinLTO backend via opt" ), |
151 | cl::Hidden); |
152 | |
153 | static cl::opt<unsigned> |
154 | TailCallSearchDepth("memprof-tail-call-search-depth" , cl::init(Val: 5), |
155 | cl::Hidden, |
156 | cl::desc("Max depth to recursively search for missing " |
157 | "frames through tail calls." )); |
158 | |
159 | // Optionally enable cloning of callsites involved with recursive cycles |
160 | static cl::opt<bool> AllowRecursiveCallsites( |
161 | "memprof-allow-recursive-callsites" , cl::init(Val: true), cl::Hidden, |
162 | cl::desc("Allow cloning of callsites involved in recursive cycles" )); |
163 | |
164 | static cl::opt<bool> CloneRecursiveContexts( |
165 | "memprof-clone-recursive-contexts" , cl::init(Val: true), cl::Hidden, |
166 | cl::desc("Allow cloning of contexts through recursive cycles" )); |
167 | |
168 | // Generally this is needed for correct assignment of allocation clones to |
169 | // function clones, however, allow it to be disabled for debugging while the |
170 | // functionality is new and being tested more widely. |
171 | static cl::opt<bool> |
172 | MergeClones("memprof-merge-clones" , cl::init(Val: true), cl::Hidden, |
173 | cl::desc("Merge clones before assigning functions" )); |
174 | |
175 | // When disabled, try to detect and prevent cloning of recursive contexts. |
176 | // This is only necessary until we support cloning through recursive cycles. |
177 | // Leave on by default for now, as disabling requires a little bit of compile |
178 | // time overhead and doesn't affect correctness, it will just inflate the cold |
179 | // hinted bytes reporting a bit when -memprof-report-hinted-sizes is enabled. |
180 | static cl::opt<bool> AllowRecursiveContexts( |
181 | "memprof-allow-recursive-contexts" , cl::init(Val: true), cl::Hidden, |
182 | cl::desc("Allow cloning of contexts having recursive cycles" )); |
183 | |
184 | namespace llvm { |
185 | cl::opt<bool> EnableMemProfContextDisambiguation( |
186 | "enable-memprof-context-disambiguation" , cl::init(Val: false), cl::Hidden, |
187 | cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation" )); |
188 | |
189 | // Indicate we are linking with an allocator that supports hot/cold operator |
190 | // new interfaces. |
191 | cl::opt<bool> SupportsHotColdNew( |
192 | "supports-hot-cold-new" , cl::init(Val: false), cl::Hidden, |
193 | cl::desc("Linking with hot/cold operator new interfaces" )); |
194 | |
195 | static cl::opt<bool> MemProfRequireDefinitionForPromotion( |
196 | "memprof-require-definition-for-promotion" , cl::init(Val: false), cl::Hidden, |
197 | cl::desc( |
198 | "Require target function definition when promoting indirect calls" )); |
199 | } // namespace llvm |
200 | |
201 | extern cl::opt<bool> MemProfReportHintedSizes; |
202 | extern cl::opt<unsigned> MinClonedColdBytePercent; |
203 | |
204 | namespace { |
205 | /// CRTP base for graphs built from either IR or ThinLTO summary index. |
206 | /// |
207 | /// The graph represents the call contexts in all memprof metadata on allocation |
208 | /// calls, with nodes for the allocations themselves, as well as for the calls |
209 | /// in each context. The graph is initially built from the allocation memprof |
210 | /// metadata (or summary) MIBs. It is then updated to match calls with callsite |
211 | /// metadata onto the nodes, updating it to reflect any inlining performed on |
212 | /// those calls. |
213 | /// |
214 | /// Each MIB (representing an allocation's call context with allocation |
215 | /// behavior) is assigned a unique context id during the graph build. The edges |
216 | /// and nodes in the graph are decorated with the context ids they carry. This |
217 | /// is used to correctly update the graph when cloning is performed so that we |
218 | /// can uniquify the context for a single (possibly cloned) allocation. |
219 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
220 | class CallsiteContextGraph { |
221 | public: |
222 | CallsiteContextGraph() = default; |
223 | CallsiteContextGraph(const CallsiteContextGraph &) = default; |
224 | CallsiteContextGraph(CallsiteContextGraph &&) = default; |
225 | |
226 | /// Main entry point to perform analysis and transformations on graph. |
227 | bool process(); |
228 | |
229 | /// Perform cloning on the graph necessary to uniquely identify the allocation |
230 | /// behavior of an allocation based on its context. |
231 | void identifyClones(); |
232 | |
233 | /// Assign callsite clones to functions, cloning functions as needed to |
234 | /// accommodate the combinations of their callsite clones reached by callers. |
235 | /// For regular LTO this clones functions and callsites in the IR, but for |
236 | /// ThinLTO the cloning decisions are noted in the summaries and later applied |
237 | /// in applyImport. |
238 | bool assignFunctions(); |
239 | |
240 | void dump() const; |
241 | void print(raw_ostream &OS) const; |
242 | void printTotalSizes(raw_ostream &OS) const; |
243 | |
244 | friend raw_ostream &operator<<(raw_ostream &OS, |
245 | const CallsiteContextGraph &CCG) { |
246 | CCG.print(OS); |
247 | return OS; |
248 | } |
249 | |
250 | friend struct GraphTraits< |
251 | const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>; |
252 | friend struct DOTGraphTraits< |
253 | const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>; |
254 | |
255 | void exportToDot(std::string Label) const; |
256 | |
257 | /// Represents a function clone via FuncTy pointer and clone number pair. |
258 | struct FuncInfo final |
259 | : public std::pair<FuncTy *, unsigned /*Clone number*/> { |
260 | using Base = std::pair<FuncTy *, unsigned>; |
261 | FuncInfo(const Base &B) : Base(B) {} |
262 | FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {} |
263 | explicit operator bool() const { return this->first != nullptr; } |
264 | FuncTy *func() const { return this->first; } |
265 | unsigned cloneNo() const { return this->second; } |
266 | }; |
267 | |
268 | /// Represents a callsite clone via CallTy and clone number pair. |
269 | struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> { |
270 | using Base = std::pair<CallTy, unsigned>; |
271 | CallInfo(const Base &B) : Base(B) {} |
272 | CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0) |
273 | : Base(Call, CloneNo) {} |
274 | explicit operator bool() const { return (bool)this->first; } |
275 | CallTy call() const { return this->first; } |
276 | unsigned cloneNo() const { return this->second; } |
277 | void setCloneNo(unsigned N) { this->second = N; } |
278 | void print(raw_ostream &OS) const { |
279 | if (!operator bool()) { |
280 | assert(!cloneNo()); |
281 | OS << "null Call" ; |
282 | return; |
283 | } |
284 | call()->print(OS); |
285 | OS << "\t(clone " << cloneNo() << ")" ; |
286 | } |
287 | void dump() const { |
288 | print(OS&: dbgs()); |
289 | dbgs() << "\n" ; |
290 | } |
291 | friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) { |
292 | Call.print(OS); |
293 | return OS; |
294 | } |
295 | }; |
296 | |
297 | struct ContextEdge; |
298 | |
299 | /// Node in the Callsite Context Graph |
300 | struct ContextNode { |
301 | // Keep this for now since in the IR case where we have an Instruction* it |
302 | // is not as immediately discoverable. Used for printing richer information |
303 | // when dumping graph. |
304 | bool IsAllocation; |
305 | |
306 | // Keeps track of when the Call was reset to null because there was |
307 | // recursion. |
308 | bool Recursive = false; |
309 | |
310 | // This will be formed by ORing together the AllocationType enum values |
311 | // for contexts including this node. |
312 | uint8_t AllocTypes = 0; |
313 | |
314 | // The corresponding allocation or interior call. This is the primary call |
315 | // for which we have created this node. |
316 | CallInfo Call; |
317 | |
318 | // List of other calls that can be treated the same as the primary call |
319 | // through cloning. I.e. located in the same function and have the same |
320 | // (possibly pruned) stack ids. They will be updated the same way as the |
321 | // primary call when assigning to function clones. |
322 | SmallVector<CallInfo, 0> MatchingCalls; |
323 | |
324 | // For alloc nodes this is a unique id assigned when constructed, and for |
325 | // callsite stack nodes it is the original stack id when the node is |
326 | // constructed from the memprof MIB metadata on the alloc nodes. Note that |
327 | // this is only used when matching callsite metadata onto the stack nodes |
328 | // created when processing the allocation memprof MIBs, and for labeling |
329 | // nodes in the dot graph. Therefore we don't bother to assign a value for |
330 | // clones. |
331 | uint64_t OrigStackOrAllocId = 0; |
332 | |
333 | // Edges to all callees in the profiled call stacks. |
334 | // TODO: Should this be a map (from Callee node) for more efficient lookup? |
335 | std::vector<std::shared_ptr<ContextEdge>> CalleeEdges; |
336 | |
337 | // Edges to all callers in the profiled call stacks. |
338 | // TODO: Should this be a map (from Caller node) for more efficient lookup? |
339 | std::vector<std::shared_ptr<ContextEdge>> CallerEdges; |
340 | |
341 | // Returns true if we need to look at the callee edges for determining the |
342 | // node context ids and allocation type. |
343 | bool useCallerEdgesForContextInfo() const { |
344 | // Typically if the callee edges are empty either the caller edges are |
345 | // also empty, or this is an allocation (leaf node). However, if we are |
346 | // allowing recursive callsites and contexts this will be violated for |
347 | // incompletely cloned recursive cycles. |
348 | assert(!CalleeEdges.empty() || CallerEdges.empty() || IsAllocation || |
349 | (AllowRecursiveCallsites && AllowRecursiveContexts)); |
350 | // When cloning for a recursive context, during cloning we might be in the |
351 | // midst of cloning for a recurrence and have moved context ids off of a |
352 | // caller edge onto the clone but not yet off of the incoming caller |
353 | // (back) edge. If we don't look at those we miss the fact that this node |
354 | // still has context ids of interest. |
355 | return IsAllocation || CloneRecursiveContexts; |
356 | } |
357 | |
358 | // Compute the context ids for this node from the union of its edge context |
359 | // ids. |
360 | DenseSet<uint32_t> getContextIds() const { |
361 | unsigned Count = 0; |
362 | // Compute the number of ids for reserve below. In general we only need to |
363 | // look at one set of edges, typically the callee edges, since other than |
364 | // allocations and in some cases during recursion cloning, all the context |
365 | // ids on the callers should also flow out via callee edges. |
366 | for (auto &Edge : CalleeEdges.empty() ? CallerEdges : CalleeEdges) |
367 | Count += Edge->getContextIds().size(); |
368 | DenseSet<uint32_t> ContextIds; |
369 | ContextIds.reserve(Size: Count); |
370 | auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>( |
371 | CalleeEdges, useCallerEdgesForContextInfo() |
372 | ? CallerEdges |
373 | : std::vector<std::shared_ptr<ContextEdge>>()); |
374 | for (const auto &Edge : Edges) |
375 | ContextIds.insert_range(Edge->getContextIds()); |
376 | return ContextIds; |
377 | } |
378 | |
379 | // Compute the allocation type for this node from the OR of its edge |
380 | // allocation types. |
381 | uint8_t computeAllocType() const { |
382 | uint8_t BothTypes = |
383 | (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold; |
384 | uint8_t AllocType = (uint8_t)AllocationType::None; |
385 | auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>( |
386 | CalleeEdges, useCallerEdgesForContextInfo() |
387 | ? CallerEdges |
388 | : std::vector<std::shared_ptr<ContextEdge>>()); |
389 | for (const auto &Edge : Edges) { |
390 | AllocType |= Edge->AllocTypes; |
391 | // Bail early if alloc type reached both, no further refinement. |
392 | if (AllocType == BothTypes) |
393 | return AllocType; |
394 | } |
395 | return AllocType; |
396 | } |
397 | |
398 | // The context ids set for this node is empty if its edge context ids are |
399 | // also all empty. |
400 | bool emptyContextIds() const { |
401 | auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>( |
402 | CalleeEdges, useCallerEdgesForContextInfo() |
403 | ? CallerEdges |
404 | : std::vector<std::shared_ptr<ContextEdge>>()); |
405 | for (const auto &Edge : Edges) { |
406 | if (!Edge->getContextIds().empty()) |
407 | return false; |
408 | } |
409 | return true; |
410 | } |
411 | |
412 | // List of clones of this ContextNode, initially empty. |
413 | std::vector<ContextNode *> Clones; |
414 | |
415 | // If a clone, points to the original uncloned node. |
416 | ContextNode *CloneOf = nullptr; |
417 | |
418 | ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {} |
419 | |
420 | ContextNode(bool IsAllocation, CallInfo C) |
421 | : IsAllocation(IsAllocation), Call(C) {} |
422 | |
423 | void addClone(ContextNode *Clone) { |
424 | if (CloneOf) { |
425 | CloneOf->Clones.push_back(Clone); |
426 | Clone->CloneOf = CloneOf; |
427 | } else { |
428 | Clones.push_back(Clone); |
429 | assert(!Clone->CloneOf); |
430 | Clone->CloneOf = this; |
431 | } |
432 | } |
433 | |
434 | ContextNode *getOrigNode() { |
435 | if (!CloneOf) |
436 | return this; |
437 | return CloneOf; |
438 | } |
439 | |
440 | void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType, |
441 | unsigned int ContextId); |
442 | |
443 | ContextEdge *findEdgeFromCallee(const ContextNode *Callee); |
444 | ContextEdge *findEdgeFromCaller(const ContextNode *Caller); |
445 | void eraseCalleeEdge(const ContextEdge *Edge); |
446 | void eraseCallerEdge(const ContextEdge *Edge); |
447 | |
448 | void setCall(CallInfo C) { Call = C; } |
449 | |
450 | bool hasCall() const { return (bool)Call.call(); } |
451 | |
452 | void printCall(raw_ostream &OS) const { Call.print(OS); } |
453 | |
454 | // True if this node was effectively removed from the graph, in which case |
455 | // it should have an allocation type of None and empty context ids. |
456 | bool isRemoved() const { |
457 | // Typically if the callee edges are empty either the caller edges are |
458 | // also empty, or this is an allocation (leaf node). However, if we are |
459 | // allowing recursive callsites and contexts this will be violated for |
460 | // incompletely cloned recursive cycles. |
461 | assert((AllowRecursiveCallsites && AllowRecursiveContexts) || |
462 | (AllocTypes == (uint8_t)AllocationType::None) == |
463 | emptyContextIds()); |
464 | return AllocTypes == (uint8_t)AllocationType::None; |
465 | } |
466 | |
467 | void dump() const; |
468 | void print(raw_ostream &OS) const; |
469 | |
470 | friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) { |
471 | Node.print(OS); |
472 | return OS; |
473 | } |
474 | }; |
475 | |
476 | /// Edge in the Callsite Context Graph from a ContextNode N to a caller or |
477 | /// callee. |
478 | struct ContextEdge { |
479 | ContextNode *Callee; |
480 | ContextNode *Caller; |
481 | |
482 | // This will be formed by ORing together the AllocationType enum values |
483 | // for contexts including this edge. |
484 | uint8_t AllocTypes = 0; |
485 | |
486 | // Set just before initiating cloning when cloning of recursive contexts is |
487 | // enabled. Used to defer cloning of backedges until we have done cloning of |
488 | // the callee node for non-backedge caller edges. This exposes cloning |
489 | // opportunities through the backedge of the cycle. |
490 | // TODO: Note that this is not updated during cloning, and it is unclear |
491 | // whether that would be needed. |
492 | bool IsBackedge = false; |
493 | |
494 | // The set of IDs for contexts including this edge. |
495 | DenseSet<uint32_t> ContextIds; |
496 | |
497 | ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType, |
498 | DenseSet<uint32_t> ContextIds) |
499 | : Callee(Callee), Caller(Caller), AllocTypes(AllocType), |
500 | ContextIds(std::move(ContextIds)) {} |
501 | |
502 | DenseSet<uint32_t> &getContextIds() { return ContextIds; } |
503 | |
504 | // Helper to clear the fields of this edge when we are removing it from the |
505 | // graph. |
506 | inline void clear() { |
507 | ContextIds.clear(); |
508 | AllocTypes = (uint8_t)AllocationType::None; |
509 | Caller = nullptr; |
510 | Callee = nullptr; |
511 | } |
512 | |
513 | // Check if edge was removed from the graph. This is useful while iterating |
514 | // over a copy of edge lists when performing operations that mutate the |
515 | // graph in ways that might remove one of the edges. |
516 | inline bool isRemoved() const { |
517 | if (Callee || Caller) |
518 | return false; |
519 | // Any edges that have been removed from the graph but are still in a |
520 | // shared_ptr somewhere should have all fields null'ed out by clear() |
521 | // above. |
522 | assert(AllocTypes == (uint8_t)AllocationType::None); |
523 | assert(ContextIds.empty()); |
524 | return true; |
525 | } |
526 | |
527 | void dump() const; |
528 | void print(raw_ostream &OS) const; |
529 | |
530 | friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) { |
531 | Edge.print(OS); |
532 | return OS; |
533 | } |
534 | }; |
535 | |
536 | /// Helpers to remove edges that have allocation type None (due to not |
537 | /// carrying any context ids) after transformations. |
538 | void removeNoneTypeCalleeEdges(ContextNode *Node); |
539 | void removeNoneTypeCallerEdges(ContextNode *Node); |
540 | void |
541 | recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node, |
542 | DenseSet<const ContextNode *> &Visited); |
543 | |
544 | protected: |
545 | /// Get a list of nodes corresponding to the stack ids in the given callsite |
546 | /// context. |
547 | template <class NodeT, class IteratorT> |
548 | std::vector<uint64_t> |
549 | getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext); |
550 | |
551 | /// Adds nodes for the given allocation and any stack ids on its memprof MIB |
552 | /// metadata (or summary). |
553 | ContextNode *addAllocNode(CallInfo Call, const FuncTy *F); |
554 | |
555 | /// Adds nodes for the given MIB stack ids. |
556 | template <class NodeT, class IteratorT> |
557 | void addStackNodesForMIB(ContextNode *AllocNode, |
558 | CallStack<NodeT, IteratorT> &StackContext, |
559 | CallStack<NodeT, IteratorT> &CallsiteContext, |
560 | AllocationType AllocType, |
561 | ArrayRef<ContextTotalSize> ContextSizeInfo); |
562 | |
563 | /// Matches all callsite metadata (or summary) to the nodes created for |
564 | /// allocation memprof MIB metadata, synthesizing new nodes to reflect any |
565 | /// inlining performed on those callsite instructions. |
566 | void updateStackNodes(); |
567 | |
568 | /// Update graph to conservatively handle any callsite stack nodes that target |
569 | /// multiple different callee target functions. |
570 | void handleCallsitesWithMultipleTargets(); |
571 | |
572 | /// Mark backedges via the standard DFS based backedge algorithm. |
573 | void markBackedges(); |
574 | |
575 | /// Merge clones generated during cloning for different allocations but that |
576 | /// are called by the same caller node, to ensure proper function assignment. |
577 | void mergeClones(); |
578 | |
579 | // Try to partition calls on the given node (already placed into the AllCalls |
580 | // array) by callee function, creating new copies of Node as needed to hold |
581 | // calls with different callees, and moving the callee edges appropriately. |
582 | // Returns true if partitioning was successful. |
583 | bool partitionCallsByCallee( |
584 | ContextNode *Node, ArrayRef<CallInfo> AllCalls, |
585 | std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode); |
586 | |
587 | /// Save lists of calls with MemProf metadata in each function, for faster |
588 | /// iteration. |
589 | MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata; |
590 | |
591 | /// Map from callsite node to the enclosing caller function. |
592 | std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc; |
593 | |
594 | // When exporting to dot, and an allocation id is specified, contains the |
595 | // context ids on that allocation. |
596 | DenseSet<uint32_t> DotAllocContextIds; |
597 | |
598 | private: |
599 | using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator; |
600 | |
601 | // Structure to keep track of information for each call as we are matching |
602 | // non-allocation callsites onto context nodes created from the allocation |
603 | // call metadata / summary contexts. |
604 | struct CallContextInfo { |
605 | // The callsite we're trying to match. |
606 | CallTy Call; |
607 | // The callsites stack ids that have a context node in the graph. |
608 | std::vector<uint64_t> StackIds; |
609 | // The function containing this callsite. |
610 | const FuncTy *Func; |
611 | // Initially empty, if needed this will be updated to contain the context |
612 | // ids for use in a new context node created for this callsite. |
613 | DenseSet<uint32_t> ContextIds; |
614 | }; |
615 | |
616 | /// Helper to remove edge from graph, updating edge iterator if it is provided |
617 | /// (in which case CalleeIter indicates which edge list is being iterated). |
618 | /// This will also perform the necessary clearing of the ContextEdge members |
619 | /// to enable later checking if the edge has been removed (since we may have |
620 | /// other copies of the shared_ptr in existence, and in fact rely on this to |
621 | /// enable removal while iterating over a copy of a node's edge list). |
622 | void removeEdgeFromGraph(ContextEdge *Edge, EdgeIter *EI = nullptr, |
623 | bool CalleeIter = true); |
624 | |
625 | /// Assigns the given Node to calls at or inlined into the location with |
626 | /// the Node's stack id, after post order traversing and processing its |
627 | /// caller nodes. Uses the call information recorded in the given |
628 | /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences |
629 | /// as needed. Called by updateStackNodes which sets up the given |
630 | /// StackIdToMatchingCalls map. |
631 | void assignStackNodesPostOrder( |
632 | ContextNode *Node, DenseSet<const ContextNode *> &Visited, |
633 | DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls, |
634 | DenseMap<CallInfo, CallInfo> &CallToMatchingCall); |
635 | |
636 | /// Duplicates the given set of context ids, updating the provided |
637 | /// map from each original id with the newly generated context ids, |
638 | /// and returning the new duplicated id set. |
639 | DenseSet<uint32_t> duplicateContextIds( |
640 | const DenseSet<uint32_t> &StackSequenceContextIds, |
641 | DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds); |
642 | |
643 | /// Propagates all duplicated context ids across the graph. |
644 | void propagateDuplicateContextIds( |
645 | const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds); |
646 | |
647 | /// Connect the NewNode to OrigNode's callees if TowardsCallee is true, |
648 | /// else to its callers. Also updates OrigNode's edges to remove any context |
649 | /// ids moved to the newly created edge. |
650 | void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode, |
651 | bool TowardsCallee, |
652 | DenseSet<uint32_t> RemainingContextIds); |
653 | |
654 | /// Get the stack id corresponding to the given Id or Index (for IR this will |
655 | /// return itself, for a summary index this will return the id recorded in the |
656 | /// index for that stack id index value). |
657 | uint64_t getStackId(uint64_t IdOrIndex) const { |
658 | return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex); |
659 | } |
660 | |
661 | /// Returns true if the given call targets the callee of the given edge, or if |
662 | /// we were able to identify the call chain through intermediate tail calls. |
663 | /// In the latter case new context nodes are added to the graph for the |
664 | /// identified tail calls, and their synthesized nodes are added to |
665 | /// TailCallToContextNodeMap. The EdgeIter is updated in the latter case for |
666 | /// the updated edges and to prepare it for an increment in the caller. |
667 | bool |
668 | calleesMatch(CallTy Call, EdgeIter &EI, |
669 | MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap); |
670 | |
671 | // Return the callee function of the given call, or nullptr if it can't be |
672 | // determined |
673 | const FuncTy *getCalleeFunc(CallTy Call) { |
674 | return static_cast<DerivedCCG *>(this)->getCalleeFunc(Call); |
675 | } |
676 | |
677 | /// Returns true if the given call targets the given function, or if we were |
678 | /// able to identify the call chain through intermediate tail calls (in which |
679 | /// case FoundCalleeChain will be populated). |
680 | bool calleeMatchesFunc( |
681 | CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc, |
682 | std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) { |
683 | return static_cast<DerivedCCG *>(this)->calleeMatchesFunc( |
684 | Call, Func, CallerFunc, FoundCalleeChain); |
685 | } |
686 | |
687 | /// Returns true if both call instructions have the same callee. |
688 | bool sameCallee(CallTy Call1, CallTy Call2) { |
689 | return static_cast<DerivedCCG *>(this)->sameCallee(Call1, Call2); |
690 | } |
691 | |
692 | /// Get a list of nodes corresponding to the stack ids in the given |
693 | /// callsite's context. |
694 | std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) { |
695 | return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall( |
696 | Call); |
697 | } |
698 | |
699 | /// Get the last stack id in the context for callsite. |
700 | uint64_t getLastStackId(CallTy Call) { |
701 | return static_cast<DerivedCCG *>(this)->getLastStackId(Call); |
702 | } |
703 | |
704 | /// Update the allocation call to record type of allocated memory. |
705 | void updateAllocationCall(CallInfo &Call, AllocationType AllocType) { |
706 | AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++; |
707 | static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType); |
708 | } |
709 | |
710 | /// Get the AllocationType assigned to the given allocation instruction clone. |
711 | AllocationType getAllocationCallType(const CallInfo &Call) const { |
712 | return static_cast<const DerivedCCG *>(this)->getAllocationCallType(Call); |
713 | } |
714 | |
715 | /// Update non-allocation call to invoke (possibly cloned) function |
716 | /// CalleeFunc. |
717 | void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) { |
718 | static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc); |
719 | } |
720 | |
721 | /// Clone the given function for the given callsite, recording mapping of all |
722 | /// of the functions tracked calls to their new versions in the CallMap. |
723 | /// Assigns new clones to clone number CloneNo. |
724 | FuncInfo cloneFunctionForCallsite( |
725 | FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap, |
726 | std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) { |
727 | return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite( |
728 | Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo); |
729 | } |
730 | |
731 | /// Gets a label to use in the dot graph for the given call clone in the given |
732 | /// function. |
733 | std::string getLabel(const FuncTy *Func, const CallTy Call, |
734 | unsigned CloneNo) const { |
735 | return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo); |
736 | } |
737 | |
738 | // Create and return a new ContextNode. |
739 | ContextNode *createNewNode(bool IsAllocation, const FuncTy *F = nullptr, |
740 | CallInfo C = CallInfo()) { |
741 | NodeOwner.push_back(std::make_unique<ContextNode>(IsAllocation, C)); |
742 | auto *NewNode = NodeOwner.back().get(); |
743 | if (F) |
744 | NodeToCallingFunc[NewNode] = F; |
745 | return NewNode; |
746 | } |
747 | |
748 | /// Helpers to find the node corresponding to the given call or stackid. |
749 | ContextNode *getNodeForInst(const CallInfo &C); |
750 | ContextNode *getNodeForAlloc(const CallInfo &C); |
751 | ContextNode *getNodeForStackId(uint64_t StackId); |
752 | |
753 | /// Computes the alloc type corresponding to the given context ids, by |
754 | /// unioning their recorded alloc types. |
755 | uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds) const; |
756 | |
757 | /// Returns the allocation type of the intersection of the contexts of two |
758 | /// nodes (based on their provided context id sets), optimized for the case |
759 | /// when Node1Ids is smaller than Node2Ids. |
760 | uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids, |
761 | const DenseSet<uint32_t> &Node2Ids) const; |
762 | |
763 | /// Returns the allocation type of the intersection of the contexts of two |
764 | /// nodes (based on their provided context id sets). |
765 | uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids, |
766 | const DenseSet<uint32_t> &Node2Ids) const; |
767 | |
768 | /// Create a clone of Edge's callee and move Edge to that new callee node, |
769 | /// performing the necessary context id and allocation type updates. |
770 | /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are |
771 | /// moved to an edge to the new callee. |
772 | ContextNode * |
773 | moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge, |
774 | DenseSet<uint32_t> ContextIdsToMove = {}); |
775 | |
776 | /// Change the callee of Edge to existing callee clone NewCallee, performing |
777 | /// the necessary context id and allocation type updates. |
778 | /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are |
779 | /// moved to an edge to the new callee. |
780 | void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge, |
781 | ContextNode *NewCallee, |
782 | bool NewClone = false, |
783 | DenseSet<uint32_t> ContextIdsToMove = {}); |
784 | |
785 | /// Change the caller of the edge at the given callee edge iterator to be |
786 | /// NewCaller, performing the necessary context id and allocation type |
787 | /// updates. This is similar to the above moveEdgeToExistingCalleeClone, but |
788 | /// a simplified version of it as we always move the given edge and all of its |
789 | /// context ids. |
790 | void moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge, |
791 | ContextNode *NewCaller); |
792 | |
793 | /// Recursive helper for marking backedges via DFS. |
794 | void markBackedges(ContextNode *Node, DenseSet<const ContextNode *> &Visited, |
795 | DenseSet<const ContextNode *> &CurrentStack); |
796 | |
797 | /// Recursive helper for merging clones. |
798 | void |
799 | mergeClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited, |
800 | DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode); |
801 | /// Main worker for merging callee clones for a given node. |
802 | void mergeNodeCalleeClones( |
803 | ContextNode *Node, DenseSet<const ContextNode *> &Visited, |
804 | DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode); |
805 | /// Helper to find other callers of the given set of callee edges that can |
806 | /// share the same callee merge node. |
807 | void findOtherCallersToShareMerge( |
808 | ContextNode *Node, std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges, |
809 | DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode, |
810 | DenseSet<ContextNode *> &OtherCallersToShareMerge); |
811 | |
812 | /// Recursively perform cloning on the graph for the given Node and its |
813 | /// callers, in order to uniquely identify the allocation behavior of an |
814 | /// allocation given its context. The context ids of the allocation being |
815 | /// processed are given in AllocContextIds. |
816 | void identifyClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited, |
817 | const DenseSet<uint32_t> &AllocContextIds); |
818 | |
819 | /// Map from each context ID to the AllocationType assigned to that context. |
820 | DenseMap<uint32_t, AllocationType> ContextIdToAllocationType; |
821 | |
822 | /// Map from each contextID to the profiled full contexts and their total |
823 | /// sizes (there may be more than one due to context trimming), |
824 | /// optionally populated when requested (via MemProfReportHintedSizes or |
825 | /// MinClonedColdBytePercent). |
826 | DenseMap<uint32_t, std::vector<ContextTotalSize>> ContextIdToContextSizeInfos; |
827 | |
828 | /// Identifies the context node created for a stack id when adding the MIB |
829 | /// contexts to the graph. This is used to locate the context nodes when |
830 | /// trying to assign the corresponding callsites with those stack ids to these |
831 | /// nodes. |
832 | DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap; |
833 | |
834 | /// Maps to track the calls to their corresponding nodes in the graph. |
835 | MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap; |
836 | MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap; |
837 | |
838 | /// Owner of all ContextNode unique_ptrs. |
839 | std::vector<std::unique_ptr<ContextNode>> NodeOwner; |
840 | |
841 | /// Perform sanity checks on graph when requested. |
842 | void check() const; |
843 | |
844 | /// Keeps track of the last unique context id assigned. |
845 | unsigned int LastContextId = 0; |
846 | }; |
847 | |
848 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
849 | using ContextNode = |
850 | typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode; |
851 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
852 | using ContextEdge = |
853 | typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge; |
854 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
855 | using FuncInfo = |
856 | typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo; |
857 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
858 | using CallInfo = |
859 | typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo; |
860 | |
861 | /// CRTP derived class for graphs built from IR (regular LTO). |
862 | class ModuleCallsiteContextGraph |
863 | : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function, |
864 | Instruction *> { |
865 | public: |
866 | ModuleCallsiteContextGraph( |
867 | Module &M, |
868 | llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter); |
869 | |
870 | private: |
871 | friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function, |
872 | Instruction *>; |
873 | |
874 | uint64_t getStackId(uint64_t IdOrIndex) const; |
875 | const Function *getCalleeFunc(Instruction *Call); |
876 | bool calleeMatchesFunc( |
877 | Instruction *Call, const Function *Func, const Function *CallerFunc, |
878 | std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain); |
879 | bool sameCallee(Instruction *Call1, Instruction *Call2); |
880 | bool findProfiledCalleeThroughTailCalls( |
881 | const Function *ProfiledCallee, Value *CurCallee, unsigned Depth, |
882 | std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain, |
883 | bool &FoundMultipleCalleeChains); |
884 | uint64_t getLastStackId(Instruction *Call); |
885 | std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call); |
886 | void updateAllocationCall(CallInfo &Call, AllocationType AllocType); |
887 | AllocationType getAllocationCallType(const CallInfo &Call) const; |
888 | void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc); |
889 | CallsiteContextGraph<ModuleCallsiteContextGraph, Function, |
890 | Instruction *>::FuncInfo |
891 | cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call, |
892 | std::map<CallInfo, CallInfo> &CallMap, |
893 | std::vector<CallInfo> &CallsWithMetadataInFunc, |
894 | unsigned CloneNo); |
895 | std::string getLabel(const Function *Func, const Instruction *Call, |
896 | unsigned CloneNo) const; |
897 | |
898 | const Module &Mod; |
899 | llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter; |
900 | }; |
901 | |
902 | /// Represents a call in the summary index graph, which can either be an |
903 | /// allocation or an interior callsite node in an allocation's context. |
904 | /// Holds a pointer to the corresponding data structure in the index. |
905 | struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> { |
906 | IndexCall() : PointerUnion() {} |
907 | IndexCall(std::nullptr_t) : IndexCall() {} |
908 | IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {} |
909 | IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {} |
910 | IndexCall(PointerUnion PT) : PointerUnion(PT) {} |
911 | |
912 | IndexCall *operator->() { return this; } |
913 | |
914 | void print(raw_ostream &OS) const { |
915 | PointerUnion<CallsiteInfo *, AllocInfo *> Base = *this; |
916 | if (auto *AI = llvm::dyn_cast_if_present<AllocInfo *>(Val&: Base)) { |
917 | OS << *AI; |
918 | } else { |
919 | auto *CI = llvm::dyn_cast_if_present<CallsiteInfo *>(Val&: Base); |
920 | assert(CI); |
921 | OS << *CI; |
922 | } |
923 | } |
924 | }; |
925 | } // namespace |
926 | |
927 | namespace llvm { |
928 | template <> struct simplify_type<IndexCall> { |
929 | using SimpleType = PointerUnion<CallsiteInfo *, AllocInfo *>; |
930 | static SimpleType getSimplifiedValue(IndexCall &Val) { return Val; } |
931 | }; |
932 | template <> struct simplify_type<const IndexCall> { |
933 | using SimpleType = const PointerUnion<CallsiteInfo *, AllocInfo *>; |
934 | static SimpleType getSimplifiedValue(const IndexCall &Val) { return Val; } |
935 | }; |
936 | } // namespace llvm |
937 | |
938 | namespace { |
939 | /// CRTP derived class for graphs built from summary index (ThinLTO). |
940 | class IndexCallsiteContextGraph |
941 | : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, |
942 | IndexCall> { |
943 | public: |
944 | IndexCallsiteContextGraph( |
945 | ModuleSummaryIndex &Index, |
946 | llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)> |
947 | isPrevailing); |
948 | |
949 | ~IndexCallsiteContextGraph() { |
950 | // Now that we are done with the graph it is safe to add the new |
951 | // CallsiteInfo structs to the function summary vectors. The graph nodes |
952 | // point into locations within these vectors, so we don't want to add them |
953 | // any earlier. |
954 | for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) { |
955 | auto *FS = I.first; |
956 | for (auto &Callsite : I.second) |
957 | FS->addCallsite(Callsite&: *Callsite.second); |
958 | } |
959 | } |
960 | |
961 | private: |
962 | friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, |
963 | IndexCall>; |
964 | |
965 | uint64_t getStackId(uint64_t IdOrIndex) const; |
966 | const FunctionSummary *getCalleeFunc(IndexCall &Call); |
967 | bool calleeMatchesFunc( |
968 | IndexCall &Call, const FunctionSummary *Func, |
969 | const FunctionSummary *CallerFunc, |
970 | std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain); |
971 | bool sameCallee(IndexCall &Call1, IndexCall &Call2); |
972 | bool findProfiledCalleeThroughTailCalls( |
973 | ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth, |
974 | std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain, |
975 | bool &FoundMultipleCalleeChains); |
976 | uint64_t getLastStackId(IndexCall &Call); |
977 | std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call); |
978 | void updateAllocationCall(CallInfo &Call, AllocationType AllocType); |
979 | AllocationType getAllocationCallType(const CallInfo &Call) const; |
980 | void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc); |
981 | CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, |
982 | IndexCall>::FuncInfo |
983 | cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call, |
984 | std::map<CallInfo, CallInfo> &CallMap, |
985 | std::vector<CallInfo> &CallsWithMetadataInFunc, |
986 | unsigned CloneNo); |
987 | std::string getLabel(const FunctionSummary *Func, const IndexCall &Call, |
988 | unsigned CloneNo) const; |
989 | |
990 | // Saves mapping from function summaries containing memprof records back to |
991 | // its VI, for use in checking and debugging. |
992 | std::map<const FunctionSummary *, ValueInfo> FSToVIMap; |
993 | |
994 | const ModuleSummaryIndex &Index; |
995 | llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)> |
996 | isPrevailing; |
997 | |
998 | // Saves/owns the callsite info structures synthesized for missing tail call |
999 | // frames that we discover while building the graph. |
1000 | // It maps from the summary of the function making the tail call, to a map |
1001 | // of callee ValueInfo to corresponding synthesized callsite info. |
1002 | std::unordered_map<FunctionSummary *, |
1003 | std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>> |
1004 | FunctionCalleesToSynthesizedCallsiteInfos; |
1005 | }; |
1006 | } // namespace |
1007 | |
1008 | namespace llvm { |
1009 | template <> |
1010 | struct DenseMapInfo<typename CallsiteContextGraph< |
1011 | ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo> |
1012 | : public DenseMapInfo<std::pair<Instruction *, unsigned>> {}; |
1013 | template <> |
1014 | struct DenseMapInfo<typename CallsiteContextGraph< |
1015 | IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo> |
1016 | : public DenseMapInfo<std::pair<IndexCall, unsigned>> {}; |
1017 | template <> |
1018 | struct DenseMapInfo<IndexCall> |
1019 | : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {}; |
1020 | } // end namespace llvm |
1021 | |
1022 | namespace { |
1023 | |
1024 | // Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc |
1025 | // type we should actually use on the corresponding allocation. |
1026 | // If we can't clone a node that has NotCold+Cold alloc type, we will fall |
1027 | // back to using NotCold. So don't bother cloning to distinguish NotCold+Cold |
1028 | // from NotCold. |
1029 | AllocationType allocTypeToUse(uint8_t AllocTypes) { |
1030 | assert(AllocTypes != (uint8_t)AllocationType::None); |
1031 | if (AllocTypes == |
1032 | ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold)) |
1033 | return AllocationType::NotCold; |
1034 | else |
1035 | return (AllocationType)AllocTypes; |
1036 | } |
1037 | |
1038 | // Helper to check if the alloc types for all edges recorded in the |
1039 | // InAllocTypes vector match the alloc types for all edges in the Edges |
1040 | // vector. |
1041 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
1042 | bool allocTypesMatch( |
1043 | const std::vector<uint8_t> &InAllocTypes, |
1044 | const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>> |
1045 | &Edges) { |
1046 | // This should be called only when the InAllocTypes vector was computed for |
1047 | // this set of Edges. Make sure the sizes are the same. |
1048 | assert(InAllocTypes.size() == Edges.size()); |
1049 | return std::equal( |
1050 | InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(), Edges.end(), |
1051 | [](const uint8_t &l, |
1052 | const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) { |
1053 | // Can share if one of the edges is None type - don't |
1054 | // care about the type along that edge as it doesn't |
1055 | // exist for those context ids. |
1056 | if (l == (uint8_t)AllocationType::None || |
1057 | r->AllocTypes == (uint8_t)AllocationType::None) |
1058 | return true; |
1059 | return allocTypeToUse(AllocTypes: l) == allocTypeToUse(r->AllocTypes); |
1060 | }); |
1061 | } |
1062 | |
1063 | // Helper to check if the alloc types for all edges recorded in the |
1064 | // InAllocTypes vector match the alloc types for callee edges in the given |
1065 | // clone. Because the InAllocTypes were computed from the original node's callee |
1066 | // edges, and other cloning could have happened after this clone was created, we |
1067 | // need to find the matching clone callee edge, which may or may not exist. |
1068 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
1069 | bool allocTypesMatchClone( |
1070 | const std::vector<uint8_t> &InAllocTypes, |
1071 | const ContextNode<DerivedCCG, FuncTy, CallTy> *Clone) { |
1072 | const ContextNode<DerivedCCG, FuncTy, CallTy> *Node = Clone->CloneOf; |
1073 | assert(Node); |
1074 | // InAllocTypes should have been computed for the original node's callee |
1075 | // edges. |
1076 | assert(InAllocTypes.size() == Node->CalleeEdges.size()); |
1077 | // First create a map of the clone callee edge callees to the edge alloc type. |
1078 | DenseMap<const ContextNode<DerivedCCG, FuncTy, CallTy> *, uint8_t> |
1079 | EdgeCalleeMap; |
1080 | for (const auto &E : Clone->CalleeEdges) { |
1081 | assert(!EdgeCalleeMap.contains(E->Callee)); |
1082 | EdgeCalleeMap[E->Callee] = E->AllocTypes; |
1083 | } |
1084 | // Next, walk the original node's callees, and look for the corresponding |
1085 | // clone edge to that callee. |
1086 | for (unsigned I = 0; I < Node->CalleeEdges.size(); I++) { |
1087 | auto Iter = EdgeCalleeMap.find(Node->CalleeEdges[I]->Callee); |
1088 | // Not found is ok, we will simply add an edge if we use this clone. |
1089 | if (Iter == EdgeCalleeMap.end()) |
1090 | continue; |
1091 | // Can share if one of the edges is None type - don't |
1092 | // care about the type along that edge as it doesn't |
1093 | // exist for those context ids. |
1094 | if (InAllocTypes[I] == (uint8_t)AllocationType::None || |
1095 | Iter->second == (uint8_t)AllocationType::None) |
1096 | continue; |
1097 | if (allocTypeToUse(Iter->second) != allocTypeToUse(AllocTypes: InAllocTypes[I])) |
1098 | return false; |
1099 | } |
1100 | return true; |
1101 | } |
1102 | |
1103 | } // end anonymous namespace |
1104 | |
1105 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
1106 | typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * |
1107 | CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst( |
1108 | const CallInfo &C) { |
1109 | ContextNode *Node = getNodeForAlloc(C); |
1110 | if (Node) |
1111 | return Node; |
1112 | |
1113 | return NonAllocationCallToContextNodeMap.lookup(C); |
1114 | } |
1115 | |
1116 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
1117 | typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * |
1118 | CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc( |
1119 | const CallInfo &C) { |
1120 | return AllocationCallToContextNodeMap.lookup(C); |
1121 | } |
1122 | |
1123 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
1124 | typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * |
1125 | CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId( |
1126 | uint64_t StackId) { |
1127 | auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId); |
1128 | if (StackEntryNode != StackEntryIdToContextNodeMap.end()) |
1129 | return StackEntryNode->second; |
1130 | return nullptr; |
1131 | } |
1132 | |
1133 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
1134 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: |
1135 | addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType, |
1136 | unsigned int ContextId) { |
1137 | for (auto &Edge : CallerEdges) { |
1138 | if (Edge->Caller == Caller) { |
1139 | Edge->AllocTypes |= (uint8_t)AllocType; |
1140 | Edge->getContextIds().insert(ContextId); |
1141 | return; |
1142 | } |
1143 | } |
1144 | std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>( |
1145 | this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId})); |
1146 | CallerEdges.push_back(Edge); |
1147 | Caller->CalleeEdges.push_back(Edge); |
1148 | } |
1149 | |
1150 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
1151 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::removeEdgeFromGraph( |
1152 | ContextEdge *Edge, EdgeIter *EI, bool CalleeIter) { |
1153 | assert(!EI || (*EI)->get() == Edge); |
1154 | assert(!Edge->isRemoved()); |
1155 | // Save the Caller and Callee pointers so we can erase Edge from their edge |
1156 | // lists after clearing Edge below. We do the clearing first in case it is |
1157 | // destructed after removing from the edge lists (if those were the last |
1158 | // shared_ptr references to Edge). |
1159 | auto *Callee = Edge->Callee; |
1160 | auto *Caller = Edge->Caller; |
1161 | |
1162 | // Make sure the edge fields are cleared out so we can properly detect |
1163 | // removed edges if Edge is not destructed because there is still a shared_ptr |
1164 | // reference. |
1165 | Edge->clear(); |
1166 | |
1167 | #ifndef NDEBUG |
1168 | auto CalleeCallerCount = Callee->CallerEdges.size(); |
1169 | auto CallerCalleeCount = Caller->CalleeEdges.size(); |
1170 | #endif |
1171 | if (!EI) { |
1172 | Callee->eraseCallerEdge(Edge); |
1173 | Caller->eraseCalleeEdge(Edge); |
1174 | } else if (CalleeIter) { |
1175 | Callee->eraseCallerEdge(Edge); |
1176 | *EI = Caller->CalleeEdges.erase(*EI); |
1177 | } else { |
1178 | Caller->eraseCalleeEdge(Edge); |
1179 | *EI = Callee->CallerEdges.erase(*EI); |
1180 | } |
1181 | assert(Callee->CallerEdges.size() < CalleeCallerCount); |
1182 | assert(Caller->CalleeEdges.size() < CallerCalleeCount); |
1183 | } |
1184 | |
1185 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
1186 | void CallsiteContextGraph< |
1187 | DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) { |
1188 | for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) { |
1189 | auto Edge = *EI; |
1190 | if (Edge->AllocTypes == (uint8_t)AllocationType::None) { |
1191 | assert(Edge->ContextIds.empty()); |
1192 | removeEdgeFromGraph(Edge: Edge.get(), EI: &EI, /*CalleeIter=*/true); |
1193 | } else |
1194 | ++EI; |
1195 | } |
1196 | } |
1197 | |
1198 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
1199 | void CallsiteContextGraph< |
1200 | DerivedCCG, FuncTy, CallTy>::removeNoneTypeCallerEdges(ContextNode *Node) { |
1201 | for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) { |
1202 | auto Edge = *EI; |
1203 | if (Edge->AllocTypes == (uint8_t)AllocationType::None) { |
1204 | assert(Edge->ContextIds.empty()); |
1205 | Edge->Caller->eraseCalleeEdge(Edge.get()); |
1206 | EI = Node->CallerEdges.erase(EI); |
1207 | } else |
1208 | ++EI; |
1209 | } |
1210 | } |
1211 | |
1212 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
1213 | typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge * |
1214 | CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: |
1215 | findEdgeFromCallee(const ContextNode *Callee) { |
1216 | for (const auto &Edge : CalleeEdges) |
1217 | if (Edge->Callee == Callee) |
1218 | return Edge.get(); |
1219 | return nullptr; |
1220 | } |
1221 | |
1222 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
1223 | typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge * |
1224 | CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: |
1225 | findEdgeFromCaller(const ContextNode *Caller) { |
1226 | for (const auto &Edge : CallerEdges) |
1227 | if (Edge->Caller == Caller) |
1228 | return Edge.get(); |
1229 | return nullptr; |
1230 | } |
1231 | |
1232 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
1233 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: |
1234 | eraseCalleeEdge(const ContextEdge *Edge) { |
1235 | auto EI = llvm::find_if( |
1236 | CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) { |
1237 | return CalleeEdge.get() == Edge; |
1238 | }); |
1239 | assert(EI != CalleeEdges.end()); |
1240 | CalleeEdges.erase(EI); |
1241 | } |
1242 | |
1243 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
1244 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: |
1245 | eraseCallerEdge(const ContextEdge *Edge) { |
1246 | auto EI = llvm::find_if( |
1247 | CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) { |
1248 | return CallerEdge.get() == Edge; |
1249 | }); |
1250 | assert(EI != CallerEdges.end()); |
1251 | CallerEdges.erase(EI); |
1252 | } |
1253 | |
1254 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
1255 | uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType( |
1256 | DenseSet<uint32_t> &ContextIds) const { |
1257 | uint8_t BothTypes = |
1258 | (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold; |
1259 | uint8_t AllocType = (uint8_t)AllocationType::None; |
1260 | for (auto Id : ContextIds) { |
1261 | AllocType |= (uint8_t)ContextIdToAllocationType.at(Val: Id); |
1262 | // Bail early if alloc type reached both, no further refinement. |
1263 | if (AllocType == BothTypes) |
1264 | return AllocType; |
1265 | } |
1266 | return AllocType; |
1267 | } |
1268 | |
1269 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
1270 | uint8_t |
1271 | CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl( |
1272 | const DenseSet<uint32_t> &Node1Ids, |
1273 | const DenseSet<uint32_t> &Node2Ids) const { |
1274 | uint8_t BothTypes = |
1275 | (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold; |
1276 | uint8_t AllocType = (uint8_t)AllocationType::None; |
1277 | for (auto Id : Node1Ids) { |
1278 | if (!Node2Ids.count(V: Id)) |
1279 | continue; |
1280 | AllocType |= (uint8_t)ContextIdToAllocationType.at(Val: Id); |
1281 | // Bail early if alloc type reached both, no further refinement. |
1282 | if (AllocType == BothTypes) |
1283 | return AllocType; |
1284 | } |
1285 | return AllocType; |
1286 | } |
1287 | |
1288 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
1289 | uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes( |
1290 | const DenseSet<uint32_t> &Node1Ids, |
1291 | const DenseSet<uint32_t> &Node2Ids) const { |
1292 | if (Node1Ids.size() < Node2Ids.size()) |
1293 | return intersectAllocTypesImpl(Node1Ids, Node2Ids); |
1294 | else |
1295 | return intersectAllocTypesImpl(Node1Ids: Node2Ids, Node2Ids: Node1Ids); |
1296 | } |
1297 | |
1298 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
1299 | typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * |
1300 | CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode( |
1301 | CallInfo Call, const FuncTy *F) { |
1302 | assert(!getNodeForAlloc(Call)); |
1303 | ContextNode *AllocNode = createNewNode(/*IsAllocation=*/true, F, C: Call); |
1304 | AllocationCallToContextNodeMap[Call] = AllocNode; |
1305 | // Use LastContextId as a uniq id for MIB allocation nodes. |
1306 | AllocNode->OrigStackOrAllocId = LastContextId; |
1307 | // Alloc type should be updated as we add in the MIBs. We should assert |
1308 | // afterwards that it is not still None. |
1309 | AllocNode->AllocTypes = (uint8_t)AllocationType::None; |
1310 | |
1311 | return AllocNode; |
1312 | } |
1313 | |
1314 | static std::string getAllocTypeString(uint8_t AllocTypes) { |
1315 | if (!AllocTypes) |
1316 | return "None" ; |
1317 | std::string Str; |
1318 | if (AllocTypes & (uint8_t)AllocationType::NotCold) |
1319 | Str += "NotCold" ; |
1320 | if (AllocTypes & (uint8_t)AllocationType::Cold) |
1321 | Str += "Cold" ; |
1322 | return Str; |
1323 | } |
1324 | |
1325 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
1326 | template <class NodeT, class IteratorT> |
1327 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB( |
1328 | ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext, |
1329 | CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType, |
1330 | ArrayRef<ContextTotalSize> ContextSizeInfo) { |
1331 | // Treating the hot alloc type as NotCold before the disambiguation for "hot" |
1332 | // is done. |
1333 | if (AllocType == AllocationType::Hot) |
1334 | AllocType = AllocationType::NotCold; |
1335 | |
1336 | ContextIdToAllocationType[++LastContextId] = AllocType; |
1337 | |
1338 | if (!ContextSizeInfo.empty()) { |
1339 | auto &Entry = ContextIdToContextSizeInfos[LastContextId]; |
1340 | Entry.insert(position: Entry.begin(), first: ContextSizeInfo.begin(), last: ContextSizeInfo.end()); |
1341 | } |
1342 | |
1343 | // Update alloc type and context ids for this MIB. |
1344 | AllocNode->AllocTypes |= (uint8_t)AllocType; |
1345 | |
1346 | // Now add or update nodes for each stack id in alloc's context. |
1347 | // Later when processing the stack ids on non-alloc callsites we will adjust |
1348 | // for any inlining in the context. |
1349 | ContextNode *PrevNode = AllocNode; |
1350 | // Look for recursion (direct recursion should have been collapsed by |
1351 | // module summary analysis, here we should just be detecting mutual |
1352 | // recursion). Mark these nodes so we don't try to clone. |
1353 | SmallSet<uint64_t, 8> StackIdSet; |
1354 | // Skip any on the allocation call (inlining). |
1355 | for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext); |
1356 | ContextIter != StackContext.end(); ++ContextIter) { |
1357 | auto StackId = getStackId(IdOrIndex: *ContextIter); |
1358 | ContextNode *StackNode = getNodeForStackId(StackId); |
1359 | if (!StackNode) { |
1360 | StackNode = createNewNode(/*IsAllocation=*/false); |
1361 | StackEntryIdToContextNodeMap[StackId] = StackNode; |
1362 | StackNode->OrigStackOrAllocId = StackId; |
1363 | } |
1364 | // Marking a node recursive will prevent its cloning completely, even for |
1365 | // non-recursive contexts flowing through it. |
1366 | if (!AllowRecursiveCallsites) { |
1367 | auto Ins = StackIdSet.insert(StackId); |
1368 | if (!Ins.second) |
1369 | StackNode->Recursive = true; |
1370 | } |
1371 | StackNode->AllocTypes |= (uint8_t)AllocType; |
1372 | PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId); |
1373 | PrevNode = StackNode; |
1374 | } |
1375 | } |
1376 | |
1377 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
1378 | DenseSet<uint32_t> |
1379 | CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds( |
1380 | const DenseSet<uint32_t> &StackSequenceContextIds, |
1381 | DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) { |
1382 | DenseSet<uint32_t> NewContextIds; |
1383 | for (auto OldId : StackSequenceContextIds) { |
1384 | NewContextIds.insert(V: ++LastContextId); |
1385 | OldToNewContextIds[OldId].insert(V: LastContextId); |
1386 | assert(ContextIdToAllocationType.count(OldId)); |
1387 | // The new context has the same allocation type as original. |
1388 | ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId]; |
1389 | if (DotAllocContextIds.contains(V: OldId)) |
1390 | DotAllocContextIds.insert(V: LastContextId); |
1391 | } |
1392 | return NewContextIds; |
1393 | } |
1394 | |
1395 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
1396 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: |
1397 | propagateDuplicateContextIds( |
1398 | const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) { |
1399 | // Build a set of duplicated context ids corresponding to the input id set. |
1400 | auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) { |
1401 | DenseSet<uint32_t> NewIds; |
1402 | for (auto Id : ContextIds) |
1403 | if (auto NewId = OldToNewContextIds.find(Val: Id); |
1404 | NewId != OldToNewContextIds.end()) |
1405 | NewIds.insert_range(R: NewId->second); |
1406 | return NewIds; |
1407 | }; |
1408 | |
1409 | // Recursively update context ids sets along caller edges. |
1410 | auto UpdateCallers = [&](ContextNode *Node, |
1411 | DenseSet<const ContextEdge *> &Visited, |
1412 | auto &&UpdateCallers) -> void { |
1413 | for (const auto &Edge : Node->CallerEdges) { |
1414 | auto Inserted = Visited.insert(Edge.get()); |
1415 | if (!Inserted.second) |
1416 | continue; |
1417 | ContextNode *NextNode = Edge->Caller; |
1418 | DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds()); |
1419 | // Only need to recursively iterate to NextNode via this caller edge if |
1420 | // it resulted in any added ids to NextNode. |
1421 | if (!NewIdsToAdd.empty()) { |
1422 | Edge->getContextIds().insert_range(NewIdsToAdd); |
1423 | UpdateCallers(NextNode, Visited, UpdateCallers); |
1424 | } |
1425 | } |
1426 | }; |
1427 | |
1428 | DenseSet<const ContextEdge *> Visited; |
1429 | for (auto &Entry : AllocationCallToContextNodeMap) { |
1430 | auto *Node = Entry.second; |
1431 | UpdateCallers(Node, Visited, UpdateCallers); |
1432 | } |
1433 | } |
1434 | |
1435 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
1436 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode( |
1437 | ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee, |
1438 | // This must be passed by value to make a copy since it will be adjusted |
1439 | // as ids are moved. |
1440 | DenseSet<uint32_t> RemainingContextIds) { |
1441 | auto &OrigEdges = |
1442 | TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges; |
1443 | DenseSet<uint32_t> RecursiveContextIds; |
1444 | DenseSet<uint32_t> AllCallerContextIds; |
1445 | if (AllowRecursiveCallsites) { |
1446 | // Identify which context ids are recursive which is needed to properly |
1447 | // update the RemainingContextIds set. The relevant recursive context ids |
1448 | // are those that are in multiple edges. |
1449 | for (auto &CE : OrigEdges) { |
1450 | AllCallerContextIds.reserve(Size: CE->getContextIds().size()); |
1451 | for (auto Id : CE->getContextIds()) |
1452 | if (!AllCallerContextIds.insert(Id).second) |
1453 | RecursiveContextIds.insert(Id); |
1454 | } |
1455 | } |
1456 | // Increment iterator in loop so that we can remove edges as needed. |
1457 | for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) { |
1458 | auto Edge = *EI; |
1459 | DenseSet<uint32_t> NewEdgeContextIds; |
1460 | DenseSet<uint32_t> NotFoundContextIds; |
1461 | // Remove any matching context ids from Edge, return set that were found and |
1462 | // removed, these are the new edge's context ids. Also update the remaining |
1463 | // (not found ids). |
1464 | set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds, |
1465 | NotFoundContextIds); |
1466 | // Update the remaining context ids set for the later edges. This is a |
1467 | // compile time optimization. |
1468 | if (RecursiveContextIds.empty()) { |
1469 | // No recursive ids, so all of the previously remaining context ids that |
1470 | // were not seen on this edge are the new remaining set. |
1471 | RemainingContextIds.swap(RHS&: NotFoundContextIds); |
1472 | } else { |
1473 | // Keep the recursive ids in the remaining set as we expect to see those |
1474 | // on another edge. We can remove the non-recursive remaining ids that |
1475 | // were seen on this edge, however. We already have the set of remaining |
1476 | // ids that were on this edge (in NewEdgeContextIds). Figure out which are |
1477 | // non-recursive and only remove those. Note that despite the higher |
1478 | // overhead of updating the remaining context ids set when recursion |
1479 | // handling is enabled, it was found to be at worst performance neutral |
1480 | // and in one case a clear win. |
1481 | DenseSet<uint32_t> NonRecursiveRemainingCurEdgeIds = |
1482 | set_difference(S1: NewEdgeContextIds, S2: RecursiveContextIds); |
1483 | set_subtract(S1&: RemainingContextIds, S2: NonRecursiveRemainingCurEdgeIds); |
1484 | } |
1485 | // If no matching context ids for this edge, skip it. |
1486 | if (NewEdgeContextIds.empty()) { |
1487 | ++EI; |
1488 | continue; |
1489 | } |
1490 | if (TowardsCallee) { |
1491 | uint8_t NewAllocType = computeAllocType(ContextIds&: NewEdgeContextIds); |
1492 | auto NewEdge = std::make_shared<ContextEdge>( |
1493 | Edge->Callee, NewNode, NewAllocType, std::move(NewEdgeContextIds)); |
1494 | NewNode->CalleeEdges.push_back(NewEdge); |
1495 | NewEdge->Callee->CallerEdges.push_back(NewEdge); |
1496 | } else { |
1497 | uint8_t NewAllocType = computeAllocType(ContextIds&: NewEdgeContextIds); |
1498 | auto NewEdge = std::make_shared<ContextEdge>( |
1499 | NewNode, Edge->Caller, NewAllocType, std::move(NewEdgeContextIds)); |
1500 | NewNode->CallerEdges.push_back(NewEdge); |
1501 | NewEdge->Caller->CalleeEdges.push_back(NewEdge); |
1502 | } |
1503 | // Remove old edge if context ids empty. |
1504 | if (Edge->getContextIds().empty()) { |
1505 | removeEdgeFromGraph(Edge: Edge.get(), EI: &EI, CalleeIter: TowardsCallee); |
1506 | continue; |
1507 | } |
1508 | ++EI; |
1509 | } |
1510 | } |
1511 | |
1512 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
1513 | static void checkEdge( |
1514 | const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) { |
1515 | // Confirm that alloc type is not None and that we have at least one context |
1516 | // id. |
1517 | assert(Edge->AllocTypes != (uint8_t)AllocationType::None); |
1518 | assert(!Edge->ContextIds.empty()); |
1519 | } |
1520 | |
1521 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
1522 | static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node, |
1523 | bool CheckEdges = true) { |
1524 | if (Node->isRemoved()) |
1525 | return; |
1526 | #ifndef NDEBUG |
1527 | // Compute node's context ids once for use in asserts. |
1528 | auto NodeContextIds = Node->getContextIds(); |
1529 | #endif |
1530 | // Node's context ids should be the union of both its callee and caller edge |
1531 | // context ids. |
1532 | if (Node->CallerEdges.size()) { |
1533 | DenseSet<uint32_t> CallerEdgeContextIds( |
1534 | Node->CallerEdges.front()->ContextIds); |
1535 | for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) { |
1536 | if (CheckEdges) |
1537 | checkEdge<DerivedCCG, FuncTy, CallTy>(Edge); |
1538 | set_union(CallerEdgeContextIds, Edge->ContextIds); |
1539 | } |
1540 | // Node can have more context ids than callers if some contexts terminate at |
1541 | // node and some are longer. If we are allowing recursive callsites and |
1542 | // contexts this will be violated for incompletely cloned recursive cycles, |
1543 | // so skip the checking in that case. |
1544 | assert((AllowRecursiveCallsites && AllowRecursiveContexts) || |
1545 | NodeContextIds == CallerEdgeContextIds || |
1546 | set_is_subset(CallerEdgeContextIds, NodeContextIds)); |
1547 | } |
1548 | if (Node->CalleeEdges.size()) { |
1549 | DenseSet<uint32_t> CalleeEdgeContextIds( |
1550 | Node->CalleeEdges.front()->ContextIds); |
1551 | for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) { |
1552 | if (CheckEdges) |
1553 | checkEdge<DerivedCCG, FuncTy, CallTy>(Edge); |
1554 | set_union(CalleeEdgeContextIds, Edge->getContextIds()); |
1555 | } |
1556 | // If we are allowing recursive callsites and contexts this will be violated |
1557 | // for incompletely cloned recursive cycles, so skip the checking in that |
1558 | // case. |
1559 | assert((AllowRecursiveCallsites && AllowRecursiveContexts) || |
1560 | NodeContextIds == CalleeEdgeContextIds); |
1561 | } |
1562 | // FIXME: Since this checking is only invoked under an option, we should |
1563 | // change the error checking from using assert to something that will trigger |
1564 | // an error on a release build. |
1565 | #ifndef NDEBUG |
1566 | // Make sure we don't end up with duplicate edges between the same caller and |
1567 | // callee. |
1568 | DenseSet<ContextNode<DerivedCCG, FuncTy, CallTy> *> NodeSet; |
1569 | for (const auto &E : Node->CalleeEdges) |
1570 | NodeSet.insert(E->Callee); |
1571 | assert(NodeSet.size() == Node->CalleeEdges.size()); |
1572 | #endif |
1573 | } |
1574 | |
1575 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
1576 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: |
1577 | assignStackNodesPostOrder( |
1578 | ContextNode *Node, DenseSet<const ContextNode *> &Visited, |
1579 | DenseMap<uint64_t, std::vector<CallContextInfo>> |
1580 | &StackIdToMatchingCalls, |
1581 | DenseMap<CallInfo, CallInfo> &CallToMatchingCall) { |
1582 | auto Inserted = Visited.insert(Node); |
1583 | if (!Inserted.second) |
1584 | return; |
1585 | // Post order traversal. Iterate over a copy since we may add nodes and |
1586 | // therefore new callers during the recursive call, invalidating any |
1587 | // iterator over the original edge vector. We don't need to process these |
1588 | // new nodes as they were already processed on creation. |
1589 | auto CallerEdges = Node->CallerEdges; |
1590 | for (auto &Edge : CallerEdges) { |
1591 | // Skip any that have been removed during the recursion. |
1592 | if (Edge->isRemoved()) { |
1593 | assert(!is_contained(Node->CallerEdges, Edge)); |
1594 | continue; |
1595 | } |
1596 | assignStackNodesPostOrder(Node: Edge->Caller, Visited, StackIdToMatchingCalls, |
1597 | CallToMatchingCall); |
1598 | } |
1599 | |
1600 | // If this node's stack id is in the map, update the graph to contain new |
1601 | // nodes representing any inlining at interior callsites. Note we move the |
1602 | // associated context ids over to the new nodes. |
1603 | |
1604 | // Ignore this node if it is for an allocation or we didn't record any |
1605 | // stack id lists ending at it. |
1606 | if (Node->IsAllocation || |
1607 | !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId)) |
1608 | return; |
1609 | |
1610 | auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId]; |
1611 | // Handle the simple case first. A single call with a single stack id. |
1612 | // In this case there is no need to create any new context nodes, simply |
1613 | // assign the context node for stack id to this Call. |
1614 | if (Calls.size() == 1) { |
1615 | auto &[Call, Ids, Func, SavedContextIds] = Calls[0]; |
1616 | if (Ids.size() == 1) { |
1617 | assert(SavedContextIds.empty()); |
1618 | // It should be this Node |
1619 | assert(Node == getNodeForStackId(Ids[0])); |
1620 | if (Node->Recursive) |
1621 | return; |
1622 | Node->setCall(Call); |
1623 | NonAllocationCallToContextNodeMap[Call] = Node; |
1624 | NodeToCallingFunc[Node] = Func; |
1625 | return; |
1626 | } |
1627 | } |
1628 | |
1629 | #ifndef NDEBUG |
1630 | // Find the node for the last stack id, which should be the same |
1631 | // across all calls recorded for this id, and is this node's id. |
1632 | uint64_t LastId = Node->OrigStackOrAllocId; |
1633 | ContextNode *LastNode = getNodeForStackId(LastId); |
1634 | // We should only have kept stack ids that had nodes. |
1635 | assert(LastNode); |
1636 | assert(LastNode == Node); |
1637 | #else |
1638 | ContextNode *LastNode = Node; |
1639 | #endif |
1640 | |
1641 | // Compute the last node's context ids once, as it is shared by all calls in |
1642 | // this entry. |
1643 | DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds(); |
1644 | |
1645 | [[maybe_unused]] bool PrevIterCreatedNode = false; |
1646 | bool CreatedNode = false; |
1647 | for (unsigned I = 0; I < Calls.size(); |
1648 | I++, PrevIterCreatedNode = CreatedNode) { |
1649 | CreatedNode = false; |
1650 | auto &[Call, Ids, Func, SavedContextIds] = Calls[I]; |
1651 | // Skip any for which we didn't assign any ids, these don't get a node in |
1652 | // the graph. |
1653 | if (SavedContextIds.empty()) { |
1654 | // If this call has a matching call (located in the same function and |
1655 | // having the same stack ids), simply add it to the context node created |
1656 | // for its matching call earlier. These can be treated the same through |
1657 | // cloning and get updated at the same time. |
1658 | if (!CallToMatchingCall.contains(Call)) |
1659 | continue; |
1660 | auto MatchingCall = CallToMatchingCall[Call]; |
1661 | if (!NonAllocationCallToContextNodeMap.contains(MatchingCall)) { |
1662 | // This should only happen if we had a prior iteration, and it didn't |
1663 | // create a node because of the below recomputation of context ids |
1664 | // finding none remaining and continuing early. |
1665 | assert(I > 0 && !PrevIterCreatedNode); |
1666 | continue; |
1667 | } |
1668 | NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back( |
1669 | Call); |
1670 | continue; |
1671 | } |
1672 | |
1673 | assert(LastId == Ids.back()); |
1674 | |
1675 | // Recompute the context ids for this stack id sequence (the |
1676 | // intersection of the context ids of the corresponding nodes). |
1677 | // Start with the ids we saved in the map for this call, which could be |
1678 | // duplicated context ids. We have to recompute as we might have overlap |
1679 | // overlap between the saved context ids for different last nodes, and |
1680 | // removed them already during the post order traversal. |
1681 | set_intersect(SavedContextIds, LastNodeContextIds); |
1682 | ContextNode *PrevNode = LastNode; |
1683 | bool Skip = false; |
1684 | // Iterate backwards through the stack Ids, starting after the last Id |
1685 | // in the list, which was handled once outside for all Calls. |
1686 | for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) { |
1687 | auto Id = *IdIter; |
1688 | ContextNode *CurNode = getNodeForStackId(StackId: Id); |
1689 | // We should only have kept stack ids that had nodes and weren't |
1690 | // recursive. |
1691 | assert(CurNode); |
1692 | assert(!CurNode->Recursive); |
1693 | |
1694 | auto *Edge = CurNode->findEdgeFromCaller(PrevNode); |
1695 | if (!Edge) { |
1696 | Skip = true; |
1697 | break; |
1698 | } |
1699 | PrevNode = CurNode; |
1700 | |
1701 | // Update the context ids, which is the intersection of the ids along |
1702 | // all edges in the sequence. |
1703 | set_intersect(SavedContextIds, Edge->getContextIds()); |
1704 | |
1705 | // If we now have no context ids for clone, skip this call. |
1706 | if (SavedContextIds.empty()) { |
1707 | Skip = true; |
1708 | break; |
1709 | } |
1710 | } |
1711 | if (Skip) |
1712 | continue; |
1713 | |
1714 | // Create new context node. |
1715 | ContextNode *NewNode = createNewNode(/*IsAllocation=*/false, F: Func, C: Call); |
1716 | NonAllocationCallToContextNodeMap[Call] = NewNode; |
1717 | CreatedNode = true; |
1718 | NewNode->AllocTypes = computeAllocType(ContextIds&: SavedContextIds); |
1719 | |
1720 | ContextNode *FirstNode = getNodeForStackId(StackId: Ids[0]); |
1721 | assert(FirstNode); |
1722 | |
1723 | // Connect to callees of innermost stack frame in inlined call chain. |
1724 | // This updates context ids for FirstNode's callee's to reflect those |
1725 | // moved to NewNode. |
1726 | connectNewNode(NewNode, OrigNode: FirstNode, /*TowardsCallee=*/true, RemainingContextIds: SavedContextIds); |
1727 | |
1728 | // Connect to callers of outermost stack frame in inlined call chain. |
1729 | // This updates context ids for FirstNode's caller's to reflect those |
1730 | // moved to NewNode. |
1731 | connectNewNode(NewNode, OrigNode: LastNode, /*TowardsCallee=*/false, RemainingContextIds: SavedContextIds); |
1732 | |
1733 | // Now we need to remove context ids from edges/nodes between First and |
1734 | // Last Node. |
1735 | PrevNode = nullptr; |
1736 | for (auto Id : Ids) { |
1737 | ContextNode *CurNode = getNodeForStackId(StackId: Id); |
1738 | // We should only have kept stack ids that had nodes. |
1739 | assert(CurNode); |
1740 | |
1741 | // Remove the context ids moved to NewNode from CurNode, and the |
1742 | // edge from the prior node. |
1743 | if (PrevNode) { |
1744 | auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode); |
1745 | // If the sequence contained recursion, we might have already removed |
1746 | // some edges during the connectNewNode calls above. |
1747 | if (!PrevEdge) { |
1748 | PrevNode = CurNode; |
1749 | continue; |
1750 | } |
1751 | set_subtract(PrevEdge->getContextIds(), SavedContextIds); |
1752 | if (PrevEdge->getContextIds().empty()) |
1753 | removeEdgeFromGraph(Edge: PrevEdge); |
1754 | } |
1755 | // Since we update the edges from leaf to tail, only look at the callee |
1756 | // edges. This isn't an alloc node, so if there are no callee edges, the |
1757 | // alloc type is None. |
1758 | CurNode->AllocTypes = CurNode->CalleeEdges.empty() |
1759 | ? (uint8_t)AllocationType::None |
1760 | : CurNode->computeAllocType(); |
1761 | PrevNode = CurNode; |
1762 | } |
1763 | if (VerifyNodes) { |
1764 | checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true); |
1765 | for (auto Id : Ids) { |
1766 | ContextNode *CurNode = getNodeForStackId(StackId: Id); |
1767 | // We should only have kept stack ids that had nodes. |
1768 | assert(CurNode); |
1769 | checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /*CheckEdges=*/true); |
1770 | } |
1771 | } |
1772 | } |
1773 | } |
1774 | |
1775 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
1776 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() { |
1777 | // Map of stack id to all calls with that as the last (outermost caller) |
1778 | // callsite id that has a context node (some might not due to pruning |
1779 | // performed during matching of the allocation profile contexts). |
1780 | // The CallContextInfo contains the Call and a list of its stack ids with |
1781 | // ContextNodes, the function containing Call, and the set of context ids |
1782 | // the analysis will eventually identify for use in any new node created |
1783 | // for that callsite. |
1784 | DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls; |
1785 | for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) { |
1786 | for (auto &Call : CallsWithMetadata) { |
1787 | // Ignore allocations, already handled. |
1788 | if (AllocationCallToContextNodeMap.count(Call)) |
1789 | continue; |
1790 | auto StackIdsWithContextNodes = |
1791 | getStackIdsWithContextNodesForCall(Call: Call.call()); |
1792 | // If there were no nodes created for MIBs on allocs (maybe this was in |
1793 | // the unambiguous part of the MIB stack that was pruned), ignore. |
1794 | if (StackIdsWithContextNodes.empty()) |
1795 | continue; |
1796 | // Otherwise, record this Call along with the list of ids for the last |
1797 | // (outermost caller) stack id with a node. |
1798 | StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back( |
1799 | {Call.call(), StackIdsWithContextNodes, Func, {}}); |
1800 | } |
1801 | } |
1802 | |
1803 | // First make a pass through all stack ids that correspond to a call, |
1804 | // as identified in the above loop. Compute the context ids corresponding to |
1805 | // each of these calls when they correspond to multiple stack ids due to |
1806 | // due to inlining. Perform any duplication of context ids required when |
1807 | // there is more than one call with the same stack ids. Their (possibly newly |
1808 | // duplicated) context ids are saved in the StackIdToMatchingCalls map. |
1809 | DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds; |
1810 | // Save a map from each call to any that are found to match it. I.e. located |
1811 | // in the same function and have the same (possibly pruned) stack ids. We use |
1812 | // this to avoid creating extra graph nodes as they can be treated the same. |
1813 | DenseMap<CallInfo, CallInfo> CallToMatchingCall; |
1814 | for (auto &It : StackIdToMatchingCalls) { |
1815 | auto &Calls = It.getSecond(); |
1816 | // Skip single calls with a single stack id. These don't need a new node. |
1817 | if (Calls.size() == 1) { |
1818 | auto &Ids = Calls[0].StackIds; |
1819 | if (Ids.size() == 1) |
1820 | continue; |
1821 | } |
1822 | // In order to do the best and maximal matching of inlined calls to context |
1823 | // node sequences we will sort the vectors of stack ids in descending order |
1824 | // of length, and within each length, lexicographically by stack id. The |
1825 | // latter is so that we can specially handle calls that have identical stack |
1826 | // id sequences (either due to cloning or artificially because of the MIB |
1827 | // context pruning). Those with the same Ids are then sorted by function to |
1828 | // facilitate efficiently mapping them to the same context node. |
1829 | // Because the functions are pointers, to ensure a stable sort first assign |
1830 | // each function pointer to its first index in the Calls array, and then use |
1831 | // that to sort by. |
1832 | DenseMap<const FuncTy *, unsigned> FuncToIndex; |
1833 | for (const auto &[Idx, CallCtxInfo] : enumerate(Calls)) |
1834 | FuncToIndex.insert({CallCtxInfo.Func, Idx}); |
1835 | llvm::stable_sort( |
1836 | Calls, |
1837 | [&FuncToIndex](const CallContextInfo &A, const CallContextInfo &B) { |
1838 | return A.StackIds.size() > B.StackIds.size() || |
1839 | (A.StackIds.size() == B.StackIds.size() && |
1840 | (A.StackIds < B.StackIds || |
1841 | (A.StackIds == B.StackIds && |
1842 | FuncToIndex[A.Func] < FuncToIndex[B.Func]))); |
1843 | }); |
1844 | |
1845 | // Find the node for the last stack id, which should be the same |
1846 | // across all calls recorded for this id, and is the id for this |
1847 | // entry in the StackIdToMatchingCalls map. |
1848 | uint64_t LastId = It.getFirst(); |
1849 | ContextNode *LastNode = getNodeForStackId(StackId: LastId); |
1850 | // We should only have kept stack ids that had nodes. |
1851 | assert(LastNode); |
1852 | |
1853 | if (LastNode->Recursive) |
1854 | continue; |
1855 | |
1856 | // Initialize the context ids with the last node's. We will subsequently |
1857 | // refine the context ids by computing the intersection along all edges. |
1858 | DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds(); |
1859 | assert(!LastNodeContextIds.empty()); |
1860 | |
1861 | #ifndef NDEBUG |
1862 | // Save the set of functions seen for a particular set of the same stack |
1863 | // ids. This is used to ensure that they have been correctly sorted to be |
1864 | // adjacent in the Calls list, since we rely on that to efficiently place |
1865 | // all such matching calls onto the same context node. |
1866 | DenseSet<const FuncTy *> MatchingIdsFuncSet; |
1867 | #endif |
1868 | |
1869 | for (unsigned I = 0; I < Calls.size(); I++) { |
1870 | auto &[Call, Ids, Func, SavedContextIds] = Calls[I]; |
1871 | assert(SavedContextIds.empty()); |
1872 | assert(LastId == Ids.back()); |
1873 | |
1874 | #ifndef NDEBUG |
1875 | // If this call has a different set of ids than the last one, clear the |
1876 | // set used to ensure they are sorted properly. |
1877 | if (I > 0 && Ids != Calls[I - 1].StackIds) |
1878 | MatchingIdsFuncSet.clear(); |
1879 | #endif |
1880 | |
1881 | // First compute the context ids for this stack id sequence (the |
1882 | // intersection of the context ids of the corresponding nodes). |
1883 | // Start with the remaining saved ids for the last node. |
1884 | assert(!LastNodeContextIds.empty()); |
1885 | DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds; |
1886 | |
1887 | ContextNode *PrevNode = LastNode; |
1888 | ContextNode *CurNode = LastNode; |
1889 | bool Skip = false; |
1890 | |
1891 | // Iterate backwards through the stack Ids, starting after the last Id |
1892 | // in the list, which was handled once outside for all Calls. |
1893 | for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) { |
1894 | auto Id = *IdIter; |
1895 | CurNode = getNodeForStackId(StackId: Id); |
1896 | // We should only have kept stack ids that had nodes. |
1897 | assert(CurNode); |
1898 | |
1899 | if (CurNode->Recursive) { |
1900 | Skip = true; |
1901 | break; |
1902 | } |
1903 | |
1904 | auto *Edge = CurNode->findEdgeFromCaller(PrevNode); |
1905 | // If there is no edge then the nodes belong to different MIB contexts, |
1906 | // and we should skip this inlined context sequence. For example, this |
1907 | // particular inlined context may include stack ids A->B, and we may |
1908 | // indeed have nodes for both A and B, but it is possible that they were |
1909 | // never profiled in sequence in a single MIB for any allocation (i.e. |
1910 | // we might have profiled an allocation that involves the callsite A, |
1911 | // but through a different one of its callee callsites, and we might |
1912 | // have profiled an allocation that involves callsite B, but reached |
1913 | // from a different caller callsite). |
1914 | if (!Edge) { |
1915 | Skip = true; |
1916 | break; |
1917 | } |
1918 | PrevNode = CurNode; |
1919 | |
1920 | // Update the context ids, which is the intersection of the ids along |
1921 | // all edges in the sequence. |
1922 | set_intersect(StackSequenceContextIds, Edge->getContextIds()); |
1923 | |
1924 | // If we now have no context ids for clone, skip this call. |
1925 | if (StackSequenceContextIds.empty()) { |
1926 | Skip = true; |
1927 | break; |
1928 | } |
1929 | } |
1930 | if (Skip) |
1931 | continue; |
1932 | |
1933 | // If some of this call's stack ids did not have corresponding nodes (due |
1934 | // to pruning), don't include any context ids for contexts that extend |
1935 | // beyond these nodes. Otherwise we would be matching part of unrelated / |
1936 | // not fully matching stack contexts. To do this, subtract any context ids |
1937 | // found in caller nodes of the last node found above. |
1938 | if (Ids.back() != getLastStackId(Call)) { |
1939 | for (const auto &PE : LastNode->CallerEdges) { |
1940 | set_subtract(StackSequenceContextIds, PE->getContextIds()); |
1941 | if (StackSequenceContextIds.empty()) |
1942 | break; |
1943 | } |
1944 | // If we now have no context ids for clone, skip this call. |
1945 | if (StackSequenceContextIds.empty()) |
1946 | continue; |
1947 | } |
1948 | |
1949 | #ifndef NDEBUG |
1950 | // If the prior call had the same stack ids this set would not be empty. |
1951 | // Check if we already have a call that "matches" because it is located |
1952 | // in the same function. If the Calls list was sorted properly we should |
1953 | // not encounter this situation as all such entries should be adjacent |
1954 | // and processed in bulk further below. |
1955 | assert(!MatchingIdsFuncSet.contains(Func)); |
1956 | |
1957 | MatchingIdsFuncSet.insert(Func); |
1958 | #endif |
1959 | |
1960 | // Check if the next set of stack ids is the same (since the Calls vector |
1961 | // of tuples is sorted by the stack ids we can just look at the next one). |
1962 | // If so, save them in the CallToMatchingCall map so that they get |
1963 | // assigned to the same context node, and skip them. |
1964 | bool DuplicateContextIds = false; |
1965 | for (unsigned J = I + 1; J < Calls.size(); J++) { |
1966 | auto &CallCtxInfo = Calls[J]; |
1967 | auto &NextIds = CallCtxInfo.StackIds; |
1968 | if (NextIds != Ids) |
1969 | break; |
1970 | auto *NextFunc = CallCtxInfo.Func; |
1971 | if (NextFunc != Func) { |
1972 | // We have another Call with the same ids but that cannot share this |
1973 | // node, must duplicate ids for it. |
1974 | DuplicateContextIds = true; |
1975 | break; |
1976 | } |
1977 | auto &NextCall = CallCtxInfo.Call; |
1978 | CallToMatchingCall[NextCall] = Call; |
1979 | // Update I so that it gets incremented correctly to skip this call. |
1980 | I = J; |
1981 | } |
1982 | |
1983 | // If we don't have duplicate context ids, then we can assign all the |
1984 | // context ids computed for the original node sequence to this call. |
1985 | // If there are duplicate calls with the same stack ids then we synthesize |
1986 | // new context ids that are duplicates of the originals. These are |
1987 | // assigned to SavedContextIds, which is a reference into the map entry |
1988 | // for this call, allowing us to access these ids later on. |
1989 | OldToNewContextIds.reserve(NumEntries: OldToNewContextIds.size() + |
1990 | StackSequenceContextIds.size()); |
1991 | SavedContextIds = |
1992 | DuplicateContextIds |
1993 | ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds) |
1994 | : StackSequenceContextIds; |
1995 | assert(!SavedContextIds.empty()); |
1996 | |
1997 | if (!DuplicateContextIds) { |
1998 | // Update saved last node's context ids to remove those that are |
1999 | // assigned to other calls, so that it is ready for the next call at |
2000 | // this stack id. |
2001 | set_subtract(S1&: LastNodeContextIds, S2: StackSequenceContextIds); |
2002 | if (LastNodeContextIds.empty()) |
2003 | break; |
2004 | } |
2005 | } |
2006 | } |
2007 | |
2008 | // Propagate the duplicate context ids over the graph. |
2009 | propagateDuplicateContextIds(OldToNewContextIds); |
2010 | |
2011 | if (VerifyCCG) |
2012 | check(); |
2013 | |
2014 | // Now perform a post-order traversal over the graph, starting with the |
2015 | // allocation nodes, essentially processing nodes from callers to callees. |
2016 | // For any that contains an id in the map, update the graph to contain new |
2017 | // nodes representing any inlining at interior callsites. Note we move the |
2018 | // associated context ids over to the new nodes. |
2019 | DenseSet<const ContextNode *> Visited; |
2020 | for (auto &Entry : AllocationCallToContextNodeMap) |
2021 | assignStackNodesPostOrder(Node: Entry.second, Visited, StackIdToMatchingCalls, |
2022 | CallToMatchingCall); |
2023 | if (VerifyCCG) |
2024 | check(); |
2025 | } |
2026 | |
2027 | uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) { |
2028 | CallStack<MDNode, MDNode::op_iterator> CallsiteContext( |
2029 | Call->getMetadata(KindID: LLVMContext::MD_callsite)); |
2030 | return CallsiteContext.back(); |
2031 | } |
2032 | |
2033 | uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) { |
2034 | assert(isa<CallsiteInfo *>(Call)); |
2035 | CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator> |
2036 | CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Val&: Call)); |
2037 | // Need to convert index into stack id. |
2038 | return Index.getStackIdAtIndex(Index: CallsiteContext.back()); |
2039 | } |
2040 | |
2041 | static const std::string MemProfCloneSuffix = ".memprof." ; |
2042 | |
2043 | static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) { |
2044 | // We use CloneNo == 0 to refer to the original version, which doesn't get |
2045 | // renamed with a suffix. |
2046 | if (!CloneNo) |
2047 | return Base.str(); |
2048 | return (Base + MemProfCloneSuffix + Twine(CloneNo)).str(); |
2049 | } |
2050 | |
2051 | static bool isMemProfClone(const Function &F) { |
2052 | return F.getName().contains(Other: MemProfCloneSuffix); |
2053 | } |
2054 | |
2055 | std::string ModuleCallsiteContextGraph::getLabel(const Function *Func, |
2056 | const Instruction *Call, |
2057 | unsigned CloneNo) const { |
2058 | return (Twine(Call->getFunction()->getName()) + " -> " + |
2059 | cast<CallBase>(Val: Call)->getCalledFunction()->getName()) |
2060 | .str(); |
2061 | } |
2062 | |
2063 | std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func, |
2064 | const IndexCall &Call, |
2065 | unsigned CloneNo) const { |
2066 | auto VI = FSToVIMap.find(x: Func); |
2067 | assert(VI != FSToVIMap.end()); |
2068 | if (isa<AllocInfo *>(Val: Call)) |
2069 | return (VI->second.name() + " -> alloc" ).str(); |
2070 | else { |
2071 | auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Val: Call); |
2072 | return (VI->second.name() + " -> " + |
2073 | getMemProfFuncName(Base: Callsite->Callee.name(), |
2074 | CloneNo: Callsite->Clones[CloneNo])) |
2075 | .str(); |
2076 | } |
2077 | } |
2078 | |
2079 | std::vector<uint64_t> |
2080 | ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall( |
2081 | Instruction *Call) { |
2082 | CallStack<MDNode, MDNode::op_iterator> CallsiteContext( |
2083 | Call->getMetadata(KindID: LLVMContext::MD_callsite)); |
2084 | return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>( |
2085 | CallsiteContext); |
2086 | } |
2087 | |
2088 | std::vector<uint64_t> |
2089 | IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) { |
2090 | assert(isa<CallsiteInfo *>(Call)); |
2091 | CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator> |
2092 | CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Val&: Call)); |
2093 | return getStackIdsWithContextNodes<CallsiteInfo, |
2094 | SmallVector<unsigned>::const_iterator>( |
2095 | CallsiteContext); |
2096 | } |
2097 | |
2098 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
2099 | template <class NodeT, class IteratorT> |
2100 | std::vector<uint64_t> |
2101 | CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes( |
2102 | CallStack<NodeT, IteratorT> &CallsiteContext) { |
2103 | std::vector<uint64_t> StackIds; |
2104 | for (auto IdOrIndex : CallsiteContext) { |
2105 | auto StackId = getStackId(IdOrIndex); |
2106 | ContextNode *Node = getNodeForStackId(StackId); |
2107 | if (!Node) |
2108 | break; |
2109 | StackIds.push_back(StackId); |
2110 | } |
2111 | return StackIds; |
2112 | } |
2113 | |
2114 | ModuleCallsiteContextGraph::( |
2115 | Module &M, |
2116 | llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) |
2117 | : Mod(M), OREGetter(OREGetter) { |
2118 | for (auto &F : M) { |
2119 | std::vector<CallInfo> CallsWithMetadata; |
2120 | for (auto &BB : F) { |
2121 | for (auto &I : BB) { |
2122 | if (!isa<CallBase>(Val: I)) |
2123 | continue; |
2124 | if (auto *MemProfMD = I.getMetadata(KindID: LLVMContext::MD_memprof)) { |
2125 | CallsWithMetadata.push_back(x: &I); |
2126 | auto *AllocNode = addAllocNode(Call: &I, F: &F); |
2127 | auto *CallsiteMD = I.getMetadata(KindID: LLVMContext::MD_callsite); |
2128 | assert(CallsiteMD); |
2129 | CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD); |
2130 | // Add all of the MIBs and their stack nodes. |
2131 | for (auto &MDOp : MemProfMD->operands()) { |
2132 | auto *MIBMD = cast<const MDNode>(Val: MDOp); |
2133 | std::vector<ContextTotalSize> ContextSizeInfo; |
2134 | // Collect the context size information if it exists. |
2135 | if (MIBMD->getNumOperands() > 2) { |
2136 | for (unsigned I = 2; I < MIBMD->getNumOperands(); I++) { |
2137 | MDNode *ContextSizePair = |
2138 | dyn_cast<MDNode>(Val: MIBMD->getOperand(I)); |
2139 | assert(ContextSizePair->getNumOperands() == 2); |
2140 | uint64_t FullStackId = mdconst::dyn_extract<ConstantInt>( |
2141 | MD: ContextSizePair->getOperand(I: 0)) |
2142 | ->getZExtValue(); |
2143 | uint64_t TotalSize = mdconst::dyn_extract<ConstantInt>( |
2144 | MD: ContextSizePair->getOperand(I: 1)) |
2145 | ->getZExtValue(); |
2146 | ContextSizeInfo.push_back(x: {.FullStackId: FullStackId, .TotalSize: TotalSize}); |
2147 | } |
2148 | } |
2149 | MDNode *StackNode = getMIBStackNode(MIB: MIBMD); |
2150 | assert(StackNode); |
2151 | CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode); |
2152 | addStackNodesForMIB<MDNode, MDNode::op_iterator>( |
2153 | AllocNode, StackContext, CallsiteContext, |
2154 | AllocType: getMIBAllocType(MIB: MIBMD), ContextSizeInfo); |
2155 | } |
2156 | // If exporting the graph to dot and an allocation id of interest was |
2157 | // specified, record all the context ids for this allocation node. |
2158 | if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot) |
2159 | DotAllocContextIds = AllocNode->getContextIds(); |
2160 | assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None); |
2161 | // Memprof and callsite metadata on memory allocations no longer |
2162 | // needed. |
2163 | I.setMetadata(KindID: LLVMContext::MD_memprof, Node: nullptr); |
2164 | I.setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr); |
2165 | } |
2166 | // For callsite metadata, add to list for this function for later use. |
2167 | else if (I.getMetadata(KindID: LLVMContext::MD_callsite)) { |
2168 | CallsWithMetadata.push_back(x: &I); |
2169 | } |
2170 | } |
2171 | } |
2172 | if (!CallsWithMetadata.empty()) |
2173 | FuncToCallsWithMetadata[&F] = CallsWithMetadata; |
2174 | } |
2175 | |
2176 | if (DumpCCG) { |
2177 | dbgs() << "CCG before updating call stack chains:\n" ; |
2178 | dbgs() << *this; |
2179 | } |
2180 | |
2181 | if (ExportToDot) |
2182 | exportToDot(Label: "prestackupdate" ); |
2183 | |
2184 | updateStackNodes(); |
2185 | |
2186 | if (ExportToDot) |
2187 | exportToDot(Label: "poststackupdate" ); |
2188 | |
2189 | handleCallsitesWithMultipleTargets(); |
2190 | |
2191 | markBackedges(); |
2192 | |
2193 | // Strip off remaining callsite metadata, no longer needed. |
2194 | for (auto &FuncEntry : FuncToCallsWithMetadata) |
2195 | for (auto &Call : FuncEntry.second) |
2196 | Call.call()->setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr); |
2197 | } |
2198 | |
2199 | IndexCallsiteContextGraph::IndexCallsiteContextGraph( |
2200 | ModuleSummaryIndex &Index, |
2201 | llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)> |
2202 | isPrevailing) |
2203 | : Index(Index), isPrevailing(isPrevailing) { |
2204 | for (auto &I : Index) { |
2205 | auto VI = Index.getValueInfo(R: I); |
2206 | for (auto &S : VI.getSummaryList()) { |
2207 | // We should only add the prevailing nodes. Otherwise we may try to clone |
2208 | // in a weak copy that won't be linked (and may be different than the |
2209 | // prevailing version). |
2210 | // We only keep the memprof summary on the prevailing copy now when |
2211 | // building the combined index, as a space optimization, however don't |
2212 | // rely on this optimization. The linker doesn't resolve local linkage |
2213 | // values so don't check whether those are prevailing. |
2214 | if (!GlobalValue::isLocalLinkage(Linkage: S->linkage()) && |
2215 | !isPrevailing(VI.getGUID(), S.get())) |
2216 | continue; |
2217 | auto *FS = dyn_cast<FunctionSummary>(Val: S.get()); |
2218 | if (!FS) |
2219 | continue; |
2220 | std::vector<CallInfo> CallsWithMetadata; |
2221 | if (!FS->allocs().empty()) { |
2222 | for (auto &AN : FS->mutableAllocs()) { |
2223 | // This can happen because of recursion elimination handling that |
2224 | // currently exists in ModuleSummaryAnalysis. Skip these for now. |
2225 | // We still added them to the summary because we need to be able to |
2226 | // correlate properly in applyImport in the backends. |
2227 | if (AN.MIBs.empty()) |
2228 | continue; |
2229 | IndexCall AllocCall(&AN); |
2230 | CallsWithMetadata.push_back(x: AllocCall); |
2231 | auto *AllocNode = addAllocNode(Call: AllocCall, F: FS); |
2232 | // Pass an empty CallStack to the CallsiteContext (second) |
2233 | // parameter, since for ThinLTO we already collapsed out the inlined |
2234 | // stack ids on the allocation call during ModuleSummaryAnalysis. |
2235 | CallStack<MIBInfo, SmallVector<unsigned>::const_iterator> |
2236 | EmptyContext; |
2237 | unsigned I = 0; |
2238 | assert(!metadataMayIncludeContextSizeInfo() || |
2239 | AN.ContextSizeInfos.size() == AN.MIBs.size()); |
2240 | // Now add all of the MIBs and their stack nodes. |
2241 | for (auto &MIB : AN.MIBs) { |
2242 | CallStack<MIBInfo, SmallVector<unsigned>::const_iterator> |
2243 | StackContext(&MIB); |
2244 | std::vector<ContextTotalSize> ContextSizeInfo; |
2245 | if (!AN.ContextSizeInfos.empty()) { |
2246 | for (auto [FullStackId, TotalSize] : AN.ContextSizeInfos[I]) |
2247 | ContextSizeInfo.push_back(x: {.FullStackId: FullStackId, .TotalSize: TotalSize}); |
2248 | } |
2249 | addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>( |
2250 | AllocNode, StackContext, CallsiteContext&: EmptyContext, AllocType: MIB.AllocType, |
2251 | ContextSizeInfo); |
2252 | I++; |
2253 | } |
2254 | // If exporting the graph to dot and an allocation id of interest was |
2255 | // specified, record all the context ids for this allocation node. |
2256 | if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot) |
2257 | DotAllocContextIds = AllocNode->getContextIds(); |
2258 | assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None); |
2259 | // Initialize version 0 on the summary alloc node to the current alloc |
2260 | // type, unless it has both types in which case make it default, so |
2261 | // that in the case where we aren't able to clone the original version |
2262 | // always ends up with the default allocation behavior. |
2263 | AN.Versions[0] = (uint8_t)allocTypeToUse(AllocTypes: AllocNode->AllocTypes); |
2264 | } |
2265 | } |
2266 | // For callsite metadata, add to list for this function for later use. |
2267 | if (!FS->callsites().empty()) |
2268 | for (auto &SN : FS->mutableCallsites()) { |
2269 | IndexCall StackNodeCall(&SN); |
2270 | CallsWithMetadata.push_back(x: StackNodeCall); |
2271 | } |
2272 | |
2273 | if (!CallsWithMetadata.empty()) |
2274 | FuncToCallsWithMetadata[FS] = CallsWithMetadata; |
2275 | |
2276 | if (!FS->allocs().empty() || !FS->callsites().empty()) |
2277 | FSToVIMap[FS] = VI; |
2278 | } |
2279 | } |
2280 | |
2281 | if (DumpCCG) { |
2282 | dbgs() << "CCG before updating call stack chains:\n" ; |
2283 | dbgs() << *this; |
2284 | } |
2285 | |
2286 | if (ExportToDot) |
2287 | exportToDot(Label: "prestackupdate" ); |
2288 | |
2289 | updateStackNodes(); |
2290 | |
2291 | if (ExportToDot) |
2292 | exportToDot(Label: "poststackupdate" ); |
2293 | |
2294 | handleCallsitesWithMultipleTargets(); |
2295 | |
2296 | markBackedges(); |
2297 | } |
2298 | |
2299 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
2300 | void CallsiteContextGraph<DerivedCCG, FuncTy, |
2301 | CallTy>::handleCallsitesWithMultipleTargets() { |
2302 | // Look for and workaround callsites that call multiple functions. |
2303 | // This can happen for indirect calls, which needs better handling, and in |
2304 | // more rare cases (e.g. macro expansion). |
2305 | // TODO: To fix this for indirect calls we will want to perform speculative |
2306 | // devirtualization using either the normal PGO info with ICP, or using the |
2307 | // information in the profiled MemProf contexts. We can do this prior to |
2308 | // this transformation for regular LTO, and for ThinLTO we can simulate that |
2309 | // effect in the summary and perform the actual speculative devirtualization |
2310 | // while cloning in the ThinLTO backend. |
2311 | |
2312 | // Keep track of the new nodes synthesized for discovered tail calls missing |
2313 | // from the profiled contexts. |
2314 | MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap; |
2315 | |
2316 | std::vector<std::pair<CallInfo, ContextNode *>> NewCallToNode; |
2317 | for (auto &Entry : NonAllocationCallToContextNodeMap) { |
2318 | auto *Node = Entry.second; |
2319 | assert(Node->Clones.empty()); |
2320 | // Check all node callees and see if in the same function. |
2321 | // We need to check all of the calls recorded in this Node, because in some |
2322 | // cases we may have had multiple calls with the same debug info calling |
2323 | // different callees. This can happen, for example, when an object is |
2324 | // constructed in the paramter list - the destructor call of the object has |
2325 | // the same debug info (line/col) as the call the object was passed to. |
2326 | // Here we will prune any that don't match all callee nodes. |
2327 | std::vector<CallInfo> AllCalls; |
2328 | AllCalls.reserve(Node->MatchingCalls.size() + 1); |
2329 | AllCalls.push_back(Node->Call); |
2330 | llvm::append_range(AllCalls, Node->MatchingCalls); |
2331 | |
2332 | // First see if we can partition the calls by callee function, creating new |
2333 | // nodes to host each set of calls calling the same callees. This is |
2334 | // necessary for support indirect calls with ThinLTO, for which we |
2335 | // synthesized CallsiteInfo records for each target. They will all have the |
2336 | // same callsite stack ids and would be sharing a context node at this |
2337 | // point. We need to perform separate cloning for each, which will be |
2338 | // applied along with speculative devirtualization in the ThinLTO backends |
2339 | // as needed. Note this does not currently support looking through tail |
2340 | // calls, it is unclear if we need that for indirect call targets. |
2341 | // First partition calls by callee func. Map indexed by func, value is |
2342 | // struct with list of matching calls, assigned node. |
2343 | if (partitionCallsByCallee(Node, AllCalls, NewCallToNode)) |
2344 | continue; |
2345 | |
2346 | auto It = AllCalls.begin(); |
2347 | // Iterate through the calls until we find the first that matches. |
2348 | for (; It != AllCalls.end(); ++It) { |
2349 | auto ThisCall = *It; |
2350 | bool Match = true; |
2351 | for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end(); |
2352 | ++EI) { |
2353 | auto Edge = *EI; |
2354 | if (!Edge->Callee->hasCall()) |
2355 | continue; |
2356 | assert(NodeToCallingFunc.count(Edge->Callee)); |
2357 | // Check if the called function matches that of the callee node. |
2358 | if (!calleesMatch(Call: ThisCall.call(), EI, TailCallToContextNodeMap)) { |
2359 | Match = false; |
2360 | break; |
2361 | } |
2362 | } |
2363 | // Found a call that matches the callee nodes, we can quit now. |
2364 | if (Match) { |
2365 | // If the first match is not the primary call on the Node, update it |
2366 | // now. We will update the list of matching calls further below. |
2367 | if (Node->Call != ThisCall) { |
2368 | Node->setCall(ThisCall); |
2369 | // We need to update the NonAllocationCallToContextNodeMap, but don't |
2370 | // want to do this during iteration over that map, so save the calls |
2371 | // that need updated entries. |
2372 | NewCallToNode.push_back({ThisCall, Node}); |
2373 | } |
2374 | break; |
2375 | } |
2376 | } |
2377 | // We will update this list below (or leave it cleared if there was no |
2378 | // match found above). |
2379 | Node->MatchingCalls.clear(); |
2380 | // If we hit the end of the AllCalls vector, no call matching the callee |
2381 | // nodes was found, clear the call information in the node. |
2382 | if (It == AllCalls.end()) { |
2383 | RemovedEdgesWithMismatchedCallees++; |
2384 | // Work around by setting Node to have a null call, so it gets |
2385 | // skipped during cloning. Otherwise assignFunctions will assert |
2386 | // because its data structures are not designed to handle this case. |
2387 | Node->setCall(CallInfo()); |
2388 | continue; |
2389 | } |
2390 | // Now add back any matching calls that call the same function as the |
2391 | // matching primary call on Node. |
2392 | for (++It; It != AllCalls.end(); ++It) { |
2393 | auto ThisCall = *It; |
2394 | if (!sameCallee(Call1: Node->Call.call(), Call2: ThisCall.call())) |
2395 | continue; |
2396 | Node->MatchingCalls.push_back(ThisCall); |
2397 | } |
2398 | } |
2399 | |
2400 | // Remove all mismatched nodes identified in the above loop from the node map |
2401 | // (checking whether they have a null call which is set above). For a |
2402 | // MapVector like NonAllocationCallToContextNodeMap it is much more efficient |
2403 | // to do the removal via remove_if than by individually erasing entries above. |
2404 | // Also remove any entries if we updated the node's primary call above. |
2405 | NonAllocationCallToContextNodeMap.remove_if([](const auto &it) { |
2406 | return !it.second->hasCall() || it.second->Call != it.first; |
2407 | }); |
2408 | |
2409 | // Add entries for any new primary calls recorded above. |
2410 | for (auto &[Call, Node] : NewCallToNode) |
2411 | NonAllocationCallToContextNodeMap[Call] = Node; |
2412 | |
2413 | // Add the new nodes after the above loop so that the iteration is not |
2414 | // invalidated. |
2415 | for (auto &[Call, Node] : TailCallToContextNodeMap) |
2416 | NonAllocationCallToContextNodeMap[Call] = Node; |
2417 | } |
2418 | |
2419 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
2420 | bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::partitionCallsByCallee( |
2421 | ContextNode *Node, ArrayRef<CallInfo> AllCalls, |
2422 | std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode) { |
2423 | // Struct to keep track of all the calls having the same callee function, |
2424 | // and the node we eventually assign to them. Eventually we will record the |
2425 | // context node assigned to this group of calls. |
2426 | struct CallsWithSameCallee { |
2427 | std::vector<CallInfo> Calls; |
2428 | ContextNode *Node = nullptr; |
2429 | }; |
2430 | |
2431 | // First partition calls by callee function. Build map from each function |
2432 | // to the list of matching calls. |
2433 | DenseMap<const FuncTy *, CallsWithSameCallee> CalleeFuncToCallInfo; |
2434 | for (auto ThisCall : AllCalls) { |
2435 | auto *F = getCalleeFunc(Call: ThisCall.call()); |
2436 | if (F) |
2437 | CalleeFuncToCallInfo[F].Calls.push_back(ThisCall); |
2438 | } |
2439 | |
2440 | // Next, walk through all callee edges. For each callee node, get its |
2441 | // containing function and see if it was recorded in the above map (meaning we |
2442 | // have at least one matching call). Build another map from each callee node |
2443 | // with a matching call to the structure instance created above containing all |
2444 | // the calls. |
2445 | DenseMap<ContextNode *, CallsWithSameCallee *> CalleeNodeToCallInfo; |
2446 | for (const auto &Edge : Node->CalleeEdges) { |
2447 | if (!Edge->Callee->hasCall()) |
2448 | continue; |
2449 | const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee]; |
2450 | if (CalleeFuncToCallInfo.contains(ProfiledCalleeFunc)) |
2451 | CalleeNodeToCallInfo[Edge->Callee] = |
2452 | &CalleeFuncToCallInfo[ProfiledCalleeFunc]; |
2453 | } |
2454 | |
2455 | // If there are entries in the second map, then there were no matching |
2456 | // calls/callees, nothing to do here. Return so we can go to the handling that |
2457 | // looks through tail calls. |
2458 | if (CalleeNodeToCallInfo.empty()) |
2459 | return false; |
2460 | |
2461 | // Walk through all callee edges again. Any and all callee edges that didn't |
2462 | // match any calls (callee not in the CalleeNodeToCallInfo map) are moved to a |
2463 | // new caller node (UnmatchedCalleesNode) which gets a null call so that it is |
2464 | // ignored during cloning. If it is in the map, then we use the node recorded |
2465 | // in that entry (creating it if needed), and move the callee edge to it. |
2466 | // The first callee will use the original node instead of creating a new one. |
2467 | // Note that any of the original calls on this node (in AllCalls) that didn't |
2468 | // have a callee function automatically get dropped from the node as part of |
2469 | // this process. |
2470 | ContextNode *UnmatchedCalleesNode = nullptr; |
2471 | // Track whether we already assigned original node to a callee. |
2472 | bool UsedOrigNode = false; |
2473 | assert(NodeToCallingFunc[Node]); |
2474 | // Iterate over a copy of Node's callee edges, since we may need to remove |
2475 | // edges in moveCalleeEdgeToNewCaller, and this simplifies the handling and |
2476 | // makes it less error-prone. |
2477 | auto CalleeEdges = Node->CalleeEdges; |
2478 | for (auto &Edge : CalleeEdges) { |
2479 | if (!Edge->Callee->hasCall()) |
2480 | continue; |
2481 | |
2482 | // Will be updated below to point to whatever (caller) node this callee edge |
2483 | // should be moved to. |
2484 | ContextNode *CallerNodeToUse = nullptr; |
2485 | |
2486 | // Handle the case where there were no matching calls first. Move this |
2487 | // callee edge to the UnmatchedCalleesNode, creating it if needed. |
2488 | if (!CalleeNodeToCallInfo.contains(Edge->Callee)) { |
2489 | if (!UnmatchedCalleesNode) |
2490 | UnmatchedCalleesNode = |
2491 | createNewNode(/*IsAllocation=*/false, F: NodeToCallingFunc[Node]); |
2492 | CallerNodeToUse = UnmatchedCalleesNode; |
2493 | } else { |
2494 | // Look up the information recorded for this callee node, and use the |
2495 | // recorded caller node (creating it if needed). |
2496 | auto *Info = CalleeNodeToCallInfo[Edge->Callee]; |
2497 | if (!Info->Node) { |
2498 | // If we haven't assigned any callees to the original node use it. |
2499 | if (!UsedOrigNode) { |
2500 | Info->Node = Node; |
2501 | // Clear the set of matching calls which will be updated below. |
2502 | Node->MatchingCalls.clear(); |
2503 | UsedOrigNode = true; |
2504 | } else |
2505 | Info->Node = |
2506 | createNewNode(/*IsAllocation=*/false, F: NodeToCallingFunc[Node]); |
2507 | assert(!Info->Calls.empty()); |
2508 | // The first call becomes the primary call for this caller node, and the |
2509 | // rest go in the matching calls list. |
2510 | Info->Node->setCall(Info->Calls.front()); |
2511 | llvm::append_range(Info->Node->MatchingCalls, |
2512 | llvm::drop_begin(Info->Calls)); |
2513 | // Save the primary call to node correspondence so that we can update |
2514 | // the NonAllocationCallToContextNodeMap, which is being iterated in the |
2515 | // caller of this function. |
2516 | NewCallToNode.push_back({Info->Node->Call, Info->Node}); |
2517 | } |
2518 | CallerNodeToUse = Info->Node; |
2519 | } |
2520 | |
2521 | // Don't need to move edge if we are using the original node; |
2522 | if (CallerNodeToUse == Node) |
2523 | continue; |
2524 | |
2525 | moveCalleeEdgeToNewCaller(Edge, NewCaller: CallerNodeToUse); |
2526 | } |
2527 | // Now that we are done moving edges, clean up any caller edges that ended |
2528 | // up with no type or context ids. During moveCalleeEdgeToNewCaller all |
2529 | // caller edges from Node are replicated onto the new callers, and it |
2530 | // simplifies the handling to leave them until we have moved all |
2531 | // edges/context ids. |
2532 | for (auto &I : CalleeNodeToCallInfo) |
2533 | removeNoneTypeCallerEdges(Node: I.second->Node); |
2534 | if (UnmatchedCalleesNode) |
2535 | removeNoneTypeCallerEdges(Node: UnmatchedCalleesNode); |
2536 | removeNoneTypeCallerEdges(Node); |
2537 | |
2538 | return true; |
2539 | } |
2540 | |
2541 | uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const { |
2542 | // In the Module (IR) case this is already the Id. |
2543 | return IdOrIndex; |
2544 | } |
2545 | |
2546 | uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const { |
2547 | // In the Index case this is an index into the stack id list in the summary |
2548 | // index, convert it to an Id. |
2549 | return Index.getStackIdAtIndex(Index: IdOrIndex); |
2550 | } |
2551 | |
2552 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
2553 | bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch( |
2554 | CallTy Call, EdgeIter &EI, |
2555 | MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) { |
2556 | auto Edge = *EI; |
2557 | const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee]; |
2558 | const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller]; |
2559 | // Will be populated in order of callee to caller if we find a chain of tail |
2560 | // calls between the profiled caller and callee. |
2561 | std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain; |
2562 | if (!calleeMatchesFunc(Call, Func: ProfiledCalleeFunc, CallerFunc, |
2563 | FoundCalleeChain)) |
2564 | return false; |
2565 | |
2566 | // The usual case where the profiled callee matches that of the IR/summary. |
2567 | if (FoundCalleeChain.empty()) |
2568 | return true; |
2569 | |
2570 | auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) { |
2571 | auto *CurEdge = Callee->findEdgeFromCaller(Caller); |
2572 | // If there is already an edge between these nodes, simply update it and |
2573 | // return. |
2574 | if (CurEdge) { |
2575 | CurEdge->ContextIds.insert_range(Edge->ContextIds); |
2576 | CurEdge->AllocTypes |= Edge->AllocTypes; |
2577 | return; |
2578 | } |
2579 | // Otherwise, create a new edge and insert it into the caller and callee |
2580 | // lists. |
2581 | auto NewEdge = std::make_shared<ContextEdge>( |
2582 | Callee, Caller, Edge->AllocTypes, Edge->ContextIds); |
2583 | Callee->CallerEdges.push_back(NewEdge); |
2584 | if (Caller == Edge->Caller) { |
2585 | // If we are inserting the new edge into the current edge's caller, insert |
2586 | // the new edge before the current iterator position, and then increment |
2587 | // back to the current edge. |
2588 | EI = Caller->CalleeEdges.insert(EI, NewEdge); |
2589 | ++EI; |
2590 | assert(*EI == Edge && |
2591 | "Iterator position not restored after insert and increment" ); |
2592 | } else |
2593 | Caller->CalleeEdges.push_back(NewEdge); |
2594 | }; |
2595 | |
2596 | // Create new nodes for each found callee and connect in between the profiled |
2597 | // caller and callee. |
2598 | auto *CurCalleeNode = Edge->Callee; |
2599 | for (auto &[NewCall, Func] : FoundCalleeChain) { |
2600 | ContextNode *NewNode = nullptr; |
2601 | // First check if we have already synthesized a node for this tail call. |
2602 | if (TailCallToContextNodeMap.count(NewCall)) { |
2603 | NewNode = TailCallToContextNodeMap[NewCall]; |
2604 | NewNode->AllocTypes |= Edge->AllocTypes; |
2605 | } else { |
2606 | FuncToCallsWithMetadata[Func].push_back({NewCall}); |
2607 | // Create Node and record node info. |
2608 | NewNode = createNewNode(/*IsAllocation=*/false, F: Func, C: NewCall); |
2609 | TailCallToContextNodeMap[NewCall] = NewNode; |
2610 | NewNode->AllocTypes = Edge->AllocTypes; |
2611 | } |
2612 | |
2613 | // Hook up node to its callee node |
2614 | AddEdge(NewNode, CurCalleeNode); |
2615 | |
2616 | CurCalleeNode = NewNode; |
2617 | } |
2618 | |
2619 | // Hook up edge's original caller to new callee node. |
2620 | AddEdge(Edge->Caller, CurCalleeNode); |
2621 | |
2622 | #ifndef NDEBUG |
2623 | // Save this because Edge's fields get cleared below when removed. |
2624 | auto *Caller = Edge->Caller; |
2625 | #endif |
2626 | |
2627 | // Remove old edge |
2628 | removeEdgeFromGraph(Edge: Edge.get(), EI: &EI, /*CalleeIter=*/true); |
2629 | |
2630 | // To simplify the increment of EI in the caller, subtract one from EI. |
2631 | // In the final AddEdge call we would have either added a new callee edge, |
2632 | // to Edge->Caller, or found an existing one. Either way we are guaranteed |
2633 | // that there is at least one callee edge. |
2634 | assert(!Caller->CalleeEdges.empty()); |
2635 | --EI; |
2636 | |
2637 | return true; |
2638 | } |
2639 | |
2640 | bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls( |
2641 | const Function *ProfiledCallee, Value *CurCallee, unsigned Depth, |
2642 | std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain, |
2643 | bool &FoundMultipleCalleeChains) { |
2644 | // Stop recursive search if we have already explored the maximum specified |
2645 | // depth. |
2646 | if (Depth > TailCallSearchDepth) |
2647 | return false; |
2648 | |
2649 | auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) { |
2650 | FoundCalleeChain.push_back(x: {Callsite, F}); |
2651 | }; |
2652 | |
2653 | auto *CalleeFunc = dyn_cast<Function>(Val: CurCallee); |
2654 | if (!CalleeFunc) { |
2655 | auto *Alias = dyn_cast<GlobalAlias>(Val: CurCallee); |
2656 | assert(Alias); |
2657 | CalleeFunc = dyn_cast<Function>(Val: Alias->getAliasee()); |
2658 | assert(CalleeFunc); |
2659 | } |
2660 | |
2661 | // Look for tail calls in this function, and check if they either call the |
2662 | // profiled callee directly, or indirectly (via a recursive search). |
2663 | // Only succeed if there is a single unique tail call chain found between the |
2664 | // profiled caller and callee, otherwise we could perform incorrect cloning. |
2665 | bool FoundSingleCalleeChain = false; |
2666 | for (auto &BB : *CalleeFunc) { |
2667 | for (auto &I : BB) { |
2668 | auto *CB = dyn_cast<CallBase>(Val: &I); |
2669 | if (!CB || !CB->isTailCall()) |
2670 | continue; |
2671 | auto *CalledValue = CB->getCalledOperand(); |
2672 | auto *CalledFunction = CB->getCalledFunction(); |
2673 | if (CalledValue && !CalledFunction) { |
2674 | CalledValue = CalledValue->stripPointerCasts(); |
2675 | // Stripping pointer casts can reveal a called function. |
2676 | CalledFunction = dyn_cast<Function>(Val: CalledValue); |
2677 | } |
2678 | // Check if this is an alias to a function. If so, get the |
2679 | // called aliasee for the checks below. |
2680 | if (auto *GA = dyn_cast<GlobalAlias>(Val: CalledValue)) { |
2681 | assert(!CalledFunction && |
2682 | "Expected null called function in callsite for alias" ); |
2683 | CalledFunction = dyn_cast<Function>(Val: GA->getAliaseeObject()); |
2684 | } |
2685 | if (!CalledFunction) |
2686 | continue; |
2687 | if (CalledFunction == ProfiledCallee) { |
2688 | if (FoundSingleCalleeChain) { |
2689 | FoundMultipleCalleeChains = true; |
2690 | return false; |
2691 | } |
2692 | FoundSingleCalleeChain = true; |
2693 | FoundProfiledCalleeCount++; |
2694 | FoundProfiledCalleeDepth += Depth; |
2695 | if (Depth > FoundProfiledCalleeMaxDepth) |
2696 | FoundProfiledCalleeMaxDepth = Depth; |
2697 | SaveCallsiteInfo(&I, CalleeFunc); |
2698 | } else if (findProfiledCalleeThroughTailCalls( |
2699 | ProfiledCallee, CurCallee: CalledFunction, Depth: Depth + 1, |
2700 | FoundCalleeChain, FoundMultipleCalleeChains)) { |
2701 | // findProfiledCalleeThroughTailCalls should not have returned |
2702 | // true if FoundMultipleCalleeChains. |
2703 | assert(!FoundMultipleCalleeChains); |
2704 | if (FoundSingleCalleeChain) { |
2705 | FoundMultipleCalleeChains = true; |
2706 | return false; |
2707 | } |
2708 | FoundSingleCalleeChain = true; |
2709 | SaveCallsiteInfo(&I, CalleeFunc); |
2710 | } else if (FoundMultipleCalleeChains) |
2711 | return false; |
2712 | } |
2713 | } |
2714 | |
2715 | return FoundSingleCalleeChain; |
2716 | } |
2717 | |
2718 | const Function *ModuleCallsiteContextGraph::getCalleeFunc(Instruction *Call) { |
2719 | auto *CB = dyn_cast<CallBase>(Val: Call); |
2720 | if (!CB->getCalledOperand() || CB->isIndirectCall()) |
2721 | return nullptr; |
2722 | auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts(); |
2723 | auto *Alias = dyn_cast<GlobalAlias>(Val: CalleeVal); |
2724 | if (Alias) |
2725 | return dyn_cast<Function>(Val: Alias->getAliasee()); |
2726 | return dyn_cast<Function>(Val: CalleeVal); |
2727 | } |
2728 | |
2729 | bool ModuleCallsiteContextGraph::calleeMatchesFunc( |
2730 | Instruction *Call, const Function *Func, const Function *CallerFunc, |
2731 | std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) { |
2732 | auto *CB = dyn_cast<CallBase>(Val: Call); |
2733 | if (!CB->getCalledOperand() || CB->isIndirectCall()) |
2734 | return false; |
2735 | auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts(); |
2736 | auto *CalleeFunc = dyn_cast<Function>(Val: CalleeVal); |
2737 | if (CalleeFunc == Func) |
2738 | return true; |
2739 | auto *Alias = dyn_cast<GlobalAlias>(Val: CalleeVal); |
2740 | if (Alias && Alias->getAliasee() == Func) |
2741 | return true; |
2742 | |
2743 | // Recursively search for the profiled callee through tail calls starting with |
2744 | // the actual Callee. The discovered tail call chain is saved in |
2745 | // FoundCalleeChain, and we will fixup the graph to include these callsites |
2746 | // after returning. |
2747 | // FIXME: We will currently redo the same recursive walk if we find the same |
2748 | // mismatched callee from another callsite. We can improve this with more |
2749 | // bookkeeping of the created chain of new nodes for each mismatch. |
2750 | unsigned Depth = 1; |
2751 | bool FoundMultipleCalleeChains = false; |
2752 | if (!findProfiledCalleeThroughTailCalls(ProfiledCallee: Func, CurCallee: CalleeVal, Depth, |
2753 | FoundCalleeChain, |
2754 | FoundMultipleCalleeChains)) { |
2755 | LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " |
2756 | << Func->getName() << " from " << CallerFunc->getName() |
2757 | << " that actually called " << CalleeVal->getName() |
2758 | << (FoundMultipleCalleeChains |
2759 | ? " (found multiple possible chains)" |
2760 | : "" ) |
2761 | << "\n" ); |
2762 | if (FoundMultipleCalleeChains) |
2763 | FoundProfiledCalleeNonUniquelyCount++; |
2764 | return false; |
2765 | } |
2766 | |
2767 | return true; |
2768 | } |
2769 | |
2770 | bool ModuleCallsiteContextGraph::sameCallee(Instruction *Call1, |
2771 | Instruction *Call2) { |
2772 | auto *CB1 = cast<CallBase>(Val: Call1); |
2773 | if (!CB1->getCalledOperand() || CB1->isIndirectCall()) |
2774 | return false; |
2775 | auto *CalleeVal1 = CB1->getCalledOperand()->stripPointerCasts(); |
2776 | auto *CalleeFunc1 = dyn_cast<Function>(Val: CalleeVal1); |
2777 | auto *CB2 = cast<CallBase>(Val: Call2); |
2778 | if (!CB2->getCalledOperand() || CB2->isIndirectCall()) |
2779 | return false; |
2780 | auto *CalleeVal2 = CB2->getCalledOperand()->stripPointerCasts(); |
2781 | auto *CalleeFunc2 = dyn_cast<Function>(Val: CalleeVal2); |
2782 | return CalleeFunc1 == CalleeFunc2; |
2783 | } |
2784 | |
2785 | bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls( |
2786 | ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth, |
2787 | std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain, |
2788 | bool &FoundMultipleCalleeChains) { |
2789 | // Stop recursive search if we have already explored the maximum specified |
2790 | // depth. |
2791 | if (Depth > TailCallSearchDepth) |
2792 | return false; |
2793 | |
2794 | auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) { |
2795 | // Make a CallsiteInfo for each discovered callee, if one hasn't already |
2796 | // been synthesized. |
2797 | if (!FunctionCalleesToSynthesizedCallsiteInfos.count(x: FS) || |
2798 | !FunctionCalleesToSynthesizedCallsiteInfos[FS].count(x: Callee)) |
2799 | // StackIds is empty (we don't have debug info available in the index for |
2800 | // these callsites) |
2801 | FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] = |
2802 | std::make_unique<CallsiteInfo>(args&: Callee, args: SmallVector<unsigned>()); |
2803 | CallsiteInfo *NewCallsiteInfo = |
2804 | FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get(); |
2805 | FoundCalleeChain.push_back(x: {NewCallsiteInfo, FS}); |
2806 | }; |
2807 | |
2808 | // Look for tail calls in this function, and check if they either call the |
2809 | // profiled callee directly, or indirectly (via a recursive search). |
2810 | // Only succeed if there is a single unique tail call chain found between the |
2811 | // profiled caller and callee, otherwise we could perform incorrect cloning. |
2812 | bool FoundSingleCalleeChain = false; |
2813 | for (auto &S : CurCallee.getSummaryList()) { |
2814 | if (!GlobalValue::isLocalLinkage(Linkage: S->linkage()) && |
2815 | !isPrevailing(CurCallee.getGUID(), S.get())) |
2816 | continue; |
2817 | auto *FS = dyn_cast<FunctionSummary>(Val: S->getBaseObject()); |
2818 | if (!FS) |
2819 | continue; |
2820 | auto FSVI = CurCallee; |
2821 | auto *AS = dyn_cast<AliasSummary>(Val: S.get()); |
2822 | if (AS) |
2823 | FSVI = AS->getAliaseeVI(); |
2824 | for (auto &CallEdge : FS->calls()) { |
2825 | if (!CallEdge.second.hasTailCall()) |
2826 | continue; |
2827 | if (CallEdge.first == ProfiledCallee) { |
2828 | if (FoundSingleCalleeChain) { |
2829 | FoundMultipleCalleeChains = true; |
2830 | return false; |
2831 | } |
2832 | FoundSingleCalleeChain = true; |
2833 | FoundProfiledCalleeCount++; |
2834 | FoundProfiledCalleeDepth += Depth; |
2835 | if (Depth > FoundProfiledCalleeMaxDepth) |
2836 | FoundProfiledCalleeMaxDepth = Depth; |
2837 | CreateAndSaveCallsiteInfo(CallEdge.first, FS); |
2838 | // Add FS to FSToVIMap in case it isn't already there. |
2839 | assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI); |
2840 | FSToVIMap[FS] = FSVI; |
2841 | } else if (findProfiledCalleeThroughTailCalls( |
2842 | ProfiledCallee, CurCallee: CallEdge.first, Depth: Depth + 1, |
2843 | FoundCalleeChain, FoundMultipleCalleeChains)) { |
2844 | // findProfiledCalleeThroughTailCalls should not have returned |
2845 | // true if FoundMultipleCalleeChains. |
2846 | assert(!FoundMultipleCalleeChains); |
2847 | if (FoundSingleCalleeChain) { |
2848 | FoundMultipleCalleeChains = true; |
2849 | return false; |
2850 | } |
2851 | FoundSingleCalleeChain = true; |
2852 | CreateAndSaveCallsiteInfo(CallEdge.first, FS); |
2853 | // Add FS to FSToVIMap in case it isn't already there. |
2854 | assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI); |
2855 | FSToVIMap[FS] = FSVI; |
2856 | } else if (FoundMultipleCalleeChains) |
2857 | return false; |
2858 | } |
2859 | } |
2860 | |
2861 | return FoundSingleCalleeChain; |
2862 | } |
2863 | |
2864 | const FunctionSummary * |
2865 | IndexCallsiteContextGraph::getCalleeFunc(IndexCall &Call) { |
2866 | ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Val&: Call)->Callee; |
2867 | if (Callee.getSummaryList().empty()) |
2868 | return nullptr; |
2869 | return dyn_cast<FunctionSummary>(Val: Callee.getSummaryList()[0]->getBaseObject()); |
2870 | } |
2871 | |
2872 | bool IndexCallsiteContextGraph::calleeMatchesFunc( |
2873 | IndexCall &Call, const FunctionSummary *Func, |
2874 | const FunctionSummary *CallerFunc, |
2875 | std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) { |
2876 | ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Val&: Call)->Callee; |
2877 | // If there is no summary list then this is a call to an externally defined |
2878 | // symbol. |
2879 | AliasSummary *Alias = |
2880 | Callee.getSummaryList().empty() |
2881 | ? nullptr |
2882 | : dyn_cast<AliasSummary>(Val: Callee.getSummaryList()[0].get()); |
2883 | assert(FSToVIMap.count(Func)); |
2884 | auto FuncVI = FSToVIMap[Func]; |
2885 | if (Callee == FuncVI || |
2886 | // If callee is an alias, check the aliasee, since only function |
2887 | // summary base objects will contain the stack node summaries and thus |
2888 | // get a context node. |
2889 | (Alias && Alias->getAliaseeVI() == FuncVI)) |
2890 | return true; |
2891 | |
2892 | // Recursively search for the profiled callee through tail calls starting with |
2893 | // the actual Callee. The discovered tail call chain is saved in |
2894 | // FoundCalleeChain, and we will fixup the graph to include these callsites |
2895 | // after returning. |
2896 | // FIXME: We will currently redo the same recursive walk if we find the same |
2897 | // mismatched callee from another callsite. We can improve this with more |
2898 | // bookkeeping of the created chain of new nodes for each mismatch. |
2899 | unsigned Depth = 1; |
2900 | bool FoundMultipleCalleeChains = false; |
2901 | if (!findProfiledCalleeThroughTailCalls( |
2902 | ProfiledCallee: FuncVI, CurCallee: Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) { |
2903 | LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI |
2904 | << " from " << FSToVIMap[CallerFunc] |
2905 | << " that actually called " << Callee |
2906 | << (FoundMultipleCalleeChains |
2907 | ? " (found multiple possible chains)" |
2908 | : "" ) |
2909 | << "\n" ); |
2910 | if (FoundMultipleCalleeChains) |
2911 | FoundProfiledCalleeNonUniquelyCount++; |
2912 | return false; |
2913 | } |
2914 | |
2915 | return true; |
2916 | } |
2917 | |
2918 | bool IndexCallsiteContextGraph::sameCallee(IndexCall &Call1, IndexCall &Call2) { |
2919 | ValueInfo Callee1 = dyn_cast_if_present<CallsiteInfo *>(Val&: Call1)->Callee; |
2920 | ValueInfo Callee2 = dyn_cast_if_present<CallsiteInfo *>(Val&: Call2)->Callee; |
2921 | return Callee1 == Callee2; |
2922 | } |
2923 | |
2924 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
2925 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump() |
2926 | const { |
2927 | print(OS&: dbgs()); |
2928 | dbgs() << "\n" ; |
2929 | } |
2930 | |
2931 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
2932 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print( |
2933 | raw_ostream &OS) const { |
2934 | OS << "Node " << this << "\n" ; |
2935 | OS << "\t" ; |
2936 | printCall(OS); |
2937 | if (Recursive) |
2938 | OS << " (recursive)" ; |
2939 | OS << "\n" ; |
2940 | if (!MatchingCalls.empty()) { |
2941 | OS << "\tMatchingCalls:\n" ; |
2942 | for (auto &MatchingCall : MatchingCalls) { |
2943 | OS << "\t" ; |
2944 | MatchingCall.print(OS); |
2945 | OS << "\n" ; |
2946 | } |
2947 | } |
2948 | OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n" ; |
2949 | OS << "\tContextIds:" ; |
2950 | // Make a copy of the computed context ids that we can sort for stability. |
2951 | auto ContextIds = getContextIds(); |
2952 | std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end()); |
2953 | std::sort(first: SortedIds.begin(), last: SortedIds.end()); |
2954 | for (auto Id : SortedIds) |
2955 | OS << " " << Id; |
2956 | OS << "\n" ; |
2957 | OS << "\tCalleeEdges:\n" ; |
2958 | for (auto &Edge : CalleeEdges) |
2959 | OS << "\t\t" << *Edge << "\n" ; |
2960 | OS << "\tCallerEdges:\n" ; |
2961 | for (auto &Edge : CallerEdges) |
2962 | OS << "\t\t" << *Edge << "\n" ; |
2963 | if (!Clones.empty()) { |
2964 | OS << "\tClones: " << llvm::interleaved(Clones) << "\n" ; |
2965 | } else if (CloneOf) { |
2966 | OS << "\tClone of " << CloneOf << "\n" ; |
2967 | } |
2968 | } |
2969 | |
2970 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
2971 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump() |
2972 | const { |
2973 | print(OS&: dbgs()); |
2974 | dbgs() << "\n" ; |
2975 | } |
2976 | |
2977 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
2978 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print( |
2979 | raw_ostream &OS) const { |
2980 | OS << "Edge from Callee " << Callee << " to Caller: " << Caller |
2981 | << (IsBackedge ? " (BE)" : "" ) |
2982 | << " AllocTypes: " << getAllocTypeString(AllocTypes); |
2983 | OS << " ContextIds:" ; |
2984 | std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end()); |
2985 | std::sort(first: SortedIds.begin(), last: SortedIds.end()); |
2986 | for (auto Id : SortedIds) |
2987 | OS << " " << Id; |
2988 | } |
2989 | |
2990 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
2991 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const { |
2992 | print(OS&: dbgs()); |
2993 | } |
2994 | |
2995 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
2996 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print( |
2997 | raw_ostream &OS) const { |
2998 | OS << "Callsite Context Graph:\n" ; |
2999 | using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; |
3000 | for (const auto Node : nodes<GraphType>(this)) { |
3001 | if (Node->isRemoved()) |
3002 | continue; |
3003 | Node->print(OS); |
3004 | OS << "\n" ; |
3005 | } |
3006 | } |
3007 | |
3008 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
3009 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes( |
3010 | raw_ostream &OS) const { |
3011 | using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; |
3012 | for (const auto Node : nodes<GraphType>(this)) { |
3013 | if (Node->isRemoved()) |
3014 | continue; |
3015 | if (!Node->IsAllocation) |
3016 | continue; |
3017 | DenseSet<uint32_t> ContextIds = Node->getContextIds(); |
3018 | auto AllocTypeFromCall = getAllocationCallType(Call: Node->Call); |
3019 | std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end()); |
3020 | std::sort(first: SortedIds.begin(), last: SortedIds.end()); |
3021 | for (auto Id : SortedIds) { |
3022 | auto TypeI = ContextIdToAllocationType.find(Val: Id); |
3023 | assert(TypeI != ContextIdToAllocationType.end()); |
3024 | auto CSI = ContextIdToContextSizeInfos.find(Val: Id); |
3025 | if (CSI != ContextIdToContextSizeInfos.end()) { |
3026 | for (auto &Info : CSI->second) { |
3027 | OS << "MemProf hinting: " |
3028 | << getAllocTypeString(AllocTypes: (uint8_t)TypeI->second) |
3029 | << " full allocation context " << Info.FullStackId |
3030 | << " with total size " << Info.TotalSize << " is " |
3031 | << getAllocTypeString(Node->AllocTypes) << " after cloning" ; |
3032 | if (allocTypeToUse(Node->AllocTypes) != AllocTypeFromCall) |
3033 | OS << " marked " << getAllocTypeString(AllocTypes: (uint8_t)AllocTypeFromCall) |
3034 | << " due to cold byte percent" ; |
3035 | // Print the internal context id to aid debugging and visualization. |
3036 | OS << " (context id " << Id << ")" ; |
3037 | OS << "\n" ; |
3038 | } |
3039 | } |
3040 | } |
3041 | } |
3042 | } |
3043 | |
3044 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
3045 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const { |
3046 | using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; |
3047 | for (const auto Node : nodes<GraphType>(this)) { |
3048 | checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false); |
3049 | for (auto &Edge : Node->CallerEdges) |
3050 | checkEdge<DerivedCCG, FuncTy, CallTy>(Edge); |
3051 | } |
3052 | } |
3053 | |
3054 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
3055 | struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> { |
3056 | using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; |
3057 | using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *; |
3058 | |
3059 | using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>; |
3060 | static NodeRef getNode(const NodePtrTy &P) { return P.get(); } |
3061 | |
3062 | using nodes_iterator = |
3063 | mapped_iterator<typename std::vector<NodePtrTy>::const_iterator, |
3064 | decltype(&getNode)>; |
3065 | |
3066 | static nodes_iterator nodes_begin(GraphType G) { |
3067 | return nodes_iterator(G->NodeOwner.begin(), &getNode); |
3068 | } |
3069 | |
3070 | static nodes_iterator nodes_end(GraphType G) { |
3071 | return nodes_iterator(G->NodeOwner.end(), &getNode); |
3072 | } |
3073 | |
3074 | static NodeRef getEntryNode(GraphType G) { |
3075 | return G->NodeOwner.begin()->get(); |
3076 | } |
3077 | |
3078 | using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>; |
3079 | static const ContextNode<DerivedCCG, FuncTy, CallTy> * |
3080 | GetCallee(const EdgePtrTy &P) { |
3081 | return P->Callee; |
3082 | } |
3083 | |
3084 | using ChildIteratorType = |
3085 | mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge< |
3086 | DerivedCCG, FuncTy, CallTy>>>::const_iterator, |
3087 | decltype(&GetCallee)>; |
3088 | |
3089 | static ChildIteratorType child_begin(NodeRef N) { |
3090 | return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee); |
3091 | } |
3092 | |
3093 | static ChildIteratorType child_end(NodeRef N) { |
3094 | return ChildIteratorType(N->CalleeEdges.end(), &GetCallee); |
3095 | } |
3096 | }; |
3097 | |
3098 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
3099 | struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> |
3100 | : public DefaultDOTGraphTraits { |
3101 | DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) { |
3102 | // If the user requested the full graph to be exported, but provided an |
3103 | // allocation id, or if the user gave a context id and requested more than |
3104 | // just a specific context to be exported, note that highlighting is |
3105 | // enabled. |
3106 | DoHighlight = |
3107 | (AllocIdForDot.getNumOccurrences() && DotGraphScope == DotScope::All) || |
3108 | (ContextIdForDot.getNumOccurrences() && |
3109 | DotGraphScope != DotScope::Context); |
3110 | } |
3111 | |
3112 | using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; |
3113 | using GTraits = GraphTraits<GraphType>; |
3114 | using NodeRef = typename GTraits::NodeRef; |
3115 | using ChildIteratorType = typename GTraits::ChildIteratorType; |
3116 | |
3117 | static std::string getNodeLabel(NodeRef Node, GraphType G) { |
3118 | std::string LabelString = |
3119 | (Twine("OrigId: " ) + (Node->IsAllocation ? "Alloc" : "" ) + |
3120 | Twine(Node->OrigStackOrAllocId)) |
3121 | .str(); |
3122 | LabelString += "\n" ; |
3123 | if (Node->hasCall()) { |
3124 | auto Func = G->NodeToCallingFunc.find(Node); |
3125 | assert(Func != G->NodeToCallingFunc.end()); |
3126 | LabelString += |
3127 | G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo()); |
3128 | } else { |
3129 | LabelString += "null call" ; |
3130 | if (Node->Recursive) |
3131 | LabelString += " (recursive)" ; |
3132 | else |
3133 | LabelString += " (external)" ; |
3134 | } |
3135 | return LabelString; |
3136 | } |
3137 | |
3138 | static std::string getNodeAttributes(NodeRef Node, GraphType G) { |
3139 | auto ContextIds = Node->getContextIds(); |
3140 | // If highlighting enabled, see if this node contains any of the context ids |
3141 | // of interest. If so, it will use a different color and a larger fontsize |
3142 | // (which makes the node larger as well). |
3143 | bool Highlight = false; |
3144 | if (DoHighlight) { |
3145 | assert(ContextIdForDot.getNumOccurrences() || |
3146 | AllocIdForDot.getNumOccurrences()); |
3147 | if (ContextIdForDot.getNumOccurrences()) |
3148 | Highlight = ContextIds.contains(ContextIdForDot); |
3149 | else |
3150 | Highlight = set_intersects(ContextIds, G->DotAllocContextIds); |
3151 | } |
3152 | std::string AttributeString = (Twine("tooltip=\"" ) + getNodeId(Node) + " " + |
3153 | getContextIds(ContextIds) + "\"" ) |
3154 | .str(); |
3155 | // Default fontsize is 14 |
3156 | if (Highlight) |
3157 | AttributeString += ",fontsize=\"30\"" ; |
3158 | AttributeString += |
3159 | (Twine(",fillcolor=\"" ) + getColor(AllocTypes: Node->AllocTypes, Highlight) + "\"" ) |
3160 | .str(); |
3161 | if (Node->CloneOf) { |
3162 | AttributeString += ",color=\"blue\"" ; |
3163 | AttributeString += ",style=\"filled,bold,dashed\"" ; |
3164 | } else |
3165 | AttributeString += ",style=\"filled\"" ; |
3166 | return AttributeString; |
3167 | } |
3168 | |
3169 | static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter, |
3170 | GraphType G) { |
3171 | auto &Edge = *(ChildIter.getCurrent()); |
3172 | // If highlighting enabled, see if this edge contains any of the context ids |
3173 | // of interest. If so, it will use a different color and a heavier arrow |
3174 | // size and weight (the larger weight makes the highlighted path |
3175 | // straighter). |
3176 | bool Highlight = false; |
3177 | if (DoHighlight) { |
3178 | assert(ContextIdForDot.getNumOccurrences() || |
3179 | AllocIdForDot.getNumOccurrences()); |
3180 | if (ContextIdForDot.getNumOccurrences()) |
3181 | Highlight = Edge->ContextIds.contains(ContextIdForDot); |
3182 | else |
3183 | Highlight = set_intersects(Edge->ContextIds, G->DotAllocContextIds); |
3184 | } |
3185 | auto Color = getColor(AllocTypes: Edge->AllocTypes, Highlight); |
3186 | std::string AttributeString = |
3187 | (Twine("tooltip=\"" ) + getContextIds(ContextIds: Edge->ContextIds) + "\"" + |
3188 | // fillcolor is the arrow head and color is the line |
3189 | Twine(",fillcolor=\"" ) + Color + "\"" + Twine(",color=\"" ) + Color + |
3190 | "\"" ) |
3191 | .str(); |
3192 | if (Edge->IsBackedge) |
3193 | AttributeString += ",style=\"dotted\"" ; |
3194 | // Default penwidth and weight are both 1. |
3195 | if (Highlight) |
3196 | AttributeString += ",penwidth=\"2.0\",weight=\"2\"" ; |
3197 | return AttributeString; |
3198 | } |
3199 | |
3200 | // Since the NodeOwners list includes nodes that are no longer connected to |
3201 | // the graph, skip them here. |
3202 | static bool isNodeHidden(NodeRef Node, GraphType G) { |
3203 | if (Node->isRemoved()) |
3204 | return true; |
3205 | // If a scope smaller than the full graph was requested, see if this node |
3206 | // contains any of the context ids of interest. |
3207 | if (DotGraphScope == DotScope::Alloc) |
3208 | return !set_intersects(Node->getContextIds(), G->DotAllocContextIds); |
3209 | if (DotGraphScope == DotScope::Context) |
3210 | return !Node->getContextIds().contains(ContextIdForDot); |
3211 | return false; |
3212 | } |
3213 | |
3214 | private: |
3215 | static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) { |
3216 | std::string IdString = "ContextIds:" ; |
3217 | if (ContextIds.size() < 100) { |
3218 | std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end()); |
3219 | std::sort(first: SortedIds.begin(), last: SortedIds.end()); |
3220 | for (auto Id : SortedIds) |
3221 | IdString += (" " + Twine(Id)).str(); |
3222 | } else { |
3223 | IdString += (" (" + Twine(ContextIds.size()) + " ids)" ).str(); |
3224 | } |
3225 | return IdString; |
3226 | } |
3227 | |
3228 | static std::string getColor(uint8_t AllocTypes, bool Highlight) { |
3229 | // If DoHighlight is not enabled, we want to use the highlight colors for |
3230 | // NotCold and Cold, and the non-highlight color for NotCold+Cold. This is |
3231 | // both compatible with the color scheme before highlighting was supported, |
3232 | // and for the NotCold+Cold color the non-highlight color is a bit more |
3233 | // readable. |
3234 | if (AllocTypes == (uint8_t)AllocationType::NotCold) |
3235 | // Color "brown1" actually looks like a lighter red. |
3236 | return !DoHighlight || Highlight ? "brown1" : "lightpink" ; |
3237 | if (AllocTypes == (uint8_t)AllocationType::Cold) |
3238 | return !DoHighlight || Highlight ? "cyan" : "lightskyblue" ; |
3239 | if (AllocTypes == |
3240 | ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold)) |
3241 | return Highlight ? "magenta" : "mediumorchid1" ; |
3242 | return "gray" ; |
3243 | } |
3244 | |
3245 | static std::string getNodeId(NodeRef Node) { |
3246 | std::stringstream SStream; |
3247 | SStream << std::hex << "N0x" << (unsigned long long)Node; |
3248 | std::string Result = SStream.str(); |
3249 | return Result; |
3250 | } |
3251 | |
3252 | // True if we should highlight a specific context or allocation's contexts in |
3253 | // the emitted graph. |
3254 | static bool DoHighlight; |
3255 | }; |
3256 | |
3257 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
3258 | bool DOTGraphTraits< |
3259 | const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>::DoHighlight = |
3260 | false; |
3261 | |
3262 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
3263 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot( |
3264 | std::string Label) const { |
3265 | WriteGraph(this, "" , false, Label, |
3266 | DotFilePathPrefix + "ccg." + Label + ".dot" ); |
3267 | } |
3268 | |
3269 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
3270 | typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * |
3271 | CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone( |
3272 | const std::shared_ptr<ContextEdge> &Edge, |
3273 | DenseSet<uint32_t> ContextIdsToMove) { |
3274 | ContextNode *Node = Edge->Callee; |
3275 | assert(NodeToCallingFunc.count(Node)); |
3276 | ContextNode *Clone = |
3277 | createNewNode(IsAllocation: Node->IsAllocation, F: NodeToCallingFunc[Node], C: Node->Call); |
3278 | Node->addClone(Clone); |
3279 | Clone->MatchingCalls = Node->MatchingCalls; |
3280 | moveEdgeToExistingCalleeClone(Edge, NewCallee: Clone, /*NewClone=*/true, |
3281 | ContextIdsToMove); |
3282 | return Clone; |
3283 | } |
3284 | |
3285 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
3286 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: |
3287 | moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge, |
3288 | ContextNode *NewCallee, bool NewClone, |
3289 | DenseSet<uint32_t> ContextIdsToMove) { |
3290 | // NewCallee and Edge's current callee must be clones of the same original |
3291 | // node (Edge's current callee may be the original node too). |
3292 | assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode()); |
3293 | |
3294 | bool EdgeIsRecursive = Edge->Callee == Edge->Caller; |
3295 | |
3296 | ContextNode *OldCallee = Edge->Callee; |
3297 | |
3298 | // We might already have an edge to the new callee from earlier cloning for a |
3299 | // different allocation. If one exists we will reuse it. |
3300 | auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller); |
3301 | |
3302 | // Callers will pass an empty ContextIdsToMove set when they want to move the |
3303 | // edge. Copy in Edge's ids for simplicity. |
3304 | if (ContextIdsToMove.empty()) |
3305 | ContextIdsToMove = Edge->getContextIds(); |
3306 | |
3307 | // If we are moving all of Edge's ids, then just move the whole Edge. |
3308 | // Otherwise only move the specified subset, to a new edge if needed. |
3309 | if (Edge->getContextIds().size() == ContextIdsToMove.size()) { |
3310 | // First, update the alloc types on New Callee from Edge. |
3311 | // Do this before we potentially clear Edge's fields below! |
3312 | NewCallee->AllocTypes |= Edge->AllocTypes; |
3313 | // Moving the whole Edge. |
3314 | if (ExistingEdgeToNewCallee) { |
3315 | // Since we already have an edge to NewCallee, simply move the ids |
3316 | // onto it, and remove the existing Edge. |
3317 | ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove); |
3318 | ExistingEdgeToNewCallee->AllocTypes |= Edge->AllocTypes; |
3319 | assert(Edge->ContextIds == ContextIdsToMove); |
3320 | removeEdgeFromGraph(Edge: Edge.get()); |
3321 | } else { |
3322 | // Otherwise just reconnect Edge to NewCallee. |
3323 | Edge->Callee = NewCallee; |
3324 | NewCallee->CallerEdges.push_back(Edge); |
3325 | // Remove it from callee where it was previously connected. |
3326 | OldCallee->eraseCallerEdge(Edge.get()); |
3327 | // Don't need to update Edge's context ids since we are simply |
3328 | // reconnecting it. |
3329 | } |
3330 | } else { |
3331 | // Only moving a subset of Edge's ids. |
3332 | // Compute the alloc type of the subset of ids being moved. |
3333 | auto CallerEdgeAllocType = computeAllocType(ContextIds&: ContextIdsToMove); |
3334 | if (ExistingEdgeToNewCallee) { |
3335 | // Since we already have an edge to NewCallee, simply move the ids |
3336 | // onto it. |
3337 | ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove); |
3338 | ExistingEdgeToNewCallee->AllocTypes |= CallerEdgeAllocType; |
3339 | } else { |
3340 | // Otherwise, create a new edge to NewCallee for the ids being moved. |
3341 | auto NewEdge = std::make_shared<ContextEdge>( |
3342 | NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove); |
3343 | Edge->Caller->CalleeEdges.push_back(NewEdge); |
3344 | NewCallee->CallerEdges.push_back(NewEdge); |
3345 | } |
3346 | // In either case, need to update the alloc types on NewCallee, and remove |
3347 | // those ids and update the alloc type on the original Edge. |
3348 | NewCallee->AllocTypes |= CallerEdgeAllocType; |
3349 | set_subtract(Edge->ContextIds, ContextIdsToMove); |
3350 | Edge->AllocTypes = computeAllocType(ContextIds&: Edge->ContextIds); |
3351 | } |
3352 | // Now walk the old callee node's callee edges and move Edge's context ids |
3353 | // over to the corresponding edge into the clone (which is created here if |
3354 | // this is a newly created clone). |
3355 | for (auto &OldCalleeEdge : OldCallee->CalleeEdges) { |
3356 | ContextNode *CalleeToUse = OldCalleeEdge->Callee; |
3357 | // If this is a direct recursion edge, use NewCallee (the clone) as the |
3358 | // callee as well, so that any edge updated/created here is also direct |
3359 | // recursive. |
3360 | if (CalleeToUse == OldCallee) { |
3361 | // If this is a recursive edge, see if we already moved a recursive edge |
3362 | // (which would have to have been this one) - if we were only moving a |
3363 | // subset of context ids it would still be on OldCallee. |
3364 | if (EdgeIsRecursive) { |
3365 | assert(OldCalleeEdge == Edge); |
3366 | continue; |
3367 | } |
3368 | CalleeToUse = NewCallee; |
3369 | } |
3370 | // The context ids moving to the new callee are the subset of this edge's |
3371 | // context ids and the context ids on the caller edge being moved. |
3372 | DenseSet<uint32_t> EdgeContextIdsToMove = |
3373 | set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove); |
3374 | set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove); |
3375 | OldCalleeEdge->AllocTypes = |
3376 | computeAllocType(ContextIds&: OldCalleeEdge->getContextIds()); |
3377 | if (!NewClone) { |
3378 | // Update context ids / alloc type on corresponding edge to NewCallee. |
3379 | // There is a chance this may not exist if we are reusing an existing |
3380 | // clone, specifically during function assignment, where we would have |
3381 | // removed none type edges after creating the clone. If we can't find |
3382 | // a corresponding edge there, fall through to the cloning below. |
3383 | if (auto *NewCalleeEdge = NewCallee->findEdgeFromCallee(CalleeToUse)) { |
3384 | NewCalleeEdge->getContextIds().insert_range(EdgeContextIdsToMove); |
3385 | NewCalleeEdge->AllocTypes |= computeAllocType(ContextIds&: EdgeContextIdsToMove); |
3386 | continue; |
3387 | } |
3388 | } |
3389 | auto NewEdge = std::make_shared<ContextEdge>( |
3390 | CalleeToUse, NewCallee, computeAllocType(ContextIds&: EdgeContextIdsToMove), |
3391 | EdgeContextIdsToMove); |
3392 | NewCallee->CalleeEdges.push_back(NewEdge); |
3393 | NewEdge->Callee->CallerEdges.push_back(NewEdge); |
3394 | } |
3395 | // Recompute the node alloc type now that its callee edges have been |
3396 | // updated (since we will compute from those edges). |
3397 | OldCallee->AllocTypes = OldCallee->computeAllocType(); |
3398 | // OldCallee alloc type should be None iff its context id set is now empty. |
3399 | assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) == |
3400 | OldCallee->emptyContextIds()); |
3401 | if (VerifyCCG) { |
3402 | checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false); |
3403 | checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false); |
3404 | for (const auto &OldCalleeEdge : OldCallee->CalleeEdges) |
3405 | checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee, |
3406 | /*CheckEdges=*/false); |
3407 | for (const auto &NewCalleeEdge : NewCallee->CalleeEdges) |
3408 | checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee, |
3409 | /*CheckEdges=*/false); |
3410 | } |
3411 | } |
3412 | |
3413 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
3414 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: |
3415 | moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge, |
3416 | ContextNode *NewCaller) { |
3417 | auto *OldCallee = Edge->Callee; |
3418 | auto *NewCallee = OldCallee; |
3419 | // If this edge was direct recursive, make any new/updated edge also direct |
3420 | // recursive to NewCaller. |
3421 | bool Recursive = Edge->Caller == Edge->Callee; |
3422 | if (Recursive) |
3423 | NewCallee = NewCaller; |
3424 | |
3425 | ContextNode *OldCaller = Edge->Caller; |
3426 | OldCaller->eraseCalleeEdge(Edge.get()); |
3427 | |
3428 | // We might already have an edge to the new caller. If one exists we will |
3429 | // reuse it. |
3430 | auto ExistingEdgeToNewCaller = NewCaller->findEdgeFromCallee(NewCallee); |
3431 | |
3432 | if (ExistingEdgeToNewCaller) { |
3433 | // Since we already have an edge to NewCaller, simply move the ids |
3434 | // onto it, and remove the existing Edge. |
3435 | ExistingEdgeToNewCaller->getContextIds().insert_range( |
3436 | Edge->getContextIds()); |
3437 | ExistingEdgeToNewCaller->AllocTypes |= Edge->AllocTypes; |
3438 | Edge->ContextIds.clear(); |
3439 | Edge->AllocTypes = (uint8_t)AllocationType::None; |
3440 | OldCallee->eraseCallerEdge(Edge.get()); |
3441 | } else { |
3442 | // Otherwise just reconnect Edge to NewCaller. |
3443 | Edge->Caller = NewCaller; |
3444 | NewCaller->CalleeEdges.push_back(Edge); |
3445 | if (Recursive) { |
3446 | assert(NewCallee == NewCaller); |
3447 | // In the case of (direct) recursive edges, we update the callee as well |
3448 | // so that it becomes recursive on the new caller. |
3449 | Edge->Callee = NewCallee; |
3450 | NewCallee->CallerEdges.push_back(Edge); |
3451 | OldCallee->eraseCallerEdge(Edge.get()); |
3452 | } |
3453 | // Don't need to update Edge's context ids since we are simply |
3454 | // reconnecting it. |
3455 | } |
3456 | // In either case, need to update the alloc types on New Caller. |
3457 | NewCaller->AllocTypes |= Edge->AllocTypes; |
3458 | |
3459 | // Now walk the old caller node's caller edges and move Edge's context ids |
3460 | // over to the corresponding edge into the node (which is created here if |
3461 | // this is a newly created node). We can tell whether this is a newly created |
3462 | // node by seeing if it has any caller edges yet. |
3463 | #ifndef NDEBUG |
3464 | bool IsNewNode = NewCaller->CallerEdges.empty(); |
3465 | #endif |
3466 | // If we just moved a direct recursive edge, presumably its context ids should |
3467 | // also flow out of OldCaller via some other non-recursive callee edge. We |
3468 | // don't want to remove the recursive context ids from other caller edges yet, |
3469 | // otherwise the context ids get into an inconsistent state on OldCaller. |
3470 | // We will update these context ids on the non-recursive caller edge when and |
3471 | // if they are updated on the non-recursive callee. |
3472 | if (!Recursive) { |
3473 | for (auto &OldCallerEdge : OldCaller->CallerEdges) { |
3474 | auto OldCallerCaller = OldCallerEdge->Caller; |
3475 | // The context ids moving to the new caller are the subset of this edge's |
3476 | // context ids and the context ids on the callee edge being moved. |
3477 | DenseSet<uint32_t> EdgeContextIdsToMove = set_intersection( |
3478 | OldCallerEdge->getContextIds(), Edge->getContextIds()); |
3479 | if (OldCaller == OldCallerCaller) { |
3480 | OldCallerCaller = NewCaller; |
3481 | // Don't actually move this one. The caller will move it directly via a |
3482 | // call to this function with this as the Edge if it is appropriate to |
3483 | // move to a diff node that has a matching callee (itself). |
3484 | continue; |
3485 | } |
3486 | set_subtract(OldCallerEdge->getContextIds(), EdgeContextIdsToMove); |
3487 | OldCallerEdge->AllocTypes = |
3488 | computeAllocType(ContextIds&: OldCallerEdge->getContextIds()); |
3489 | // In this function we expect that any pre-existing node already has edges |
3490 | // from the same callers as the old node. That should be true in the |
3491 | // current use case, where we will remove None-type edges after copying |
3492 | // over all caller edges from the callee. |
3493 | auto *ExistingCallerEdge = NewCaller->findEdgeFromCaller(OldCallerCaller); |
3494 | // Since we would have skipped caller edges when moving a direct recursive |
3495 | // edge, this may not hold true when recursive handling enabled. |
3496 | assert(IsNewNode || ExistingCallerEdge || AllowRecursiveCallsites); |
3497 | if (ExistingCallerEdge) { |
3498 | ExistingCallerEdge->getContextIds().insert_range(EdgeContextIdsToMove); |
3499 | ExistingCallerEdge->AllocTypes |= |
3500 | computeAllocType(ContextIds&: EdgeContextIdsToMove); |
3501 | continue; |
3502 | } |
3503 | auto NewEdge = std::make_shared<ContextEdge>( |
3504 | NewCaller, OldCallerCaller, computeAllocType(ContextIds&: EdgeContextIdsToMove), |
3505 | EdgeContextIdsToMove); |
3506 | NewCaller->CallerEdges.push_back(NewEdge); |
3507 | NewEdge->Caller->CalleeEdges.push_back(NewEdge); |
3508 | } |
3509 | } |
3510 | // Recompute the node alloc type now that its caller edges have been |
3511 | // updated (since we will compute from those edges). |
3512 | OldCaller->AllocTypes = OldCaller->computeAllocType(); |
3513 | // OldCaller alloc type should be None iff its context id set is now empty. |
3514 | assert((OldCaller->AllocTypes == (uint8_t)AllocationType::None) == |
3515 | OldCaller->emptyContextIds()); |
3516 | if (VerifyCCG) { |
3517 | checkNode<DerivedCCG, FuncTy, CallTy>(OldCaller, /*CheckEdges=*/false); |
3518 | checkNode<DerivedCCG, FuncTy, CallTy>(NewCaller, /*CheckEdges=*/false); |
3519 | for (const auto &OldCallerEdge : OldCaller->CallerEdges) |
3520 | checkNode<DerivedCCG, FuncTy, CallTy>(OldCallerEdge->Caller, |
3521 | /*CheckEdges=*/false); |
3522 | for (const auto &NewCallerEdge : NewCaller->CallerEdges) |
3523 | checkNode<DerivedCCG, FuncTy, CallTy>(NewCallerEdge->Caller, |
3524 | /*CheckEdges=*/false); |
3525 | } |
3526 | } |
3527 | |
3528 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
3529 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: |
3530 | recursivelyRemoveNoneTypeCalleeEdges( |
3531 | ContextNode *Node, DenseSet<const ContextNode *> &Visited) { |
3532 | auto Inserted = Visited.insert(Node); |
3533 | if (!Inserted.second) |
3534 | return; |
3535 | |
3536 | removeNoneTypeCalleeEdges(Node); |
3537 | |
3538 | for (auto *Clone : Node->Clones) |
3539 | recursivelyRemoveNoneTypeCalleeEdges(Node: Clone, Visited); |
3540 | |
3541 | // The recursive call may remove some of this Node's caller edges. |
3542 | // Iterate over a copy and skip any that were removed. |
3543 | auto CallerEdges = Node->CallerEdges; |
3544 | for (auto &Edge : CallerEdges) { |
3545 | // Skip any that have been removed by an earlier recursive call. |
3546 | if (Edge->isRemoved()) { |
3547 | assert(!is_contained(Node->CallerEdges, Edge)); |
3548 | continue; |
3549 | } |
3550 | recursivelyRemoveNoneTypeCalleeEdges(Node: Edge->Caller, Visited); |
3551 | } |
3552 | } |
3553 | |
3554 | // This is the standard DFS based backedge discovery algorithm. |
3555 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
3556 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges() { |
3557 | // If we are cloning recursive contexts, find and mark backedges from all root |
3558 | // callers, using the typical DFS based backedge analysis. |
3559 | if (!CloneRecursiveContexts) |
3560 | return; |
3561 | DenseSet<const ContextNode *> Visited; |
3562 | DenseSet<const ContextNode *> CurrentStack; |
3563 | for (auto &Entry : NonAllocationCallToContextNodeMap) { |
3564 | auto *Node = Entry.second; |
3565 | if (Node->isRemoved()) |
3566 | continue; |
3567 | // It is a root if it doesn't have callers. |
3568 | if (!Node->CallerEdges.empty()) |
3569 | continue; |
3570 | markBackedges(Node, Visited, CurrentStack); |
3571 | assert(CurrentStack.empty()); |
3572 | } |
3573 | } |
3574 | |
3575 | // Recursive helper for above markBackedges method. |
3576 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
3577 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges( |
3578 | ContextNode *Node, DenseSet<const ContextNode *> &Visited, |
3579 | DenseSet<const ContextNode *> &CurrentStack) { |
3580 | auto I = Visited.insert(Node); |
3581 | // We should only call this for unvisited nodes. |
3582 | assert(I.second); |
3583 | (void)I; |
3584 | for (auto &CalleeEdge : Node->CalleeEdges) { |
3585 | auto *Callee = CalleeEdge->Callee; |
3586 | if (Visited.count(Callee)) { |
3587 | // Since this was already visited we need to check if it is currently on |
3588 | // the recursive stack in which case it is a backedge. |
3589 | if (CurrentStack.count(Callee)) |
3590 | CalleeEdge->IsBackedge = true; |
3591 | continue; |
3592 | } |
3593 | CurrentStack.insert(Callee); |
3594 | markBackedges(Callee, Visited, CurrentStack); |
3595 | CurrentStack.erase(Callee); |
3596 | } |
3597 | } |
3598 | |
3599 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
3600 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() { |
3601 | DenseSet<const ContextNode *> Visited; |
3602 | for (auto &Entry : AllocationCallToContextNodeMap) { |
3603 | Visited.clear(); |
3604 | identifyClones(Entry.second, Visited, Entry.second->getContextIds()); |
3605 | } |
3606 | Visited.clear(); |
3607 | for (auto &Entry : AllocationCallToContextNodeMap) |
3608 | recursivelyRemoveNoneTypeCalleeEdges(Node: Entry.second, Visited); |
3609 | if (VerifyCCG) |
3610 | check(); |
3611 | } |
3612 | |
3613 | // helper function to check an AllocType is cold or notcold or both. |
3614 | bool checkColdOrNotCold(uint8_t AllocType) { |
3615 | return (AllocType == (uint8_t)AllocationType::Cold) || |
3616 | (AllocType == (uint8_t)AllocationType::NotCold) || |
3617 | (AllocType == |
3618 | ((uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold)); |
3619 | } |
3620 | |
3621 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
3622 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones( |
3623 | ContextNode *Node, DenseSet<const ContextNode *> &Visited, |
3624 | const DenseSet<uint32_t> &AllocContextIds) { |
3625 | if (VerifyNodes) |
3626 | checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false); |
3627 | assert(!Node->CloneOf); |
3628 | |
3629 | // If Node as a null call, then either it wasn't found in the module (regular |
3630 | // LTO) or summary index (ThinLTO), or there were other conditions blocking |
3631 | // cloning (e.g. recursion, calls multiple targets, etc). |
3632 | // Do this here so that we don't try to recursively clone callers below, which |
3633 | // isn't useful at least for this node. |
3634 | if (!Node->hasCall()) |
3635 | return; |
3636 | |
3637 | // No need to look at any callers if allocation type already unambiguous. |
3638 | if (hasSingleAllocType(Node->AllocTypes)) |
3639 | return; |
3640 | |
3641 | #ifndef NDEBUG |
3642 | auto Insert = |
3643 | #endif |
3644 | Visited.insert(Node); |
3645 | // We should not have visited this node yet. |
3646 | assert(Insert.second); |
3647 | // The recursive call to identifyClones may delete the current edge from the |
3648 | // CallerEdges vector. Make a copy and iterate on that, simpler than passing |
3649 | // in an iterator and having recursive call erase from it. Other edges may |
3650 | // also get removed during the recursion, which will have null Callee and |
3651 | // Caller pointers (and are deleted later), so we skip those below. |
3652 | { |
3653 | auto CallerEdges = Node->CallerEdges; |
3654 | for (auto &Edge : CallerEdges) { |
3655 | // Skip any that have been removed by an earlier recursive call. |
3656 | if (Edge->isRemoved()) { |
3657 | assert(!is_contained(Node->CallerEdges, Edge)); |
3658 | continue; |
3659 | } |
3660 | // Defer backedges. See comments further below where these edges are |
3661 | // handled during the cloning of this Node. |
3662 | if (Edge->IsBackedge) { |
3663 | // We should only mark these if cloning recursive contexts, where we |
3664 | // need to do this deferral. |
3665 | assert(CloneRecursiveContexts); |
3666 | continue; |
3667 | } |
3668 | // Ignore any caller we previously visited via another edge. |
3669 | if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) { |
3670 | identifyClones(Edge->Caller, Visited, AllocContextIds); |
3671 | } |
3672 | } |
3673 | } |
3674 | |
3675 | // Check if we reached an unambiguous call or have have only a single caller. |
3676 | if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1) |
3677 | return; |
3678 | |
3679 | // We need to clone. |
3680 | |
3681 | // Try to keep the original version as alloc type NotCold. This will make |
3682 | // cases with indirect calls or any other situation with an unknown call to |
3683 | // the original function get the default behavior. We do this by sorting the |
3684 | // CallerEdges of the Node we will clone by alloc type. |
3685 | // |
3686 | // Give NotCold edge the lowest sort priority so those edges are at the end of |
3687 | // the caller edges vector, and stay on the original version (since the below |
3688 | // code clones greedily until it finds all remaining edges have the same type |
3689 | // and leaves the remaining ones on the original Node). |
3690 | // |
3691 | // We shouldn't actually have any None type edges, so the sorting priority for |
3692 | // that is arbitrary, and we assert in that case below. |
3693 | const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4, |
3694 | /*Cold*/ 1, |
3695 | /*NotColdCold*/ 2}; |
3696 | llvm::stable_sort(Node->CallerEdges, |
3697 | [&](const std::shared_ptr<ContextEdge> &A, |
3698 | const std::shared_ptr<ContextEdge> &B) { |
3699 | // Nodes with non-empty context ids should be sorted |
3700 | // before those with empty context ids. |
3701 | if (A->ContextIds.empty()) |
3702 | // Either B ContextIds are non-empty (in which case we |
3703 | // should return false because B < A), or B ContextIds |
3704 | // are empty, in which case they are equal, and we |
3705 | // should maintain the original relative ordering. |
3706 | return false; |
3707 | if (B->ContextIds.empty()) |
3708 | return true; |
3709 | |
3710 | if (A->AllocTypes == B->AllocTypes) |
3711 | // Use the first context id for each edge as a |
3712 | // tie-breaker. |
3713 | return *A->ContextIds.begin() < *B->ContextIds.begin(); |
3714 | return AllocTypeCloningPriority[A->AllocTypes] < |
3715 | AllocTypeCloningPriority[B->AllocTypes]; |
3716 | }); |
3717 | |
3718 | assert(Node->AllocTypes != (uint8_t)AllocationType::None); |
3719 | |
3720 | DenseSet<uint32_t> RecursiveContextIds; |
3721 | assert(AllowRecursiveContexts || !CloneRecursiveContexts); |
3722 | // If we are allowing recursive callsites, but have also disabled recursive |
3723 | // contexts, look for context ids that show up in multiple caller edges. |
3724 | if (AllowRecursiveCallsites && !AllowRecursiveContexts) { |
3725 | DenseSet<uint32_t> AllCallerContextIds; |
3726 | for (auto &CE : Node->CallerEdges) { |
3727 | // Resize to the largest set of caller context ids, since we know the |
3728 | // final set will be at least that large. |
3729 | AllCallerContextIds.reserve(Size: CE->getContextIds().size()); |
3730 | for (auto Id : CE->getContextIds()) |
3731 | if (!AllCallerContextIds.insert(Id).second) |
3732 | RecursiveContextIds.insert(Id); |
3733 | } |
3734 | } |
3735 | |
3736 | // Iterate until we find no more opportunities for disambiguating the alloc |
3737 | // types via cloning. In most cases this loop will terminate once the Node |
3738 | // has a single allocation type, in which case no more cloning is needed. |
3739 | // Iterate over a copy of Node's caller edges, since we may need to remove |
3740 | // edges in the moveEdgeTo* methods, and this simplifies the handling and |
3741 | // makes it less error-prone. |
3742 | auto CallerEdges = Node->CallerEdges; |
3743 | for (auto &CallerEdge : CallerEdges) { |
3744 | // Skip any that have been removed by an earlier recursive call. |
3745 | if (CallerEdge->isRemoved()) { |
3746 | assert(!is_contained(Node->CallerEdges, CallerEdge)); |
3747 | continue; |
3748 | } |
3749 | assert(CallerEdge->Callee == Node); |
3750 | |
3751 | // See if cloning the prior caller edge left this node with a single alloc |
3752 | // type or a single caller. In that case no more cloning of Node is needed. |
3753 | if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1) |
3754 | break; |
3755 | |
3756 | // If the caller was not successfully matched to a call in the IR/summary, |
3757 | // there is no point in trying to clone for it as we can't update that call. |
3758 | if (!CallerEdge->Caller->hasCall()) |
3759 | continue; |
3760 | |
3761 | // Only need to process the ids along this edge pertaining to the given |
3762 | // allocation. |
3763 | auto CallerEdgeContextsForAlloc = |
3764 | set_intersection(CallerEdge->getContextIds(), AllocContextIds); |
3765 | if (!RecursiveContextIds.empty()) |
3766 | CallerEdgeContextsForAlloc = |
3767 | set_difference(CallerEdgeContextsForAlloc, RecursiveContextIds); |
3768 | if (CallerEdgeContextsForAlloc.empty()) |
3769 | continue; |
3770 | |
3771 | auto CallerAllocTypeForAlloc = computeAllocType(ContextIds&: CallerEdgeContextsForAlloc); |
3772 | |
3773 | // Compute the node callee edge alloc types corresponding to the context ids |
3774 | // for this caller edge. |
3775 | std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge; |
3776 | CalleeEdgeAllocTypesForCallerEdge.reserve(n: Node->CalleeEdges.size()); |
3777 | for (auto &CalleeEdge : Node->CalleeEdges) |
3778 | CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes( |
3779 | Node1Ids: CalleeEdge->getContextIds(), Node2Ids: CallerEdgeContextsForAlloc)); |
3780 | |
3781 | // Don't clone if doing so will not disambiguate any alloc types amongst |
3782 | // caller edges (including the callee edges that would be cloned). |
3783 | // Otherwise we will simply move all edges to the clone. |
3784 | // |
3785 | // First check if by cloning we will disambiguate the caller allocation |
3786 | // type from node's allocation type. Query allocTypeToUse so that we don't |
3787 | // bother cloning to distinguish NotCold+Cold from NotCold. Note that |
3788 | // neither of these should be None type. |
3789 | // |
3790 | // Then check if by cloning node at least one of the callee edges will be |
3791 | // disambiguated by splitting out different context ids. |
3792 | // |
3793 | // However, always do the cloning if this is a backedge, in which case we |
3794 | // have not yet cloned along this caller edge. |
3795 | assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None); |
3796 | assert(Node->AllocTypes != (uint8_t)AllocationType::None); |
3797 | if (!CallerEdge->IsBackedge && |
3798 | allocTypeToUse(CallerAllocTypeForAlloc) == |
3799 | allocTypeToUse(Node->AllocTypes) && |
3800 | allocTypesMatch<DerivedCCG, FuncTy, CallTy>( |
3801 | CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) { |
3802 | continue; |
3803 | } |
3804 | |
3805 | if (CallerEdge->IsBackedge) { |
3806 | // We should only mark these if cloning recursive contexts, where we |
3807 | // need to do this deferral. |
3808 | assert(CloneRecursiveContexts); |
3809 | DeferredBackedges++; |
3810 | } |
3811 | |
3812 | // If this is a backedge, we now do recursive cloning starting from its |
3813 | // caller since we may have moved unambiguous caller contexts to a clone |
3814 | // of this Node in a previous iteration of the current loop, giving more |
3815 | // opportunity for cloning through the backedge. Because we sorted the |
3816 | // caller edges earlier so that cold caller edges are first, we would have |
3817 | // visited and cloned this node for any unamibiguously cold non-recursive |
3818 | // callers before any ambiguous backedge callers. Note that we don't do this |
3819 | // if the caller is already cloned or visited during cloning (e.g. via a |
3820 | // different context path from the allocation). |
3821 | // TODO: Can we do better in the case where the caller was already visited? |
3822 | if (CallerEdge->IsBackedge && !CallerEdge->Caller->CloneOf && |
3823 | !Visited.count(CallerEdge->Caller)) { |
3824 | const auto OrigIdCount = CallerEdge->getContextIds().size(); |
3825 | // Now do the recursive cloning of this backedge's caller, which was |
3826 | // deferred earlier. |
3827 | identifyClones(CallerEdge->Caller, Visited, CallerEdgeContextsForAlloc); |
3828 | removeNoneTypeCalleeEdges(Node: CallerEdge->Caller); |
3829 | // See if the recursive call to identifyClones moved the context ids to a |
3830 | // new edge from this node to a clone of caller, and switch to looking at |
3831 | // that new edge so that we clone Node for the new caller clone. |
3832 | bool UpdatedEdge = false; |
3833 | if (OrigIdCount > CallerEdge->getContextIds().size()) { |
3834 | for (auto E : Node->CallerEdges) { |
3835 | // Only interested in clones of the current edges caller. |
3836 | if (E->Caller->CloneOf != CallerEdge->Caller) |
3837 | continue; |
3838 | // See if this edge contains any of the context ids originally on the |
3839 | // current caller edge. |
3840 | auto CallerEdgeContextsForAllocNew = |
3841 | set_intersection(CallerEdgeContextsForAlloc, E->getContextIds()); |
3842 | if (CallerEdgeContextsForAllocNew.empty()) |
3843 | continue; |
3844 | // Make sure we don't pick a previously existing caller edge of this |
3845 | // Node, which would be processed on a different iteration of the |
3846 | // outer loop over the saved CallerEdges. |
3847 | if (llvm::is_contained(CallerEdges, E)) |
3848 | continue; |
3849 | // The CallerAllocTypeForAlloc and CalleeEdgeAllocTypesForCallerEdge |
3850 | // are updated further below for all cases where we just invoked |
3851 | // identifyClones recursively. |
3852 | CallerEdgeContextsForAlloc.swap(CallerEdgeContextsForAllocNew); |
3853 | CallerEdge = E; |
3854 | UpdatedEdge = true; |
3855 | break; |
3856 | } |
3857 | } |
3858 | // If cloning removed this edge (and we didn't update it to a new edge |
3859 | // above), we're done with this edge. It's possible we moved all of the |
3860 | // context ids to an existing clone, in which case there's no need to do |
3861 | // further processing for them. |
3862 | if (CallerEdge->isRemoved()) |
3863 | continue; |
3864 | |
3865 | // Now we need to update the information used for the cloning decisions |
3866 | // further below, as we may have modified edges and their context ids. |
3867 | |
3868 | // Note if we changed the CallerEdge above we would have already updated |
3869 | // the context ids. |
3870 | if (!UpdatedEdge) { |
3871 | CallerEdgeContextsForAlloc = set_intersection( |
3872 | CallerEdgeContextsForAlloc, CallerEdge->getContextIds()); |
3873 | if (CallerEdgeContextsForAlloc.empty()) |
3874 | continue; |
3875 | } |
3876 | // Update the other information that depends on the edges and on the now |
3877 | // updated CallerEdgeContextsForAlloc. |
3878 | CallerAllocTypeForAlloc = computeAllocType(ContextIds&: CallerEdgeContextsForAlloc); |
3879 | CalleeEdgeAllocTypesForCallerEdge.clear(); |
3880 | for (auto &CalleeEdge : Node->CalleeEdges) { |
3881 | CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes( |
3882 | Node1Ids: CalleeEdge->getContextIds(), Node2Ids: CallerEdgeContextsForAlloc)); |
3883 | } |
3884 | } |
3885 | |
3886 | // First see if we can use an existing clone. Check each clone and its |
3887 | // callee edges for matching alloc types. |
3888 | ContextNode *Clone = nullptr; |
3889 | for (auto *CurClone : Node->Clones) { |
3890 | if (allocTypeToUse(CurClone->AllocTypes) != |
3891 | allocTypeToUse(CallerAllocTypeForAlloc)) |
3892 | continue; |
3893 | |
3894 | bool BothSingleAlloc = hasSingleAllocType(CurClone->AllocTypes) && |
3895 | hasSingleAllocType(CallerAllocTypeForAlloc); |
3896 | // The above check should mean that if both have single alloc types that |
3897 | // they should be equal. |
3898 | assert(!BothSingleAlloc || |
3899 | CurClone->AllocTypes == CallerAllocTypeForAlloc); |
3900 | |
3901 | // If either both have a single alloc type (which are the same), or if the |
3902 | // clone's callee edges have the same alloc types as those for the current |
3903 | // allocation on Node's callee edges (CalleeEdgeAllocTypesForCallerEdge), |
3904 | // then we can reuse this clone. |
3905 | if (BothSingleAlloc || allocTypesMatchClone<DerivedCCG, FuncTy, CallTy>( |
3906 | CalleeEdgeAllocTypesForCallerEdge, CurClone)) { |
3907 | Clone = CurClone; |
3908 | break; |
3909 | } |
3910 | } |
3911 | |
3912 | // The edge iterator is adjusted when we move the CallerEdge to the clone. |
3913 | if (Clone) |
3914 | moveEdgeToExistingCalleeClone(Edge: CallerEdge, NewCallee: Clone, /*NewClone=*/false, |
3915 | ContextIdsToMove: CallerEdgeContextsForAlloc); |
3916 | else |
3917 | Clone = moveEdgeToNewCalleeClone(Edge: CallerEdge, ContextIdsToMove: CallerEdgeContextsForAlloc); |
3918 | |
3919 | // Sanity check that no alloc types on clone or its edges are None. |
3920 | assert(Clone->AllocTypes != (uint8_t)AllocationType::None); |
3921 | } |
3922 | |
3923 | // We should still have some context ids on the original Node. |
3924 | assert(!Node->emptyContextIds()); |
3925 | |
3926 | // Sanity check that no alloc types on node or edges are None. |
3927 | assert(Node->AllocTypes != (uint8_t)AllocationType::None); |
3928 | |
3929 | if (VerifyNodes) |
3930 | checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false); |
3931 | } |
3932 | |
3933 | void ModuleCallsiteContextGraph::updateAllocationCall( |
3934 | CallInfo &Call, AllocationType AllocType) { |
3935 | std::string AllocTypeString = getAllocTypeAttributeString(Type: AllocType); |
3936 | auto A = llvm::Attribute::get(Context&: Call.call()->getFunction()->getContext(), |
3937 | Kind: "memprof" , Val: AllocTypeString); |
3938 | cast<CallBase>(Val: Call.call())->addFnAttr(Attr: A); |
3939 | OREGetter(Call.call()->getFunction()) |
3940 | .emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofAttribute" , Call.call()) |
3941 | << ore::NV("AllocationCall" , Call.call()) << " in clone " |
3942 | << ore::NV("Caller" , Call.call()->getFunction()) |
3943 | << " marked with memprof allocation attribute " |
3944 | << ore::NV("Attribute" , AllocTypeString)); |
3945 | } |
3946 | |
3947 | void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call, |
3948 | AllocationType AllocType) { |
3949 | auto *AI = cast<AllocInfo *>(Val: Call.call()); |
3950 | assert(AI); |
3951 | assert(AI->Versions.size() > Call.cloneNo()); |
3952 | AI->Versions[Call.cloneNo()] = (uint8_t)AllocType; |
3953 | } |
3954 | |
3955 | AllocationType |
3956 | ModuleCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const { |
3957 | const auto *CB = cast<CallBase>(Val: Call.call()); |
3958 | if (!CB->getAttributes().hasFnAttr(Kind: "memprof" )) |
3959 | return AllocationType::None; |
3960 | return CB->getAttributes().getFnAttr(Kind: "memprof" ).getValueAsString() == "cold" |
3961 | ? AllocationType::Cold |
3962 | : AllocationType::NotCold; |
3963 | } |
3964 | |
3965 | AllocationType |
3966 | IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const { |
3967 | const auto *AI = cast<AllocInfo *>(Val: Call.call()); |
3968 | assert(AI->Versions.size() > Call.cloneNo()); |
3969 | return (AllocationType)AI->Versions[Call.cloneNo()]; |
3970 | } |
3971 | |
3972 | void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall, |
3973 | FuncInfo CalleeFunc) { |
3974 | if (CalleeFunc.cloneNo() > 0) |
3975 | cast<CallBase>(Val: CallerCall.call())->setCalledFunction(CalleeFunc.func()); |
3976 | OREGetter(CallerCall.call()->getFunction()) |
3977 | .emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofCall" , CallerCall.call()) |
3978 | << ore::NV("Call" , CallerCall.call()) << " in clone " |
3979 | << ore::NV("Caller" , CallerCall.call()->getFunction()) |
3980 | << " assigned to call function clone " |
3981 | << ore::NV("Callee" , CalleeFunc.func())); |
3982 | } |
3983 | |
3984 | void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall, |
3985 | FuncInfo CalleeFunc) { |
3986 | auto *CI = cast<CallsiteInfo *>(Val: CallerCall.call()); |
3987 | assert(CI && |
3988 | "Caller cannot be an allocation which should not have profiled calls" ); |
3989 | assert(CI->Clones.size() > CallerCall.cloneNo()); |
3990 | CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo(); |
3991 | } |
3992 | |
3993 | CallsiteContextGraph<ModuleCallsiteContextGraph, Function, |
3994 | Instruction *>::FuncInfo |
3995 | ModuleCallsiteContextGraph::cloneFunctionForCallsite( |
3996 | FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap, |
3997 | std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) { |
3998 | // Use existing LLVM facilities for cloning and obtaining Call in clone |
3999 | ValueToValueMapTy VMap; |
4000 | auto *NewFunc = CloneFunction(F: Func.func(), VMap); |
4001 | std::string Name = getMemProfFuncName(Base: Func.func()->getName(), CloneNo); |
4002 | assert(!Func.func()->getParent()->getFunction(Name)); |
4003 | NewFunc->setName(Name); |
4004 | if (auto *SP = NewFunc->getSubprogram()) |
4005 | SP->replaceLinkageName( |
4006 | LN: MDString::get(Context&: NewFunc->getParent()->getContext(), Str: Name)); |
4007 | for (auto &Inst : CallsWithMetadataInFunc) { |
4008 | // This map always has the initial version in it. |
4009 | assert(Inst.cloneNo() == 0); |
4010 | CallMap[Inst] = {cast<Instruction>(Val&: VMap[Inst.call()]), CloneNo}; |
4011 | } |
4012 | OREGetter(Func.func()) |
4013 | .emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofClone" , Func.func()) |
4014 | << "created clone " << ore::NV("NewFunction" , NewFunc)); |
4015 | return {NewFunc, CloneNo}; |
4016 | } |
4017 | |
4018 | CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, |
4019 | IndexCall>::FuncInfo |
4020 | IndexCallsiteContextGraph::cloneFunctionForCallsite( |
4021 | FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap, |
4022 | std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) { |
4023 | // Check how many clones we have of Call (and therefore function). |
4024 | // The next clone number is the current size of versions array. |
4025 | // Confirm this matches the CloneNo provided by the caller, which is based on |
4026 | // the number of function clones we have. |
4027 | assert(CloneNo == (isa<AllocInfo *>(Call.call()) |
4028 | ? cast<AllocInfo *>(Call.call())->Versions.size() |
4029 | : cast<CallsiteInfo *>(Call.call())->Clones.size())); |
4030 | // Walk all the instructions in this function. Create a new version for |
4031 | // each (by adding an entry to the Versions/Clones summary array), and copy |
4032 | // over the version being called for the function clone being cloned here. |
4033 | // Additionally, add an entry to the CallMap for the new function clone, |
4034 | // mapping the original call (clone 0, what is in CallsWithMetadataInFunc) |
4035 | // to the new call clone. |
4036 | for (auto &Inst : CallsWithMetadataInFunc) { |
4037 | // This map always has the initial version in it. |
4038 | assert(Inst.cloneNo() == 0); |
4039 | if (auto *AI = dyn_cast<AllocInfo *>(Val: Inst.call())) { |
4040 | assert(AI->Versions.size() == CloneNo); |
4041 | // We assign the allocation type later (in updateAllocationCall), just add |
4042 | // an entry for it here. |
4043 | AI->Versions.push_back(Elt: 0); |
4044 | } else { |
4045 | auto *CI = cast<CallsiteInfo *>(Val: Inst.call()); |
4046 | assert(CI && CI->Clones.size() == CloneNo); |
4047 | // We assign the clone number later (in updateCall), just add an entry for |
4048 | // it here. |
4049 | CI->Clones.push_back(Elt: 0); |
4050 | } |
4051 | CallMap[Inst] = {Inst.call(), CloneNo}; |
4052 | } |
4053 | return {Func.func(), CloneNo}; |
4054 | } |
4055 | |
4056 | // We perform cloning for each allocation node separately. However, this |
4057 | // sometimes results in a situation where the same node calls multiple |
4058 | // clones of the same callee, created for different allocations. This |
4059 | // causes issues when assigning functions to these clones, as each node can |
4060 | // in reality only call a single callee clone. |
4061 | // |
4062 | // To address this, before assigning functions, merge callee clone nodes as |
4063 | // needed using a post order traversal from the allocations. We attempt to |
4064 | // use existing clones as the merge node when legal, and to share them |
4065 | // among callers with the same properties (callers calling the same set of |
4066 | // callee clone nodes for the same allocations). |
4067 | // |
4068 | // Without this fix, in some cases incorrect function assignment will lead |
4069 | // to calling the wrong allocation clone. |
4070 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
4071 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones() { |
4072 | if (!MergeClones) |
4073 | return; |
4074 | |
4075 | // Generate a map from context id to the associated allocation node for use |
4076 | // when merging clones. |
4077 | DenseMap<uint32_t, ContextNode *> ContextIdToAllocationNode; |
4078 | for (auto &Entry : AllocationCallToContextNodeMap) { |
4079 | auto *Node = Entry.second; |
4080 | for (auto Id : Node->getContextIds()) |
4081 | ContextIdToAllocationNode[Id] = Node->getOrigNode(); |
4082 | for (auto *Clone : Node->Clones) { |
4083 | for (auto Id : Clone->getContextIds()) |
4084 | ContextIdToAllocationNode[Id] = Clone->getOrigNode(); |
4085 | } |
4086 | } |
4087 | |
4088 | // Post order traversal starting from allocations to ensure each callsite |
4089 | // calls a single clone of its callee. Callee nodes that are clones of each |
4090 | // other are merged (via new merge nodes if needed) to achieve this. |
4091 | DenseSet<const ContextNode *> Visited; |
4092 | for (auto &Entry : AllocationCallToContextNodeMap) { |
4093 | auto *Node = Entry.second; |
4094 | |
4095 | mergeClones(Node, Visited, ContextIdToAllocationNode); |
4096 | |
4097 | // Make a copy so the recursive post order traversal that may create new |
4098 | // clones doesn't mess up iteration. Note that the recursive traversal |
4099 | // itself does not call mergeClones on any of these nodes, which are all |
4100 | // (clones of) allocations. |
4101 | auto Clones = Node->Clones; |
4102 | for (auto *Clone : Clones) |
4103 | mergeClones(Clone, Visited, ContextIdToAllocationNode); |
4104 | } |
4105 | |
4106 | if (DumpCCG) { |
4107 | dbgs() << "CCG after merging:\n" ; |
4108 | dbgs() << *this; |
4109 | } |
4110 | if (ExportToDot) |
4111 | exportToDot(Label: "aftermerge" ); |
4112 | |
4113 | if (VerifyCCG) { |
4114 | check(); |
4115 | } |
4116 | } |
4117 | |
4118 | // Recursive helper for above mergeClones method. |
4119 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
4120 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones( |
4121 | ContextNode *Node, DenseSet<const ContextNode *> &Visited, |
4122 | DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) { |
4123 | auto Inserted = Visited.insert(Node); |
4124 | if (!Inserted.second) |
4125 | return; |
4126 | |
4127 | // Make a copy since the recursive call may move a caller edge to a new |
4128 | // callee, messing up the iterator. |
4129 | auto CallerEdges = Node->CallerEdges; |
4130 | for (auto CallerEdge : CallerEdges) { |
4131 | // Skip any caller edge moved onto a different callee during recursion. |
4132 | if (CallerEdge->Callee != Node) |
4133 | continue; |
4134 | mergeClones(CallerEdge->Caller, Visited, ContextIdToAllocationNode); |
4135 | } |
4136 | |
4137 | // Merge for this node after we handle its callers. |
4138 | mergeNodeCalleeClones(Node, Visited, ContextIdToAllocationNode); |
4139 | } |
4140 | |
4141 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
4142 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeNodeCalleeClones( |
4143 | ContextNode *Node, DenseSet<const ContextNode *> &Visited, |
4144 | DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) { |
4145 | // Ignore Node if we moved all of its contexts to clones. |
4146 | if (Node->emptyContextIds()) |
4147 | return; |
4148 | |
4149 | // First identify groups of clones among Node's callee edges, by building |
4150 | // a map from each callee base node to the associated callee edges from Node. |
4151 | MapVector<ContextNode *, std::vector<std::shared_ptr<ContextEdge>>> |
4152 | OrigNodeToCloneEdges; |
4153 | for (const auto &E : Node->CalleeEdges) { |
4154 | auto *Callee = E->Callee; |
4155 | if (!Callee->CloneOf && Callee->Clones.empty()) |
4156 | continue; |
4157 | ContextNode *Base = Callee->getOrigNode(); |
4158 | OrigNodeToCloneEdges[Base].push_back(E); |
4159 | } |
4160 | |
4161 | // Helper for callee edge sorting below. Return true if A's callee has fewer |
4162 | // caller edges than B, or if A is a clone and B is not, or if A's first |
4163 | // context id is smaller than B's. |
4164 | auto CalleeCallerEdgeLessThan = [](const std::shared_ptr<ContextEdge> &A, |
4165 | const std::shared_ptr<ContextEdge> &B) { |
4166 | if (A->Callee->CallerEdges.size() != B->Callee->CallerEdges.size()) |
4167 | return A->Callee->CallerEdges.size() < B->Callee->CallerEdges.size(); |
4168 | if (A->Callee->CloneOf && !B->Callee->CloneOf) |
4169 | return true; |
4170 | else if (!A->Callee->CloneOf && B->Callee->CloneOf) |
4171 | return false; |
4172 | // Use the first context id for each edge as a |
4173 | // tie-breaker. |
4174 | return *A->ContextIds.begin() < *B->ContextIds.begin(); |
4175 | }; |
4176 | |
4177 | // Process each set of callee clones called by Node, performing the needed |
4178 | // merging. |
4179 | for (auto Entry : OrigNodeToCloneEdges) { |
4180 | // CalleeEdges is the set of edges from Node reaching callees that are |
4181 | // mutual clones of each other. |
4182 | auto &CalleeEdges = Entry.second; |
4183 | auto NumCalleeClones = CalleeEdges.size(); |
4184 | // A single edge means there is no merging needed. |
4185 | if (NumCalleeClones == 1) |
4186 | continue; |
4187 | // Sort the CalleeEdges calling this group of clones in ascending order of |
4188 | // their caller edge counts, putting the original non-clone node first in |
4189 | // cases of a tie. This simplifies finding an existing node to use as the |
4190 | // merge node. |
4191 | llvm::stable_sort(CalleeEdges, CalleeCallerEdgeLessThan); |
4192 | |
4193 | /// Find other callers of the given set of callee edges that can |
4194 | /// share the same callee merge node. See the comments at this method |
4195 | /// definition for details. |
4196 | DenseSet<ContextNode *> OtherCallersToShareMerge; |
4197 | findOtherCallersToShareMerge(Node, CalleeEdges, ContextIdToAllocationNode, |
4198 | OtherCallersToShareMerge); |
4199 | |
4200 | // Now do the actual merging. Identify existing or create a new MergeNode |
4201 | // during the first iteration. Move each callee over, along with edges from |
4202 | // other callers we've determined above can share the same merge node. |
4203 | ContextNode *MergeNode = nullptr; |
4204 | DenseMap<ContextNode *, unsigned> CallerToMoveCount; |
4205 | for (auto CalleeEdge : CalleeEdges) { |
4206 | auto *OrigCallee = CalleeEdge->Callee; |
4207 | // If we don't have a MergeNode yet (only happens on the first iteration, |
4208 | // as a new one will be created when we go to move the first callee edge |
4209 | // over as needed), see if we can use this callee. |
4210 | if (!MergeNode) { |
4211 | // If there are no other callers, simply use this callee. |
4212 | if (CalleeEdge->Callee->CallerEdges.size() == 1) { |
4213 | MergeNode = OrigCallee; |
4214 | NonNewMergedNodes++; |
4215 | continue; |
4216 | } |
4217 | // Otherwise, if we have identified other caller nodes that can share |
4218 | // the merge node with Node, see if all of OrigCallee's callers are |
4219 | // going to share the same merge node. In that case we can use callee |
4220 | // (since all of its callers would move to the new merge node). |
4221 | if (!OtherCallersToShareMerge.empty()) { |
4222 | bool MoveAllCallerEdges = true; |
4223 | for (auto CalleeCallerE : OrigCallee->CallerEdges) { |
4224 | if (CalleeCallerE == CalleeEdge) |
4225 | continue; |
4226 | if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller)) { |
4227 | MoveAllCallerEdges = false; |
4228 | break; |
4229 | } |
4230 | } |
4231 | // If we are going to move all callers over, we can use this callee as |
4232 | // the MergeNode. |
4233 | if (MoveAllCallerEdges) { |
4234 | MergeNode = OrigCallee; |
4235 | NonNewMergedNodes++; |
4236 | continue; |
4237 | } |
4238 | } |
4239 | } |
4240 | // Move this callee edge, creating a new merge node if necessary. |
4241 | if (MergeNode) { |
4242 | assert(MergeNode != OrigCallee); |
4243 | moveEdgeToExistingCalleeClone(Edge: CalleeEdge, NewCallee: MergeNode, |
4244 | /*NewClone*/ false); |
4245 | } else { |
4246 | MergeNode = moveEdgeToNewCalleeClone(Edge: CalleeEdge); |
4247 | NewMergedNodes++; |
4248 | } |
4249 | // Now move all identified edges from other callers over to the merge node |
4250 | // as well. |
4251 | if (!OtherCallersToShareMerge.empty()) { |
4252 | // Make and iterate over a copy of OrigCallee's caller edges because |
4253 | // some of these will be moved off of the OrigCallee and that would mess |
4254 | // up the iteration from OrigCallee. |
4255 | auto OrigCalleeCallerEdges = OrigCallee->CallerEdges; |
4256 | for (auto &CalleeCallerE : OrigCalleeCallerEdges) { |
4257 | if (CalleeCallerE == CalleeEdge) |
4258 | continue; |
4259 | if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller)) |
4260 | continue; |
4261 | CallerToMoveCount[CalleeCallerE->Caller]++; |
4262 | moveEdgeToExistingCalleeClone(Edge: CalleeCallerE, NewCallee: MergeNode, |
4263 | /*NewClone*/ false); |
4264 | } |
4265 | } |
4266 | removeNoneTypeCalleeEdges(Node: OrigCallee); |
4267 | removeNoneTypeCalleeEdges(Node: MergeNode); |
4268 | } |
4269 | } |
4270 | } |
4271 | |
4272 | // Look for other nodes that have edges to the same set of callee |
4273 | // clones as the current Node. Those can share the eventual merge node |
4274 | // (reducing cloning and binary size overhead) iff: |
4275 | // - they have edges to the same set of callee clones |
4276 | // - each callee edge reaches a subset of the same allocations as Node's |
4277 | // corresponding edge to the same callee clone. |
4278 | // The second requirement is to ensure that we don't undo any of the |
4279 | // necessary cloning to distinguish contexts with different allocation |
4280 | // behavior. |
4281 | // FIXME: This is somewhat conservative, as we really just need to ensure |
4282 | // that they don't reach the same allocations as contexts on edges from Node |
4283 | // going to any of the *other* callee clones being merged. However, that |
4284 | // requires more tracking and checking to get right. |
4285 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
4286 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: |
4287 | findOtherCallersToShareMerge( |
4288 | ContextNode *Node, |
4289 | std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges, |
4290 | DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode, |
4291 | DenseSet<ContextNode *> &OtherCallersToShareMerge) { |
4292 | auto NumCalleeClones = CalleeEdges.size(); |
4293 | // This map counts how many edges to the same callee clone exist for other |
4294 | // caller nodes of each callee clone. |
4295 | DenseMap<ContextNode *, unsigned> OtherCallersToSharedCalleeEdgeCount; |
4296 | // Counts the number of other caller nodes that have edges to all callee |
4297 | // clones that don't violate the allocation context checking. |
4298 | unsigned PossibleOtherCallerNodes = 0; |
4299 | |
4300 | // We only need to look at other Caller nodes if the first callee edge has |
4301 | // multiple callers (recall they are sorted in ascending order above). |
4302 | if (CalleeEdges[0]->Callee->CallerEdges.size() < 2) |
4303 | return; |
4304 | |
4305 | // For each callee edge: |
4306 | // - Collect the count of other caller nodes calling the same callees. |
4307 | // - Collect the alloc nodes reached by contexts on each callee edge. |
4308 | DenseMap<ContextEdge *, DenseSet<ContextNode *>> CalleeEdgeToAllocNodes; |
4309 | for (auto CalleeEdge : CalleeEdges) { |
4310 | assert(CalleeEdge->Callee->CallerEdges.size() > 1); |
4311 | // For each other caller of the same callee, increment the count of |
4312 | // edges reaching the same callee clone. |
4313 | for (auto CalleeCallerEdges : CalleeEdge->Callee->CallerEdges) { |
4314 | if (CalleeCallerEdges->Caller == Node) { |
4315 | assert(CalleeCallerEdges == CalleeEdge); |
4316 | continue; |
4317 | } |
4318 | OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller]++; |
4319 | // If this caller edge now reaches all of the same callee clones, |
4320 | // increment the count of candidate other caller nodes. |
4321 | if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller] == |
4322 | NumCalleeClones) |
4323 | PossibleOtherCallerNodes++; |
4324 | } |
4325 | // Collect the alloc nodes reached by contexts on each callee edge, for |
4326 | // later analysis. |
4327 | for (auto Id : CalleeEdge->getContextIds()) { |
4328 | auto *Alloc = ContextIdToAllocationNode.lookup(Id); |
4329 | if (!Alloc) { |
4330 | // FIXME: unclear why this happens occasionally, presumably |
4331 | // imperfect graph updates possibly with recursion. |
4332 | MissingAllocForContextId++; |
4333 | continue; |
4334 | } |
4335 | CalleeEdgeToAllocNodes[CalleeEdge.get()].insert(Alloc); |
4336 | } |
4337 | } |
4338 | |
4339 | // Now walk the callee edges again, and make sure that for each candidate |
4340 | // caller node all of its edges to the callees reach the same allocs (or |
4341 | // a subset) as those along the corresponding callee edge from Node. |
4342 | for (auto CalleeEdge : CalleeEdges) { |
4343 | assert(CalleeEdge->Callee->CallerEdges.size() > 1); |
4344 | // Stop if we do not have any (more) candidate other caller nodes. |
4345 | if (!PossibleOtherCallerNodes) |
4346 | break; |
4347 | auto &CurCalleeAllocNodes = CalleeEdgeToAllocNodes[CalleeEdge.get()]; |
4348 | // Check each other caller of this callee clone. |
4349 | for (auto &CalleeCallerE : CalleeEdge->Callee->CallerEdges) { |
4350 | // Not interested in the callee edge from Node itself. |
4351 | if (CalleeCallerE == CalleeEdge) |
4352 | continue; |
4353 | // Skip any callers that didn't have callee edges to all the same |
4354 | // callee clones. |
4355 | if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] != |
4356 | NumCalleeClones) |
4357 | continue; |
4358 | // Make sure that each context along edge from candidate caller node |
4359 | // reaches an allocation also reached by this callee edge from Node. |
4360 | for (auto Id : CalleeCallerE->getContextIds()) { |
4361 | auto *Alloc = ContextIdToAllocationNode.lookup(Id); |
4362 | if (!Alloc) |
4363 | continue; |
4364 | // If not, simply reset the map entry to 0 so caller is ignored, and |
4365 | // reduce the count of candidate other caller nodes. |
4366 | if (!CurCalleeAllocNodes.contains(Alloc)) { |
4367 | OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] = 0; |
4368 | PossibleOtherCallerNodes--; |
4369 | break; |
4370 | } |
4371 | } |
4372 | } |
4373 | } |
4374 | |
4375 | if (!PossibleOtherCallerNodes) |
4376 | return; |
4377 | |
4378 | // Build the set of other caller nodes that can use the same callee merge |
4379 | // node. |
4380 | for (auto &[OtherCaller, Count] : OtherCallersToSharedCalleeEdgeCount) { |
4381 | if (Count != NumCalleeClones) |
4382 | continue; |
4383 | OtherCallersToShareMerge.insert(OtherCaller); |
4384 | } |
4385 | } |
4386 | |
4387 | // This method assigns cloned callsites to functions, cloning the functions as |
4388 | // needed. The assignment is greedy and proceeds roughly as follows: |
4389 | // |
4390 | // For each function Func: |
4391 | // For each call with graph Node having clones: |
4392 | // Initialize ClonesWorklist to Node and its clones |
4393 | // Initialize NodeCloneCount to 0 |
4394 | // While ClonesWorklist is not empty: |
4395 | // Clone = pop front ClonesWorklist |
4396 | // NodeCloneCount++ |
4397 | // If Func has been cloned less than NodeCloneCount times: |
4398 | // If NodeCloneCount is 1: |
4399 | // Assign Clone to original Func |
4400 | // Continue |
4401 | // Create a new function clone |
4402 | // If other callers not assigned to call a function clone yet: |
4403 | // Assign them to call new function clone |
4404 | // Continue |
4405 | // Assign any other caller calling the cloned version to new clone |
4406 | // |
4407 | // For each caller of Clone: |
4408 | // If caller is assigned to call a specific function clone: |
4409 | // If we cannot assign Clone to that function clone: |
4410 | // Create new callsite Clone NewClone |
4411 | // Add NewClone to ClonesWorklist |
4412 | // Continue |
4413 | // Assign Clone to existing caller's called function clone |
4414 | // Else: |
4415 | // If Clone not already assigned to a function clone: |
4416 | // Assign to first function clone without assignment |
4417 | // Assign caller to selected function clone |
4418 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
4419 | bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { |
4420 | bool Changed = false; |
4421 | |
4422 | mergeClones(); |
4423 | |
4424 | // Keep track of the assignment of nodes (callsites) to function clones they |
4425 | // call. |
4426 | DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap; |
4427 | |
4428 | // Update caller node to call function version CalleeFunc, by recording the |
4429 | // assignment in CallsiteToCalleeFuncCloneMap. |
4430 | auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller, |
4431 | const FuncInfo &CalleeFunc) { |
4432 | assert(Caller->hasCall()); |
4433 | CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc; |
4434 | }; |
4435 | |
4436 | // Walk all functions for which we saw calls with memprof metadata, and handle |
4437 | // cloning for each of its calls. |
4438 | for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) { |
4439 | FuncInfo OrigFunc(Func); |
4440 | // Map from each clone of OrigFunc to a map of remappings of each call of |
4441 | // interest (from original uncloned call to the corresponding cloned call in |
4442 | // that function clone). |
4443 | std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap; |
4444 | for (auto &Call : CallsWithMetadata) { |
4445 | ContextNode *Node = getNodeForInst(C: Call); |
4446 | // Skip call if we do not have a node for it (all uses of its stack ids |
4447 | // were either on inlined chains or pruned from the MIBs), or if we did |
4448 | // not create any clones for it. |
4449 | if (!Node || Node->Clones.empty()) |
4450 | continue; |
4451 | assert(Node->hasCall() && |
4452 | "Not having a call should have prevented cloning" ); |
4453 | |
4454 | // Track the assignment of function clones to clones of the current |
4455 | // callsite Node being handled. |
4456 | std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap; |
4457 | |
4458 | // Assign callsite version CallsiteClone to function version FuncClone, |
4459 | // and also assign (possibly cloned) Call to CallsiteClone. |
4460 | auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone, |
4461 | CallInfo &Call, |
4462 | ContextNode *CallsiteClone, |
4463 | bool IsAlloc) { |
4464 | // Record the clone of callsite node assigned to this function clone. |
4465 | FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone; |
4466 | |
4467 | assert(FuncClonesToCallMap.count(FuncClone)); |
4468 | std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone]; |
4469 | CallInfo CallClone(Call); |
4470 | if (auto It = CallMap.find(Call); It != CallMap.end()) |
4471 | CallClone = It->second; |
4472 | CallsiteClone->setCall(CallClone); |
4473 | // Need to do the same for all matching calls. |
4474 | for (auto &MatchingCall : Node->MatchingCalls) { |
4475 | CallInfo CallClone(MatchingCall); |
4476 | if (auto It = CallMap.find(MatchingCall); It != CallMap.end()) |
4477 | CallClone = It->second; |
4478 | // Updates the call in the list. |
4479 | MatchingCall = CallClone; |
4480 | } |
4481 | }; |
4482 | |
4483 | // Keep track of the clones of callsite Node that need to be assigned to |
4484 | // function clones. This list may be expanded in the loop body below if we |
4485 | // find additional cloning is required. |
4486 | std::deque<ContextNode *> ClonesWorklist; |
4487 | // Ignore original Node if we moved all of its contexts to clones. |
4488 | if (!Node->emptyContextIds()) |
4489 | ClonesWorklist.push_back(Node); |
4490 | llvm::append_range(ClonesWorklist, Node->Clones); |
4491 | |
4492 | // Now walk through all of the clones of this callsite Node that we need, |
4493 | // and determine the assignment to a corresponding clone of the current |
4494 | // function (creating new function clones as needed). |
4495 | unsigned NodeCloneCount = 0; |
4496 | while (!ClonesWorklist.empty()) { |
4497 | ContextNode *Clone = ClonesWorklist.front(); |
4498 | ClonesWorklist.pop_front(); |
4499 | NodeCloneCount++; |
4500 | if (VerifyNodes) |
4501 | checkNode<DerivedCCG, FuncTy, CallTy>(Clone); |
4502 | |
4503 | // Need to create a new function clone if we have more callsite clones |
4504 | // than existing function clones, which would have been assigned to an |
4505 | // earlier clone in the list (we assign callsite clones to function |
4506 | // clones greedily). |
4507 | if (FuncClonesToCallMap.size() < NodeCloneCount) { |
4508 | // If this is the first callsite copy, assign to original function. |
4509 | if (NodeCloneCount == 1) { |
4510 | // Since FuncClonesToCallMap is empty in this case, no clones have |
4511 | // been created for this function yet, and no callers should have |
4512 | // been assigned a function clone for this callee node yet. |
4513 | assert(llvm::none_of( |
4514 | Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) { |
4515 | return CallsiteToCalleeFuncCloneMap.count(E->Caller); |
4516 | })); |
4517 | // Initialize with empty call map, assign Clone to original function |
4518 | // and its callers, and skip to the next clone. |
4519 | FuncClonesToCallMap[OrigFunc] = {}; |
4520 | AssignCallsiteCloneToFuncClone( |
4521 | OrigFunc, Call, Clone, |
4522 | AllocationCallToContextNodeMap.count(Call)); |
4523 | for (auto &CE : Clone->CallerEdges) { |
4524 | // Ignore any caller that does not have a recorded callsite Call. |
4525 | if (!CE->Caller->hasCall()) |
4526 | continue; |
4527 | RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc); |
4528 | } |
4529 | continue; |
4530 | } |
4531 | |
4532 | // First locate which copy of OrigFunc to clone again. If a caller |
4533 | // of this callsite clone was already assigned to call a particular |
4534 | // function clone, we need to redirect all of those callers to the |
4535 | // new function clone, and update their other callees within this |
4536 | // function. |
4537 | FuncInfo PreviousAssignedFuncClone; |
4538 | auto EI = llvm::find_if( |
4539 | Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) { |
4540 | return CallsiteToCalleeFuncCloneMap.count(E->Caller); |
4541 | }); |
4542 | bool CallerAssignedToCloneOfFunc = false; |
4543 | if (EI != Clone->CallerEdges.end()) { |
4544 | const std::shared_ptr<ContextEdge> &Edge = *EI; |
4545 | PreviousAssignedFuncClone = |
4546 | CallsiteToCalleeFuncCloneMap[Edge->Caller]; |
4547 | CallerAssignedToCloneOfFunc = true; |
4548 | } |
4549 | |
4550 | // Clone function and save it along with the CallInfo map created |
4551 | // during cloning in the FuncClonesToCallMap. |
4552 | std::map<CallInfo, CallInfo> NewCallMap; |
4553 | unsigned CloneNo = FuncClonesToCallMap.size(); |
4554 | assert(CloneNo > 0 && "Clone 0 is the original function, which " |
4555 | "should already exist in the map" ); |
4556 | FuncInfo NewFuncClone = cloneFunctionForCallsite( |
4557 | Func&: OrigFunc, Call, CallMap&: NewCallMap, CallsWithMetadataInFunc&: CallsWithMetadata, CloneNo); |
4558 | FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap)); |
4559 | FunctionClonesAnalysis++; |
4560 | Changed = true; |
4561 | |
4562 | // If no caller callsites were already assigned to a clone of this |
4563 | // function, we can simply assign this clone to the new func clone |
4564 | // and update all callers to it, then skip to the next clone. |
4565 | if (!CallerAssignedToCloneOfFunc) { |
4566 | AssignCallsiteCloneToFuncClone( |
4567 | NewFuncClone, Call, Clone, |
4568 | AllocationCallToContextNodeMap.count(Call)); |
4569 | for (auto &CE : Clone->CallerEdges) { |
4570 | // Ignore any caller that does not have a recorded callsite Call. |
4571 | if (!CE->Caller->hasCall()) |
4572 | continue; |
4573 | RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone); |
4574 | } |
4575 | continue; |
4576 | } |
4577 | |
4578 | // We may need to do additional node cloning in this case. |
4579 | // Reset the CallsiteToCalleeFuncCloneMap entry for any callers |
4580 | // that were previously assigned to call PreviousAssignedFuncClone, |
4581 | // to record that they now call NewFuncClone. |
4582 | // The none type edge removal may remove some of this Clone's caller |
4583 | // edges, if it is reached via another of its caller's callees. |
4584 | // Iterate over a copy and skip any that were removed. |
4585 | auto CallerEdges = Clone->CallerEdges; |
4586 | for (auto CE : CallerEdges) { |
4587 | // Skip any that have been removed on an earlier iteration. |
4588 | if (CE->isRemoved()) { |
4589 | assert(!is_contained(Clone->CallerEdges, CE)); |
4590 | continue; |
4591 | } |
4592 | assert(CE); |
4593 | // Ignore any caller that does not have a recorded callsite Call. |
4594 | if (!CE->Caller->hasCall()) |
4595 | continue; |
4596 | |
4597 | if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) || |
4598 | // We subsequently fall through to later handling that |
4599 | // will perform any additional cloning required for |
4600 | // callers that were calling other function clones. |
4601 | CallsiteToCalleeFuncCloneMap[CE->Caller] != |
4602 | PreviousAssignedFuncClone) |
4603 | continue; |
4604 | |
4605 | RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone); |
4606 | |
4607 | // If we are cloning a function that was already assigned to some |
4608 | // callers, then essentially we are creating new callsite clones |
4609 | // of the other callsites in that function that are reached by those |
4610 | // callers. Clone the other callees of the current callsite's caller |
4611 | // that were already assigned to PreviousAssignedFuncClone |
4612 | // accordingly. This is important since we subsequently update the |
4613 | // calls from the nodes in the graph and their assignments to callee |
4614 | // functions recorded in CallsiteToCalleeFuncCloneMap. |
4615 | // The none type edge removal may remove some of this caller's |
4616 | // callee edges, if it is reached via another of its callees. |
4617 | // Iterate over a copy and skip any that were removed. |
4618 | auto CalleeEdges = CE->Caller->CalleeEdges; |
4619 | for (auto CalleeEdge : CalleeEdges) { |
4620 | // Skip any that have been removed on an earlier iteration when |
4621 | // cleaning up newly None type callee edges. |
4622 | if (CalleeEdge->isRemoved()) { |
4623 | assert(!is_contained(CE->Caller->CalleeEdges, CalleeEdge)); |
4624 | continue; |
4625 | } |
4626 | assert(CalleeEdge); |
4627 | ContextNode *Callee = CalleeEdge->Callee; |
4628 | // Skip the current callsite, we are looking for other |
4629 | // callsites Caller calls, as well as any that does not have a |
4630 | // recorded callsite Call. |
4631 | if (Callee == Clone || !Callee->hasCall()) |
4632 | continue; |
4633 | // Skip direct recursive calls. We don't need/want to clone the |
4634 | // caller node again, and this loop will not behave as expected if |
4635 | // we tried. |
4636 | if (Callee == CalleeEdge->Caller) |
4637 | continue; |
4638 | ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge: CalleeEdge); |
4639 | removeNoneTypeCalleeEdges(Node: NewClone); |
4640 | // Moving the edge may have resulted in some none type |
4641 | // callee edges on the original Callee. |
4642 | removeNoneTypeCalleeEdges(Node: Callee); |
4643 | assert(NewClone->AllocTypes != (uint8_t)AllocationType::None); |
4644 | // If the Callee node was already assigned to call a specific |
4645 | // function version, make sure its new clone is assigned to call |
4646 | // that same function clone. |
4647 | if (CallsiteToCalleeFuncCloneMap.count(Callee)) |
4648 | RecordCalleeFuncOfCallsite( |
4649 | NewClone, CallsiteToCalleeFuncCloneMap[Callee]); |
4650 | // Update NewClone with the new Call clone of this callsite's Call |
4651 | // created for the new function clone created earlier. |
4652 | // Recall that we have already ensured when building the graph |
4653 | // that each caller can only call callsites within the same |
4654 | // function, so we are guaranteed that Callee Call is in the |
4655 | // current OrigFunc. |
4656 | // CallMap is set up as indexed by original Call at clone 0. |
4657 | CallInfo OrigCall(Callee->getOrigNode()->Call); |
4658 | OrigCall.setCloneNo(0); |
4659 | std::map<CallInfo, CallInfo> &CallMap = |
4660 | FuncClonesToCallMap[NewFuncClone]; |
4661 | assert(CallMap.count(OrigCall)); |
4662 | CallInfo NewCall(CallMap[OrigCall]); |
4663 | assert(NewCall); |
4664 | NewClone->setCall(NewCall); |
4665 | // Need to do the same for all matching calls. |
4666 | for (auto &MatchingCall : NewClone->MatchingCalls) { |
4667 | CallInfo OrigMatchingCall(MatchingCall); |
4668 | OrigMatchingCall.setCloneNo(0); |
4669 | assert(CallMap.count(OrigMatchingCall)); |
4670 | CallInfo NewCall(CallMap[OrigMatchingCall]); |
4671 | assert(NewCall); |
4672 | // Updates the call in the list. |
4673 | MatchingCall = NewCall; |
4674 | } |
4675 | } |
4676 | } |
4677 | // Fall through to handling below to perform the recording of the |
4678 | // function for this callsite clone. This enables handling of cases |
4679 | // where the callers were assigned to different clones of a function. |
4680 | } |
4681 | |
4682 | // See if we can use existing function clone. Walk through |
4683 | // all caller edges to see if any have already been assigned to |
4684 | // a clone of this callsite's function. If we can use it, do so. If not, |
4685 | // because that function clone is already assigned to a different clone |
4686 | // of this callsite, then we need to clone again. |
4687 | // Basically, this checking is needed to handle the case where different |
4688 | // caller functions/callsites may need versions of this function |
4689 | // containing different mixes of callsite clones across the different |
4690 | // callsites within the function. If that happens, we need to create |
4691 | // additional function clones to handle the various combinations. |
4692 | // |
4693 | // Keep track of any new clones of this callsite created by the |
4694 | // following loop, as well as any existing clone that we decided to |
4695 | // assign this clone to. |
4696 | std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap; |
4697 | FuncInfo FuncCloneAssignedToCurCallsiteClone; |
4698 | // Iterate over a copy of Clone's caller edges, since we may need to |
4699 | // remove edges in the moveEdgeTo* methods, and this simplifies the |
4700 | // handling and makes it less error-prone. |
4701 | auto CloneCallerEdges = Clone->CallerEdges; |
4702 | for (auto &Edge : CloneCallerEdges) { |
4703 | // Skip removed edges (due to direct recursive edges updated when |
4704 | // updating callee edges when moving an edge and subsequently |
4705 | // removed by call to removeNoneTypeCalleeEdges on the Clone). |
4706 | if (Edge->isRemoved()) |
4707 | continue; |
4708 | // Ignore any caller that does not have a recorded callsite Call. |
4709 | if (!Edge->Caller->hasCall()) |
4710 | continue; |
4711 | // If this caller already assigned to call a version of OrigFunc, need |
4712 | // to ensure we can assign this callsite clone to that function clone. |
4713 | if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) { |
4714 | FuncInfo FuncCloneCalledByCaller = |
4715 | CallsiteToCalleeFuncCloneMap[Edge->Caller]; |
4716 | // First we need to confirm that this function clone is available |
4717 | // for use by this callsite node clone. |
4718 | // |
4719 | // While FuncCloneToCurNodeCloneMap is built only for this Node and |
4720 | // its callsite clones, one of those callsite clones X could have |
4721 | // been assigned to the same function clone called by Edge's caller |
4722 | // - if Edge's caller calls another callsite within Node's original |
4723 | // function, and that callsite has another caller reaching clone X. |
4724 | // We need to clone Node again in this case. |
4725 | if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) && |
4726 | FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] != |
4727 | Clone) || |
4728 | // Detect when we have multiple callers of this callsite that |
4729 | // have already been assigned to specific, and different, clones |
4730 | // of OrigFunc (due to other unrelated callsites in Func they |
4731 | // reach via call contexts). Is this Clone of callsite Node |
4732 | // assigned to a different clone of OrigFunc? If so, clone Node |
4733 | // again. |
4734 | (FuncCloneAssignedToCurCallsiteClone && |
4735 | FuncCloneAssignedToCurCallsiteClone != |
4736 | FuncCloneCalledByCaller)) { |
4737 | // We need to use a different newly created callsite clone, in |
4738 | // order to assign it to another new function clone on a |
4739 | // subsequent iteration over the Clones array (adjusted below). |
4740 | // Note we specifically do not reset the |
4741 | // CallsiteToCalleeFuncCloneMap entry for this caller, so that |
4742 | // when this new clone is processed later we know which version of |
4743 | // the function to copy (so that other callsite clones we have |
4744 | // assigned to that function clone are properly cloned over). See |
4745 | // comments in the function cloning handling earlier. |
4746 | |
4747 | // Check if we already have cloned this callsite again while |
4748 | // walking through caller edges, for a caller calling the same |
4749 | // function clone. If so, we can move this edge to that new clone |
4750 | // rather than creating yet another new clone. |
4751 | if (FuncCloneToNewCallsiteCloneMap.count( |
4752 | FuncCloneCalledByCaller)) { |
4753 | ContextNode *NewClone = |
4754 | FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller]; |
4755 | moveEdgeToExistingCalleeClone(Edge, NewCallee: NewClone); |
4756 | // Cleanup any none type edges cloned over. |
4757 | removeNoneTypeCalleeEdges(Node: NewClone); |
4758 | } else { |
4759 | // Create a new callsite clone. |
4760 | ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge); |
4761 | removeNoneTypeCalleeEdges(Node: NewClone); |
4762 | FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] = |
4763 | NewClone; |
4764 | // Add to list of clones and process later. |
4765 | ClonesWorklist.push_back(NewClone); |
4766 | assert(NewClone->AllocTypes != (uint8_t)AllocationType::None); |
4767 | } |
4768 | // Moving the caller edge may have resulted in some none type |
4769 | // callee edges. |
4770 | removeNoneTypeCalleeEdges(Node: Clone); |
4771 | // We will handle the newly created callsite clone in a subsequent |
4772 | // iteration over this Node's Clones. |
4773 | continue; |
4774 | } |
4775 | |
4776 | // Otherwise, we can use the function clone already assigned to this |
4777 | // caller. |
4778 | if (!FuncCloneAssignedToCurCallsiteClone) { |
4779 | FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller; |
4780 | // Assign Clone to FuncCloneCalledByCaller |
4781 | AssignCallsiteCloneToFuncClone( |
4782 | FuncCloneCalledByCaller, Call, Clone, |
4783 | AllocationCallToContextNodeMap.count(Call)); |
4784 | } else |
4785 | // Don't need to do anything - callsite is already calling this |
4786 | // function clone. |
4787 | assert(FuncCloneAssignedToCurCallsiteClone == |
4788 | FuncCloneCalledByCaller); |
4789 | |
4790 | } else { |
4791 | // We have not already assigned this caller to a version of |
4792 | // OrigFunc. Do the assignment now. |
4793 | |
4794 | // First check if we have already assigned this callsite clone to a |
4795 | // clone of OrigFunc for another caller during this iteration over |
4796 | // its caller edges. |
4797 | if (!FuncCloneAssignedToCurCallsiteClone) { |
4798 | // Find first function in FuncClonesToCallMap without an assigned |
4799 | // clone of this callsite Node. We should always have one |
4800 | // available at this point due to the earlier cloning when the |
4801 | // FuncClonesToCallMap size was smaller than the clone number. |
4802 | for (auto &CF : FuncClonesToCallMap) { |
4803 | if (!FuncCloneToCurNodeCloneMap.count(CF.first)) { |
4804 | FuncCloneAssignedToCurCallsiteClone = CF.first; |
4805 | break; |
4806 | } |
4807 | } |
4808 | assert(FuncCloneAssignedToCurCallsiteClone); |
4809 | // Assign Clone to FuncCloneAssignedToCurCallsiteClone |
4810 | AssignCallsiteCloneToFuncClone( |
4811 | FuncCloneAssignedToCurCallsiteClone, Call, Clone, |
4812 | AllocationCallToContextNodeMap.count(Call)); |
4813 | } else |
4814 | assert(FuncCloneToCurNodeCloneMap |
4815 | [FuncCloneAssignedToCurCallsiteClone] == Clone); |
4816 | // Update callers to record function version called. |
4817 | RecordCalleeFuncOfCallsite(Edge->Caller, |
4818 | FuncCloneAssignedToCurCallsiteClone); |
4819 | } |
4820 | } |
4821 | } |
4822 | if (VerifyCCG) { |
4823 | checkNode<DerivedCCG, FuncTy, CallTy>(Node); |
4824 | for (const auto &PE : Node->CalleeEdges) |
4825 | checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee); |
4826 | for (const auto &CE : Node->CallerEdges) |
4827 | checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller); |
4828 | for (auto *Clone : Node->Clones) { |
4829 | checkNode<DerivedCCG, FuncTy, CallTy>(Clone); |
4830 | for (const auto &PE : Clone->CalleeEdges) |
4831 | checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee); |
4832 | for (const auto &CE : Clone->CallerEdges) |
4833 | checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller); |
4834 | } |
4835 | } |
4836 | } |
4837 | } |
4838 | |
4839 | uint8_t BothTypes = |
4840 | (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold; |
4841 | |
4842 | auto UpdateCalls = [&](ContextNode *Node, |
4843 | DenseSet<const ContextNode *> &Visited, |
4844 | auto &&UpdateCalls) { |
4845 | auto Inserted = Visited.insert(Node); |
4846 | if (!Inserted.second) |
4847 | return; |
4848 | |
4849 | for (auto *Clone : Node->Clones) |
4850 | UpdateCalls(Clone, Visited, UpdateCalls); |
4851 | |
4852 | for (auto &Edge : Node->CallerEdges) |
4853 | UpdateCalls(Edge->Caller, Visited, UpdateCalls); |
4854 | |
4855 | // Skip if either no call to update, or if we ended up with no context ids |
4856 | // (we moved all edges onto other clones). |
4857 | if (!Node->hasCall() || Node->emptyContextIds()) |
4858 | return; |
4859 | |
4860 | if (Node->IsAllocation) { |
4861 | auto AT = allocTypeToUse(Node->AllocTypes); |
4862 | // If the allocation type is ambiguous, and more aggressive hinting |
4863 | // has been enabled via the MinClonedColdBytePercent flag, see if this |
4864 | // allocation should be hinted cold anyway because its fraction cold bytes |
4865 | // allocated is at least the given threshold. |
4866 | if (Node->AllocTypes == BothTypes && MinClonedColdBytePercent < 100 && |
4867 | !ContextIdToContextSizeInfos.empty()) { |
4868 | uint64_t TotalCold = 0; |
4869 | uint64_t Total = 0; |
4870 | for (auto Id : Node->getContextIds()) { |
4871 | auto TypeI = ContextIdToAllocationType.find(Id); |
4872 | assert(TypeI != ContextIdToAllocationType.end()); |
4873 | auto CSI = ContextIdToContextSizeInfos.find(Id); |
4874 | if (CSI != ContextIdToContextSizeInfos.end()) { |
4875 | for (auto &Info : CSI->second) { |
4876 | Total += Info.TotalSize; |
4877 | if (TypeI->second == AllocationType::Cold) |
4878 | TotalCold += Info.TotalSize; |
4879 | } |
4880 | } |
4881 | } |
4882 | if (TotalCold * 100 >= Total * MinClonedColdBytePercent) |
4883 | AT = AllocationType::Cold; |
4884 | } |
4885 | updateAllocationCall(Call&: Node->Call, AllocType: AT); |
4886 | assert(Node->MatchingCalls.empty()); |
4887 | return; |
4888 | } |
4889 | |
4890 | if (!CallsiteToCalleeFuncCloneMap.count(Node)) |
4891 | return; |
4892 | |
4893 | auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node]; |
4894 | updateCall(CallerCall&: Node->Call, CalleeFunc); |
4895 | // Update all the matching calls as well. |
4896 | for (auto &Call : Node->MatchingCalls) |
4897 | updateCall(CallerCall&: Call, CalleeFunc); |
4898 | }; |
4899 | |
4900 | // Performs DFS traversal starting from allocation nodes to update calls to |
4901 | // reflect cloning decisions recorded earlier. For regular LTO this will |
4902 | // update the actual calls in the IR to call the appropriate function clone |
4903 | // (and add attributes to allocation calls), whereas for ThinLTO the decisions |
4904 | // are recorded in the summary entries. |
4905 | DenseSet<const ContextNode *> Visited; |
4906 | for (auto &Entry : AllocationCallToContextNodeMap) |
4907 | UpdateCalls(Entry.second, Visited, UpdateCalls); |
4908 | |
4909 | return Changed; |
4910 | } |
4911 | |
4912 | static SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> ( |
4913 | Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE, |
4914 | std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>> |
4915 | &FuncToAliasMap) { |
4916 | // The first "clone" is the original copy, we should only call this if we |
4917 | // needed to create new clones. |
4918 | assert(NumClones > 1); |
4919 | SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps; |
4920 | VMaps.reserve(N: NumClones - 1); |
4921 | FunctionsClonedThinBackend++; |
4922 | for (unsigned I = 1; I < NumClones; I++) { |
4923 | VMaps.emplace_back(Args: std::make_unique<ValueToValueMapTy>()); |
4924 | auto *NewF = CloneFunction(F: &F, VMap&: *VMaps.back()); |
4925 | FunctionClonesThinBackend++; |
4926 | // Strip memprof and callsite metadata from clone as they are no longer |
4927 | // needed. |
4928 | for (auto &BB : *NewF) { |
4929 | for (auto &Inst : BB) { |
4930 | Inst.setMetadata(KindID: LLVMContext::MD_memprof, Node: nullptr); |
4931 | Inst.setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr); |
4932 | } |
4933 | } |
4934 | std::string Name = getMemProfFuncName(Base: F.getName(), CloneNo: I); |
4935 | auto *PrevF = M.getFunction(Name); |
4936 | if (PrevF) { |
4937 | // We might have created this when adjusting callsite in another |
4938 | // function. It should be a declaration. |
4939 | assert(PrevF->isDeclaration()); |
4940 | NewF->takeName(V: PrevF); |
4941 | PrevF->replaceAllUsesWith(V: NewF); |
4942 | PrevF->eraseFromParent(); |
4943 | } else |
4944 | NewF->setName(Name); |
4945 | if (auto *SP = NewF->getSubprogram()) |
4946 | SP->replaceLinkageName( |
4947 | LN: MDString::get(Context&: NewF->getParent()->getContext(), Str: Name)); |
4948 | ORE.emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofClone" , &F) |
4949 | << "created clone " << ore::NV("NewFunction" , NewF)); |
4950 | |
4951 | // Now handle aliases to this function, and clone those as well. |
4952 | if (!FuncToAliasMap.count(x: &F)) |
4953 | continue; |
4954 | for (auto *A : FuncToAliasMap[&F]) { |
4955 | std::string Name = getMemProfFuncName(Base: A->getName(), CloneNo: I); |
4956 | auto *PrevA = M.getNamedAlias(Name); |
4957 | auto *NewA = GlobalAlias::create(Ty: A->getValueType(), |
4958 | AddressSpace: A->getType()->getPointerAddressSpace(), |
4959 | Linkage: A->getLinkage(), Name, Aliasee: NewF); |
4960 | NewA->copyAttributesFrom(Src: A); |
4961 | if (PrevA) { |
4962 | // We might have created this when adjusting callsite in another |
4963 | // function. It should be a declaration. |
4964 | assert(PrevA->isDeclaration()); |
4965 | NewA->takeName(V: PrevA); |
4966 | PrevA->replaceAllUsesWith(V: NewA); |
4967 | PrevA->eraseFromParent(); |
4968 | } |
4969 | } |
4970 | } |
4971 | return VMaps; |
4972 | } |
4973 | |
4974 | // Locate the summary for F. This is complicated by the fact that it might |
4975 | // have been internalized or promoted. |
4976 | static ValueInfo findValueInfoForFunc(const Function &F, const Module &M, |
4977 | const ModuleSummaryIndex *ImportSummary, |
4978 | const Function *CallingFunc = nullptr) { |
4979 | // FIXME: Ideally we would retain the original GUID in some fashion on the |
4980 | // function (e.g. as metadata), but for now do our best to locate the |
4981 | // summary without that information. |
4982 | ValueInfo TheFnVI = ImportSummary->getValueInfo(GUID: F.getGUID()); |
4983 | if (!TheFnVI) |
4984 | // See if theFn was internalized, by checking index directly with |
4985 | // original name (this avoids the name adjustment done by getGUID() for |
4986 | // internal symbols). |
4987 | TheFnVI = ImportSummary->getValueInfo( |
4988 | GUID: GlobalValue::getGUIDAssumingExternalLinkage(GlobalName: F.getName())); |
4989 | if (TheFnVI) |
4990 | return TheFnVI; |
4991 | // Now query with the original name before any promotion was performed. |
4992 | StringRef OrigName = |
4993 | ModuleSummaryIndex::getOriginalNameBeforePromote(Name: F.getName()); |
4994 | // When this pass is enabled, we always add thinlto_src_file provenance |
4995 | // metadata to imported function definitions, which allows us to recreate the |
4996 | // original internal symbol's GUID. |
4997 | auto SrcFileMD = F.getMetadata(Kind: "thinlto_src_file" ); |
4998 | // If this is a call to an imported/promoted local for which we didn't import |
4999 | // the definition, the metadata will not exist on the declaration. However, |
5000 | // since we are doing this early, before any inlining in the LTO backend, we |
5001 | // can simply look at the metadata on the calling function which must have |
5002 | // been from the same module if F was an internal symbol originally. |
5003 | if (!SrcFileMD && F.isDeclaration()) { |
5004 | // We would only call this for a declaration for a direct callsite, in which |
5005 | // case the caller would have provided the calling function pointer. |
5006 | assert(CallingFunc); |
5007 | SrcFileMD = CallingFunc->getMetadata(Kind: "thinlto_src_file" ); |
5008 | // If this is a promoted local (OrigName != F.getName()), since this is a |
5009 | // declaration, it must be imported from a different module and therefore we |
5010 | // should always find the metadata on its calling function. Any call to a |
5011 | // promoted local that came from this module should still be a definition. |
5012 | assert(SrcFileMD || OrigName == F.getName()); |
5013 | } |
5014 | StringRef SrcFile = M.getSourceFileName(); |
5015 | if (SrcFileMD) |
5016 | SrcFile = dyn_cast<MDString>(Val: SrcFileMD->getOperand(I: 0))->getString(); |
5017 | std::string OrigId = GlobalValue::getGlobalIdentifier( |
5018 | Name: OrigName, Linkage: GlobalValue::InternalLinkage, FileName: SrcFile); |
5019 | TheFnVI = ImportSummary->getValueInfo( |
5020 | GUID: GlobalValue::getGUIDAssumingExternalLinkage(GlobalName: OrigId)); |
5021 | // Internal func in original module may have gotten a numbered suffix if we |
5022 | // imported an external function with the same name. This happens |
5023 | // automatically during IR linking for naming conflicts. It would have to |
5024 | // still be internal in that case (otherwise it would have been renamed on |
5025 | // promotion in which case we wouldn't have a naming conflict). |
5026 | if (!TheFnVI && OrigName == F.getName() && F.hasLocalLinkage() && |
5027 | F.getName().contains(C: '.')) { |
5028 | OrigName = F.getName().rsplit(Separator: '.').first; |
5029 | OrigId = GlobalValue::getGlobalIdentifier( |
5030 | Name: OrigName, Linkage: GlobalValue::InternalLinkage, FileName: SrcFile); |
5031 | TheFnVI = ImportSummary->getValueInfo( |
5032 | GUID: GlobalValue::getGUIDAssumingExternalLinkage(GlobalName: OrigId)); |
5033 | } |
5034 | // The only way we may not have a VI is if this is a declaration created for |
5035 | // an imported reference. For distributed ThinLTO we may not have a VI for |
5036 | // such declarations in the distributed summary. |
5037 | assert(TheFnVI || F.isDeclaration()); |
5038 | return TheFnVI; |
5039 | } |
5040 | |
5041 | bool MemProfContextDisambiguation::initializeIndirectCallPromotionInfo( |
5042 | Module &M) { |
5043 | ICallAnalysis = std::make_unique<ICallPromotionAnalysis>(); |
5044 | Symtab = std::make_unique<InstrProfSymtab>(); |
5045 | // Don't add canonical names, to avoid multiple functions to the symtab |
5046 | // when they both have the same root name with "." suffixes stripped. |
5047 | // If we pick the wrong one then this could lead to incorrect ICP and calling |
5048 | // a memprof clone that we don't actually create (resulting in linker unsats). |
5049 | // What this means is that the GUID of the function (or its PGOFuncName |
5050 | // metadata) *must* match that in the VP metadata to allow promotion. |
5051 | // In practice this should not be a limitation, since local functions should |
5052 | // have PGOFuncName metadata and global function names shouldn't need any |
5053 | // special handling (they should not get the ".llvm.*" suffix that the |
5054 | // canonicalization handling is attempting to strip). |
5055 | if (Error E = Symtab->create(M, /*InLTO=*/true, /*AddCanonical=*/false)) { |
5056 | std::string SymtabFailure = toString(E: std::move(E)); |
5057 | M.getContext().emitError(ErrorStr: "Failed to create symtab: " + SymtabFailure); |
5058 | return false; |
5059 | } |
5060 | return true; |
5061 | } |
5062 | |
5063 | #ifndef NDEBUG |
5064 | // Sanity check that the MIB stack ids match between the summary and |
5065 | // instruction metadata. |
5066 | static void checkAllocContextIds( |
5067 | const AllocInfo &AllocNode, const MDNode *MemProfMD, |
5068 | const CallStack<MDNode, MDNode::op_iterator> &CallsiteContext, |
5069 | const ModuleSummaryIndex *ImportSummary) { |
5070 | auto MIBIter = AllocNode.MIBs.begin(); |
5071 | for (auto &MDOp : MemProfMD->operands()) { |
5072 | assert(MIBIter != AllocNode.MIBs.end()); |
5073 | auto StackIdIndexIter = MIBIter->StackIdIndices.begin(); |
5074 | auto *MIBMD = cast<const MDNode>(MDOp); |
5075 | MDNode *StackMDNode = getMIBStackNode(MIBMD); |
5076 | assert(StackMDNode); |
5077 | CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode); |
5078 | auto ContextIterBegin = |
5079 | StackContext.beginAfterSharedPrefix(CallsiteContext); |
5080 | // Skip the checking on the first iteration. |
5081 | uint64_t LastStackContextId = |
5082 | (ContextIterBegin != StackContext.end() && *ContextIterBegin == 0) ? 1 |
5083 | : 0; |
5084 | for (auto ContextIter = ContextIterBegin; ContextIter != StackContext.end(); |
5085 | ++ContextIter) { |
5086 | // If this is a direct recursion, simply skip the duplicate |
5087 | // entries, to be consistent with how the summary ids were |
5088 | // generated during ModuleSummaryAnalysis. |
5089 | if (LastStackContextId == *ContextIter) |
5090 | continue; |
5091 | LastStackContextId = *ContextIter; |
5092 | assert(StackIdIndexIter != MIBIter->StackIdIndices.end()); |
5093 | assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) == |
5094 | *ContextIter); |
5095 | StackIdIndexIter++; |
5096 | } |
5097 | MIBIter++; |
5098 | } |
5099 | } |
5100 | #endif |
5101 | |
5102 | bool MemProfContextDisambiguation::applyImport(Module &M) { |
5103 | assert(ImportSummary); |
5104 | bool Changed = false; |
5105 | |
5106 | // We also need to clone any aliases that reference cloned functions, because |
5107 | // the modified callsites may invoke via the alias. Keep track of the aliases |
5108 | // for each function. |
5109 | std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>> |
5110 | FuncToAliasMap; |
5111 | for (auto &A : M.aliases()) { |
5112 | auto *Aliasee = A.getAliaseeObject(); |
5113 | if (auto *F = dyn_cast<Function>(Val: Aliasee)) |
5114 | FuncToAliasMap[F].insert(Ptr: &A); |
5115 | } |
5116 | |
5117 | if (!initializeIndirectCallPromotionInfo(M)) |
5118 | return false; |
5119 | |
5120 | for (auto &F : M) { |
5121 | if (F.isDeclaration() || isMemProfClone(F)) |
5122 | continue; |
5123 | |
5124 | OptimizationRemarkEmitter ORE(&F); |
5125 | |
5126 | SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps; |
5127 | bool ClonesCreated = false; |
5128 | unsigned NumClonesCreated = 0; |
5129 | auto CloneFuncIfNeeded = [&](unsigned NumClones) { |
5130 | // We should at least have version 0 which is the original copy. |
5131 | assert(NumClones > 0); |
5132 | // If only one copy needed use original. |
5133 | if (NumClones == 1) |
5134 | return; |
5135 | // If we already performed cloning of this function, confirm that the |
5136 | // requested number of clones matches (the thin link should ensure the |
5137 | // number of clones for each constituent callsite is consistent within |
5138 | // each function), before returning. |
5139 | if (ClonesCreated) { |
5140 | assert(NumClonesCreated == NumClones); |
5141 | return; |
5142 | } |
5143 | VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap); |
5144 | // The first "clone" is the original copy, which doesn't have a VMap. |
5145 | assert(VMaps.size() == NumClones - 1); |
5146 | Changed = true; |
5147 | ClonesCreated = true; |
5148 | NumClonesCreated = NumClones; |
5149 | }; |
5150 | |
5151 | auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB, |
5152 | Function *CalledFunction) { |
5153 | // Perform cloning if not yet done. |
5154 | CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size()); |
5155 | |
5156 | assert(!isMemProfClone(*CalledFunction)); |
5157 | |
5158 | // Update the calls per the summary info. |
5159 | // Save orig name since it gets updated in the first iteration |
5160 | // below. |
5161 | auto CalleeOrigName = CalledFunction->getName(); |
5162 | for (unsigned J = 0; J < StackNode.Clones.size(); J++) { |
5163 | // Do nothing if this version calls the original version of its |
5164 | // callee. |
5165 | if (!StackNode.Clones[J]) |
5166 | continue; |
5167 | auto NewF = M.getOrInsertFunction( |
5168 | Name: getMemProfFuncName(Base: CalleeOrigName, CloneNo: StackNode.Clones[J]), |
5169 | T: CalledFunction->getFunctionType()); |
5170 | CallBase *CBClone; |
5171 | // Copy 0 is the original function. |
5172 | if (!J) |
5173 | CBClone = CB; |
5174 | else |
5175 | CBClone = cast<CallBase>(Val&: (*VMaps[J - 1])[CB]); |
5176 | CBClone->setCalledFunction(NewF); |
5177 | ORE.emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofCall" , CBClone) |
5178 | << ore::NV("Call" , CBClone) << " in clone " |
5179 | << ore::NV("Caller" , CBClone->getFunction()) |
5180 | << " assigned to call function clone " |
5181 | << ore::NV("Callee" , NewF.getCallee())); |
5182 | } |
5183 | }; |
5184 | |
5185 | // Locate the summary for F. |
5186 | ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary); |
5187 | // If not found, this could be an imported local (see comment in |
5188 | // findValueInfoForFunc). Skip for now as it will be cloned in its original |
5189 | // module (where it would have been promoted to global scope so should |
5190 | // satisfy any reference in this module). |
5191 | if (!TheFnVI) |
5192 | continue; |
5193 | |
5194 | auto *GVSummary = |
5195 | ImportSummary->findSummaryInModule(VI: TheFnVI, ModuleId: M.getModuleIdentifier()); |
5196 | if (!GVSummary) { |
5197 | // Must have been imported, use the summary which matches the definition。 |
5198 | // (might be multiple if this was a linkonce_odr). |
5199 | auto SrcModuleMD = F.getMetadata(Kind: "thinlto_src_module" ); |
5200 | assert(SrcModuleMD && |
5201 | "enable-import-metadata is needed to emit thinlto_src_module" ); |
5202 | StringRef SrcModule = |
5203 | dyn_cast<MDString>(Val: SrcModuleMD->getOperand(I: 0))->getString(); |
5204 | for (auto &GVS : TheFnVI.getSummaryList()) { |
5205 | if (GVS->modulePath() == SrcModule) { |
5206 | GVSummary = GVS.get(); |
5207 | break; |
5208 | } |
5209 | } |
5210 | assert(GVSummary && GVSummary->modulePath() == SrcModule); |
5211 | } |
5212 | |
5213 | // If this was an imported alias skip it as we won't have the function |
5214 | // summary, and it should be cloned in the original module. |
5215 | if (isa<AliasSummary>(Val: GVSummary)) |
5216 | continue; |
5217 | |
5218 | auto *FS = cast<FunctionSummary>(Val: GVSummary->getBaseObject()); |
5219 | |
5220 | if (FS->allocs().empty() && FS->callsites().empty()) |
5221 | continue; |
5222 | |
5223 | auto SI = FS->callsites().begin(); |
5224 | auto AI = FS->allocs().begin(); |
5225 | |
5226 | // To handle callsite infos synthesized for tail calls which have missing |
5227 | // frames in the profiled context, map callee VI to the synthesized callsite |
5228 | // info. |
5229 | DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite; |
5230 | // Iterate the callsites for this function in reverse, since we place all |
5231 | // those synthesized for tail calls at the end. |
5232 | for (auto CallsiteIt = FS->callsites().rbegin(); |
5233 | CallsiteIt != FS->callsites().rend(); CallsiteIt++) { |
5234 | auto &Callsite = *CallsiteIt; |
5235 | // Stop as soon as we see a non-synthesized callsite info (see comment |
5236 | // above loop). All the entries added for discovered tail calls have empty |
5237 | // stack ids. |
5238 | if (!Callsite.StackIdIndices.empty()) |
5239 | break; |
5240 | MapTailCallCalleeVIToCallsite.insert(KV: {Callsite.Callee, Callsite}); |
5241 | } |
5242 | |
5243 | // Keeps track of needed ICP for the function. |
5244 | SmallVector<ICallAnalysisData> ICallAnalysisInfo; |
5245 | |
5246 | // Assume for now that the instructions are in the exact same order |
5247 | // as when the summary was created, but confirm this is correct by |
5248 | // matching the stack ids. |
5249 | for (auto &BB : F) { |
5250 | for (auto &I : BB) { |
5251 | auto *CB = dyn_cast<CallBase>(Val: &I); |
5252 | // Same handling as when creating module summary. |
5253 | if (!mayHaveMemprofSummary(CB)) |
5254 | continue; |
5255 | |
5256 | auto *CalledValue = CB->getCalledOperand(); |
5257 | auto *CalledFunction = CB->getCalledFunction(); |
5258 | if (CalledValue && !CalledFunction) { |
5259 | CalledValue = CalledValue->stripPointerCasts(); |
5260 | // Stripping pointer casts can reveal a called function. |
5261 | CalledFunction = dyn_cast<Function>(Val: CalledValue); |
5262 | } |
5263 | // Check if this is an alias to a function. If so, get the |
5264 | // called aliasee for the checks below. |
5265 | if (auto *GA = dyn_cast<GlobalAlias>(Val: CalledValue)) { |
5266 | assert(!CalledFunction && |
5267 | "Expected null called function in callsite for alias" ); |
5268 | CalledFunction = dyn_cast<Function>(Val: GA->getAliaseeObject()); |
5269 | } |
5270 | |
5271 | CallStack<MDNode, MDNode::op_iterator> CallsiteContext( |
5272 | I.getMetadata(KindID: LLVMContext::MD_callsite)); |
5273 | auto *MemProfMD = I.getMetadata(KindID: LLVMContext::MD_memprof); |
5274 | |
5275 | // Include allocs that were already assigned a memprof function |
5276 | // attribute in the statistics. |
5277 | if (CB->getAttributes().hasFnAttr(Kind: "memprof" )) { |
5278 | assert(!MemProfMD); |
5279 | CB->getAttributes().getFnAttr(Kind: "memprof" ).getValueAsString() == "cold" |
5280 | ? AllocTypeColdThinBackend++ |
5281 | : AllocTypeNotColdThinBackend++; |
5282 | OrigAllocsThinBackend++; |
5283 | AllocVersionsThinBackend++; |
5284 | if (!MaxAllocVersionsThinBackend) |
5285 | MaxAllocVersionsThinBackend = 1; |
5286 | continue; |
5287 | } |
5288 | |
5289 | if (MemProfMD) { |
5290 | // Consult the next alloc node. |
5291 | assert(AI != FS->allocs().end()); |
5292 | auto &AllocNode = *(AI++); |
5293 | |
5294 | #ifndef NDEBUG |
5295 | checkAllocContextIds(AllocNode, MemProfMD, CallsiteContext, |
5296 | ImportSummary); |
5297 | #endif |
5298 | |
5299 | // Perform cloning if not yet done. |
5300 | CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size()); |
5301 | |
5302 | OrigAllocsThinBackend++; |
5303 | AllocVersionsThinBackend += AllocNode.Versions.size(); |
5304 | if (MaxAllocVersionsThinBackend < AllocNode.Versions.size()) |
5305 | MaxAllocVersionsThinBackend = AllocNode.Versions.size(); |
5306 | |
5307 | // If there is only one version that means we didn't end up |
5308 | // considering this function for cloning, and in that case the alloc |
5309 | // will still be none type or should have gotten the default NotCold. |
5310 | // Skip that after calling clone helper since that does some sanity |
5311 | // checks that confirm we haven't decided yet that we need cloning. |
5312 | // We might have a single version that is cold due to the |
5313 | // MinClonedColdBytePercent heuristic, make sure we don't skip in that |
5314 | // case. |
5315 | if (AllocNode.Versions.size() == 1 && |
5316 | (AllocationType)AllocNode.Versions[0] != AllocationType::Cold) { |
5317 | assert((AllocationType)AllocNode.Versions[0] == |
5318 | AllocationType::NotCold || |
5319 | (AllocationType)AllocNode.Versions[0] == |
5320 | AllocationType::None); |
5321 | UnclonableAllocsThinBackend++; |
5322 | continue; |
5323 | } |
5324 | |
5325 | // All versions should have a singular allocation type. |
5326 | assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) { |
5327 | return Type == ((uint8_t)AllocationType::NotCold | |
5328 | (uint8_t)AllocationType::Cold); |
5329 | })); |
5330 | |
5331 | // Update the allocation types per the summary info. |
5332 | for (unsigned J = 0; J < AllocNode.Versions.size(); J++) { |
5333 | // Ignore any that didn't get an assigned allocation type. |
5334 | if (AllocNode.Versions[J] == (uint8_t)AllocationType::None) |
5335 | continue; |
5336 | AllocationType AllocTy = (AllocationType)AllocNode.Versions[J]; |
5337 | AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++ |
5338 | : AllocTypeNotColdThinBackend++; |
5339 | std::string AllocTypeString = getAllocTypeAttributeString(Type: AllocTy); |
5340 | auto A = llvm::Attribute::get(Context&: F.getContext(), Kind: "memprof" , |
5341 | Val: AllocTypeString); |
5342 | CallBase *CBClone; |
5343 | // Copy 0 is the original function. |
5344 | if (!J) |
5345 | CBClone = CB; |
5346 | else |
5347 | // Since VMaps are only created for new clones, we index with |
5348 | // clone J-1 (J==0 is the original clone and does not have a VMaps |
5349 | // entry). |
5350 | CBClone = cast<CallBase>(Val&: (*VMaps[J - 1])[CB]); |
5351 | CBClone->addFnAttr(Attr: A); |
5352 | ORE.emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofAttribute" , CBClone) |
5353 | << ore::NV("AllocationCall" , CBClone) << " in clone " |
5354 | << ore::NV("Caller" , CBClone->getFunction()) |
5355 | << " marked with memprof allocation attribute " |
5356 | << ore::NV("Attribute" , AllocTypeString)); |
5357 | } |
5358 | } else if (!CallsiteContext.empty()) { |
5359 | if (!CalledFunction) { |
5360 | #ifndef NDEBUG |
5361 | // We should have skipped inline assembly calls. |
5362 | auto *CI = dyn_cast<CallInst>(CB); |
5363 | assert(!CI || !CI->isInlineAsm()); |
5364 | #endif |
5365 | // We should have skipped direct calls via a Constant. |
5366 | assert(CalledValue && !isa<Constant>(CalledValue)); |
5367 | |
5368 | // This is an indirect call, see if we have profile information and |
5369 | // whether any clones were recorded for the profiled targets (that |
5370 | // we synthesized CallsiteInfo summary records for when building the |
5371 | // index). |
5372 | auto NumClones = |
5373 | recordICPInfo(CB, AllCallsites: FS->callsites(), SI, ICallAnalysisInfo); |
5374 | |
5375 | // Perform cloning if not yet done. This is done here in case |
5376 | // we don't need to do ICP, but might need to clone this |
5377 | // function as it is the target of other cloned calls. |
5378 | if (NumClones) |
5379 | CloneFuncIfNeeded(NumClones); |
5380 | } |
5381 | |
5382 | else { |
5383 | // Consult the next callsite node. |
5384 | assert(SI != FS->callsites().end()); |
5385 | auto &StackNode = *(SI++); |
5386 | |
5387 | #ifndef NDEBUG |
5388 | // Sanity check that the stack ids match between the summary and |
5389 | // instruction metadata. |
5390 | auto StackIdIndexIter = StackNode.StackIdIndices.begin(); |
5391 | for (auto StackId : CallsiteContext) { |
5392 | assert(StackIdIndexIter != StackNode.StackIdIndices.end()); |
5393 | assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) == |
5394 | StackId); |
5395 | StackIdIndexIter++; |
5396 | } |
5397 | #endif |
5398 | |
5399 | CloneCallsite(StackNode, CB, CalledFunction); |
5400 | } |
5401 | } else if (CB->isTailCall() && CalledFunction) { |
5402 | // Locate the synthesized callsite info for the callee VI, if any was |
5403 | // created, and use that for cloning. |
5404 | ValueInfo CalleeVI = |
5405 | findValueInfoForFunc(F: *CalledFunction, M, ImportSummary, CallingFunc: &F); |
5406 | if (CalleeVI && MapTailCallCalleeVIToCallsite.count(Val: CalleeVI)) { |
5407 | auto Callsite = MapTailCallCalleeVIToCallsite.find(Val: CalleeVI); |
5408 | assert(Callsite != MapTailCallCalleeVIToCallsite.end()); |
5409 | CloneCallsite(Callsite->second, CB, CalledFunction); |
5410 | } |
5411 | } |
5412 | } |
5413 | } |
5414 | |
5415 | // Now do any promotion required for cloning. |
5416 | performICP(M, AllCallsites: FS->callsites(), VMaps, ICallAnalysisInfo, ORE); |
5417 | } |
5418 | |
5419 | // We skip some of the functions and instructions above, so remove all the |
5420 | // metadata in a single sweep here. |
5421 | for (auto &F : M) { |
5422 | // We can skip memprof clones because createFunctionClones already strips |
5423 | // the metadata from the newly created clones. |
5424 | if (F.isDeclaration() || isMemProfClone(F)) |
5425 | continue; |
5426 | for (auto &BB : F) { |
5427 | for (auto &I : BB) { |
5428 | if (!isa<CallBase>(Val: I)) |
5429 | continue; |
5430 | I.setMetadata(KindID: LLVMContext::MD_memprof, Node: nullptr); |
5431 | I.setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr); |
5432 | } |
5433 | } |
5434 | } |
5435 | |
5436 | return Changed; |
5437 | } |
5438 | |
5439 | unsigned MemProfContextDisambiguation::recordICPInfo( |
5440 | CallBase *CB, ArrayRef<CallsiteInfo> AllCallsites, |
5441 | ArrayRef<CallsiteInfo>::iterator &SI, |
5442 | SmallVector<ICallAnalysisData> &ICallAnalysisInfo) { |
5443 | // First see if we have profile information for this indirect call. |
5444 | uint32_t NumCandidates; |
5445 | uint64_t TotalCount; |
5446 | auto CandidateProfileData = |
5447 | ICallAnalysis->getPromotionCandidatesForInstruction(I: CB, TotalCount, |
5448 | NumCandidates); |
5449 | if (CandidateProfileData.empty()) |
5450 | return 0; |
5451 | |
5452 | // Iterate through all of the candidate profiled targets along with the |
5453 | // CallsiteInfo summary records synthesized for them when building the index, |
5454 | // and see if any are cloned and/or refer to clones. |
5455 | bool ICPNeeded = false; |
5456 | unsigned NumClones = 0; |
5457 | size_t CallsiteInfoStartIndex = std::distance(first: AllCallsites.begin(), last: SI); |
5458 | for (const auto &Candidate : CandidateProfileData) { |
5459 | #ifndef NDEBUG |
5460 | auto CalleeValueInfo = |
5461 | #endif |
5462 | ImportSummary->getValueInfo(GUID: Candidate.Value); |
5463 | // We might not have a ValueInfo if this is a distributed |
5464 | // ThinLTO backend and decided not to import that function. |
5465 | assert(!CalleeValueInfo || SI->Callee == CalleeValueInfo); |
5466 | assert(SI != AllCallsites.end()); |
5467 | auto &StackNode = *(SI++); |
5468 | // See if any of the clones of the indirect callsite for this |
5469 | // profiled target should call a cloned version of the profiled |
5470 | // target. We only need to do the ICP here if so. |
5471 | ICPNeeded |= llvm::any_of(Range: StackNode.Clones, |
5472 | P: [](unsigned CloneNo) { return CloneNo != 0; }); |
5473 | // Every callsite in the same function should have been cloned the same |
5474 | // number of times. |
5475 | assert(!NumClones || NumClones == StackNode.Clones.size()); |
5476 | NumClones = StackNode.Clones.size(); |
5477 | } |
5478 | if (!ICPNeeded) |
5479 | return NumClones; |
5480 | // Save information for ICP, which is performed later to avoid messing up the |
5481 | // current function traversal. |
5482 | ICallAnalysisInfo.push_back(Elt: {.CB: CB, .CandidateProfileData: CandidateProfileData.vec(), .NumCandidates: NumCandidates, |
5483 | .TotalCount: TotalCount, .CallsiteInfoStartIndex: CallsiteInfoStartIndex}); |
5484 | return NumClones; |
5485 | } |
5486 | |
5487 | void MemProfContextDisambiguation::performICP( |
5488 | Module &M, ArrayRef<CallsiteInfo> AllCallsites, |
5489 | ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps, |
5490 | ArrayRef<ICallAnalysisData> ICallAnalysisInfo, |
5491 | OptimizationRemarkEmitter &ORE) { |
5492 | // Now do any promotion required for cloning. Specifically, for each |
5493 | // recorded ICP candidate (which was only recorded because one clone of that |
5494 | // candidate should call a cloned target), we perform ICP (speculative |
5495 | // devirtualization) for each clone of the callsite, and update its callee |
5496 | // to the appropriate clone. Note that the ICP compares against the original |
5497 | // version of the target, which is what is in the vtable. |
5498 | for (auto &Info : ICallAnalysisInfo) { |
5499 | auto *CB = Info.CB; |
5500 | auto CallsiteIndex = Info.CallsiteInfoStartIndex; |
5501 | auto TotalCount = Info.TotalCount; |
5502 | unsigned NumPromoted = 0; |
5503 | unsigned NumClones = 0; |
5504 | |
5505 | for (auto &Candidate : Info.CandidateProfileData) { |
5506 | auto &StackNode = AllCallsites[CallsiteIndex++]; |
5507 | |
5508 | // All calls in the same function must have the same number of clones. |
5509 | assert(!NumClones || NumClones == StackNode.Clones.size()); |
5510 | NumClones = StackNode.Clones.size(); |
5511 | |
5512 | // See if the target is in the module. If it wasn't imported, it is |
5513 | // possible that this profile could have been collected on a different |
5514 | // target (or version of the code), and we need to be conservative |
5515 | // (similar to what is done in the ICP pass). |
5516 | Function *TargetFunction = Symtab->getFunction(FuncMD5Hash: Candidate.Value); |
5517 | if (TargetFunction == nullptr || |
5518 | // Any ThinLTO global dead symbol removal should have already |
5519 | // occurred, so it should be safe to promote when the target is a |
5520 | // declaration. |
5521 | // TODO: Remove internal option once more fully tested. |
5522 | (MemProfRequireDefinitionForPromotion && |
5523 | TargetFunction->isDeclaration())) { |
5524 | ORE.emit(RemarkBuilder: [&]() { |
5525 | return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget" , CB) |
5526 | << "Memprof cannot promote indirect call: target with md5sum " |
5527 | << ore::NV("target md5sum" , Candidate.Value) << " not found" ; |
5528 | }); |
5529 | // FIXME: See if we can use the new declaration importing support to |
5530 | // at least get the declarations imported for this case. Hot indirect |
5531 | // targets should have been imported normally, however. |
5532 | continue; |
5533 | } |
5534 | |
5535 | // Check if legal to promote |
5536 | const char *Reason = nullptr; |
5537 | if (!isLegalToPromote(CB: *CB, Callee: TargetFunction, FailureReason: &Reason)) { |
5538 | ORE.emit(RemarkBuilder: [&]() { |
5539 | return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote" , CB) |
5540 | << "Memprof cannot promote indirect call to " |
5541 | << ore::NV("TargetFunction" , TargetFunction) |
5542 | << " with count of " << ore::NV("TotalCount" , TotalCount) |
5543 | << ": " << Reason; |
5544 | }); |
5545 | continue; |
5546 | } |
5547 | |
5548 | assert(!isMemProfClone(*TargetFunction)); |
5549 | |
5550 | // Handle each call clone, applying ICP so that each clone directly |
5551 | // calls the specified callee clone, guarded by the appropriate ICP |
5552 | // check. |
5553 | CallBase *CBClone = CB; |
5554 | for (unsigned J = 0; J < NumClones; J++) { |
5555 | // Copy 0 is the original function. |
5556 | if (J > 0) |
5557 | CBClone = cast<CallBase>(Val&: (*VMaps[J - 1])[CB]); |
5558 | // We do the promotion using the original name, so that the comparison |
5559 | // is against the name in the vtable. Then just below, change the new |
5560 | // direct call to call the cloned function. |
5561 | auto &DirectCall = |
5562 | pgo::promoteIndirectCall(CB&: *CBClone, F: TargetFunction, Count: Candidate.Count, |
5563 | TotalCount, AttachProfToDirectCall: isSamplePGO, ORE: &ORE); |
5564 | auto *TargetToUse = TargetFunction; |
5565 | // Call original if this version calls the original version of its |
5566 | // callee. |
5567 | if (StackNode.Clones[J]) { |
5568 | TargetToUse = |
5569 | cast<Function>(Val: M.getOrInsertFunction( |
5570 | Name: getMemProfFuncName(Base: TargetFunction->getName(), |
5571 | CloneNo: StackNode.Clones[J]), |
5572 | T: TargetFunction->getFunctionType()) |
5573 | .getCallee()); |
5574 | } |
5575 | DirectCall.setCalledFunction(TargetToUse); |
5576 | ORE.emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "MemprofCall" , CBClone) |
5577 | << ore::NV("Call" , CBClone) << " in clone " |
5578 | << ore::NV("Caller" , CBClone->getFunction()) |
5579 | << " promoted and assigned to call function clone " |
5580 | << ore::NV("Callee" , TargetToUse)); |
5581 | } |
5582 | |
5583 | // Update TotalCount (all clones should get same count above) |
5584 | TotalCount -= Candidate.Count; |
5585 | NumPromoted++; |
5586 | } |
5587 | // Adjust the MD.prof metadata for all clones, now that we have the new |
5588 | // TotalCount and the number promoted. |
5589 | CallBase *CBClone = CB; |
5590 | for (unsigned J = 0; J < NumClones; J++) { |
5591 | // Copy 0 is the original function. |
5592 | if (J > 0) |
5593 | CBClone = cast<CallBase>(Val&: (*VMaps[J - 1])[CB]); |
5594 | // First delete the old one. |
5595 | CBClone->setMetadata(KindID: LLVMContext::MD_prof, Node: nullptr); |
5596 | // If all promoted, we don't need the MD.prof metadata. |
5597 | // Otherwise we need update with the un-promoted records back. |
5598 | if (TotalCount != 0) |
5599 | annotateValueSite( |
5600 | M, Inst&: *CBClone, VDs: ArrayRef(Info.CandidateProfileData).slice(N: NumPromoted), |
5601 | Sum: TotalCount, ValueKind: IPVK_IndirectCallTarget, MaxMDCount: Info.NumCandidates); |
5602 | } |
5603 | } |
5604 | } |
5605 | |
5606 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
5607 | bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() { |
5608 | if (DumpCCG) { |
5609 | dbgs() << "CCG before cloning:\n" ; |
5610 | dbgs() << *this; |
5611 | } |
5612 | if (ExportToDot) |
5613 | exportToDot(Label: "postbuild" ); |
5614 | |
5615 | if (VerifyCCG) { |
5616 | check(); |
5617 | } |
5618 | |
5619 | identifyClones(); |
5620 | |
5621 | if (VerifyCCG) { |
5622 | check(); |
5623 | } |
5624 | |
5625 | if (DumpCCG) { |
5626 | dbgs() << "CCG after cloning:\n" ; |
5627 | dbgs() << *this; |
5628 | } |
5629 | if (ExportToDot) |
5630 | exportToDot(Label: "cloned" ); |
5631 | |
5632 | bool Changed = assignFunctions(); |
5633 | |
5634 | if (DumpCCG) { |
5635 | dbgs() << "CCG after assigning function clones:\n" ; |
5636 | dbgs() << *this; |
5637 | } |
5638 | if (ExportToDot) |
5639 | exportToDot(Label: "clonefuncassign" ); |
5640 | |
5641 | if (MemProfReportHintedSizes) |
5642 | printTotalSizes(OS&: errs()); |
5643 | |
5644 | return Changed; |
5645 | } |
5646 | |
5647 | bool MemProfContextDisambiguation::( |
5648 | Module &M, |
5649 | llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) { |
5650 | |
5651 | // If we have an import summary, then the cloning decisions were made during |
5652 | // the thin link on the index. Apply them and return. |
5653 | if (ImportSummary) |
5654 | return applyImport(M); |
5655 | |
5656 | // TODO: If/when other types of memprof cloning are enabled beyond just for |
5657 | // hot and cold, we will need to change this to individually control the |
5658 | // AllocationType passed to addStackNodesForMIB during CCG construction. |
5659 | // Note that we specifically check this after applying imports above, so that |
5660 | // the option isn't needed to be passed to distributed ThinLTO backend |
5661 | // clang processes, which won't necessarily have visibility into the linker |
5662 | // dependences. Instead the information is communicated from the LTO link to |
5663 | // the backends via the combined summary index. |
5664 | if (!SupportsHotColdNew) |
5665 | return false; |
5666 | |
5667 | ModuleCallsiteContextGraph CCG(M, OREGetter); |
5668 | return CCG.process(); |
5669 | } |
5670 | |
5671 | MemProfContextDisambiguation::MemProfContextDisambiguation( |
5672 | const ModuleSummaryIndex *Summary, bool isSamplePGO) |
5673 | : ImportSummary(Summary), isSamplePGO(isSamplePGO) { |
5674 | // Check the dot graph printing options once here, to make sure we have valid |
5675 | // and expected combinations. |
5676 | if (DotGraphScope == DotScope::Alloc && !AllocIdForDot.getNumOccurrences()) |
5677 | llvm::report_fatal_error( |
5678 | reason: "-memprof-dot-scope=alloc requires -memprof-dot-alloc-id" ); |
5679 | if (DotGraphScope == DotScope::Context && |
5680 | !ContextIdForDot.getNumOccurrences()) |
5681 | llvm::report_fatal_error( |
5682 | reason: "-memprof-dot-scope=context requires -memprof-dot-context-id" ); |
5683 | if (DotGraphScope == DotScope::All && AllocIdForDot.getNumOccurrences() && |
5684 | ContextIdForDot.getNumOccurrences()) |
5685 | llvm::report_fatal_error( |
5686 | reason: "-memprof-dot-scope=all can't have both -memprof-dot-alloc-id and " |
5687 | "-memprof-dot-context-id" ); |
5688 | if (ImportSummary) { |
5689 | // The MemProfImportSummary should only be used for testing ThinLTO |
5690 | // distributed backend handling via opt, in which case we don't have a |
5691 | // summary from the pass pipeline. |
5692 | assert(MemProfImportSummary.empty()); |
5693 | return; |
5694 | } |
5695 | if (MemProfImportSummary.empty()) |
5696 | return; |
5697 | |
5698 | auto ReadSummaryFile = |
5699 | errorOrToExpected(EO: MemoryBuffer::getFile(Filename: MemProfImportSummary)); |
5700 | if (!ReadSummaryFile) { |
5701 | logAllUnhandledErrors(E: ReadSummaryFile.takeError(), OS&: errs(), |
5702 | ErrorBanner: "Error loading file '" + MemProfImportSummary + |
5703 | "': " ); |
5704 | return; |
5705 | } |
5706 | auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(Buffer: **ReadSummaryFile); |
5707 | if (!ImportSummaryForTestingOrErr) { |
5708 | logAllUnhandledErrors(E: ImportSummaryForTestingOrErr.takeError(), OS&: errs(), |
5709 | ErrorBanner: "Error parsing file '" + MemProfImportSummary + |
5710 | "': " ); |
5711 | return; |
5712 | } |
5713 | ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr); |
5714 | ImportSummary = ImportSummaryForTesting.get(); |
5715 | } |
5716 | |
5717 | PreservedAnalyses MemProfContextDisambiguation::run(Module &M, |
5718 | ModuleAnalysisManager &AM) { |
5719 | auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(IR&: M).getManager(); |
5720 | auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & { |
5721 | return FAM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: *F); |
5722 | }; |
5723 | if (!processModule(M, OREGetter)) |
5724 | return PreservedAnalyses::all(); |
5725 | return PreservedAnalyses::none(); |
5726 | } |
5727 | |
5728 | void MemProfContextDisambiguation::run( |
5729 | ModuleSummaryIndex &Index, |
5730 | llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)> |
5731 | isPrevailing) { |
5732 | // TODO: If/when other types of memprof cloning are enabled beyond just for |
5733 | // hot and cold, we will need to change this to individually control the |
5734 | // AllocationType passed to addStackNodesForMIB during CCG construction. |
5735 | // The index was set from the option, so these should be in sync. |
5736 | assert(Index.withSupportsHotColdNew() == SupportsHotColdNew); |
5737 | if (!SupportsHotColdNew) |
5738 | return; |
5739 | |
5740 | IndexCallsiteContextGraph CCG(Index, isPrevailing); |
5741 | CCG.process(); |
5742 | } |
5743 | |