1 | //==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file implements support for context disambiguation of allocation |
10 | // calls for profile guided heap optimization. Specifically, it uses Memprof |
11 | // profiles which indicate context specific allocation behavior (currently |
12 | // distinguishing cold vs hot memory allocations). Cloning is performed to |
13 | // expose the cold allocation call contexts, and the allocation calls are |
14 | // subsequently annotated with an attribute for later transformation. |
15 | // |
16 | // The transformations can be performed either directly on IR (regular LTO), or |
17 | // on a ThinLTO index (and later applied to the IR during the ThinLTO backend). |
18 | // Both types of LTO operate on a the same base graph representation, which |
19 | // uses CRTP to support either IR or Index formats. |
20 | // |
21 | //===----------------------------------------------------------------------===// |
22 | |
23 | #include "llvm/Transforms/IPO/MemProfContextDisambiguation.h" |
24 | #include "llvm/ADT/DenseMap.h" |
25 | #include "llvm/ADT/DenseSet.h" |
26 | #include "llvm/ADT/MapVector.h" |
27 | #include "llvm/ADT/SetOperations.h" |
28 | #include "llvm/ADT/SmallPtrSet.h" |
29 | #include "llvm/ADT/SmallSet.h" |
30 | #include "llvm/ADT/SmallVector.h" |
31 | #include "llvm/ADT/Statistic.h" |
32 | #include "llvm/Analysis/MemoryProfileInfo.h" |
33 | #include "llvm/Analysis/ModuleSummaryAnalysis.h" |
34 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
35 | #include "llvm/Bitcode/BitcodeReader.h" |
36 | #include "llvm/IR/Constants.h" |
37 | #include "llvm/IR/Instructions.h" |
38 | #include "llvm/IR/Module.h" |
39 | #include "llvm/IR/ModuleSummaryIndex.h" |
40 | #include "llvm/Pass.h" |
41 | #include "llvm/Support/CommandLine.h" |
42 | #include "llvm/Support/FileSystem.h" |
43 | #include "llvm/Support/GraphWriter.h" |
44 | #include "llvm/Support/raw_ostream.h" |
45 | #include "llvm/Transforms/IPO.h" |
46 | #include "llvm/Transforms/Utils/Cloning.h" |
47 | #include <deque> |
48 | #include <sstream> |
49 | #include <unordered_map> |
50 | #include <vector> |
51 | using namespace llvm; |
52 | using namespace llvm::memprof; |
53 | |
54 | #define DEBUG_TYPE "memprof-context-disambiguation" |
55 | |
56 | STATISTIC(FunctionClonesAnalysis, |
57 | "Number of function clones created during whole program analysis" ); |
58 | STATISTIC(FunctionClonesThinBackend, |
59 | "Number of function clones created during ThinLTO backend" ); |
60 | STATISTIC(FunctionsClonedThinBackend, |
61 | "Number of functions that had clones created during ThinLTO backend" ); |
62 | STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly " |
63 | "cloned) during whole program analysis" ); |
64 | STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) " |
65 | "during whole program analysis" ); |
66 | STATISTIC(AllocTypeNotColdThinBackend, |
67 | "Number of not cold static allocations (possibly cloned) during " |
68 | "ThinLTO backend" ); |
69 | STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations " |
70 | "(possibly cloned) during ThinLTO backend" ); |
71 | STATISTIC(OrigAllocsThinBackend, |
72 | "Number of original (not cloned) allocations with memprof profiles " |
73 | "during ThinLTO backend" ); |
74 | STATISTIC( |
75 | AllocVersionsThinBackend, |
76 | "Number of allocation versions (including clones) during ThinLTO backend" ); |
77 | STATISTIC(MaxAllocVersionsThinBackend, |
78 | "Maximum number of allocation versions created for an original " |
79 | "allocation during ThinLTO backend" ); |
80 | STATISTIC(UnclonableAllocsThinBackend, |
81 | "Number of unclonable ambigous allocations during ThinLTO backend" ); |
82 | STATISTIC(RemovedEdgesWithMismatchedCallees, |
83 | "Number of edges removed due to mismatched callees (profiled vs IR)" ); |
84 | STATISTIC(FoundProfiledCalleeCount, |
85 | "Number of profiled callees found via tail calls" ); |
86 | STATISTIC(FoundProfiledCalleeDepth, |
87 | "Aggregate depth of profiled callees found via tail calls" ); |
88 | STATISTIC(FoundProfiledCalleeMaxDepth, |
89 | "Maximum depth of profiled callees found via tail calls" ); |
90 | STATISTIC(FoundProfiledCalleeNonUniquelyCount, |
91 | "Number of profiled callees found via multiple tail call chains" ); |
92 | |
93 | static cl::opt<std::string> DotFilePathPrefix( |
94 | "memprof-dot-file-path-prefix" , cl::init(Val: "" ), cl::Hidden, |
95 | cl::value_desc("filename" ), |
96 | cl::desc("Specify the path prefix of the MemProf dot files." )); |
97 | |
98 | static cl::opt<bool> ExportToDot("memprof-export-to-dot" , cl::init(Val: false), |
99 | cl::Hidden, |
100 | cl::desc("Export graph to dot files." )); |
101 | |
102 | static cl::opt<bool> |
103 | DumpCCG("memprof-dump-ccg" , cl::init(Val: false), cl::Hidden, |
104 | cl::desc("Dump CallingContextGraph to stdout after each stage." )); |
105 | |
106 | static cl::opt<bool> |
107 | VerifyCCG("memprof-verify-ccg" , cl::init(Val: false), cl::Hidden, |
108 | cl::desc("Perform verification checks on CallingContextGraph." )); |
109 | |
110 | static cl::opt<bool> |
111 | VerifyNodes("memprof-verify-nodes" , cl::init(Val: false), cl::Hidden, |
112 | cl::desc("Perform frequent verification checks on nodes." )); |
113 | |
114 | static cl::opt<std::string> MemProfImportSummary( |
115 | "memprof-import-summary" , |
116 | cl::desc("Import summary to use for testing the ThinLTO backend via opt" ), |
117 | cl::Hidden); |
118 | |
119 | static cl::opt<unsigned> |
120 | TailCallSearchDepth("memprof-tail-call-search-depth" , cl::init(Val: 5), |
121 | cl::Hidden, |
122 | cl::desc("Max depth to recursively search for missing " |
123 | "frames through tail calls." )); |
124 | |
125 | namespace llvm { |
126 | cl::opt<bool> EnableMemProfContextDisambiguation( |
127 | "enable-memprof-context-disambiguation" , cl::init(Val: false), cl::Hidden, |
128 | cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation" )); |
129 | |
130 | // Indicate we are linking with an allocator that supports hot/cold operator |
131 | // new interfaces. |
132 | cl::opt<bool> SupportsHotColdNew( |
133 | "supports-hot-cold-new" , cl::init(Val: false), cl::Hidden, |
134 | cl::desc("Linking with hot/cold operator new interfaces" )); |
135 | } // namespace llvm |
136 | |
137 | extern cl::opt<bool> MemProfReportHintedSizes; |
138 | |
139 | namespace { |
140 | /// CRTP base for graphs built from either IR or ThinLTO summary index. |
141 | /// |
142 | /// The graph represents the call contexts in all memprof metadata on allocation |
143 | /// calls, with nodes for the allocations themselves, as well as for the calls |
144 | /// in each context. The graph is initially built from the allocation memprof |
145 | /// metadata (or summary) MIBs. It is then updated to match calls with callsite |
146 | /// metadata onto the nodes, updating it to reflect any inlining performed on |
147 | /// those calls. |
148 | /// |
149 | /// Each MIB (representing an allocation's call context with allocation |
150 | /// behavior) is assigned a unique context id during the graph build. The edges |
151 | /// and nodes in the graph are decorated with the context ids they carry. This |
152 | /// is used to correctly update the graph when cloning is performed so that we |
153 | /// can uniquify the context for a single (possibly cloned) allocation. |
154 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
155 | class CallsiteContextGraph { |
156 | public: |
157 | CallsiteContextGraph() = default; |
158 | CallsiteContextGraph(const CallsiteContextGraph &) = default; |
159 | CallsiteContextGraph(CallsiteContextGraph &&) = default; |
160 | |
161 | /// Main entry point to perform analysis and transformations on graph. |
162 | bool process(); |
163 | |
164 | /// Perform cloning on the graph necessary to uniquely identify the allocation |
165 | /// behavior of an allocation based on its context. |
166 | void identifyClones(); |
167 | |
168 | /// Assign callsite clones to functions, cloning functions as needed to |
169 | /// accommodate the combinations of their callsite clones reached by callers. |
170 | /// For regular LTO this clones functions and callsites in the IR, but for |
171 | /// ThinLTO the cloning decisions are noted in the summaries and later applied |
172 | /// in applyImport. |
173 | bool assignFunctions(); |
174 | |
175 | void dump() const; |
176 | void print(raw_ostream &OS) const; |
177 | void printTotalSizes(raw_ostream &OS) const; |
178 | |
179 | friend raw_ostream &operator<<(raw_ostream &OS, |
180 | const CallsiteContextGraph &CCG) { |
181 | CCG.print(OS); |
182 | return OS; |
183 | } |
184 | |
185 | friend struct GraphTraits< |
186 | const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>; |
187 | friend struct DOTGraphTraits< |
188 | const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>; |
189 | |
190 | void exportToDot(std::string Label) const; |
191 | |
192 | /// Represents a function clone via FuncTy pointer and clone number pair. |
193 | struct FuncInfo final |
194 | : public std::pair<FuncTy *, unsigned /*Clone number*/> { |
195 | using Base = std::pair<FuncTy *, unsigned>; |
196 | FuncInfo(const Base &B) : Base(B) {} |
197 | FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {} |
198 | explicit operator bool() const { return this->first != nullptr; } |
199 | FuncTy *func() const { return this->first; } |
200 | unsigned cloneNo() const { return this->second; } |
201 | }; |
202 | |
203 | /// Represents a callsite clone via CallTy and clone number pair. |
204 | struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> { |
205 | using Base = std::pair<CallTy, unsigned>; |
206 | CallInfo(const Base &B) : Base(B) {} |
207 | CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0) |
208 | : Base(Call, CloneNo) {} |
209 | explicit operator bool() const { return (bool)this->first; } |
210 | CallTy call() const { return this->first; } |
211 | unsigned cloneNo() const { return this->second; } |
212 | void setCloneNo(unsigned N) { this->second = N; } |
213 | void print(raw_ostream &OS) const { |
214 | if (!operator bool()) { |
215 | assert(!cloneNo()); |
216 | OS << "null Call" ; |
217 | return; |
218 | } |
219 | call()->print(OS); |
220 | OS << "\t(clone " << cloneNo() << ")" ; |
221 | } |
222 | void dump() const { |
223 | print(OS&: dbgs()); |
224 | dbgs() << "\n" ; |
225 | } |
226 | friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) { |
227 | Call.print(OS); |
228 | return OS; |
229 | } |
230 | }; |
231 | |
232 | struct ContextEdge; |
233 | |
234 | /// Node in the Callsite Context Graph |
235 | struct ContextNode { |
236 | // Keep this for now since in the IR case where we have an Instruction* it |
237 | // is not as immediately discoverable. Used for printing richer information |
238 | // when dumping graph. |
239 | bool IsAllocation; |
240 | |
241 | // Keeps track of when the Call was reset to null because there was |
242 | // recursion. |
243 | bool Recursive = false; |
244 | |
245 | // The corresponding allocation or interior call. |
246 | CallInfo Call; |
247 | |
248 | // For alloc nodes this is a unique id assigned when constructed, and for |
249 | // callsite stack nodes it is the original stack id when the node is |
250 | // constructed from the memprof MIB metadata on the alloc nodes. Note that |
251 | // this is only used when matching callsite metadata onto the stack nodes |
252 | // created when processing the allocation memprof MIBs, and for labeling |
253 | // nodes in the dot graph. Therefore we don't bother to assign a value for |
254 | // clones. |
255 | uint64_t OrigStackOrAllocId = 0; |
256 | |
257 | // This will be formed by ORing together the AllocationType enum values |
258 | // for contexts including this node. |
259 | uint8_t AllocTypes = 0; |
260 | |
261 | // Edges to all callees in the profiled call stacks. |
262 | // TODO: Should this be a map (from Callee node) for more efficient lookup? |
263 | std::vector<std::shared_ptr<ContextEdge>> CalleeEdges; |
264 | |
265 | // Edges to all callers in the profiled call stacks. |
266 | // TODO: Should this be a map (from Caller node) for more efficient lookup? |
267 | std::vector<std::shared_ptr<ContextEdge>> CallerEdges; |
268 | |
269 | // Get the list of edges from which we can compute allocation information |
270 | // such as the context ids and allocation type of this node. |
271 | const std::vector<std::shared_ptr<ContextEdge>> * |
272 | getEdgesWithAllocInfo() const { |
273 | // If node has any callees, compute from those, otherwise compute from |
274 | // callers (i.e. if this is the leaf allocation node). |
275 | if (!CalleeEdges.empty()) |
276 | return &CalleeEdges; |
277 | if (!CallerEdges.empty()) { |
278 | // A node with caller edges but no callee edges must be the allocation |
279 | // node. |
280 | assert(IsAllocation); |
281 | return &CallerEdges; |
282 | } |
283 | return nullptr; |
284 | } |
285 | |
286 | // Compute the context ids for this node from the union of its edge context |
287 | // ids. |
288 | DenseSet<uint32_t> getContextIds() const { |
289 | DenseSet<uint32_t> ContextIds; |
290 | auto *Edges = getEdgesWithAllocInfo(); |
291 | if (!Edges) |
292 | return {}; |
293 | unsigned Count = 0; |
294 | for (auto &Edge : *Edges) |
295 | Count += Edge->getContextIds().size(); |
296 | ContextIds.reserve(Size: Count); |
297 | for (auto &Edge : *Edges) |
298 | ContextIds.insert(Edge->getContextIds().begin(), |
299 | Edge->getContextIds().end()); |
300 | return ContextIds; |
301 | } |
302 | |
303 | // Compute the allocation type for this node from the OR of its edge |
304 | // allocation types. |
305 | uint8_t computeAllocType() const { |
306 | auto *Edges = getEdgesWithAllocInfo(); |
307 | if (!Edges) |
308 | return (uint8_t)AllocationType::None; |
309 | uint8_t BothTypes = |
310 | (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold; |
311 | uint8_t AllocType = (uint8_t)AllocationType::None; |
312 | for (auto &Edge : *Edges) { |
313 | AllocType |= Edge->AllocTypes; |
314 | // Bail early if alloc type reached both, no further refinement. |
315 | if (AllocType == BothTypes) |
316 | return AllocType; |
317 | } |
318 | return AllocType; |
319 | } |
320 | |
321 | // The context ids set for this node is empty if its edge context ids are |
322 | // also all empty. |
323 | bool emptyContextIds() const { |
324 | auto *Edges = getEdgesWithAllocInfo(); |
325 | if (!Edges) |
326 | return true; |
327 | for (auto &Edge : *Edges) { |
328 | if (!Edge->getContextIds().empty()) |
329 | return false; |
330 | } |
331 | return true; |
332 | } |
333 | |
334 | // List of clones of this ContextNode, initially empty. |
335 | std::vector<ContextNode *> Clones; |
336 | |
337 | // If a clone, points to the original uncloned node. |
338 | ContextNode *CloneOf = nullptr; |
339 | |
340 | ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {} |
341 | |
342 | ContextNode(bool IsAllocation, CallInfo C) |
343 | : IsAllocation(IsAllocation), Call(C) {} |
344 | |
345 | void addClone(ContextNode *Clone) { |
346 | if (CloneOf) { |
347 | CloneOf->Clones.push_back(Clone); |
348 | Clone->CloneOf = CloneOf; |
349 | } else { |
350 | Clones.push_back(Clone); |
351 | assert(!Clone->CloneOf); |
352 | Clone->CloneOf = this; |
353 | } |
354 | } |
355 | |
356 | ContextNode *getOrigNode() { |
357 | if (!CloneOf) |
358 | return this; |
359 | return CloneOf; |
360 | } |
361 | |
362 | void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType, |
363 | unsigned int ContextId); |
364 | |
365 | ContextEdge *findEdgeFromCallee(const ContextNode *Callee); |
366 | ContextEdge *findEdgeFromCaller(const ContextNode *Caller); |
367 | void eraseCalleeEdge(const ContextEdge *Edge); |
368 | void eraseCallerEdge(const ContextEdge *Edge); |
369 | |
370 | void setCall(CallInfo C) { Call = C; } |
371 | |
372 | bool hasCall() const { return (bool)Call.call(); } |
373 | |
374 | void printCall(raw_ostream &OS) const { Call.print(OS); } |
375 | |
376 | // True if this node was effectively removed from the graph, in which case |
377 | // it should have an allocation type of None and empty context ids. |
378 | bool isRemoved() const { |
379 | assert((AllocTypes == (uint8_t)AllocationType::None) == |
380 | emptyContextIds()); |
381 | return AllocTypes == (uint8_t)AllocationType::None; |
382 | } |
383 | |
384 | void dump() const; |
385 | void print(raw_ostream &OS) const; |
386 | |
387 | friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) { |
388 | Node.print(OS); |
389 | return OS; |
390 | } |
391 | }; |
392 | |
393 | /// Edge in the Callsite Context Graph from a ContextNode N to a caller or |
394 | /// callee. |
395 | struct ContextEdge { |
396 | ContextNode *Callee; |
397 | ContextNode *Caller; |
398 | |
399 | // This will be formed by ORing together the AllocationType enum values |
400 | // for contexts including this edge. |
401 | uint8_t AllocTypes = 0; |
402 | |
403 | // The set of IDs for contexts including this edge. |
404 | DenseSet<uint32_t> ContextIds; |
405 | |
406 | ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType, |
407 | DenseSet<uint32_t> ContextIds) |
408 | : Callee(Callee), Caller(Caller), AllocTypes(AllocType), |
409 | ContextIds(std::move(ContextIds)) {} |
410 | |
411 | DenseSet<uint32_t> &getContextIds() { return ContextIds; } |
412 | |
413 | void dump() const; |
414 | void print(raw_ostream &OS) const; |
415 | |
416 | friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) { |
417 | Edge.print(OS); |
418 | return OS; |
419 | } |
420 | }; |
421 | |
422 | /// Helpers to remove callee edges that have allocation type None (due to not |
423 | /// carrying any context ids) after transformations. |
424 | void removeNoneTypeCalleeEdges(ContextNode *Node); |
425 | void |
426 | recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node, |
427 | DenseSet<const ContextNode *> &Visited); |
428 | |
429 | protected: |
430 | /// Get a list of nodes corresponding to the stack ids in the given callsite |
431 | /// context. |
432 | template <class NodeT, class IteratorT> |
433 | std::vector<uint64_t> |
434 | getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext); |
435 | |
436 | /// Adds nodes for the given allocation and any stack ids on its memprof MIB |
437 | /// metadata (or summary). |
438 | ContextNode *addAllocNode(CallInfo Call, const FuncTy *F); |
439 | |
440 | /// Adds nodes for the given MIB stack ids. |
441 | template <class NodeT, class IteratorT> |
442 | void addStackNodesForMIB(ContextNode *AllocNode, |
443 | CallStack<NodeT, IteratorT> &StackContext, |
444 | CallStack<NodeT, IteratorT> &CallsiteContext, |
445 | AllocationType AllocType, uint64_t TotalSize); |
446 | |
447 | /// Matches all callsite metadata (or summary) to the nodes created for |
448 | /// allocation memprof MIB metadata, synthesizing new nodes to reflect any |
449 | /// inlining performed on those callsite instructions. |
450 | void updateStackNodes(); |
451 | |
452 | /// Update graph to conservatively handle any callsite stack nodes that target |
453 | /// multiple different callee target functions. |
454 | void handleCallsitesWithMultipleTargets(); |
455 | |
456 | /// Save lists of calls with MemProf metadata in each function, for faster |
457 | /// iteration. |
458 | MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata; |
459 | |
460 | /// Map from callsite node to the enclosing caller function. |
461 | std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc; |
462 | |
463 | private: |
464 | using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator; |
465 | |
466 | using CallContextInfo = std::tuple<CallTy, std::vector<uint64_t>, |
467 | const FuncTy *, DenseSet<uint32_t>>; |
468 | |
469 | /// Assigns the given Node to calls at or inlined into the location with |
470 | /// the Node's stack id, after post order traversing and processing its |
471 | /// caller nodes. Uses the call information recorded in the given |
472 | /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences |
473 | /// as needed. Called by updateStackNodes which sets up the given |
474 | /// StackIdToMatchingCalls map. |
475 | void assignStackNodesPostOrder( |
476 | ContextNode *Node, DenseSet<const ContextNode *> &Visited, |
477 | DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls); |
478 | |
479 | /// Duplicates the given set of context ids, updating the provided |
480 | /// map from each original id with the newly generated context ids, |
481 | /// and returning the new duplicated id set. |
482 | DenseSet<uint32_t> duplicateContextIds( |
483 | const DenseSet<uint32_t> &StackSequenceContextIds, |
484 | DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds); |
485 | |
486 | /// Propagates all duplicated context ids across the graph. |
487 | void propagateDuplicateContextIds( |
488 | const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds); |
489 | |
490 | /// Connect the NewNode to OrigNode's callees if TowardsCallee is true, |
491 | /// else to its callers. Also updates OrigNode's edges to remove any context |
492 | /// ids moved to the newly created edge. |
493 | void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode, |
494 | bool TowardsCallee, |
495 | DenseSet<uint32_t> RemainingContextIds); |
496 | |
497 | /// Get the stack id corresponding to the given Id or Index (for IR this will |
498 | /// return itself, for a summary index this will return the id recorded in the |
499 | /// index for that stack id index value). |
500 | uint64_t getStackId(uint64_t IdOrIndex) const { |
501 | return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex); |
502 | } |
503 | |
504 | /// Returns true if the given call targets the callee of the given edge, or if |
505 | /// we were able to identify the call chain through intermediate tail calls. |
506 | /// In the latter case new context nodes are added to the graph for the |
507 | /// identified tail calls, and their synthesized nodes are added to |
508 | /// TailCallToContextNodeMap. The EdgeIter is updated in the latter case for |
509 | /// the updated edges and to prepare it for an increment in the caller. |
510 | bool |
511 | calleesMatch(CallTy Call, EdgeIter &EI, |
512 | MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap); |
513 | |
514 | /// Returns true if the given call targets the given function, or if we were |
515 | /// able to identify the call chain through intermediate tail calls (in which |
516 | /// case FoundCalleeChain will be populated). |
517 | bool calleeMatchesFunc( |
518 | CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc, |
519 | std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) { |
520 | return static_cast<DerivedCCG *>(this)->calleeMatchesFunc( |
521 | Call, Func, CallerFunc, FoundCalleeChain); |
522 | } |
523 | |
524 | /// Get a list of nodes corresponding to the stack ids in the given |
525 | /// callsite's context. |
526 | std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) { |
527 | return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall( |
528 | Call); |
529 | } |
530 | |
531 | /// Get the last stack id in the context for callsite. |
532 | uint64_t getLastStackId(CallTy Call) { |
533 | return static_cast<DerivedCCG *>(this)->getLastStackId(Call); |
534 | } |
535 | |
536 | /// Update the allocation call to record type of allocated memory. |
537 | void updateAllocationCall(CallInfo &Call, AllocationType AllocType) { |
538 | AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++; |
539 | static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType); |
540 | } |
541 | |
542 | /// Update non-allocation call to invoke (possibly cloned) function |
543 | /// CalleeFunc. |
544 | void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) { |
545 | static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc); |
546 | } |
547 | |
548 | /// Clone the given function for the given callsite, recording mapping of all |
549 | /// of the functions tracked calls to their new versions in the CallMap. |
550 | /// Assigns new clones to clone number CloneNo. |
551 | FuncInfo cloneFunctionForCallsite( |
552 | FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap, |
553 | std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) { |
554 | return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite( |
555 | Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo); |
556 | } |
557 | |
558 | /// Gets a label to use in the dot graph for the given call clone in the given |
559 | /// function. |
560 | std::string getLabel(const FuncTy *Func, const CallTy Call, |
561 | unsigned CloneNo) const { |
562 | return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo); |
563 | } |
564 | |
565 | /// Helpers to find the node corresponding to the given call or stackid. |
566 | ContextNode *getNodeForInst(const CallInfo &C); |
567 | ContextNode *getNodeForAlloc(const CallInfo &C); |
568 | ContextNode *getNodeForStackId(uint64_t StackId); |
569 | |
570 | /// Computes the alloc type corresponding to the given context ids, by |
571 | /// unioning their recorded alloc types. |
572 | uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds); |
573 | |
574 | /// Returns the allocation type of the intersection of the contexts of two |
575 | /// nodes (based on their provided context id sets), optimized for the case |
576 | /// when Node1Ids is smaller than Node2Ids. |
577 | uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids, |
578 | const DenseSet<uint32_t> &Node2Ids); |
579 | |
580 | /// Returns the allocation type of the intersection of the contexts of two |
581 | /// nodes (based on their provided context id sets). |
582 | uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids, |
583 | const DenseSet<uint32_t> &Node2Ids); |
584 | |
585 | /// Create a clone of Edge's callee and move Edge to that new callee node, |
586 | /// performing the necessary context id and allocation type updates. |
587 | /// If callee's caller edge iterator is supplied, it is updated when removing |
588 | /// the edge from that list. If ContextIdsToMove is non-empty, only that |
589 | /// subset of Edge's ids are moved to an edge to the new callee. |
590 | ContextNode * |
591 | moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge, |
592 | EdgeIter *CallerEdgeI = nullptr, |
593 | DenseSet<uint32_t> ContextIdsToMove = {}); |
594 | |
595 | /// Change the callee of Edge to existing callee clone NewCallee, performing |
596 | /// the necessary context id and allocation type updates. |
597 | /// If callee's caller edge iterator is supplied, it is updated when removing |
598 | /// the edge from that list. If ContextIdsToMove is non-empty, only that |
599 | /// subset of Edge's ids are moved to an edge to the new callee. |
600 | void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge, |
601 | ContextNode *NewCallee, |
602 | EdgeIter *CallerEdgeI = nullptr, |
603 | bool NewClone = false, |
604 | DenseSet<uint32_t> ContextIdsToMove = {}); |
605 | |
606 | /// Recursively perform cloning on the graph for the given Node and its |
607 | /// callers, in order to uniquely identify the allocation behavior of an |
608 | /// allocation given its context. The context ids of the allocation being |
609 | /// processed are given in AllocContextIds. |
610 | void identifyClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited, |
611 | const DenseSet<uint32_t> &AllocContextIds); |
612 | |
613 | /// Map from each context ID to the AllocationType assigned to that context. |
614 | DenseMap<uint32_t, AllocationType> ContextIdToAllocationType; |
615 | |
616 | /// Map from each contextID to the profiled aggregate allocation size, |
617 | /// optionally populated when requested (via MemProfReportHintedSizes). |
618 | DenseMap<uint32_t, uint64_t> ContextIdToTotalSize; |
619 | |
620 | /// Identifies the context node created for a stack id when adding the MIB |
621 | /// contexts to the graph. This is used to locate the context nodes when |
622 | /// trying to assign the corresponding callsites with those stack ids to these |
623 | /// nodes. |
624 | DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap; |
625 | |
626 | /// Maps to track the calls to their corresponding nodes in the graph. |
627 | MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap; |
628 | MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap; |
629 | |
630 | /// Owner of all ContextNode unique_ptrs. |
631 | std::vector<std::unique_ptr<ContextNode>> NodeOwner; |
632 | |
633 | /// Perform sanity checks on graph when requested. |
634 | void check() const; |
635 | |
636 | /// Keeps track of the last unique context id assigned. |
637 | unsigned int LastContextId = 0; |
638 | }; |
639 | |
640 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
641 | using ContextNode = |
642 | typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode; |
643 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
644 | using ContextEdge = |
645 | typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge; |
646 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
647 | using FuncInfo = |
648 | typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo; |
649 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
650 | using CallInfo = |
651 | typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo; |
652 | |
653 | /// CRTP derived class for graphs built from IR (regular LTO). |
654 | class ModuleCallsiteContextGraph |
655 | : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function, |
656 | Instruction *> { |
657 | public: |
658 | ModuleCallsiteContextGraph( |
659 | Module &M, |
660 | llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter); |
661 | |
662 | private: |
663 | friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function, |
664 | Instruction *>; |
665 | |
666 | uint64_t getStackId(uint64_t IdOrIndex) const; |
667 | bool calleeMatchesFunc( |
668 | Instruction *Call, const Function *Func, const Function *CallerFunc, |
669 | std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain); |
670 | bool findProfiledCalleeThroughTailCalls( |
671 | const Function *ProfiledCallee, Value *CurCallee, unsigned Depth, |
672 | std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain, |
673 | bool &FoundMultipleCalleeChains); |
674 | uint64_t getLastStackId(Instruction *Call); |
675 | std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call); |
676 | void updateAllocationCall(CallInfo &Call, AllocationType AllocType); |
677 | void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc); |
678 | CallsiteContextGraph<ModuleCallsiteContextGraph, Function, |
679 | Instruction *>::FuncInfo |
680 | cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call, |
681 | std::map<CallInfo, CallInfo> &CallMap, |
682 | std::vector<CallInfo> &CallsWithMetadataInFunc, |
683 | unsigned CloneNo); |
684 | std::string getLabel(const Function *Func, const Instruction *Call, |
685 | unsigned CloneNo) const; |
686 | |
687 | const Module &Mod; |
688 | llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter; |
689 | }; |
690 | |
691 | /// Represents a call in the summary index graph, which can either be an |
692 | /// allocation or an interior callsite node in an allocation's context. |
693 | /// Holds a pointer to the corresponding data structure in the index. |
694 | struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> { |
695 | IndexCall() : PointerUnion() {} |
696 | IndexCall(std::nullptr_t) : IndexCall() {} |
697 | IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {} |
698 | IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {} |
699 | IndexCall(PointerUnion PT) : PointerUnion(PT) {} |
700 | |
701 | IndexCall *operator->() { return this; } |
702 | |
703 | PointerUnion<CallsiteInfo *, AllocInfo *> getBase() const { return *this; } |
704 | |
705 | void print(raw_ostream &OS) const { |
706 | if (auto *AI = llvm::dyn_cast_if_present<AllocInfo *>(Val: getBase())) { |
707 | OS << *AI; |
708 | } else { |
709 | auto *CI = llvm::dyn_cast_if_present<CallsiteInfo *>(Val: getBase()); |
710 | assert(CI); |
711 | OS << *CI; |
712 | } |
713 | } |
714 | }; |
715 | |
716 | /// CRTP derived class for graphs built from summary index (ThinLTO). |
717 | class IndexCallsiteContextGraph |
718 | : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, |
719 | IndexCall> { |
720 | public: |
721 | IndexCallsiteContextGraph( |
722 | ModuleSummaryIndex &Index, |
723 | llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)> |
724 | isPrevailing); |
725 | |
726 | ~IndexCallsiteContextGraph() { |
727 | // Now that we are done with the graph it is safe to add the new |
728 | // CallsiteInfo structs to the function summary vectors. The graph nodes |
729 | // point into locations within these vectors, so we don't want to add them |
730 | // any earlier. |
731 | for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) { |
732 | auto *FS = I.first; |
733 | for (auto &Callsite : I.second) |
734 | FS->addCallsite(Callsite&: *Callsite.second); |
735 | } |
736 | } |
737 | |
738 | private: |
739 | friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, |
740 | IndexCall>; |
741 | |
742 | uint64_t getStackId(uint64_t IdOrIndex) const; |
743 | bool calleeMatchesFunc( |
744 | IndexCall &Call, const FunctionSummary *Func, |
745 | const FunctionSummary *CallerFunc, |
746 | std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain); |
747 | bool findProfiledCalleeThroughTailCalls( |
748 | ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth, |
749 | std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain, |
750 | bool &FoundMultipleCalleeChains); |
751 | uint64_t getLastStackId(IndexCall &Call); |
752 | std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call); |
753 | void updateAllocationCall(CallInfo &Call, AllocationType AllocType); |
754 | void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc); |
755 | CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, |
756 | IndexCall>::FuncInfo |
757 | cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call, |
758 | std::map<CallInfo, CallInfo> &CallMap, |
759 | std::vector<CallInfo> &CallsWithMetadataInFunc, |
760 | unsigned CloneNo); |
761 | std::string getLabel(const FunctionSummary *Func, const IndexCall &Call, |
762 | unsigned CloneNo) const; |
763 | |
764 | // Saves mapping from function summaries containing memprof records back to |
765 | // its VI, for use in checking and debugging. |
766 | std::map<const FunctionSummary *, ValueInfo> FSToVIMap; |
767 | |
768 | const ModuleSummaryIndex &Index; |
769 | llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)> |
770 | isPrevailing; |
771 | |
772 | // Saves/owns the callsite info structures synthesized for missing tail call |
773 | // frames that we discover while building the graph. |
774 | // It maps from the summary of the function making the tail call, to a map |
775 | // of callee ValueInfo to corresponding synthesized callsite info. |
776 | std::unordered_map<FunctionSummary *, |
777 | std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>> |
778 | FunctionCalleesToSynthesizedCallsiteInfos; |
779 | }; |
780 | } // namespace |
781 | |
782 | namespace llvm { |
783 | template <> |
784 | struct DenseMapInfo<typename CallsiteContextGraph< |
785 | ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo> |
786 | : public DenseMapInfo<std::pair<Instruction *, unsigned>> {}; |
787 | template <> |
788 | struct DenseMapInfo<typename CallsiteContextGraph< |
789 | IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo> |
790 | : public DenseMapInfo<std::pair<IndexCall, unsigned>> {}; |
791 | template <> |
792 | struct DenseMapInfo<IndexCall> |
793 | : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {}; |
794 | } // end namespace llvm |
795 | |
796 | namespace { |
797 | |
798 | struct FieldSeparator { |
799 | bool Skip = true; |
800 | const char *Sep; |
801 | |
802 | FieldSeparator(const char *Sep = ", " ) : Sep(Sep) {} |
803 | }; |
804 | |
805 | raw_ostream &operator<<(raw_ostream &OS, FieldSeparator &FS) { |
806 | if (FS.Skip) { |
807 | FS.Skip = false; |
808 | return OS; |
809 | } |
810 | return OS << FS.Sep; |
811 | } |
812 | |
813 | // Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc |
814 | // type we should actually use on the corresponding allocation. |
815 | // If we can't clone a node that has NotCold+Cold alloc type, we will fall |
816 | // back to using NotCold. So don't bother cloning to distinguish NotCold+Cold |
817 | // from NotCold. |
818 | AllocationType allocTypeToUse(uint8_t AllocTypes) { |
819 | assert(AllocTypes != (uint8_t)AllocationType::None); |
820 | if (AllocTypes == |
821 | ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold)) |
822 | return AllocationType::NotCold; |
823 | else |
824 | return (AllocationType)AllocTypes; |
825 | } |
826 | |
827 | // Helper to check if the alloc types for all edges recorded in the |
828 | // InAllocTypes vector match the alloc types for all edges in the Edges |
829 | // vector. |
830 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
831 | bool allocTypesMatch( |
832 | const std::vector<uint8_t> &InAllocTypes, |
833 | const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>> |
834 | &Edges) { |
835 | return std::equal( |
836 | InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(), |
837 | [](const uint8_t &l, |
838 | const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) { |
839 | // Can share if one of the edges is None type - don't |
840 | // care about the type along that edge as it doesn't |
841 | // exist for those context ids. |
842 | if (l == (uint8_t)AllocationType::None || |
843 | r->AllocTypes == (uint8_t)AllocationType::None) |
844 | return true; |
845 | return allocTypeToUse(AllocTypes: l) == allocTypeToUse(r->AllocTypes); |
846 | }); |
847 | } |
848 | |
849 | } // end anonymous namespace |
850 | |
851 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
852 | typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * |
853 | CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst( |
854 | const CallInfo &C) { |
855 | ContextNode *Node = getNodeForAlloc(C); |
856 | if (Node) |
857 | return Node; |
858 | |
859 | return NonAllocationCallToContextNodeMap.lookup(C); |
860 | } |
861 | |
862 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
863 | typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * |
864 | CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc( |
865 | const CallInfo &C) { |
866 | return AllocationCallToContextNodeMap.lookup(C); |
867 | } |
868 | |
869 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
870 | typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * |
871 | CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId( |
872 | uint64_t StackId) { |
873 | auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId); |
874 | if (StackEntryNode != StackEntryIdToContextNodeMap.end()) |
875 | return StackEntryNode->second; |
876 | return nullptr; |
877 | } |
878 | |
879 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
880 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: |
881 | addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType, |
882 | unsigned int ContextId) { |
883 | for (auto &Edge : CallerEdges) { |
884 | if (Edge->Caller == Caller) { |
885 | Edge->AllocTypes |= (uint8_t)AllocType; |
886 | Edge->getContextIds().insert(ContextId); |
887 | return; |
888 | } |
889 | } |
890 | std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>( |
891 | this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId})); |
892 | CallerEdges.push_back(Edge); |
893 | Caller->CalleeEdges.push_back(Edge); |
894 | } |
895 | |
896 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
897 | void CallsiteContextGraph< |
898 | DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) { |
899 | for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) { |
900 | auto Edge = *EI; |
901 | if (Edge->AllocTypes == (uint8_t)AllocationType::None) { |
902 | assert(Edge->ContextIds.empty()); |
903 | Edge->Callee->eraseCallerEdge(Edge.get()); |
904 | EI = Node->CalleeEdges.erase(EI); |
905 | } else |
906 | ++EI; |
907 | } |
908 | } |
909 | |
910 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
911 | typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge * |
912 | CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: |
913 | findEdgeFromCallee(const ContextNode *Callee) { |
914 | for (const auto &Edge : CalleeEdges) |
915 | if (Edge->Callee == Callee) |
916 | return Edge.get(); |
917 | return nullptr; |
918 | } |
919 | |
920 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
921 | typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge * |
922 | CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: |
923 | findEdgeFromCaller(const ContextNode *Caller) { |
924 | for (const auto &Edge : CallerEdges) |
925 | if (Edge->Caller == Caller) |
926 | return Edge.get(); |
927 | return nullptr; |
928 | } |
929 | |
930 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
931 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: |
932 | eraseCalleeEdge(const ContextEdge *Edge) { |
933 | auto EI = llvm::find_if( |
934 | CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) { |
935 | return CalleeEdge.get() == Edge; |
936 | }); |
937 | assert(EI != CalleeEdges.end()); |
938 | CalleeEdges.erase(EI); |
939 | } |
940 | |
941 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
942 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: |
943 | eraseCallerEdge(const ContextEdge *Edge) { |
944 | auto EI = llvm::find_if( |
945 | CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) { |
946 | return CallerEdge.get() == Edge; |
947 | }); |
948 | assert(EI != CallerEdges.end()); |
949 | CallerEdges.erase(EI); |
950 | } |
951 | |
952 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
953 | uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType( |
954 | DenseSet<uint32_t> &ContextIds) { |
955 | uint8_t BothTypes = |
956 | (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold; |
957 | uint8_t AllocType = (uint8_t)AllocationType::None; |
958 | for (auto Id : ContextIds) { |
959 | AllocType |= (uint8_t)ContextIdToAllocationType[Id]; |
960 | // Bail early if alloc type reached both, no further refinement. |
961 | if (AllocType == BothTypes) |
962 | return AllocType; |
963 | } |
964 | return AllocType; |
965 | } |
966 | |
967 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
968 | uint8_t |
969 | CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl( |
970 | const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) { |
971 | uint8_t BothTypes = |
972 | (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold; |
973 | uint8_t AllocType = (uint8_t)AllocationType::None; |
974 | for (auto Id : Node1Ids) { |
975 | if (!Node2Ids.count(V: Id)) |
976 | continue; |
977 | AllocType |= (uint8_t)ContextIdToAllocationType[Id]; |
978 | // Bail early if alloc type reached both, no further refinement. |
979 | if (AllocType == BothTypes) |
980 | return AllocType; |
981 | } |
982 | return AllocType; |
983 | } |
984 | |
985 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
986 | uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes( |
987 | const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) { |
988 | if (Node1Ids.size() < Node2Ids.size()) |
989 | return intersectAllocTypesImpl(Node1Ids, Node2Ids); |
990 | else |
991 | return intersectAllocTypesImpl(Node1Ids: Node2Ids, Node2Ids: Node1Ids); |
992 | } |
993 | |
994 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
995 | typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * |
996 | CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode( |
997 | CallInfo Call, const FuncTy *F) { |
998 | assert(!getNodeForAlloc(Call)); |
999 | NodeOwner.push_back( |
1000 | std::make_unique<ContextNode>(/*IsAllocation=*/true, Call)); |
1001 | ContextNode *AllocNode = NodeOwner.back().get(); |
1002 | AllocationCallToContextNodeMap[Call] = AllocNode; |
1003 | NodeToCallingFunc[AllocNode] = F; |
1004 | // Use LastContextId as a uniq id for MIB allocation nodes. |
1005 | AllocNode->OrigStackOrAllocId = LastContextId; |
1006 | // Alloc type should be updated as we add in the MIBs. We should assert |
1007 | // afterwards that it is not still None. |
1008 | AllocNode->AllocTypes = (uint8_t)AllocationType::None; |
1009 | |
1010 | return AllocNode; |
1011 | } |
1012 | |
1013 | static std::string getAllocTypeString(uint8_t AllocTypes) { |
1014 | if (!AllocTypes) |
1015 | return "None" ; |
1016 | std::string Str; |
1017 | if (AllocTypes & (uint8_t)AllocationType::NotCold) |
1018 | Str += "NotCold" ; |
1019 | if (AllocTypes & (uint8_t)AllocationType::Cold) |
1020 | Str += "Cold" ; |
1021 | return Str; |
1022 | } |
1023 | |
1024 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
1025 | template <class NodeT, class IteratorT> |
1026 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB( |
1027 | ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext, |
1028 | CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType, |
1029 | uint64_t TotalSize) { |
1030 | assert(!MemProfReportHintedSizes || TotalSize > 0); |
1031 | // Treating the hot alloc type as NotCold before the disambiguation for "hot" |
1032 | // is done. |
1033 | if (AllocType == AllocationType::Hot) |
1034 | AllocType = AllocationType::NotCold; |
1035 | |
1036 | ContextIdToAllocationType[++LastContextId] = AllocType; |
1037 | |
1038 | if (MemProfReportHintedSizes) { |
1039 | assert(TotalSize); |
1040 | ContextIdToTotalSize[LastContextId] = TotalSize; |
1041 | } |
1042 | |
1043 | // Update alloc type and context ids for this MIB. |
1044 | AllocNode->AllocTypes |= (uint8_t)AllocType; |
1045 | |
1046 | // Now add or update nodes for each stack id in alloc's context. |
1047 | // Later when processing the stack ids on non-alloc callsites we will adjust |
1048 | // for any inlining in the context. |
1049 | ContextNode *PrevNode = AllocNode; |
1050 | // Look for recursion (direct recursion should have been collapsed by |
1051 | // module summary analysis, here we should just be detecting mutual |
1052 | // recursion). Mark these nodes so we don't try to clone. |
1053 | SmallSet<uint64_t, 8> StackIdSet; |
1054 | // Skip any on the allocation call (inlining). |
1055 | for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext); |
1056 | ContextIter != StackContext.end(); ++ContextIter) { |
1057 | auto StackId = getStackId(IdOrIndex: *ContextIter); |
1058 | ContextNode *StackNode = getNodeForStackId(StackId); |
1059 | if (!StackNode) { |
1060 | NodeOwner.push_back( |
1061 | std::make_unique<ContextNode>(/*IsAllocation=*/false)); |
1062 | StackNode = NodeOwner.back().get(); |
1063 | StackEntryIdToContextNodeMap[StackId] = StackNode; |
1064 | StackNode->OrigStackOrAllocId = StackId; |
1065 | } |
1066 | auto Ins = StackIdSet.insert(StackId); |
1067 | if (!Ins.second) |
1068 | StackNode->Recursive = true; |
1069 | StackNode->AllocTypes |= (uint8_t)AllocType; |
1070 | PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId); |
1071 | PrevNode = StackNode; |
1072 | } |
1073 | } |
1074 | |
1075 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
1076 | DenseSet<uint32_t> |
1077 | CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds( |
1078 | const DenseSet<uint32_t> &StackSequenceContextIds, |
1079 | DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) { |
1080 | DenseSet<uint32_t> NewContextIds; |
1081 | for (auto OldId : StackSequenceContextIds) { |
1082 | NewContextIds.insert(V: ++LastContextId); |
1083 | OldToNewContextIds[OldId].insert(V: LastContextId); |
1084 | assert(ContextIdToAllocationType.count(OldId)); |
1085 | // The new context has the same allocation type as original. |
1086 | ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId]; |
1087 | // For now set this to 0 so we don't duplicate sizes. Not clear how to divvy |
1088 | // up the size. Assume that if we are able to duplicate context ids that we |
1089 | // will be able to disambiguate all copies. |
1090 | ContextIdToTotalSize[LastContextId] = 0; |
1091 | } |
1092 | return NewContextIds; |
1093 | } |
1094 | |
1095 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
1096 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: |
1097 | propagateDuplicateContextIds( |
1098 | const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) { |
1099 | // Build a set of duplicated context ids corresponding to the input id set. |
1100 | auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) { |
1101 | DenseSet<uint32_t> NewIds; |
1102 | for (auto Id : ContextIds) |
1103 | if (auto NewId = OldToNewContextIds.find(Val: Id); |
1104 | NewId != OldToNewContextIds.end()) |
1105 | NewIds.insert(I: NewId->second.begin(), E: NewId->second.end()); |
1106 | return NewIds; |
1107 | }; |
1108 | |
1109 | // Recursively update context ids sets along caller edges. |
1110 | auto UpdateCallers = [&](ContextNode *Node, |
1111 | DenseSet<const ContextEdge *> &Visited, |
1112 | auto &&UpdateCallers) -> void { |
1113 | for (const auto &Edge : Node->CallerEdges) { |
1114 | auto Inserted = Visited.insert(Edge.get()); |
1115 | if (!Inserted.second) |
1116 | continue; |
1117 | ContextNode *NextNode = Edge->Caller; |
1118 | DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds()); |
1119 | // Only need to recursively iterate to NextNode via this caller edge if |
1120 | // it resulted in any added ids to NextNode. |
1121 | if (!NewIdsToAdd.empty()) { |
1122 | Edge->getContextIds().insert(NewIdsToAdd.begin(), NewIdsToAdd.end()); |
1123 | UpdateCallers(NextNode, Visited, UpdateCallers); |
1124 | } |
1125 | } |
1126 | }; |
1127 | |
1128 | DenseSet<const ContextEdge *> Visited; |
1129 | for (auto &Entry : AllocationCallToContextNodeMap) { |
1130 | auto *Node = Entry.second; |
1131 | UpdateCallers(Node, Visited, UpdateCallers); |
1132 | } |
1133 | } |
1134 | |
1135 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
1136 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode( |
1137 | ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee, |
1138 | // This must be passed by value to make a copy since it will be adjusted |
1139 | // as ids are moved. |
1140 | DenseSet<uint32_t> RemainingContextIds) { |
1141 | auto &OrigEdges = |
1142 | TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges; |
1143 | // Increment iterator in loop so that we can remove edges as needed. |
1144 | for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) { |
1145 | auto Edge = *EI; |
1146 | // Remove any matching context ids from Edge, return set that were found and |
1147 | // removed, these are the new edge's context ids. Also update the remaining |
1148 | // (not found ids). |
1149 | DenseSet<uint32_t> NewEdgeContextIds, NotFoundContextIds; |
1150 | set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds, |
1151 | NotFoundContextIds); |
1152 | RemainingContextIds.swap(RHS&: NotFoundContextIds); |
1153 | // If no matching context ids for this edge, skip it. |
1154 | if (NewEdgeContextIds.empty()) { |
1155 | ++EI; |
1156 | continue; |
1157 | } |
1158 | if (TowardsCallee) { |
1159 | uint8_t NewAllocType = computeAllocType(ContextIds&: NewEdgeContextIds); |
1160 | auto NewEdge = std::make_shared<ContextEdge>( |
1161 | Edge->Callee, NewNode, NewAllocType, std::move(NewEdgeContextIds)); |
1162 | NewNode->CalleeEdges.push_back(NewEdge); |
1163 | NewEdge->Callee->CallerEdges.push_back(NewEdge); |
1164 | } else { |
1165 | uint8_t NewAllocType = computeAllocType(ContextIds&: NewEdgeContextIds); |
1166 | auto NewEdge = std::make_shared<ContextEdge>( |
1167 | NewNode, Edge->Caller, NewAllocType, std::move(NewEdgeContextIds)); |
1168 | NewNode->CallerEdges.push_back(NewEdge); |
1169 | NewEdge->Caller->CalleeEdges.push_back(NewEdge); |
1170 | } |
1171 | // Remove old edge if context ids empty. |
1172 | if (Edge->getContextIds().empty()) { |
1173 | if (TowardsCallee) { |
1174 | Edge->Callee->eraseCallerEdge(Edge.get()); |
1175 | EI = OrigNode->CalleeEdges.erase(EI); |
1176 | } else { |
1177 | Edge->Caller->eraseCalleeEdge(Edge.get()); |
1178 | EI = OrigNode->CallerEdges.erase(EI); |
1179 | } |
1180 | continue; |
1181 | } |
1182 | ++EI; |
1183 | } |
1184 | } |
1185 | |
1186 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
1187 | static void checkEdge( |
1188 | const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) { |
1189 | // Confirm that alloc type is not None and that we have at least one context |
1190 | // id. |
1191 | assert(Edge->AllocTypes != (uint8_t)AllocationType::None); |
1192 | assert(!Edge->ContextIds.empty()); |
1193 | } |
1194 | |
1195 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
1196 | static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node, |
1197 | bool CheckEdges = true) { |
1198 | if (Node->isRemoved()) |
1199 | return; |
1200 | #ifndef NDEBUG |
1201 | // Compute node's context ids once for use in asserts. |
1202 | auto NodeContextIds = Node->getContextIds(); |
1203 | #endif |
1204 | // Node's context ids should be the union of both its callee and caller edge |
1205 | // context ids. |
1206 | if (Node->CallerEdges.size()) { |
1207 | DenseSet<uint32_t> CallerEdgeContextIds( |
1208 | Node->CallerEdges.front()->ContextIds); |
1209 | for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) { |
1210 | if (CheckEdges) |
1211 | checkEdge<DerivedCCG, FuncTy, CallTy>(Edge); |
1212 | set_union(CallerEdgeContextIds, Edge->ContextIds); |
1213 | } |
1214 | // Node can have more context ids than callers if some contexts terminate at |
1215 | // node and some are longer. |
1216 | assert(NodeContextIds == CallerEdgeContextIds || |
1217 | set_is_subset(CallerEdgeContextIds, NodeContextIds)); |
1218 | } |
1219 | if (Node->CalleeEdges.size()) { |
1220 | DenseSet<uint32_t> CalleeEdgeContextIds( |
1221 | Node->CalleeEdges.front()->ContextIds); |
1222 | for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) { |
1223 | if (CheckEdges) |
1224 | checkEdge<DerivedCCG, FuncTy, CallTy>(Edge); |
1225 | set_union(CalleeEdgeContextIds, Edge->getContextIds()); |
1226 | } |
1227 | assert(NodeContextIds == CalleeEdgeContextIds); |
1228 | } |
1229 | } |
1230 | |
1231 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
1232 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: |
1233 | assignStackNodesPostOrder(ContextNode *Node, |
1234 | DenseSet<const ContextNode *> &Visited, |
1235 | DenseMap<uint64_t, std::vector<CallContextInfo>> |
1236 | &StackIdToMatchingCalls) { |
1237 | auto Inserted = Visited.insert(Node); |
1238 | if (!Inserted.second) |
1239 | return; |
1240 | // Post order traversal. Iterate over a copy since we may add nodes and |
1241 | // therefore new callers during the recursive call, invalidating any |
1242 | // iterator over the original edge vector. We don't need to process these |
1243 | // new nodes as they were already processed on creation. |
1244 | auto CallerEdges = Node->CallerEdges; |
1245 | for (auto &Edge : CallerEdges) { |
1246 | // Skip any that have been removed during the recursion. |
1247 | if (!Edge) |
1248 | continue; |
1249 | assignStackNodesPostOrder(Node: Edge->Caller, Visited, StackIdToMatchingCalls); |
1250 | } |
1251 | |
1252 | // If this node's stack id is in the map, update the graph to contain new |
1253 | // nodes representing any inlining at interior callsites. Note we move the |
1254 | // associated context ids over to the new nodes. |
1255 | |
1256 | // Ignore this node if it is for an allocation or we didn't record any |
1257 | // stack id lists ending at it. |
1258 | if (Node->IsAllocation || |
1259 | !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId)) |
1260 | return; |
1261 | |
1262 | auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId]; |
1263 | // Handle the simple case first. A single call with a single stack id. |
1264 | // In this case there is no need to create any new context nodes, simply |
1265 | // assign the context node for stack id to this Call. |
1266 | if (Calls.size() == 1) { |
1267 | auto &[Call, Ids, Func, SavedContextIds] = Calls[0]; |
1268 | if (Ids.size() == 1) { |
1269 | assert(SavedContextIds.empty()); |
1270 | // It should be this Node |
1271 | assert(Node == getNodeForStackId(Ids[0])); |
1272 | if (Node->Recursive) |
1273 | return; |
1274 | Node->setCall(Call); |
1275 | NonAllocationCallToContextNodeMap[Call] = Node; |
1276 | NodeToCallingFunc[Node] = Func; |
1277 | return; |
1278 | } |
1279 | } |
1280 | |
1281 | // Find the node for the last stack id, which should be the same |
1282 | // across all calls recorded for this id, and is this node's id. |
1283 | uint64_t LastId = Node->OrigStackOrAllocId; |
1284 | ContextNode *LastNode = getNodeForStackId(StackId: LastId); |
1285 | // We should only have kept stack ids that had nodes. |
1286 | assert(LastNode); |
1287 | |
1288 | for (unsigned I = 0; I < Calls.size(); I++) { |
1289 | auto &[Call, Ids, Func, SavedContextIds] = Calls[I]; |
1290 | // Skip any for which we didn't assign any ids, these don't get a node in |
1291 | // the graph. |
1292 | if (SavedContextIds.empty()) |
1293 | continue; |
1294 | |
1295 | assert(LastId == Ids.back()); |
1296 | |
1297 | ContextNode *FirstNode = getNodeForStackId(StackId: Ids[0]); |
1298 | assert(FirstNode); |
1299 | |
1300 | // Recompute the context ids for this stack id sequence (the |
1301 | // intersection of the context ids of the corresponding nodes). |
1302 | // Start with the ids we saved in the map for this call, which could be |
1303 | // duplicated context ids. We have to recompute as we might have overlap |
1304 | // overlap between the saved context ids for different last nodes, and |
1305 | // removed them already during the post order traversal. |
1306 | set_intersect(SavedContextIds, FirstNode->getContextIds()); |
1307 | ContextNode *PrevNode = nullptr; |
1308 | for (auto Id : Ids) { |
1309 | ContextNode *CurNode = getNodeForStackId(StackId: Id); |
1310 | // We should only have kept stack ids that had nodes and weren't |
1311 | // recursive. |
1312 | assert(CurNode); |
1313 | assert(!CurNode->Recursive); |
1314 | if (!PrevNode) { |
1315 | PrevNode = CurNode; |
1316 | continue; |
1317 | } |
1318 | auto *Edge = CurNode->findEdgeFromCallee(PrevNode); |
1319 | if (!Edge) { |
1320 | SavedContextIds.clear(); |
1321 | break; |
1322 | } |
1323 | PrevNode = CurNode; |
1324 | set_intersect(SavedContextIds, Edge->getContextIds()); |
1325 | |
1326 | // If we now have no context ids for clone, skip this call. |
1327 | if (SavedContextIds.empty()) |
1328 | break; |
1329 | } |
1330 | if (SavedContextIds.empty()) |
1331 | continue; |
1332 | |
1333 | // Create new context node. |
1334 | NodeOwner.push_back( |
1335 | std::make_unique<ContextNode>(/*IsAllocation=*/false, Call)); |
1336 | ContextNode *NewNode = NodeOwner.back().get(); |
1337 | NodeToCallingFunc[NewNode] = Func; |
1338 | NonAllocationCallToContextNodeMap[Call] = NewNode; |
1339 | NewNode->AllocTypes = computeAllocType(ContextIds&: SavedContextIds); |
1340 | |
1341 | // Connect to callees of innermost stack frame in inlined call chain. |
1342 | // This updates context ids for FirstNode's callee's to reflect those |
1343 | // moved to NewNode. |
1344 | connectNewNode(NewNode, OrigNode: FirstNode, /*TowardsCallee=*/true, RemainingContextIds: SavedContextIds); |
1345 | |
1346 | // Connect to callers of outermost stack frame in inlined call chain. |
1347 | // This updates context ids for FirstNode's caller's to reflect those |
1348 | // moved to NewNode. |
1349 | connectNewNode(NewNode, OrigNode: LastNode, /*TowardsCallee=*/false, RemainingContextIds: SavedContextIds); |
1350 | |
1351 | // Now we need to remove context ids from edges/nodes between First and |
1352 | // Last Node. |
1353 | PrevNode = nullptr; |
1354 | for (auto Id : Ids) { |
1355 | ContextNode *CurNode = getNodeForStackId(StackId: Id); |
1356 | // We should only have kept stack ids that had nodes. |
1357 | assert(CurNode); |
1358 | |
1359 | // Remove the context ids moved to NewNode from CurNode, and the |
1360 | // edge from the prior node. |
1361 | if (PrevNode) { |
1362 | auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode); |
1363 | assert(PrevEdge); |
1364 | set_subtract(PrevEdge->getContextIds(), SavedContextIds); |
1365 | if (PrevEdge->getContextIds().empty()) { |
1366 | PrevNode->eraseCallerEdge(PrevEdge); |
1367 | CurNode->eraseCalleeEdge(PrevEdge); |
1368 | } |
1369 | } |
1370 | // Since we update the edges from leaf to tail, only look at the callee |
1371 | // edges. This isn't an alloc node, so if there are no callee edges, the |
1372 | // alloc type is None. |
1373 | CurNode->AllocTypes = CurNode->CalleeEdges.empty() |
1374 | ? (uint8_t)AllocationType::None |
1375 | : CurNode->computeAllocType(); |
1376 | PrevNode = CurNode; |
1377 | } |
1378 | if (VerifyNodes) { |
1379 | checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true); |
1380 | for (auto Id : Ids) { |
1381 | ContextNode *CurNode = getNodeForStackId(StackId: Id); |
1382 | // We should only have kept stack ids that had nodes. |
1383 | assert(CurNode); |
1384 | checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /*CheckEdges=*/true); |
1385 | } |
1386 | } |
1387 | } |
1388 | } |
1389 | |
1390 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
1391 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() { |
1392 | // Map of stack id to all calls with that as the last (outermost caller) |
1393 | // callsite id that has a context node (some might not due to pruning |
1394 | // performed during matching of the allocation profile contexts). |
1395 | // The CallContextInfo contains the Call and a list of its stack ids with |
1396 | // ContextNodes, the function containing Call, and the set of context ids |
1397 | // the analysis will eventually identify for use in any new node created |
1398 | // for that callsite. |
1399 | DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls; |
1400 | for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) { |
1401 | for (auto &Call : CallsWithMetadata) { |
1402 | // Ignore allocations, already handled. |
1403 | if (AllocationCallToContextNodeMap.count(Call)) |
1404 | continue; |
1405 | auto StackIdsWithContextNodes = |
1406 | getStackIdsWithContextNodesForCall(Call: Call.call()); |
1407 | // If there were no nodes created for MIBs on allocs (maybe this was in |
1408 | // the unambiguous part of the MIB stack that was pruned), ignore. |
1409 | if (StackIdsWithContextNodes.empty()) |
1410 | continue; |
1411 | // Otherwise, record this Call along with the list of ids for the last |
1412 | // (outermost caller) stack id with a node. |
1413 | StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back( |
1414 | {Call.call(), StackIdsWithContextNodes, Func, {}}); |
1415 | } |
1416 | } |
1417 | |
1418 | // First make a pass through all stack ids that correspond to a call, |
1419 | // as identified in the above loop. Compute the context ids corresponding to |
1420 | // each of these calls when they correspond to multiple stack ids due to |
1421 | // due to inlining. Perform any duplication of context ids required when |
1422 | // there is more than one call with the same stack ids. Their (possibly newly |
1423 | // duplicated) context ids are saved in the StackIdToMatchingCalls map. |
1424 | DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds; |
1425 | for (auto &It : StackIdToMatchingCalls) { |
1426 | auto &Calls = It.getSecond(); |
1427 | // Skip single calls with a single stack id. These don't need a new node. |
1428 | if (Calls.size() == 1) { |
1429 | auto &Ids = std::get<1>(Calls[0]); |
1430 | if (Ids.size() == 1) |
1431 | continue; |
1432 | } |
1433 | // In order to do the best and maximal matching of inlined calls to context |
1434 | // node sequences we will sort the vectors of stack ids in descending order |
1435 | // of length, and within each length, lexicographically by stack id. The |
1436 | // latter is so that we can specially handle calls that have identical stack |
1437 | // id sequences (either due to cloning or artificially because of the MIB |
1438 | // context pruning). |
1439 | std::stable_sort(Calls.begin(), Calls.end(), |
1440 | [](const CallContextInfo &A, const CallContextInfo &B) { |
1441 | auto &IdsA = std::get<1>(A); |
1442 | auto &IdsB = std::get<1>(B); |
1443 | return IdsA.size() > IdsB.size() || |
1444 | (IdsA.size() == IdsB.size() && IdsA < IdsB); |
1445 | }); |
1446 | |
1447 | // Find the node for the last stack id, which should be the same |
1448 | // across all calls recorded for this id, and is the id for this |
1449 | // entry in the StackIdToMatchingCalls map. |
1450 | uint64_t LastId = It.getFirst(); |
1451 | ContextNode *LastNode = getNodeForStackId(StackId: LastId); |
1452 | // We should only have kept stack ids that had nodes. |
1453 | assert(LastNode); |
1454 | |
1455 | if (LastNode->Recursive) |
1456 | continue; |
1457 | |
1458 | // Initialize the context ids with the last node's. We will subsequently |
1459 | // refine the context ids by computing the intersection along all edges. |
1460 | DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds(); |
1461 | assert(!LastNodeContextIds.empty()); |
1462 | |
1463 | for (unsigned I = 0; I < Calls.size(); I++) { |
1464 | auto &[Call, Ids, Func, SavedContextIds] = Calls[I]; |
1465 | assert(SavedContextIds.empty()); |
1466 | assert(LastId == Ids.back()); |
1467 | |
1468 | // First compute the context ids for this stack id sequence (the |
1469 | // intersection of the context ids of the corresponding nodes). |
1470 | // Start with the remaining saved ids for the last node. |
1471 | assert(!LastNodeContextIds.empty()); |
1472 | DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds; |
1473 | |
1474 | ContextNode *PrevNode = LastNode; |
1475 | ContextNode *CurNode = LastNode; |
1476 | bool Skip = false; |
1477 | |
1478 | // Iterate backwards through the stack Ids, starting after the last Id |
1479 | // in the list, which was handled once outside for all Calls. |
1480 | for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) { |
1481 | auto Id = *IdIter; |
1482 | CurNode = getNodeForStackId(StackId: Id); |
1483 | // We should only have kept stack ids that had nodes. |
1484 | assert(CurNode); |
1485 | |
1486 | if (CurNode->Recursive) { |
1487 | Skip = true; |
1488 | break; |
1489 | } |
1490 | |
1491 | auto *Edge = CurNode->findEdgeFromCaller(PrevNode); |
1492 | // If there is no edge then the nodes belong to different MIB contexts, |
1493 | // and we should skip this inlined context sequence. For example, this |
1494 | // particular inlined context may include stack ids A->B, and we may |
1495 | // indeed have nodes for both A and B, but it is possible that they were |
1496 | // never profiled in sequence in a single MIB for any allocation (i.e. |
1497 | // we might have profiled an allocation that involves the callsite A, |
1498 | // but through a different one of its callee callsites, and we might |
1499 | // have profiled an allocation that involves callsite B, but reached |
1500 | // from a different caller callsite). |
1501 | if (!Edge) { |
1502 | Skip = true; |
1503 | break; |
1504 | } |
1505 | PrevNode = CurNode; |
1506 | |
1507 | // Update the context ids, which is the intersection of the ids along |
1508 | // all edges in the sequence. |
1509 | set_intersect(StackSequenceContextIds, Edge->getContextIds()); |
1510 | |
1511 | // If we now have no context ids for clone, skip this call. |
1512 | if (StackSequenceContextIds.empty()) { |
1513 | Skip = true; |
1514 | break; |
1515 | } |
1516 | } |
1517 | if (Skip) |
1518 | continue; |
1519 | |
1520 | // If some of this call's stack ids did not have corresponding nodes (due |
1521 | // to pruning), don't include any context ids for contexts that extend |
1522 | // beyond these nodes. Otherwise we would be matching part of unrelated / |
1523 | // not fully matching stack contexts. To do this, subtract any context ids |
1524 | // found in caller nodes of the last node found above. |
1525 | if (Ids.back() != getLastStackId(Call)) { |
1526 | for (const auto &PE : LastNode->CallerEdges) { |
1527 | set_subtract(StackSequenceContextIds, PE->getContextIds()); |
1528 | if (StackSequenceContextIds.empty()) |
1529 | break; |
1530 | } |
1531 | // If we now have no context ids for clone, skip this call. |
1532 | if (StackSequenceContextIds.empty()) |
1533 | continue; |
1534 | } |
1535 | |
1536 | // Check if the next set of stack ids is the same (since the Calls vector |
1537 | // of tuples is sorted by the stack ids we can just look at the next one). |
1538 | bool DuplicateContextIds = false; |
1539 | if (I + 1 < Calls.size()) { |
1540 | auto NextIds = std::get<1>(Calls[I + 1]); |
1541 | DuplicateContextIds = Ids == NextIds; |
1542 | } |
1543 | |
1544 | // If we don't have duplicate context ids, then we can assign all the |
1545 | // context ids computed for the original node sequence to this call. |
1546 | // If there are duplicate calls with the same stack ids then we synthesize |
1547 | // new context ids that are duplicates of the originals. These are |
1548 | // assigned to SavedContextIds, which is a reference into the map entry |
1549 | // for this call, allowing us to access these ids later on. |
1550 | OldToNewContextIds.reserve(NumEntries: OldToNewContextIds.size() + |
1551 | StackSequenceContextIds.size()); |
1552 | SavedContextIds = |
1553 | DuplicateContextIds |
1554 | ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds) |
1555 | : StackSequenceContextIds; |
1556 | assert(!SavedContextIds.empty()); |
1557 | |
1558 | if (!DuplicateContextIds) { |
1559 | // Update saved last node's context ids to remove those that are |
1560 | // assigned to other calls, so that it is ready for the next call at |
1561 | // this stack id. |
1562 | set_subtract(S1&: LastNodeContextIds, S2: StackSequenceContextIds); |
1563 | if (LastNodeContextIds.empty()) |
1564 | break; |
1565 | } |
1566 | } |
1567 | } |
1568 | |
1569 | // Propagate the duplicate context ids over the graph. |
1570 | propagateDuplicateContextIds(OldToNewContextIds); |
1571 | |
1572 | if (VerifyCCG) |
1573 | check(); |
1574 | |
1575 | // Now perform a post-order traversal over the graph, starting with the |
1576 | // allocation nodes, essentially processing nodes from callers to callees. |
1577 | // For any that contains an id in the map, update the graph to contain new |
1578 | // nodes representing any inlining at interior callsites. Note we move the |
1579 | // associated context ids over to the new nodes. |
1580 | DenseSet<const ContextNode *> Visited; |
1581 | for (auto &Entry : AllocationCallToContextNodeMap) |
1582 | assignStackNodesPostOrder(Node: Entry.second, Visited, StackIdToMatchingCalls); |
1583 | if (VerifyCCG) |
1584 | check(); |
1585 | } |
1586 | |
1587 | uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) { |
1588 | CallStack<MDNode, MDNode::op_iterator> CallsiteContext( |
1589 | Call->getMetadata(KindID: LLVMContext::MD_callsite)); |
1590 | return CallsiteContext.back(); |
1591 | } |
1592 | |
1593 | uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) { |
1594 | assert(isa<CallsiteInfo *>(Call.getBase())); |
1595 | CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator> |
1596 | CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Val: Call.getBase())); |
1597 | // Need to convert index into stack id. |
1598 | return Index.getStackIdAtIndex(Index: CallsiteContext.back()); |
1599 | } |
1600 | |
1601 | static const std::string MemProfCloneSuffix = ".memprof." ; |
1602 | |
1603 | static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) { |
1604 | // We use CloneNo == 0 to refer to the original version, which doesn't get |
1605 | // renamed with a suffix. |
1606 | if (!CloneNo) |
1607 | return Base.str(); |
1608 | return (Base + MemProfCloneSuffix + Twine(CloneNo)).str(); |
1609 | } |
1610 | |
1611 | std::string ModuleCallsiteContextGraph::getLabel(const Function *Func, |
1612 | const Instruction *Call, |
1613 | unsigned CloneNo) const { |
1614 | return (Twine(Call->getFunction()->getName()) + " -> " + |
1615 | cast<CallBase>(Val: Call)->getCalledFunction()->getName()) |
1616 | .str(); |
1617 | } |
1618 | |
1619 | std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func, |
1620 | const IndexCall &Call, |
1621 | unsigned CloneNo) const { |
1622 | auto VI = FSToVIMap.find(x: Func); |
1623 | assert(VI != FSToVIMap.end()); |
1624 | if (isa<AllocInfo *>(Val: Call.getBase())) |
1625 | return (VI->second.name() + " -> alloc" ).str(); |
1626 | else { |
1627 | auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Val: Call.getBase()); |
1628 | return (VI->second.name() + " -> " + |
1629 | getMemProfFuncName(Base: Callsite->Callee.name(), |
1630 | CloneNo: Callsite->Clones[CloneNo])) |
1631 | .str(); |
1632 | } |
1633 | } |
1634 | |
1635 | std::vector<uint64_t> |
1636 | ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall( |
1637 | Instruction *Call) { |
1638 | CallStack<MDNode, MDNode::op_iterator> CallsiteContext( |
1639 | Call->getMetadata(KindID: LLVMContext::MD_callsite)); |
1640 | return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>( |
1641 | CallsiteContext); |
1642 | } |
1643 | |
1644 | std::vector<uint64_t> |
1645 | IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) { |
1646 | assert(isa<CallsiteInfo *>(Call.getBase())); |
1647 | CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator> |
1648 | CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Val: Call.getBase())); |
1649 | return getStackIdsWithContextNodes<CallsiteInfo, |
1650 | SmallVector<unsigned>::const_iterator>( |
1651 | CallsiteContext); |
1652 | } |
1653 | |
1654 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
1655 | template <class NodeT, class IteratorT> |
1656 | std::vector<uint64_t> |
1657 | CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes( |
1658 | CallStack<NodeT, IteratorT> &CallsiteContext) { |
1659 | std::vector<uint64_t> StackIds; |
1660 | for (auto IdOrIndex : CallsiteContext) { |
1661 | auto StackId = getStackId(IdOrIndex); |
1662 | ContextNode *Node = getNodeForStackId(StackId); |
1663 | if (!Node) |
1664 | break; |
1665 | StackIds.push_back(StackId); |
1666 | } |
1667 | return StackIds; |
1668 | } |
1669 | |
1670 | ModuleCallsiteContextGraph::( |
1671 | Module &M, |
1672 | llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) |
1673 | : Mod(M), OREGetter(OREGetter) { |
1674 | for (auto &F : M) { |
1675 | std::vector<CallInfo> CallsWithMetadata; |
1676 | for (auto &BB : F) { |
1677 | for (auto &I : BB) { |
1678 | if (!isa<CallBase>(Val: I)) |
1679 | continue; |
1680 | if (auto *MemProfMD = I.getMetadata(KindID: LLVMContext::MD_memprof)) { |
1681 | CallsWithMetadata.push_back(x: &I); |
1682 | auto *AllocNode = addAllocNode(Call: &I, F: &F); |
1683 | auto *CallsiteMD = I.getMetadata(KindID: LLVMContext::MD_callsite); |
1684 | assert(CallsiteMD); |
1685 | CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD); |
1686 | // Add all of the MIBs and their stack nodes. |
1687 | for (auto &MDOp : MemProfMD->operands()) { |
1688 | auto *MIBMD = cast<const MDNode>(Val: MDOp); |
1689 | MDNode *StackNode = getMIBStackNode(MIB: MIBMD); |
1690 | assert(StackNode); |
1691 | CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode); |
1692 | addStackNodesForMIB<MDNode, MDNode::op_iterator>( |
1693 | AllocNode, StackContext, CallsiteContext, |
1694 | AllocType: getMIBAllocType(MIB: MIBMD), TotalSize: getMIBTotalSize(MIB: MIBMD)); |
1695 | } |
1696 | assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None); |
1697 | // Memprof and callsite metadata on memory allocations no longer |
1698 | // needed. |
1699 | I.setMetadata(KindID: LLVMContext::MD_memprof, Node: nullptr); |
1700 | I.setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr); |
1701 | } |
1702 | // For callsite metadata, add to list for this function for later use. |
1703 | else if (I.getMetadata(KindID: LLVMContext::MD_callsite)) |
1704 | CallsWithMetadata.push_back(x: &I); |
1705 | } |
1706 | } |
1707 | if (!CallsWithMetadata.empty()) |
1708 | FuncToCallsWithMetadata[&F] = CallsWithMetadata; |
1709 | } |
1710 | |
1711 | if (DumpCCG) { |
1712 | dbgs() << "CCG before updating call stack chains:\n" ; |
1713 | dbgs() << *this; |
1714 | } |
1715 | |
1716 | if (ExportToDot) |
1717 | exportToDot(Label: "prestackupdate" ); |
1718 | |
1719 | updateStackNodes(); |
1720 | |
1721 | handleCallsitesWithMultipleTargets(); |
1722 | |
1723 | // Strip off remaining callsite metadata, no longer needed. |
1724 | for (auto &FuncEntry : FuncToCallsWithMetadata) |
1725 | for (auto &Call : FuncEntry.second) |
1726 | Call.call()->setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr); |
1727 | } |
1728 | |
1729 | IndexCallsiteContextGraph::IndexCallsiteContextGraph( |
1730 | ModuleSummaryIndex &Index, |
1731 | llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)> |
1732 | isPrevailing) |
1733 | : Index(Index), isPrevailing(isPrevailing) { |
1734 | for (auto &I : Index) { |
1735 | auto VI = Index.getValueInfo(R: I); |
1736 | for (auto &S : VI.getSummaryList()) { |
1737 | // We should only add the prevailing nodes. Otherwise we may try to clone |
1738 | // in a weak copy that won't be linked (and may be different than the |
1739 | // prevailing version). |
1740 | // We only keep the memprof summary on the prevailing copy now when |
1741 | // building the combined index, as a space optimization, however don't |
1742 | // rely on this optimization. The linker doesn't resolve local linkage |
1743 | // values so don't check whether those are prevailing. |
1744 | if (!GlobalValue::isLocalLinkage(Linkage: S->linkage()) && |
1745 | !isPrevailing(VI.getGUID(), S.get())) |
1746 | continue; |
1747 | auto *FS = dyn_cast<FunctionSummary>(Val: S.get()); |
1748 | if (!FS) |
1749 | continue; |
1750 | std::vector<CallInfo> CallsWithMetadata; |
1751 | if (!FS->allocs().empty()) { |
1752 | for (auto &AN : FS->mutableAllocs()) { |
1753 | // This can happen because of recursion elimination handling that |
1754 | // currently exists in ModuleSummaryAnalysis. Skip these for now. |
1755 | // We still added them to the summary because we need to be able to |
1756 | // correlate properly in applyImport in the backends. |
1757 | if (AN.MIBs.empty()) |
1758 | continue; |
1759 | CallsWithMetadata.push_back(x: {&AN}); |
1760 | auto *AllocNode = addAllocNode(Call: {&AN}, F: FS); |
1761 | // Pass an empty CallStack to the CallsiteContext (second) |
1762 | // parameter, since for ThinLTO we already collapsed out the inlined |
1763 | // stack ids on the allocation call during ModuleSummaryAnalysis. |
1764 | CallStack<MIBInfo, SmallVector<unsigned>::const_iterator> |
1765 | EmptyContext; |
1766 | unsigned I = 0; |
1767 | assert(!MemProfReportHintedSizes || |
1768 | AN.TotalSizes.size() == AN.MIBs.size()); |
1769 | // Now add all of the MIBs and their stack nodes. |
1770 | for (auto &MIB : AN.MIBs) { |
1771 | CallStack<MIBInfo, SmallVector<unsigned>::const_iterator> |
1772 | StackContext(&MIB); |
1773 | uint64_t TotalSize = 0; |
1774 | if (MemProfReportHintedSizes) |
1775 | TotalSize = AN.TotalSizes[I]; |
1776 | addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>( |
1777 | AllocNode, StackContext, CallsiteContext&: EmptyContext, AllocType: MIB.AllocType, |
1778 | TotalSize); |
1779 | I++; |
1780 | } |
1781 | assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None); |
1782 | // Initialize version 0 on the summary alloc node to the current alloc |
1783 | // type, unless it has both types in which case make it default, so |
1784 | // that in the case where we aren't able to clone the original version |
1785 | // always ends up with the default allocation behavior. |
1786 | AN.Versions[0] = (uint8_t)allocTypeToUse(AllocTypes: AllocNode->AllocTypes); |
1787 | } |
1788 | } |
1789 | // For callsite metadata, add to list for this function for later use. |
1790 | if (!FS->callsites().empty()) |
1791 | for (auto &SN : FS->mutableCallsites()) |
1792 | CallsWithMetadata.push_back(x: {&SN}); |
1793 | |
1794 | if (!CallsWithMetadata.empty()) |
1795 | FuncToCallsWithMetadata[FS] = CallsWithMetadata; |
1796 | |
1797 | if (!FS->allocs().empty() || !FS->callsites().empty()) |
1798 | FSToVIMap[FS] = VI; |
1799 | } |
1800 | } |
1801 | |
1802 | if (DumpCCG) { |
1803 | dbgs() << "CCG before updating call stack chains:\n" ; |
1804 | dbgs() << *this; |
1805 | } |
1806 | |
1807 | if (ExportToDot) |
1808 | exportToDot(Label: "prestackupdate" ); |
1809 | |
1810 | updateStackNodes(); |
1811 | |
1812 | handleCallsitesWithMultipleTargets(); |
1813 | } |
1814 | |
1815 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
1816 | void CallsiteContextGraph<DerivedCCG, FuncTy, |
1817 | CallTy>::handleCallsitesWithMultipleTargets() { |
1818 | // Look for and workaround callsites that call multiple functions. |
1819 | // This can happen for indirect calls, which needs better handling, and in |
1820 | // more rare cases (e.g. macro expansion). |
1821 | // TODO: To fix this for indirect calls we will want to perform speculative |
1822 | // devirtualization using either the normal PGO info with ICP, or using the |
1823 | // information in the profiled MemProf contexts. We can do this prior to |
1824 | // this transformation for regular LTO, and for ThinLTO we can simulate that |
1825 | // effect in the summary and perform the actual speculative devirtualization |
1826 | // while cloning in the ThinLTO backend. |
1827 | |
1828 | // Keep track of the new nodes synthesized for discovered tail calls missing |
1829 | // from the profiled contexts. |
1830 | MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap; |
1831 | |
1832 | for (auto &Entry : NonAllocationCallToContextNodeMap) { |
1833 | auto *Node = Entry.second; |
1834 | assert(Node->Clones.empty()); |
1835 | // Check all node callees and see if in the same function. |
1836 | auto Call = Node->Call.call(); |
1837 | for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end(); |
1838 | ++EI) { |
1839 | auto Edge = *EI; |
1840 | if (!Edge->Callee->hasCall()) |
1841 | continue; |
1842 | assert(NodeToCallingFunc.count(Edge->Callee)); |
1843 | // Check if the called function matches that of the callee node. |
1844 | if (calleesMatch(Call, EI, TailCallToContextNodeMap)) |
1845 | continue; |
1846 | RemovedEdgesWithMismatchedCallees++; |
1847 | // Work around by setting Node to have a null call, so it gets |
1848 | // skipped during cloning. Otherwise assignFunctions will assert |
1849 | // because its data structures are not designed to handle this case. |
1850 | Node->setCall(CallInfo()); |
1851 | break; |
1852 | } |
1853 | } |
1854 | |
1855 | // Remove all mismatched nodes identified in the above loop from the node map |
1856 | // (checking whether they have a null call which is set above). For a |
1857 | // MapVector like NonAllocationCallToContextNodeMap it is much more efficient |
1858 | // to do the removal via remove_if than by individually erasing entries above. |
1859 | NonAllocationCallToContextNodeMap.remove_if( |
1860 | [](const auto &it) { return !it.second->hasCall(); }); |
1861 | |
1862 | // Add the new nodes after the above loop so that the iteration is not |
1863 | // invalidated. |
1864 | for (auto &[Call, Node] : TailCallToContextNodeMap) |
1865 | NonAllocationCallToContextNodeMap[Call] = Node; |
1866 | } |
1867 | |
1868 | uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const { |
1869 | // In the Module (IR) case this is already the Id. |
1870 | return IdOrIndex; |
1871 | } |
1872 | |
1873 | uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const { |
1874 | // In the Index case this is an index into the stack id list in the summary |
1875 | // index, convert it to an Id. |
1876 | return Index.getStackIdAtIndex(Index: IdOrIndex); |
1877 | } |
1878 | |
1879 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
1880 | bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch( |
1881 | CallTy Call, EdgeIter &EI, |
1882 | MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) { |
1883 | auto Edge = *EI; |
1884 | const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee]; |
1885 | const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller]; |
1886 | // Will be populated in order of callee to caller if we find a chain of tail |
1887 | // calls between the profiled caller and callee. |
1888 | std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain; |
1889 | if (!calleeMatchesFunc(Call, Func: ProfiledCalleeFunc, CallerFunc, |
1890 | FoundCalleeChain)) |
1891 | return false; |
1892 | |
1893 | // The usual case where the profiled callee matches that of the IR/summary. |
1894 | if (FoundCalleeChain.empty()) |
1895 | return true; |
1896 | |
1897 | auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) { |
1898 | auto *CurEdge = Callee->findEdgeFromCaller(Caller); |
1899 | // If there is already an edge between these nodes, simply update it and |
1900 | // return. |
1901 | if (CurEdge) { |
1902 | CurEdge->ContextIds.insert(Edge->ContextIds.begin(), |
1903 | Edge->ContextIds.end()); |
1904 | CurEdge->AllocTypes |= Edge->AllocTypes; |
1905 | return; |
1906 | } |
1907 | // Otherwise, create a new edge and insert it into the caller and callee |
1908 | // lists. |
1909 | auto NewEdge = std::make_shared<ContextEdge>( |
1910 | Callee, Caller, Edge->AllocTypes, Edge->ContextIds); |
1911 | Callee->CallerEdges.push_back(NewEdge); |
1912 | if (Caller == Edge->Caller) { |
1913 | // If we are inserting the new edge into the current edge's caller, insert |
1914 | // the new edge before the current iterator position, and then increment |
1915 | // back to the current edge. |
1916 | EI = Caller->CalleeEdges.insert(EI, NewEdge); |
1917 | ++EI; |
1918 | assert(*EI == Edge && |
1919 | "Iterator position not restored after insert and increment" ); |
1920 | } else |
1921 | Caller->CalleeEdges.push_back(NewEdge); |
1922 | }; |
1923 | |
1924 | // Create new nodes for each found callee and connect in between the profiled |
1925 | // caller and callee. |
1926 | auto *CurCalleeNode = Edge->Callee; |
1927 | for (auto &[NewCall, Func] : FoundCalleeChain) { |
1928 | ContextNode *NewNode = nullptr; |
1929 | // First check if we have already synthesized a node for this tail call. |
1930 | if (TailCallToContextNodeMap.count(NewCall)) { |
1931 | NewNode = TailCallToContextNodeMap[NewCall]; |
1932 | NewNode->AllocTypes |= Edge->AllocTypes; |
1933 | } else { |
1934 | FuncToCallsWithMetadata[Func].push_back({NewCall}); |
1935 | // Create Node and record node info. |
1936 | NodeOwner.push_back( |
1937 | std::make_unique<ContextNode>(/*IsAllocation=*/false, NewCall)); |
1938 | NewNode = NodeOwner.back().get(); |
1939 | NodeToCallingFunc[NewNode] = Func; |
1940 | TailCallToContextNodeMap[NewCall] = NewNode; |
1941 | NewNode->AllocTypes = Edge->AllocTypes; |
1942 | } |
1943 | |
1944 | // Hook up node to its callee node |
1945 | AddEdge(NewNode, CurCalleeNode); |
1946 | |
1947 | CurCalleeNode = NewNode; |
1948 | } |
1949 | |
1950 | // Hook up edge's original caller to new callee node. |
1951 | AddEdge(Edge->Caller, CurCalleeNode); |
1952 | |
1953 | // Remove old edge |
1954 | Edge->Callee->eraseCallerEdge(Edge.get()); |
1955 | EI = Edge->Caller->CalleeEdges.erase(EI); |
1956 | |
1957 | // To simplify the increment of EI in the caller, subtract one from EI. |
1958 | // In the final AddEdge call we would have either added a new callee edge, |
1959 | // to Edge->Caller, or found an existing one. Either way we are guaranteed |
1960 | // that there is at least one callee edge. |
1961 | assert(!Edge->Caller->CalleeEdges.empty()); |
1962 | --EI; |
1963 | |
1964 | return true; |
1965 | } |
1966 | |
1967 | bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls( |
1968 | const Function *ProfiledCallee, Value *CurCallee, unsigned Depth, |
1969 | std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain, |
1970 | bool &FoundMultipleCalleeChains) { |
1971 | // Stop recursive search if we have already explored the maximum specified |
1972 | // depth. |
1973 | if (Depth > TailCallSearchDepth) |
1974 | return false; |
1975 | |
1976 | auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) { |
1977 | FoundCalleeChain.push_back(x: {Callsite, F}); |
1978 | }; |
1979 | |
1980 | auto *CalleeFunc = dyn_cast<Function>(Val: CurCallee); |
1981 | if (!CalleeFunc) { |
1982 | auto *Alias = dyn_cast<GlobalAlias>(Val: CurCallee); |
1983 | assert(Alias); |
1984 | CalleeFunc = dyn_cast<Function>(Val: Alias->getAliasee()); |
1985 | assert(CalleeFunc); |
1986 | } |
1987 | |
1988 | // Look for tail calls in this function, and check if they either call the |
1989 | // profiled callee directly, or indirectly (via a recursive search). |
1990 | // Only succeed if there is a single unique tail call chain found between the |
1991 | // profiled caller and callee, otherwise we could perform incorrect cloning. |
1992 | bool FoundSingleCalleeChain = false; |
1993 | for (auto &BB : *CalleeFunc) { |
1994 | for (auto &I : BB) { |
1995 | auto *CB = dyn_cast<CallBase>(Val: &I); |
1996 | if (!CB || !CB->isTailCall()) |
1997 | continue; |
1998 | auto *CalledValue = CB->getCalledOperand(); |
1999 | auto *CalledFunction = CB->getCalledFunction(); |
2000 | if (CalledValue && !CalledFunction) { |
2001 | CalledValue = CalledValue->stripPointerCasts(); |
2002 | // Stripping pointer casts can reveal a called function. |
2003 | CalledFunction = dyn_cast<Function>(Val: CalledValue); |
2004 | } |
2005 | // Check if this is an alias to a function. If so, get the |
2006 | // called aliasee for the checks below. |
2007 | if (auto *GA = dyn_cast<GlobalAlias>(Val: CalledValue)) { |
2008 | assert(!CalledFunction && |
2009 | "Expected null called function in callsite for alias" ); |
2010 | CalledFunction = dyn_cast<Function>(Val: GA->getAliaseeObject()); |
2011 | } |
2012 | if (!CalledFunction) |
2013 | continue; |
2014 | if (CalledFunction == ProfiledCallee) { |
2015 | if (FoundSingleCalleeChain) { |
2016 | FoundMultipleCalleeChains = true; |
2017 | return false; |
2018 | } |
2019 | FoundSingleCalleeChain = true; |
2020 | FoundProfiledCalleeCount++; |
2021 | FoundProfiledCalleeDepth += Depth; |
2022 | if (Depth > FoundProfiledCalleeMaxDepth) |
2023 | FoundProfiledCalleeMaxDepth = Depth; |
2024 | SaveCallsiteInfo(&I, CalleeFunc); |
2025 | } else if (findProfiledCalleeThroughTailCalls( |
2026 | ProfiledCallee, CurCallee: CalledFunction, Depth: Depth + 1, |
2027 | FoundCalleeChain, FoundMultipleCalleeChains)) { |
2028 | // findProfiledCalleeThroughTailCalls should not have returned |
2029 | // true if FoundMultipleCalleeChains. |
2030 | assert(!FoundMultipleCalleeChains); |
2031 | if (FoundSingleCalleeChain) { |
2032 | FoundMultipleCalleeChains = true; |
2033 | return false; |
2034 | } |
2035 | FoundSingleCalleeChain = true; |
2036 | SaveCallsiteInfo(&I, CalleeFunc); |
2037 | } else if (FoundMultipleCalleeChains) |
2038 | return false; |
2039 | } |
2040 | } |
2041 | |
2042 | return FoundSingleCalleeChain; |
2043 | } |
2044 | |
2045 | bool ModuleCallsiteContextGraph::calleeMatchesFunc( |
2046 | Instruction *Call, const Function *Func, const Function *CallerFunc, |
2047 | std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) { |
2048 | auto *CB = dyn_cast<CallBase>(Val: Call); |
2049 | if (!CB->getCalledOperand()) |
2050 | return false; |
2051 | auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts(); |
2052 | auto *CalleeFunc = dyn_cast<Function>(Val: CalleeVal); |
2053 | if (CalleeFunc == Func) |
2054 | return true; |
2055 | auto *Alias = dyn_cast<GlobalAlias>(Val: CalleeVal); |
2056 | if (Alias && Alias->getAliasee() == Func) |
2057 | return true; |
2058 | |
2059 | // Recursively search for the profiled callee through tail calls starting with |
2060 | // the actual Callee. The discovered tail call chain is saved in |
2061 | // FoundCalleeChain, and we will fixup the graph to include these callsites |
2062 | // after returning. |
2063 | // FIXME: We will currently redo the same recursive walk if we find the same |
2064 | // mismatched callee from another callsite. We can improve this with more |
2065 | // bookkeeping of the created chain of new nodes for each mismatch. |
2066 | unsigned Depth = 1; |
2067 | bool FoundMultipleCalleeChains = false; |
2068 | if (!findProfiledCalleeThroughTailCalls(ProfiledCallee: Func, CurCallee: CalleeVal, Depth, |
2069 | FoundCalleeChain, |
2070 | FoundMultipleCalleeChains)) { |
2071 | LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " |
2072 | << Func->getName() << " from " << CallerFunc->getName() |
2073 | << " that actually called " << CalleeVal->getName() |
2074 | << (FoundMultipleCalleeChains |
2075 | ? " (found multiple possible chains)" |
2076 | : "" ) |
2077 | << "\n" ); |
2078 | if (FoundMultipleCalleeChains) |
2079 | FoundProfiledCalleeNonUniquelyCount++; |
2080 | return false; |
2081 | } |
2082 | |
2083 | return true; |
2084 | } |
2085 | |
2086 | bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls( |
2087 | ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth, |
2088 | std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain, |
2089 | bool &FoundMultipleCalleeChains) { |
2090 | // Stop recursive search if we have already explored the maximum specified |
2091 | // depth. |
2092 | if (Depth > TailCallSearchDepth) |
2093 | return false; |
2094 | |
2095 | auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) { |
2096 | // Make a CallsiteInfo for each discovered callee, if one hasn't already |
2097 | // been synthesized. |
2098 | if (!FunctionCalleesToSynthesizedCallsiteInfos.count(x: FS) || |
2099 | !FunctionCalleesToSynthesizedCallsiteInfos[FS].count(x: Callee)) |
2100 | // StackIds is empty (we don't have debug info available in the index for |
2101 | // these callsites) |
2102 | FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] = |
2103 | std::make_unique<CallsiteInfo>(args&: Callee, args: SmallVector<unsigned>()); |
2104 | CallsiteInfo *NewCallsiteInfo = |
2105 | FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get(); |
2106 | FoundCalleeChain.push_back(x: {NewCallsiteInfo, FS}); |
2107 | }; |
2108 | |
2109 | // Look for tail calls in this function, and check if they either call the |
2110 | // profiled callee directly, or indirectly (via a recursive search). |
2111 | // Only succeed if there is a single unique tail call chain found between the |
2112 | // profiled caller and callee, otherwise we could perform incorrect cloning. |
2113 | bool FoundSingleCalleeChain = false; |
2114 | for (auto &S : CurCallee.getSummaryList()) { |
2115 | if (!GlobalValue::isLocalLinkage(Linkage: S->linkage()) && |
2116 | !isPrevailing(CurCallee.getGUID(), S.get())) |
2117 | continue; |
2118 | auto *FS = dyn_cast<FunctionSummary>(Val: S->getBaseObject()); |
2119 | if (!FS) |
2120 | continue; |
2121 | auto FSVI = CurCallee; |
2122 | auto *AS = dyn_cast<AliasSummary>(Val: S.get()); |
2123 | if (AS) |
2124 | FSVI = AS->getAliaseeVI(); |
2125 | for (auto &CallEdge : FS->calls()) { |
2126 | if (!CallEdge.second.hasTailCall()) |
2127 | continue; |
2128 | if (CallEdge.first == ProfiledCallee) { |
2129 | if (FoundSingleCalleeChain) { |
2130 | FoundMultipleCalleeChains = true; |
2131 | return false; |
2132 | } |
2133 | FoundSingleCalleeChain = true; |
2134 | FoundProfiledCalleeCount++; |
2135 | FoundProfiledCalleeDepth += Depth; |
2136 | if (Depth > FoundProfiledCalleeMaxDepth) |
2137 | FoundProfiledCalleeMaxDepth = Depth; |
2138 | CreateAndSaveCallsiteInfo(CallEdge.first, FS); |
2139 | // Add FS to FSToVIMap in case it isn't already there. |
2140 | assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI); |
2141 | FSToVIMap[FS] = FSVI; |
2142 | } else if (findProfiledCalleeThroughTailCalls( |
2143 | ProfiledCallee, CurCallee: CallEdge.first, Depth: Depth + 1, |
2144 | FoundCalleeChain, FoundMultipleCalleeChains)) { |
2145 | // findProfiledCalleeThroughTailCalls should not have returned |
2146 | // true if FoundMultipleCalleeChains. |
2147 | assert(!FoundMultipleCalleeChains); |
2148 | if (FoundSingleCalleeChain) { |
2149 | FoundMultipleCalleeChains = true; |
2150 | return false; |
2151 | } |
2152 | FoundSingleCalleeChain = true; |
2153 | CreateAndSaveCallsiteInfo(CallEdge.first, FS); |
2154 | // Add FS to FSToVIMap in case it isn't already there. |
2155 | assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI); |
2156 | FSToVIMap[FS] = FSVI; |
2157 | } else if (FoundMultipleCalleeChains) |
2158 | return false; |
2159 | } |
2160 | } |
2161 | |
2162 | return FoundSingleCalleeChain; |
2163 | } |
2164 | |
2165 | bool IndexCallsiteContextGraph::calleeMatchesFunc( |
2166 | IndexCall &Call, const FunctionSummary *Func, |
2167 | const FunctionSummary *CallerFunc, |
2168 | std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) { |
2169 | ValueInfo Callee = |
2170 | dyn_cast_if_present<CallsiteInfo *>(Val: Call.getBase())->Callee; |
2171 | // If there is no summary list then this is a call to an externally defined |
2172 | // symbol. |
2173 | AliasSummary *Alias = |
2174 | Callee.getSummaryList().empty() |
2175 | ? nullptr |
2176 | : dyn_cast<AliasSummary>(Val: Callee.getSummaryList()[0].get()); |
2177 | assert(FSToVIMap.count(Func)); |
2178 | auto FuncVI = FSToVIMap[Func]; |
2179 | if (Callee == FuncVI || |
2180 | // If callee is an alias, check the aliasee, since only function |
2181 | // summary base objects will contain the stack node summaries and thus |
2182 | // get a context node. |
2183 | (Alias && Alias->getAliaseeVI() == FuncVI)) |
2184 | return true; |
2185 | |
2186 | // Recursively search for the profiled callee through tail calls starting with |
2187 | // the actual Callee. The discovered tail call chain is saved in |
2188 | // FoundCalleeChain, and we will fixup the graph to include these callsites |
2189 | // after returning. |
2190 | // FIXME: We will currently redo the same recursive walk if we find the same |
2191 | // mismatched callee from another callsite. We can improve this with more |
2192 | // bookkeeping of the created chain of new nodes for each mismatch. |
2193 | unsigned Depth = 1; |
2194 | bool FoundMultipleCalleeChains = false; |
2195 | if (!findProfiledCalleeThroughTailCalls( |
2196 | ProfiledCallee: FuncVI, CurCallee: Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) { |
2197 | LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI |
2198 | << " from " << FSToVIMap[CallerFunc] |
2199 | << " that actually called " << Callee |
2200 | << (FoundMultipleCalleeChains |
2201 | ? " (found multiple possible chains)" |
2202 | : "" ) |
2203 | << "\n" ); |
2204 | if (FoundMultipleCalleeChains) |
2205 | FoundProfiledCalleeNonUniquelyCount++; |
2206 | return false; |
2207 | } |
2208 | |
2209 | return true; |
2210 | } |
2211 | |
2212 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
2213 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump() |
2214 | const { |
2215 | print(OS&: dbgs()); |
2216 | dbgs() << "\n" ; |
2217 | } |
2218 | |
2219 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
2220 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print( |
2221 | raw_ostream &OS) const { |
2222 | OS << "Node " << this << "\n" ; |
2223 | OS << "\t" ; |
2224 | printCall(OS); |
2225 | if (Recursive) |
2226 | OS << " (recursive)" ; |
2227 | OS << "\n" ; |
2228 | OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n" ; |
2229 | OS << "\tContextIds:" ; |
2230 | // Make a copy of the computed context ids that we can sort for stability. |
2231 | auto ContextIds = getContextIds(); |
2232 | std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end()); |
2233 | std::sort(first: SortedIds.begin(), last: SortedIds.end()); |
2234 | for (auto Id : SortedIds) |
2235 | OS << " " << Id; |
2236 | OS << "\n" ; |
2237 | OS << "\tCalleeEdges:\n" ; |
2238 | for (auto &Edge : CalleeEdges) |
2239 | OS << "\t\t" << *Edge << "\n" ; |
2240 | OS << "\tCallerEdges:\n" ; |
2241 | for (auto &Edge : CallerEdges) |
2242 | OS << "\t\t" << *Edge << "\n" ; |
2243 | if (!Clones.empty()) { |
2244 | OS << "\tClones: " ; |
2245 | FieldSeparator FS; |
2246 | for (auto *Clone : Clones) |
2247 | OS << FS << Clone; |
2248 | OS << "\n" ; |
2249 | } else if (CloneOf) { |
2250 | OS << "\tClone of " << CloneOf << "\n" ; |
2251 | } |
2252 | } |
2253 | |
2254 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
2255 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump() |
2256 | const { |
2257 | print(OS&: dbgs()); |
2258 | dbgs() << "\n" ; |
2259 | } |
2260 | |
2261 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
2262 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print( |
2263 | raw_ostream &OS) const { |
2264 | OS << "Edge from Callee " << Callee << " to Caller: " << Caller |
2265 | << " AllocTypes: " << getAllocTypeString(AllocTypes); |
2266 | OS << " ContextIds:" ; |
2267 | std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end()); |
2268 | std::sort(first: SortedIds.begin(), last: SortedIds.end()); |
2269 | for (auto Id : SortedIds) |
2270 | OS << " " << Id; |
2271 | } |
2272 | |
2273 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
2274 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const { |
2275 | print(OS&: dbgs()); |
2276 | } |
2277 | |
2278 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
2279 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print( |
2280 | raw_ostream &OS) const { |
2281 | OS << "Callsite Context Graph:\n" ; |
2282 | using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; |
2283 | for (const auto Node : nodes<GraphType>(this)) { |
2284 | if (Node->isRemoved()) |
2285 | continue; |
2286 | Node->print(OS); |
2287 | OS << "\n" ; |
2288 | } |
2289 | } |
2290 | |
2291 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
2292 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes( |
2293 | raw_ostream &OS) const { |
2294 | using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; |
2295 | for (const auto Node : nodes<GraphType>(this)) { |
2296 | if (Node->isRemoved()) |
2297 | continue; |
2298 | if (!Node->IsAllocation) |
2299 | continue; |
2300 | DenseSet<uint32_t> ContextIds = Node->getContextIds(); |
2301 | std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end()); |
2302 | std::sort(first: SortedIds.begin(), last: SortedIds.end()); |
2303 | for (auto Id : SortedIds) { |
2304 | auto SizeI = ContextIdToTotalSize.find(Val: Id); |
2305 | assert(SizeI != ContextIdToTotalSize.end()); |
2306 | auto TypeI = ContextIdToAllocationType.find(Val: Id); |
2307 | assert(TypeI != ContextIdToAllocationType.end()); |
2308 | OS << getAllocTypeString(AllocTypes: (uint8_t)TypeI->second) << " context " << Id |
2309 | << " with total size " << SizeI->second << " is " |
2310 | << getAllocTypeString(Node->AllocTypes) << " after cloning\n" ; |
2311 | } |
2312 | } |
2313 | } |
2314 | |
2315 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
2316 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const { |
2317 | using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; |
2318 | for (const auto Node : nodes<GraphType>(this)) { |
2319 | checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false); |
2320 | for (auto &Edge : Node->CallerEdges) |
2321 | checkEdge<DerivedCCG, FuncTy, CallTy>(Edge); |
2322 | } |
2323 | } |
2324 | |
2325 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
2326 | struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> { |
2327 | using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; |
2328 | using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *; |
2329 | |
2330 | using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>; |
2331 | static NodeRef getNode(const NodePtrTy &P) { return P.get(); } |
2332 | |
2333 | using nodes_iterator = |
2334 | mapped_iterator<typename std::vector<NodePtrTy>::const_iterator, |
2335 | decltype(&getNode)>; |
2336 | |
2337 | static nodes_iterator nodes_begin(GraphType G) { |
2338 | return nodes_iterator(G->NodeOwner.begin(), &getNode); |
2339 | } |
2340 | |
2341 | static nodes_iterator nodes_end(GraphType G) { |
2342 | return nodes_iterator(G->NodeOwner.end(), &getNode); |
2343 | } |
2344 | |
2345 | static NodeRef getEntryNode(GraphType G) { |
2346 | return G->NodeOwner.begin()->get(); |
2347 | } |
2348 | |
2349 | using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>; |
2350 | static const ContextNode<DerivedCCG, FuncTy, CallTy> * |
2351 | GetCallee(const EdgePtrTy &P) { |
2352 | return P->Callee; |
2353 | } |
2354 | |
2355 | using ChildIteratorType = |
2356 | mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge< |
2357 | DerivedCCG, FuncTy, CallTy>>>::const_iterator, |
2358 | decltype(&GetCallee)>; |
2359 | |
2360 | static ChildIteratorType child_begin(NodeRef N) { |
2361 | return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee); |
2362 | } |
2363 | |
2364 | static ChildIteratorType child_end(NodeRef N) { |
2365 | return ChildIteratorType(N->CalleeEdges.end(), &GetCallee); |
2366 | } |
2367 | }; |
2368 | |
2369 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
2370 | struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> |
2371 | : public DefaultDOTGraphTraits { |
2372 | DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {} |
2373 | |
2374 | using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; |
2375 | using GTraits = GraphTraits<GraphType>; |
2376 | using NodeRef = typename GTraits::NodeRef; |
2377 | using ChildIteratorType = typename GTraits::ChildIteratorType; |
2378 | |
2379 | static std::string getNodeLabel(NodeRef Node, GraphType G) { |
2380 | std::string LabelString = |
2381 | (Twine("OrigId: " ) + (Node->IsAllocation ? "Alloc" : "" ) + |
2382 | Twine(Node->OrigStackOrAllocId)) |
2383 | .str(); |
2384 | LabelString += "\n" ; |
2385 | if (Node->hasCall()) { |
2386 | auto Func = G->NodeToCallingFunc.find(Node); |
2387 | assert(Func != G->NodeToCallingFunc.end()); |
2388 | LabelString += |
2389 | G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo()); |
2390 | } else { |
2391 | LabelString += "null call" ; |
2392 | if (Node->Recursive) |
2393 | LabelString += " (recursive)" ; |
2394 | else |
2395 | LabelString += " (external)" ; |
2396 | } |
2397 | return LabelString; |
2398 | } |
2399 | |
2400 | static std::string getNodeAttributes(NodeRef Node, GraphType) { |
2401 | std::string AttributeString = (Twine("tooltip=\"" ) + getNodeId(Node) + " " + |
2402 | getContextIds(ContextIds: Node->getContextIds()) + "\"" ) |
2403 | .str(); |
2404 | AttributeString += |
2405 | (Twine(",fillcolor=\"" ) + getColor(AllocTypes: Node->AllocTypes) + "\"" ).str(); |
2406 | AttributeString += ",style=\"filled\"" ; |
2407 | if (Node->CloneOf) { |
2408 | AttributeString += ",color=\"blue\"" ; |
2409 | AttributeString += ",style=\"filled,bold,dashed\"" ; |
2410 | } else |
2411 | AttributeString += ",style=\"filled\"" ; |
2412 | return AttributeString; |
2413 | } |
2414 | |
2415 | static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter, |
2416 | GraphType) { |
2417 | auto &Edge = *(ChildIter.getCurrent()); |
2418 | return (Twine("tooltip=\"" ) + getContextIds(ContextIds: Edge->ContextIds) + "\"" + |
2419 | Twine(",fillcolor=\"" ) + getColor(AllocTypes: Edge->AllocTypes) + "\"" ) |
2420 | .str(); |
2421 | } |
2422 | |
2423 | // Since the NodeOwners list includes nodes that are no longer connected to |
2424 | // the graph, skip them here. |
2425 | static bool isNodeHidden(NodeRef Node, GraphType) { |
2426 | return Node->isRemoved(); |
2427 | } |
2428 | |
2429 | private: |
2430 | static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) { |
2431 | std::string IdString = "ContextIds:" ; |
2432 | if (ContextIds.size() < 100) { |
2433 | std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end()); |
2434 | std::sort(first: SortedIds.begin(), last: SortedIds.end()); |
2435 | for (auto Id : SortedIds) |
2436 | IdString += (" " + Twine(Id)).str(); |
2437 | } else { |
2438 | IdString += (" (" + Twine(ContextIds.size()) + " ids)" ).str(); |
2439 | } |
2440 | return IdString; |
2441 | } |
2442 | |
2443 | static std::string getColor(uint8_t AllocTypes) { |
2444 | if (AllocTypes == (uint8_t)AllocationType::NotCold) |
2445 | // Color "brown1" actually looks like a lighter red. |
2446 | return "brown1" ; |
2447 | if (AllocTypes == (uint8_t)AllocationType::Cold) |
2448 | return "cyan" ; |
2449 | if (AllocTypes == |
2450 | ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold)) |
2451 | // Lighter purple. |
2452 | return "mediumorchid1" ; |
2453 | return "gray" ; |
2454 | } |
2455 | |
2456 | static std::string getNodeId(NodeRef Node) { |
2457 | std::stringstream SStream; |
2458 | SStream << std::hex << "N0x" << (unsigned long long)Node; |
2459 | std::string Result = SStream.str(); |
2460 | return Result; |
2461 | } |
2462 | }; |
2463 | |
2464 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
2465 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot( |
2466 | std::string Label) const { |
2467 | WriteGraph(this, "" , false, Label, |
2468 | DotFilePathPrefix + "ccg." + Label + ".dot" ); |
2469 | } |
2470 | |
2471 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
2472 | typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * |
2473 | CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone( |
2474 | const std::shared_ptr<ContextEdge> &Edge, EdgeIter *CallerEdgeI, |
2475 | DenseSet<uint32_t> ContextIdsToMove) { |
2476 | ContextNode *Node = Edge->Callee; |
2477 | NodeOwner.push_back( |
2478 | std::make_unique<ContextNode>(Node->IsAllocation, Node->Call)); |
2479 | ContextNode *Clone = NodeOwner.back().get(); |
2480 | Node->addClone(Clone); |
2481 | assert(NodeToCallingFunc.count(Node)); |
2482 | NodeToCallingFunc[Clone] = NodeToCallingFunc[Node]; |
2483 | moveEdgeToExistingCalleeClone(Edge, NewCallee: Clone, CallerEdgeI, /*NewClone=*/true, |
2484 | ContextIdsToMove); |
2485 | return Clone; |
2486 | } |
2487 | |
2488 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
2489 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: |
2490 | moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge, |
2491 | ContextNode *NewCallee, EdgeIter *CallerEdgeI, |
2492 | bool NewClone, |
2493 | DenseSet<uint32_t> ContextIdsToMove) { |
2494 | // NewCallee and Edge's current callee must be clones of the same original |
2495 | // node (Edge's current callee may be the original node too). |
2496 | assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode()); |
2497 | |
2498 | ContextNode *OldCallee = Edge->Callee; |
2499 | |
2500 | // We might already have an edge to the new callee from earlier cloning for a |
2501 | // different allocation. If one exists we will reuse it. |
2502 | auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller); |
2503 | |
2504 | // Callers will pass an empty ContextIdsToMove set when they want to move the |
2505 | // edge. Copy in Edge's ids for simplicity. |
2506 | if (ContextIdsToMove.empty()) |
2507 | ContextIdsToMove = Edge->getContextIds(); |
2508 | |
2509 | // If we are moving all of Edge's ids, then just move the whole Edge. |
2510 | // Otherwise only move the specified subset, to a new edge if needed. |
2511 | if (Edge->getContextIds().size() == ContextIdsToMove.size()) { |
2512 | // Moving the whole Edge. |
2513 | if (CallerEdgeI) |
2514 | *CallerEdgeI = OldCallee->CallerEdges.erase(*CallerEdgeI); |
2515 | else |
2516 | OldCallee->eraseCallerEdge(Edge.get()); |
2517 | if (ExistingEdgeToNewCallee) { |
2518 | // Since we already have an edge to NewCallee, simply move the ids |
2519 | // onto it, and remove the existing Edge. |
2520 | ExistingEdgeToNewCallee->getContextIds().insert(ContextIdsToMove.begin(), |
2521 | ContextIdsToMove.end()); |
2522 | ExistingEdgeToNewCallee->AllocTypes |= Edge->AllocTypes; |
2523 | assert(Edge->ContextIds == ContextIdsToMove); |
2524 | Edge->ContextIds.clear(); |
2525 | Edge->AllocTypes = (uint8_t)AllocationType::None; |
2526 | Edge->Caller->eraseCalleeEdge(Edge.get()); |
2527 | } else { |
2528 | // Otherwise just reconnect Edge to NewCallee. |
2529 | Edge->Callee = NewCallee; |
2530 | NewCallee->CallerEdges.push_back(Edge); |
2531 | // Don't need to update Edge's context ids since we are simply |
2532 | // reconnecting it. |
2533 | } |
2534 | // In either case, need to update the alloc types on New Callee. |
2535 | NewCallee->AllocTypes |= Edge->AllocTypes; |
2536 | } else { |
2537 | // Only moving a subset of Edge's ids. |
2538 | if (CallerEdgeI) |
2539 | ++CallerEdgeI; |
2540 | // Compute the alloc type of the subset of ids being moved. |
2541 | auto CallerEdgeAllocType = computeAllocType(ContextIds&: ContextIdsToMove); |
2542 | if (ExistingEdgeToNewCallee) { |
2543 | // Since we already have an edge to NewCallee, simply move the ids |
2544 | // onto it. |
2545 | ExistingEdgeToNewCallee->getContextIds().insert(ContextIdsToMove.begin(), |
2546 | ContextIdsToMove.end()); |
2547 | ExistingEdgeToNewCallee->AllocTypes |= CallerEdgeAllocType; |
2548 | } else { |
2549 | // Otherwise, create a new edge to NewCallee for the ids being moved. |
2550 | auto NewEdge = std::make_shared<ContextEdge>( |
2551 | NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove); |
2552 | Edge->Caller->CalleeEdges.push_back(NewEdge); |
2553 | NewCallee->CallerEdges.push_back(NewEdge); |
2554 | } |
2555 | // In either case, need to update the alloc types on NewCallee, and remove |
2556 | // those ids and update the alloc type on the original Edge. |
2557 | NewCallee->AllocTypes |= CallerEdgeAllocType; |
2558 | set_subtract(Edge->ContextIds, ContextIdsToMove); |
2559 | Edge->AllocTypes = computeAllocType(ContextIds&: Edge->ContextIds); |
2560 | } |
2561 | // Now walk the old callee node's callee edges and move Edge's context ids |
2562 | // over to the corresponding edge into the clone (which is created here if |
2563 | // this is a newly created clone). |
2564 | for (auto &OldCalleeEdge : OldCallee->CalleeEdges) { |
2565 | // The context ids moving to the new callee are the subset of this edge's |
2566 | // context ids and the context ids on the caller edge being moved. |
2567 | DenseSet<uint32_t> EdgeContextIdsToMove = |
2568 | set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove); |
2569 | set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove); |
2570 | OldCalleeEdge->AllocTypes = |
2571 | computeAllocType(ContextIds&: OldCalleeEdge->getContextIds()); |
2572 | if (!NewClone) { |
2573 | // Update context ids / alloc type on corresponding edge to NewCallee. |
2574 | // There is a chance this may not exist if we are reusing an existing |
2575 | // clone, specifically during function assignment, where we would have |
2576 | // removed none type edges after creating the clone. If we can't find |
2577 | // a corresponding edge there, fall through to the cloning below. |
2578 | if (auto *NewCalleeEdge = |
2579 | NewCallee->findEdgeFromCallee(OldCalleeEdge->Callee)) { |
2580 | NewCalleeEdge->getContextIds().insert(EdgeContextIdsToMove.begin(), |
2581 | EdgeContextIdsToMove.end()); |
2582 | NewCalleeEdge->AllocTypes |= computeAllocType(ContextIds&: EdgeContextIdsToMove); |
2583 | continue; |
2584 | } |
2585 | } |
2586 | auto NewEdge = std::make_shared<ContextEdge>( |
2587 | OldCalleeEdge->Callee, NewCallee, |
2588 | computeAllocType(ContextIds&: EdgeContextIdsToMove), EdgeContextIdsToMove); |
2589 | NewCallee->CalleeEdges.push_back(NewEdge); |
2590 | NewEdge->Callee->CallerEdges.push_back(NewEdge); |
2591 | } |
2592 | // Recompute the node alloc type now that its callee edges have been |
2593 | // updated (since we will compute from those edges). |
2594 | OldCallee->AllocTypes = OldCallee->computeAllocType(); |
2595 | // OldCallee alloc type should be None iff its context id set is now empty. |
2596 | assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) == |
2597 | OldCallee->emptyContextIds()); |
2598 | if (VerifyCCG) { |
2599 | checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false); |
2600 | checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false); |
2601 | for (const auto &OldCalleeEdge : OldCallee->CalleeEdges) |
2602 | checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee, |
2603 | /*CheckEdges=*/false); |
2604 | for (const auto &NewCalleeEdge : NewCallee->CalleeEdges) |
2605 | checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee, |
2606 | /*CheckEdges=*/false); |
2607 | } |
2608 | } |
2609 | |
2610 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
2611 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: |
2612 | recursivelyRemoveNoneTypeCalleeEdges( |
2613 | ContextNode *Node, DenseSet<const ContextNode *> &Visited) { |
2614 | auto Inserted = Visited.insert(Node); |
2615 | if (!Inserted.second) |
2616 | return; |
2617 | |
2618 | removeNoneTypeCalleeEdges(Node); |
2619 | |
2620 | for (auto *Clone : Node->Clones) |
2621 | recursivelyRemoveNoneTypeCalleeEdges(Node: Clone, Visited); |
2622 | |
2623 | // The recursive call may remove some of this Node's caller edges. |
2624 | // Iterate over a copy and skip any that were removed. |
2625 | auto CallerEdges = Node->CallerEdges; |
2626 | for (auto &Edge : CallerEdges) { |
2627 | // Skip any that have been removed by an earlier recursive call. |
2628 | if (Edge->Callee == nullptr && Edge->Caller == nullptr) { |
2629 | assert(!is_contained(Node->CallerEdges, Edge)); |
2630 | continue; |
2631 | } |
2632 | recursivelyRemoveNoneTypeCalleeEdges(Node: Edge->Caller, Visited); |
2633 | } |
2634 | } |
2635 | |
2636 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
2637 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() { |
2638 | DenseSet<const ContextNode *> Visited; |
2639 | for (auto &Entry : AllocationCallToContextNodeMap) { |
2640 | Visited.clear(); |
2641 | identifyClones(Entry.second, Visited, Entry.second->getContextIds()); |
2642 | } |
2643 | Visited.clear(); |
2644 | for (auto &Entry : AllocationCallToContextNodeMap) |
2645 | recursivelyRemoveNoneTypeCalleeEdges(Node: Entry.second, Visited); |
2646 | if (VerifyCCG) |
2647 | check(); |
2648 | } |
2649 | |
2650 | // helper function to check an AllocType is cold or notcold or both. |
2651 | bool checkColdOrNotCold(uint8_t AllocType) { |
2652 | return (AllocType == (uint8_t)AllocationType::Cold) || |
2653 | (AllocType == (uint8_t)AllocationType::NotCold) || |
2654 | (AllocType == |
2655 | ((uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold)); |
2656 | } |
2657 | |
2658 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
2659 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones( |
2660 | ContextNode *Node, DenseSet<const ContextNode *> &Visited, |
2661 | const DenseSet<uint32_t> &AllocContextIds) { |
2662 | if (VerifyNodes) |
2663 | checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false); |
2664 | assert(!Node->CloneOf); |
2665 | |
2666 | // If Node as a null call, then either it wasn't found in the module (regular |
2667 | // LTO) or summary index (ThinLTO), or there were other conditions blocking |
2668 | // cloning (e.g. recursion, calls multiple targets, etc). |
2669 | // Do this here so that we don't try to recursively clone callers below, which |
2670 | // isn't useful at least for this node. |
2671 | if (!Node->hasCall()) |
2672 | return; |
2673 | |
2674 | #ifndef NDEBUG |
2675 | auto Insert = |
2676 | #endif |
2677 | Visited.insert(Node); |
2678 | // We should not have visited this node yet. |
2679 | assert(Insert.second); |
2680 | // The recursive call to identifyClones may delete the current edge from the |
2681 | // CallerEdges vector. Make a copy and iterate on that, simpler than passing |
2682 | // in an iterator and having recursive call erase from it. Other edges may |
2683 | // also get removed during the recursion, which will have null Callee and |
2684 | // Caller pointers (and are deleted later), so we skip those below. |
2685 | { |
2686 | auto CallerEdges = Node->CallerEdges; |
2687 | for (auto &Edge : CallerEdges) { |
2688 | // Skip any that have been removed by an earlier recursive call. |
2689 | if (Edge->Callee == nullptr && Edge->Caller == nullptr) { |
2690 | assert(!llvm::count(Node->CallerEdges, Edge)); |
2691 | continue; |
2692 | } |
2693 | // Ignore any caller we previously visited via another edge. |
2694 | if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) { |
2695 | identifyClones(Edge->Caller, Visited, AllocContextIds); |
2696 | } |
2697 | } |
2698 | } |
2699 | |
2700 | // Check if we reached an unambiguous call or have have only a single caller. |
2701 | if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1) |
2702 | return; |
2703 | |
2704 | // We need to clone. |
2705 | |
2706 | // Try to keep the original version as alloc type NotCold. This will make |
2707 | // cases with indirect calls or any other situation with an unknown call to |
2708 | // the original function get the default behavior. We do this by sorting the |
2709 | // CallerEdges of the Node we will clone by alloc type. |
2710 | // |
2711 | // Give NotCold edge the lowest sort priority so those edges are at the end of |
2712 | // the caller edges vector, and stay on the original version (since the below |
2713 | // code clones greedily until it finds all remaining edges have the same type |
2714 | // and leaves the remaining ones on the original Node). |
2715 | // |
2716 | // We shouldn't actually have any None type edges, so the sorting priority for |
2717 | // that is arbitrary, and we assert in that case below. |
2718 | const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4, |
2719 | /*Cold*/ 1, |
2720 | /*NotColdCold*/ 2}; |
2721 | std::stable_sort(Node->CallerEdges.begin(), Node->CallerEdges.end(), |
2722 | [&](const std::shared_ptr<ContextEdge> &A, |
2723 | const std::shared_ptr<ContextEdge> &B) { |
2724 | // Nodes with non-empty context ids should be sorted before |
2725 | // those with empty context ids. |
2726 | if (A->ContextIds.empty()) |
2727 | // Either B ContextIds are non-empty (in which case we |
2728 | // should return false because B < A), or B ContextIds |
2729 | // are empty, in which case they are equal, and we should |
2730 | // maintain the original relative ordering. |
2731 | return false; |
2732 | if (B->ContextIds.empty()) |
2733 | return true; |
2734 | |
2735 | if (A->AllocTypes == B->AllocTypes) |
2736 | // Use the first context id for each edge as a |
2737 | // tie-breaker. |
2738 | return *A->ContextIds.begin() < *B->ContextIds.begin(); |
2739 | return AllocTypeCloningPriority[A->AllocTypes] < |
2740 | AllocTypeCloningPriority[B->AllocTypes]; |
2741 | }); |
2742 | |
2743 | assert(Node->AllocTypes != (uint8_t)AllocationType::None); |
2744 | |
2745 | // Iterate until we find no more opportunities for disambiguating the alloc |
2746 | // types via cloning. In most cases this loop will terminate once the Node |
2747 | // has a single allocation type, in which case no more cloning is needed. |
2748 | // We need to be able to remove Edge from CallerEdges, so need to adjust |
2749 | // iterator inside the loop. |
2750 | for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) { |
2751 | auto CallerEdge = *EI; |
2752 | |
2753 | // See if cloning the prior caller edge left this node with a single alloc |
2754 | // type or a single caller. In that case no more cloning of Node is needed. |
2755 | if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1) |
2756 | break; |
2757 | |
2758 | // Only need to process the ids along this edge pertaining to the given |
2759 | // allocation. |
2760 | auto CallerEdgeContextsForAlloc = |
2761 | set_intersection(CallerEdge->getContextIds(), AllocContextIds); |
2762 | if (CallerEdgeContextsForAlloc.empty()) { |
2763 | ++EI; |
2764 | continue; |
2765 | } |
2766 | auto CallerAllocTypeForAlloc = computeAllocType(ContextIds&: CallerEdgeContextsForAlloc); |
2767 | |
2768 | // Compute the node callee edge alloc types corresponding to the context ids |
2769 | // for this caller edge. |
2770 | std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge; |
2771 | CalleeEdgeAllocTypesForCallerEdge.reserve(n: Node->CalleeEdges.size()); |
2772 | for (auto &CalleeEdge : Node->CalleeEdges) |
2773 | CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes( |
2774 | Node1Ids: CalleeEdge->getContextIds(), Node2Ids: CallerEdgeContextsForAlloc)); |
2775 | |
2776 | // Don't clone if doing so will not disambiguate any alloc types amongst |
2777 | // caller edges (including the callee edges that would be cloned). |
2778 | // Otherwise we will simply move all edges to the clone. |
2779 | // |
2780 | // First check if by cloning we will disambiguate the caller allocation |
2781 | // type from node's allocation type. Query allocTypeToUse so that we don't |
2782 | // bother cloning to distinguish NotCold+Cold from NotCold. Note that |
2783 | // neither of these should be None type. |
2784 | // |
2785 | // Then check if by cloning node at least one of the callee edges will be |
2786 | // disambiguated by splitting out different context ids. |
2787 | assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None); |
2788 | assert(Node->AllocTypes != (uint8_t)AllocationType::None); |
2789 | if (allocTypeToUse(CallerAllocTypeForAlloc) == |
2790 | allocTypeToUse(Node->AllocTypes) && |
2791 | allocTypesMatch<DerivedCCG, FuncTy, CallTy>( |
2792 | CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) { |
2793 | ++EI; |
2794 | continue; |
2795 | } |
2796 | |
2797 | // First see if we can use an existing clone. Check each clone and its |
2798 | // callee edges for matching alloc types. |
2799 | ContextNode *Clone = nullptr; |
2800 | for (auto *CurClone : Node->Clones) { |
2801 | if (allocTypeToUse(CurClone->AllocTypes) != |
2802 | allocTypeToUse(CallerAllocTypeForAlloc)) |
2803 | continue; |
2804 | |
2805 | if (!allocTypesMatch<DerivedCCG, FuncTy, CallTy>( |
2806 | CalleeEdgeAllocTypesForCallerEdge, CurClone->CalleeEdges)) |
2807 | continue; |
2808 | Clone = CurClone; |
2809 | break; |
2810 | } |
2811 | |
2812 | // The edge iterator is adjusted when we move the CallerEdge to the clone. |
2813 | if (Clone) |
2814 | moveEdgeToExistingCalleeClone(Edge: CallerEdge, NewCallee: Clone, CallerEdgeI: &EI, /*NewClone=*/false, |
2815 | ContextIdsToMove: CallerEdgeContextsForAlloc); |
2816 | else |
2817 | Clone = |
2818 | moveEdgeToNewCalleeClone(Edge: CallerEdge, CallerEdgeI: &EI, ContextIdsToMove: CallerEdgeContextsForAlloc); |
2819 | |
2820 | assert(EI == Node->CallerEdges.end() || |
2821 | Node->AllocTypes != (uint8_t)AllocationType::None); |
2822 | // Sanity check that no alloc types on clone or its edges are None. |
2823 | assert(Clone->AllocTypes != (uint8_t)AllocationType::None); |
2824 | } |
2825 | |
2826 | // We should still have some context ids on the original Node. |
2827 | assert(!Node->emptyContextIds()); |
2828 | |
2829 | // Sanity check that no alloc types on node or edges are None. |
2830 | assert(Node->AllocTypes != (uint8_t)AllocationType::None); |
2831 | |
2832 | if (VerifyNodes) |
2833 | checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false); |
2834 | } |
2835 | |
2836 | void ModuleCallsiteContextGraph::updateAllocationCall( |
2837 | CallInfo &Call, AllocationType AllocType) { |
2838 | std::string AllocTypeString = getAllocTypeAttributeString(Type: AllocType); |
2839 | auto A = llvm::Attribute::get(Context&: Call.call()->getFunction()->getContext(), |
2840 | Kind: "memprof" , Val: AllocTypeString); |
2841 | cast<CallBase>(Val: Call.call())->addFnAttr(Attr: A); |
2842 | OREGetter(Call.call()->getFunction()) |
2843 | .emit(OptDiag&: OptimizationRemark(DEBUG_TYPE, "MemprofAttribute" , Call.call()) |
2844 | << ore::NV("AllocationCall" , Call.call()) << " in clone " |
2845 | << ore::NV("Caller" , Call.call()->getFunction()) |
2846 | << " marked with memprof allocation attribute " |
2847 | << ore::NV("Attribute" , AllocTypeString)); |
2848 | } |
2849 | |
2850 | void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call, |
2851 | AllocationType AllocType) { |
2852 | auto *AI = Call.call().dyn_cast<AllocInfo *>(); |
2853 | assert(AI); |
2854 | assert(AI->Versions.size() > Call.cloneNo()); |
2855 | AI->Versions[Call.cloneNo()] = (uint8_t)AllocType; |
2856 | } |
2857 | |
2858 | void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall, |
2859 | FuncInfo CalleeFunc) { |
2860 | if (CalleeFunc.cloneNo() > 0) |
2861 | cast<CallBase>(Val: CallerCall.call())->setCalledFunction(CalleeFunc.func()); |
2862 | OREGetter(CallerCall.call()->getFunction()) |
2863 | .emit(OptDiag&: OptimizationRemark(DEBUG_TYPE, "MemprofCall" , CallerCall.call()) |
2864 | << ore::NV("Call" , CallerCall.call()) << " in clone " |
2865 | << ore::NV("Caller" , CallerCall.call()->getFunction()) |
2866 | << " assigned to call function clone " |
2867 | << ore::NV("Callee" , CalleeFunc.func())); |
2868 | } |
2869 | |
2870 | void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall, |
2871 | FuncInfo CalleeFunc) { |
2872 | auto *CI = CallerCall.call().dyn_cast<CallsiteInfo *>(); |
2873 | assert(CI && |
2874 | "Caller cannot be an allocation which should not have profiled calls" ); |
2875 | assert(CI->Clones.size() > CallerCall.cloneNo()); |
2876 | CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo(); |
2877 | } |
2878 | |
2879 | CallsiteContextGraph<ModuleCallsiteContextGraph, Function, |
2880 | Instruction *>::FuncInfo |
2881 | ModuleCallsiteContextGraph::cloneFunctionForCallsite( |
2882 | FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap, |
2883 | std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) { |
2884 | // Use existing LLVM facilities for cloning and obtaining Call in clone |
2885 | ValueToValueMapTy VMap; |
2886 | auto *NewFunc = CloneFunction(F: Func.func(), VMap); |
2887 | std::string Name = getMemProfFuncName(Base: Func.func()->getName(), CloneNo); |
2888 | assert(!Func.func()->getParent()->getFunction(Name)); |
2889 | NewFunc->setName(Name); |
2890 | for (auto &Inst : CallsWithMetadataInFunc) { |
2891 | // This map always has the initial version in it. |
2892 | assert(Inst.cloneNo() == 0); |
2893 | CallMap[Inst] = {cast<Instruction>(Val&: VMap[Inst.call()]), CloneNo}; |
2894 | } |
2895 | OREGetter(Func.func()) |
2896 | .emit(OptDiag&: OptimizationRemark(DEBUG_TYPE, "MemprofClone" , Func.func()) |
2897 | << "created clone " << ore::NV("NewFunction" , NewFunc)); |
2898 | return {NewFunc, CloneNo}; |
2899 | } |
2900 | |
2901 | CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, |
2902 | IndexCall>::FuncInfo |
2903 | IndexCallsiteContextGraph::cloneFunctionForCallsite( |
2904 | FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap, |
2905 | std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) { |
2906 | // Check how many clones we have of Call (and therefore function). |
2907 | // The next clone number is the current size of versions array. |
2908 | // Confirm this matches the CloneNo provided by the caller, which is based on |
2909 | // the number of function clones we have. |
2910 | assert(CloneNo == |
2911 | (Call.call().is<AllocInfo *>() |
2912 | ? Call.call().dyn_cast<AllocInfo *>()->Versions.size() |
2913 | : Call.call().dyn_cast<CallsiteInfo *>()->Clones.size())); |
2914 | // Walk all the instructions in this function. Create a new version for |
2915 | // each (by adding an entry to the Versions/Clones summary array), and copy |
2916 | // over the version being called for the function clone being cloned here. |
2917 | // Additionally, add an entry to the CallMap for the new function clone, |
2918 | // mapping the original call (clone 0, what is in CallsWithMetadataInFunc) |
2919 | // to the new call clone. |
2920 | for (auto &Inst : CallsWithMetadataInFunc) { |
2921 | // This map always has the initial version in it. |
2922 | assert(Inst.cloneNo() == 0); |
2923 | if (auto *AI = Inst.call().dyn_cast<AllocInfo *>()) { |
2924 | assert(AI->Versions.size() == CloneNo); |
2925 | // We assign the allocation type later (in updateAllocationCall), just add |
2926 | // an entry for it here. |
2927 | AI->Versions.push_back(Elt: 0); |
2928 | } else { |
2929 | auto *CI = Inst.call().dyn_cast<CallsiteInfo *>(); |
2930 | assert(CI && CI->Clones.size() == CloneNo); |
2931 | // We assign the clone number later (in updateCall), just add an entry for |
2932 | // it here. |
2933 | CI->Clones.push_back(Elt: 0); |
2934 | } |
2935 | CallMap[Inst] = {Inst.call(), CloneNo}; |
2936 | } |
2937 | return {Func.func(), CloneNo}; |
2938 | } |
2939 | |
2940 | // This method assigns cloned callsites to functions, cloning the functions as |
2941 | // needed. The assignment is greedy and proceeds roughly as follows: |
2942 | // |
2943 | // For each function Func: |
2944 | // For each call with graph Node having clones: |
2945 | // Initialize ClonesWorklist to Node and its clones |
2946 | // Initialize NodeCloneCount to 0 |
2947 | // While ClonesWorklist is not empty: |
2948 | // Clone = pop front ClonesWorklist |
2949 | // NodeCloneCount++ |
2950 | // If Func has been cloned less than NodeCloneCount times: |
2951 | // If NodeCloneCount is 1: |
2952 | // Assign Clone to original Func |
2953 | // Continue |
2954 | // Create a new function clone |
2955 | // If other callers not assigned to call a function clone yet: |
2956 | // Assign them to call new function clone |
2957 | // Continue |
2958 | // Assign any other caller calling the cloned version to new clone |
2959 | // |
2960 | // For each caller of Clone: |
2961 | // If caller is assigned to call a specific function clone: |
2962 | // If we cannot assign Clone to that function clone: |
2963 | // Create new callsite Clone NewClone |
2964 | // Add NewClone to ClonesWorklist |
2965 | // Continue |
2966 | // Assign Clone to existing caller's called function clone |
2967 | // Else: |
2968 | // If Clone not already assigned to a function clone: |
2969 | // Assign to first function clone without assignment |
2970 | // Assign caller to selected function clone |
2971 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
2972 | bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { |
2973 | bool Changed = false; |
2974 | |
2975 | // Keep track of the assignment of nodes (callsites) to function clones they |
2976 | // call. |
2977 | DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap; |
2978 | |
2979 | // Update caller node to call function version CalleeFunc, by recording the |
2980 | // assignment in CallsiteToCalleeFuncCloneMap. |
2981 | auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller, |
2982 | const FuncInfo &CalleeFunc) { |
2983 | assert(Caller->hasCall()); |
2984 | CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc; |
2985 | }; |
2986 | |
2987 | // Walk all functions for which we saw calls with memprof metadata, and handle |
2988 | // cloning for each of its calls. |
2989 | for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) { |
2990 | FuncInfo OrigFunc(Func); |
2991 | // Map from each clone of OrigFunc to a map of remappings of each call of |
2992 | // interest (from original uncloned call to the corresponding cloned call in |
2993 | // that function clone). |
2994 | std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap; |
2995 | for (auto &Call : CallsWithMetadata) { |
2996 | ContextNode *Node = getNodeForInst(C: Call); |
2997 | // Skip call if we do not have a node for it (all uses of its stack ids |
2998 | // were either on inlined chains or pruned from the MIBs), or if we did |
2999 | // not create any clones for it. |
3000 | if (!Node || Node->Clones.empty()) |
3001 | continue; |
3002 | assert(Node->hasCall() && |
3003 | "Not having a call should have prevented cloning" ); |
3004 | |
3005 | // Track the assignment of function clones to clones of the current |
3006 | // callsite Node being handled. |
3007 | std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap; |
3008 | |
3009 | // Assign callsite version CallsiteClone to function version FuncClone, |
3010 | // and also assign (possibly cloned) Call to CallsiteClone. |
3011 | auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone, |
3012 | CallInfo &Call, |
3013 | ContextNode *CallsiteClone, |
3014 | bool IsAlloc) { |
3015 | // Record the clone of callsite node assigned to this function clone. |
3016 | FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone; |
3017 | |
3018 | assert(FuncClonesToCallMap.count(FuncClone)); |
3019 | std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone]; |
3020 | CallInfo CallClone(Call); |
3021 | if (CallMap.count(Call)) |
3022 | CallClone = CallMap[Call]; |
3023 | CallsiteClone->setCall(CallClone); |
3024 | }; |
3025 | |
3026 | // Keep track of the clones of callsite Node that need to be assigned to |
3027 | // function clones. This list may be expanded in the loop body below if we |
3028 | // find additional cloning is required. |
3029 | std::deque<ContextNode *> ClonesWorklist; |
3030 | // Ignore original Node if we moved all of its contexts to clones. |
3031 | if (!Node->emptyContextIds()) |
3032 | ClonesWorklist.push_back(Node); |
3033 | ClonesWorklist.insert(ClonesWorklist.end(), Node->Clones.begin(), |
3034 | Node->Clones.end()); |
3035 | |
3036 | // Now walk through all of the clones of this callsite Node that we need, |
3037 | // and determine the assignment to a corresponding clone of the current |
3038 | // function (creating new function clones as needed). |
3039 | unsigned NodeCloneCount = 0; |
3040 | while (!ClonesWorklist.empty()) { |
3041 | ContextNode *Clone = ClonesWorklist.front(); |
3042 | ClonesWorklist.pop_front(); |
3043 | NodeCloneCount++; |
3044 | if (VerifyNodes) |
3045 | checkNode<DerivedCCG, FuncTy, CallTy>(Clone); |
3046 | |
3047 | // Need to create a new function clone if we have more callsite clones |
3048 | // than existing function clones, which would have been assigned to an |
3049 | // earlier clone in the list (we assign callsite clones to function |
3050 | // clones greedily). |
3051 | if (FuncClonesToCallMap.size() < NodeCloneCount) { |
3052 | // If this is the first callsite copy, assign to original function. |
3053 | if (NodeCloneCount == 1) { |
3054 | // Since FuncClonesToCallMap is empty in this case, no clones have |
3055 | // been created for this function yet, and no callers should have |
3056 | // been assigned a function clone for this callee node yet. |
3057 | assert(llvm::none_of( |
3058 | Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) { |
3059 | return CallsiteToCalleeFuncCloneMap.count(E->Caller); |
3060 | })); |
3061 | // Initialize with empty call map, assign Clone to original function |
3062 | // and its callers, and skip to the next clone. |
3063 | FuncClonesToCallMap[OrigFunc] = {}; |
3064 | AssignCallsiteCloneToFuncClone( |
3065 | OrigFunc, Call, Clone, |
3066 | AllocationCallToContextNodeMap.count(Call)); |
3067 | for (auto &CE : Clone->CallerEdges) { |
3068 | // Ignore any caller that does not have a recorded callsite Call. |
3069 | if (!CE->Caller->hasCall()) |
3070 | continue; |
3071 | RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc); |
3072 | } |
3073 | continue; |
3074 | } |
3075 | |
3076 | // First locate which copy of OrigFunc to clone again. If a caller |
3077 | // of this callsite clone was already assigned to call a particular |
3078 | // function clone, we need to redirect all of those callers to the |
3079 | // new function clone, and update their other callees within this |
3080 | // function. |
3081 | FuncInfo PreviousAssignedFuncClone; |
3082 | auto EI = llvm::find_if( |
3083 | Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) { |
3084 | return CallsiteToCalleeFuncCloneMap.count(E->Caller); |
3085 | }); |
3086 | bool CallerAssignedToCloneOfFunc = false; |
3087 | if (EI != Clone->CallerEdges.end()) { |
3088 | const std::shared_ptr<ContextEdge> &Edge = *EI; |
3089 | PreviousAssignedFuncClone = |
3090 | CallsiteToCalleeFuncCloneMap[Edge->Caller]; |
3091 | CallerAssignedToCloneOfFunc = true; |
3092 | } |
3093 | |
3094 | // Clone function and save it along with the CallInfo map created |
3095 | // during cloning in the FuncClonesToCallMap. |
3096 | std::map<CallInfo, CallInfo> NewCallMap; |
3097 | unsigned CloneNo = FuncClonesToCallMap.size(); |
3098 | assert(CloneNo > 0 && "Clone 0 is the original function, which " |
3099 | "should already exist in the map" ); |
3100 | FuncInfo NewFuncClone = cloneFunctionForCallsite( |
3101 | Func&: OrigFunc, Call, CallMap&: NewCallMap, CallsWithMetadataInFunc&: CallsWithMetadata, CloneNo); |
3102 | FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap)); |
3103 | FunctionClonesAnalysis++; |
3104 | Changed = true; |
3105 | |
3106 | // If no caller callsites were already assigned to a clone of this |
3107 | // function, we can simply assign this clone to the new func clone |
3108 | // and update all callers to it, then skip to the next clone. |
3109 | if (!CallerAssignedToCloneOfFunc) { |
3110 | AssignCallsiteCloneToFuncClone( |
3111 | NewFuncClone, Call, Clone, |
3112 | AllocationCallToContextNodeMap.count(Call)); |
3113 | for (auto &CE : Clone->CallerEdges) { |
3114 | // Ignore any caller that does not have a recorded callsite Call. |
3115 | if (!CE->Caller->hasCall()) |
3116 | continue; |
3117 | RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone); |
3118 | } |
3119 | continue; |
3120 | } |
3121 | |
3122 | // We may need to do additional node cloning in this case. |
3123 | // Reset the CallsiteToCalleeFuncCloneMap entry for any callers |
3124 | // that were previously assigned to call PreviousAssignedFuncClone, |
3125 | // to record that they now call NewFuncClone. |
3126 | for (auto CE : Clone->CallerEdges) { |
3127 | // Skip any that have been removed on an earlier iteration. |
3128 | if (!CE) |
3129 | continue; |
3130 | // Ignore any caller that does not have a recorded callsite Call. |
3131 | if (!CE->Caller->hasCall()) |
3132 | continue; |
3133 | |
3134 | if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) || |
3135 | // We subsequently fall through to later handling that |
3136 | // will perform any additional cloning required for |
3137 | // callers that were calling other function clones. |
3138 | CallsiteToCalleeFuncCloneMap[CE->Caller] != |
3139 | PreviousAssignedFuncClone) |
3140 | continue; |
3141 | |
3142 | RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone); |
3143 | |
3144 | // If we are cloning a function that was already assigned to some |
3145 | // callers, then essentially we are creating new callsite clones |
3146 | // of the other callsites in that function that are reached by those |
3147 | // callers. Clone the other callees of the current callsite's caller |
3148 | // that were already assigned to PreviousAssignedFuncClone |
3149 | // accordingly. This is important since we subsequently update the |
3150 | // calls from the nodes in the graph and their assignments to callee |
3151 | // functions recorded in CallsiteToCalleeFuncCloneMap. |
3152 | for (auto CalleeEdge : CE->Caller->CalleeEdges) { |
3153 | // Skip any that have been removed on an earlier iteration when |
3154 | // cleaning up newly None type callee edges. |
3155 | if (!CalleeEdge) |
3156 | continue; |
3157 | ContextNode *Callee = CalleeEdge->Callee; |
3158 | // Skip the current callsite, we are looking for other |
3159 | // callsites Caller calls, as well as any that does not have a |
3160 | // recorded callsite Call. |
3161 | if (Callee == Clone || !Callee->hasCall()) |
3162 | continue; |
3163 | ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge: CalleeEdge); |
3164 | removeNoneTypeCalleeEdges(Node: NewClone); |
3165 | // Moving the edge may have resulted in some none type |
3166 | // callee edges on the original Callee. |
3167 | removeNoneTypeCalleeEdges(Node: Callee); |
3168 | assert(NewClone->AllocTypes != (uint8_t)AllocationType::None); |
3169 | // If the Callee node was already assigned to call a specific |
3170 | // function version, make sure its new clone is assigned to call |
3171 | // that same function clone. |
3172 | if (CallsiteToCalleeFuncCloneMap.count(Callee)) |
3173 | RecordCalleeFuncOfCallsite( |
3174 | NewClone, CallsiteToCalleeFuncCloneMap[Callee]); |
3175 | // Update NewClone with the new Call clone of this callsite's Call |
3176 | // created for the new function clone created earlier. |
3177 | // Recall that we have already ensured when building the graph |
3178 | // that each caller can only call callsites within the same |
3179 | // function, so we are guaranteed that Callee Call is in the |
3180 | // current OrigFunc. |
3181 | // CallMap is set up as indexed by original Call at clone 0. |
3182 | CallInfo OrigCall(Callee->getOrigNode()->Call); |
3183 | OrigCall.setCloneNo(0); |
3184 | std::map<CallInfo, CallInfo> &CallMap = |
3185 | FuncClonesToCallMap[NewFuncClone]; |
3186 | assert(CallMap.count(OrigCall)); |
3187 | CallInfo NewCall(CallMap[OrigCall]); |
3188 | assert(NewCall); |
3189 | NewClone->setCall(NewCall); |
3190 | } |
3191 | } |
3192 | // Fall through to handling below to perform the recording of the |
3193 | // function for this callsite clone. This enables handling of cases |
3194 | // where the callers were assigned to different clones of a function. |
3195 | } |
3196 | |
3197 | // See if we can use existing function clone. Walk through |
3198 | // all caller edges to see if any have already been assigned to |
3199 | // a clone of this callsite's function. If we can use it, do so. If not, |
3200 | // because that function clone is already assigned to a different clone |
3201 | // of this callsite, then we need to clone again. |
3202 | // Basically, this checking is needed to handle the case where different |
3203 | // caller functions/callsites may need versions of this function |
3204 | // containing different mixes of callsite clones across the different |
3205 | // callsites within the function. If that happens, we need to create |
3206 | // additional function clones to handle the various combinations. |
3207 | // |
3208 | // Keep track of any new clones of this callsite created by the |
3209 | // following loop, as well as any existing clone that we decided to |
3210 | // assign this clone to. |
3211 | std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap; |
3212 | FuncInfo FuncCloneAssignedToCurCallsiteClone; |
3213 | // We need to be able to remove Edge from CallerEdges, so need to adjust |
3214 | // iterator in the loop. |
3215 | for (auto EI = Clone->CallerEdges.begin(); |
3216 | EI != Clone->CallerEdges.end();) { |
3217 | auto Edge = *EI; |
3218 | // Ignore any caller that does not have a recorded callsite Call. |
3219 | if (!Edge->Caller->hasCall()) { |
3220 | EI++; |
3221 | continue; |
3222 | } |
3223 | // If this caller already assigned to call a version of OrigFunc, need |
3224 | // to ensure we can assign this callsite clone to that function clone. |
3225 | if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) { |
3226 | FuncInfo FuncCloneCalledByCaller = |
3227 | CallsiteToCalleeFuncCloneMap[Edge->Caller]; |
3228 | // First we need to confirm that this function clone is available |
3229 | // for use by this callsite node clone. |
3230 | // |
3231 | // While FuncCloneToCurNodeCloneMap is built only for this Node and |
3232 | // its callsite clones, one of those callsite clones X could have |
3233 | // been assigned to the same function clone called by Edge's caller |
3234 | // - if Edge's caller calls another callsite within Node's original |
3235 | // function, and that callsite has another caller reaching clone X. |
3236 | // We need to clone Node again in this case. |
3237 | if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) && |
3238 | FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] != |
3239 | Clone) || |
3240 | // Detect when we have multiple callers of this callsite that |
3241 | // have already been assigned to specific, and different, clones |
3242 | // of OrigFunc (due to other unrelated callsites in Func they |
3243 | // reach via call contexts). Is this Clone of callsite Node |
3244 | // assigned to a different clone of OrigFunc? If so, clone Node |
3245 | // again. |
3246 | (FuncCloneAssignedToCurCallsiteClone && |
3247 | FuncCloneAssignedToCurCallsiteClone != |
3248 | FuncCloneCalledByCaller)) { |
3249 | // We need to use a different newly created callsite clone, in |
3250 | // order to assign it to another new function clone on a |
3251 | // subsequent iteration over the Clones array (adjusted below). |
3252 | // Note we specifically do not reset the |
3253 | // CallsiteToCalleeFuncCloneMap entry for this caller, so that |
3254 | // when this new clone is processed later we know which version of |
3255 | // the function to copy (so that other callsite clones we have |
3256 | // assigned to that function clone are properly cloned over). See |
3257 | // comments in the function cloning handling earlier. |
3258 | |
3259 | // Check if we already have cloned this callsite again while |
3260 | // walking through caller edges, for a caller calling the same |
3261 | // function clone. If so, we can move this edge to that new clone |
3262 | // rather than creating yet another new clone. |
3263 | if (FuncCloneToNewCallsiteCloneMap.count( |
3264 | FuncCloneCalledByCaller)) { |
3265 | ContextNode *NewClone = |
3266 | FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller]; |
3267 | moveEdgeToExistingCalleeClone(Edge, NewCallee: NewClone, CallerEdgeI: &EI); |
3268 | // Cleanup any none type edges cloned over. |
3269 | removeNoneTypeCalleeEdges(Node: NewClone); |
3270 | } else { |
3271 | // Create a new callsite clone. |
3272 | ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge, CallerEdgeI: &EI); |
3273 | removeNoneTypeCalleeEdges(Node: NewClone); |
3274 | FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] = |
3275 | NewClone; |
3276 | // Add to list of clones and process later. |
3277 | ClonesWorklist.push_back(NewClone); |
3278 | assert(EI == Clone->CallerEdges.end() || |
3279 | Clone->AllocTypes != (uint8_t)AllocationType::None); |
3280 | assert(NewClone->AllocTypes != (uint8_t)AllocationType::None); |
3281 | } |
3282 | // Moving the caller edge may have resulted in some none type |
3283 | // callee edges. |
3284 | removeNoneTypeCalleeEdges(Node: Clone); |
3285 | // We will handle the newly created callsite clone in a subsequent |
3286 | // iteration over this Node's Clones. Continue here since we |
3287 | // already adjusted iterator EI while moving the edge. |
3288 | continue; |
3289 | } |
3290 | |
3291 | // Otherwise, we can use the function clone already assigned to this |
3292 | // caller. |
3293 | if (!FuncCloneAssignedToCurCallsiteClone) { |
3294 | FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller; |
3295 | // Assign Clone to FuncCloneCalledByCaller |
3296 | AssignCallsiteCloneToFuncClone( |
3297 | FuncCloneCalledByCaller, Call, Clone, |
3298 | AllocationCallToContextNodeMap.count(Call)); |
3299 | } else |
3300 | // Don't need to do anything - callsite is already calling this |
3301 | // function clone. |
3302 | assert(FuncCloneAssignedToCurCallsiteClone == |
3303 | FuncCloneCalledByCaller); |
3304 | |
3305 | } else { |
3306 | // We have not already assigned this caller to a version of |
3307 | // OrigFunc. Do the assignment now. |
3308 | |
3309 | // First check if we have already assigned this callsite clone to a |
3310 | // clone of OrigFunc for another caller during this iteration over |
3311 | // its caller edges. |
3312 | if (!FuncCloneAssignedToCurCallsiteClone) { |
3313 | // Find first function in FuncClonesToCallMap without an assigned |
3314 | // clone of this callsite Node. We should always have one |
3315 | // available at this point due to the earlier cloning when the |
3316 | // FuncClonesToCallMap size was smaller than the clone number. |
3317 | for (auto &CF : FuncClonesToCallMap) { |
3318 | if (!FuncCloneToCurNodeCloneMap.count(CF.first)) { |
3319 | FuncCloneAssignedToCurCallsiteClone = CF.first; |
3320 | break; |
3321 | } |
3322 | } |
3323 | assert(FuncCloneAssignedToCurCallsiteClone); |
3324 | // Assign Clone to FuncCloneAssignedToCurCallsiteClone |
3325 | AssignCallsiteCloneToFuncClone( |
3326 | FuncCloneAssignedToCurCallsiteClone, Call, Clone, |
3327 | AllocationCallToContextNodeMap.count(Call)); |
3328 | } else |
3329 | assert(FuncCloneToCurNodeCloneMap |
3330 | [FuncCloneAssignedToCurCallsiteClone] == Clone); |
3331 | // Update callers to record function version called. |
3332 | RecordCalleeFuncOfCallsite(Edge->Caller, |
3333 | FuncCloneAssignedToCurCallsiteClone); |
3334 | } |
3335 | |
3336 | EI++; |
3337 | } |
3338 | } |
3339 | if (VerifyCCG) { |
3340 | checkNode<DerivedCCG, FuncTy, CallTy>(Node); |
3341 | for (const auto &PE : Node->CalleeEdges) |
3342 | checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee); |
3343 | for (const auto &CE : Node->CallerEdges) |
3344 | checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller); |
3345 | for (auto *Clone : Node->Clones) { |
3346 | checkNode<DerivedCCG, FuncTy, CallTy>(Clone); |
3347 | for (const auto &PE : Clone->CalleeEdges) |
3348 | checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee); |
3349 | for (const auto &CE : Clone->CallerEdges) |
3350 | checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller); |
3351 | } |
3352 | } |
3353 | } |
3354 | } |
3355 | |
3356 | auto UpdateCalls = [&](ContextNode *Node, |
3357 | DenseSet<const ContextNode *> &Visited, |
3358 | auto &&UpdateCalls) { |
3359 | auto Inserted = Visited.insert(Node); |
3360 | if (!Inserted.second) |
3361 | return; |
3362 | |
3363 | for (auto *Clone : Node->Clones) |
3364 | UpdateCalls(Clone, Visited, UpdateCalls); |
3365 | |
3366 | for (auto &Edge : Node->CallerEdges) |
3367 | UpdateCalls(Edge->Caller, Visited, UpdateCalls); |
3368 | |
3369 | // Skip if either no call to update, or if we ended up with no context ids |
3370 | // (we moved all edges onto other clones). |
3371 | if (!Node->hasCall() || Node->emptyContextIds()) |
3372 | return; |
3373 | |
3374 | if (Node->IsAllocation) { |
3375 | updateAllocationCall(Call&: Node->Call, AllocType: allocTypeToUse(Node->AllocTypes)); |
3376 | return; |
3377 | } |
3378 | |
3379 | if (!CallsiteToCalleeFuncCloneMap.count(Node)) |
3380 | return; |
3381 | |
3382 | auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node]; |
3383 | updateCall(CallerCall&: Node->Call, CalleeFunc); |
3384 | }; |
3385 | |
3386 | // Performs DFS traversal starting from allocation nodes to update calls to |
3387 | // reflect cloning decisions recorded earlier. For regular LTO this will |
3388 | // update the actual calls in the IR to call the appropriate function clone |
3389 | // (and add attributes to allocation calls), whereas for ThinLTO the decisions |
3390 | // are recorded in the summary entries. |
3391 | DenseSet<const ContextNode *> Visited; |
3392 | for (auto &Entry : AllocationCallToContextNodeMap) |
3393 | UpdateCalls(Entry.second, Visited, UpdateCalls); |
3394 | |
3395 | return Changed; |
3396 | } |
3397 | |
3398 | static SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> ( |
3399 | Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE, |
3400 | std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>> |
3401 | &FuncToAliasMap) { |
3402 | // The first "clone" is the original copy, we should only call this if we |
3403 | // needed to create new clones. |
3404 | assert(NumClones > 1); |
3405 | SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps; |
3406 | VMaps.reserve(N: NumClones - 1); |
3407 | FunctionsClonedThinBackend++; |
3408 | for (unsigned I = 1; I < NumClones; I++) { |
3409 | VMaps.emplace_back(Args: std::make_unique<ValueToValueMapTy>()); |
3410 | auto *NewF = CloneFunction(F: &F, VMap&: *VMaps.back()); |
3411 | FunctionClonesThinBackend++; |
3412 | // Strip memprof and callsite metadata from clone as they are no longer |
3413 | // needed. |
3414 | for (auto &BB : *NewF) { |
3415 | for (auto &Inst : BB) { |
3416 | Inst.setMetadata(KindID: LLVMContext::MD_memprof, Node: nullptr); |
3417 | Inst.setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr); |
3418 | } |
3419 | } |
3420 | std::string Name = getMemProfFuncName(Base: F.getName(), CloneNo: I); |
3421 | auto *PrevF = M.getFunction(Name); |
3422 | if (PrevF) { |
3423 | // We might have created this when adjusting callsite in another |
3424 | // function. It should be a declaration. |
3425 | assert(PrevF->isDeclaration()); |
3426 | NewF->takeName(V: PrevF); |
3427 | PrevF->replaceAllUsesWith(V: NewF); |
3428 | PrevF->eraseFromParent(); |
3429 | } else |
3430 | NewF->setName(Name); |
3431 | ORE.emit(OptDiag&: OptimizationRemark(DEBUG_TYPE, "MemprofClone" , &F) |
3432 | << "created clone " << ore::NV("NewFunction" , NewF)); |
3433 | |
3434 | // Now handle aliases to this function, and clone those as well. |
3435 | if (!FuncToAliasMap.count(x: &F)) |
3436 | continue; |
3437 | for (auto *A : FuncToAliasMap[&F]) { |
3438 | std::string Name = getMemProfFuncName(Base: A->getName(), CloneNo: I); |
3439 | auto *PrevA = M.getNamedAlias(Name); |
3440 | auto *NewA = GlobalAlias::create(Ty: A->getValueType(), |
3441 | AddressSpace: A->getType()->getPointerAddressSpace(), |
3442 | Linkage: A->getLinkage(), Name, Aliasee: NewF); |
3443 | NewA->copyAttributesFrom(Src: A); |
3444 | if (PrevA) { |
3445 | // We might have created this when adjusting callsite in another |
3446 | // function. It should be a declaration. |
3447 | assert(PrevA->isDeclaration()); |
3448 | NewA->takeName(V: PrevA); |
3449 | PrevA->replaceAllUsesWith(V: NewA); |
3450 | PrevA->eraseFromParent(); |
3451 | } |
3452 | } |
3453 | } |
3454 | return VMaps; |
3455 | } |
3456 | |
3457 | // Locate the summary for F. This is complicated by the fact that it might |
3458 | // have been internalized or promoted. |
3459 | static ValueInfo findValueInfoForFunc(const Function &F, const Module &M, |
3460 | const ModuleSummaryIndex *ImportSummary) { |
3461 | // FIXME: Ideally we would retain the original GUID in some fashion on the |
3462 | // function (e.g. as metadata), but for now do our best to locate the |
3463 | // summary without that information. |
3464 | ValueInfo TheFnVI = ImportSummary->getValueInfo(GUID: F.getGUID()); |
3465 | if (!TheFnVI) |
3466 | // See if theFn was internalized, by checking index directly with |
3467 | // original name (this avoids the name adjustment done by getGUID() for |
3468 | // internal symbols). |
3469 | TheFnVI = ImportSummary->getValueInfo(GUID: GlobalValue::getGUID(GlobalName: F.getName())); |
3470 | if (TheFnVI) |
3471 | return TheFnVI; |
3472 | // Now query with the original name before any promotion was performed. |
3473 | StringRef OrigName = |
3474 | ModuleSummaryIndex::getOriginalNameBeforePromote(Name: F.getName()); |
3475 | std::string OrigId = GlobalValue::getGlobalIdentifier( |
3476 | Name: OrigName, Linkage: GlobalValue::InternalLinkage, FileName: M.getSourceFileName()); |
3477 | TheFnVI = ImportSummary->getValueInfo(GUID: GlobalValue::getGUID(GlobalName: OrigId)); |
3478 | if (TheFnVI) |
3479 | return TheFnVI; |
3480 | // Could be a promoted local imported from another module. We need to pass |
3481 | // down more info here to find the original module id. For now, try with |
3482 | // the OrigName which might have been stored in the OidGuidMap in the |
3483 | // index. This would not work if there were same-named locals in multiple |
3484 | // modules, however. |
3485 | auto OrigGUID = |
3486 | ImportSummary->getGUIDFromOriginalID(OriginalID: GlobalValue::getGUID(GlobalName: OrigName)); |
3487 | if (OrigGUID) |
3488 | TheFnVI = ImportSummary->getValueInfo(GUID: OrigGUID); |
3489 | return TheFnVI; |
3490 | } |
3491 | |
3492 | bool MemProfContextDisambiguation::applyImport(Module &M) { |
3493 | assert(ImportSummary); |
3494 | bool Changed = false; |
3495 | |
3496 | auto IsMemProfClone = [](const Function &F) { |
3497 | return F.getName().contains(Other: MemProfCloneSuffix); |
3498 | }; |
3499 | |
3500 | // We also need to clone any aliases that reference cloned functions, because |
3501 | // the modified callsites may invoke via the alias. Keep track of the aliases |
3502 | // for each function. |
3503 | std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>> |
3504 | FuncToAliasMap; |
3505 | for (auto &A : M.aliases()) { |
3506 | auto *Aliasee = A.getAliaseeObject(); |
3507 | if (auto *F = dyn_cast<Function>(Val: Aliasee)) |
3508 | FuncToAliasMap[F].insert(Ptr: &A); |
3509 | } |
3510 | |
3511 | for (auto &F : M) { |
3512 | if (F.isDeclaration() || IsMemProfClone(F)) |
3513 | continue; |
3514 | |
3515 | OptimizationRemarkEmitter ORE(&F); |
3516 | |
3517 | SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps; |
3518 | bool ClonesCreated = false; |
3519 | unsigned NumClonesCreated = 0; |
3520 | auto CloneFuncIfNeeded = [&](unsigned NumClones) { |
3521 | // We should at least have version 0 which is the original copy. |
3522 | assert(NumClones > 0); |
3523 | // If only one copy needed use original. |
3524 | if (NumClones == 1) |
3525 | return; |
3526 | // If we already performed cloning of this function, confirm that the |
3527 | // requested number of clones matches (the thin link should ensure the |
3528 | // number of clones for each constituent callsite is consistent within |
3529 | // each function), before returning. |
3530 | if (ClonesCreated) { |
3531 | assert(NumClonesCreated == NumClones); |
3532 | return; |
3533 | } |
3534 | VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap); |
3535 | // The first "clone" is the original copy, which doesn't have a VMap. |
3536 | assert(VMaps.size() == NumClones - 1); |
3537 | Changed = true; |
3538 | ClonesCreated = true; |
3539 | NumClonesCreated = NumClones; |
3540 | }; |
3541 | |
3542 | auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB, |
3543 | Function *CalledFunction) { |
3544 | // Perform cloning if not yet done. |
3545 | CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size()); |
3546 | |
3547 | // Should have skipped indirect calls via mayHaveMemprofSummary. |
3548 | assert(CalledFunction); |
3549 | assert(!IsMemProfClone(*CalledFunction)); |
3550 | |
3551 | // Update the calls per the summary info. |
3552 | // Save orig name since it gets updated in the first iteration |
3553 | // below. |
3554 | auto CalleeOrigName = CalledFunction->getName(); |
3555 | for (unsigned J = 0; J < StackNode.Clones.size(); J++) { |
3556 | // Do nothing if this version calls the original version of its |
3557 | // callee. |
3558 | if (!StackNode.Clones[J]) |
3559 | continue; |
3560 | auto NewF = M.getOrInsertFunction( |
3561 | Name: getMemProfFuncName(Base: CalleeOrigName, CloneNo: StackNode.Clones[J]), |
3562 | T: CalledFunction->getFunctionType()); |
3563 | CallBase *CBClone; |
3564 | // Copy 0 is the original function. |
3565 | if (!J) |
3566 | CBClone = CB; |
3567 | else |
3568 | CBClone = cast<CallBase>(Val&: (*VMaps[J - 1])[CB]); |
3569 | CBClone->setCalledFunction(NewF); |
3570 | ORE.emit(OptDiag&: OptimizationRemark(DEBUG_TYPE, "MemprofCall" , CBClone) |
3571 | << ore::NV("Call" , CBClone) << " in clone " |
3572 | << ore::NV("Caller" , CBClone->getFunction()) |
3573 | << " assigned to call function clone " |
3574 | << ore::NV("Callee" , NewF.getCallee())); |
3575 | } |
3576 | }; |
3577 | |
3578 | // Locate the summary for F. |
3579 | ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary); |
3580 | // If not found, this could be an imported local (see comment in |
3581 | // findValueInfoForFunc). Skip for now as it will be cloned in its original |
3582 | // module (where it would have been promoted to global scope so should |
3583 | // satisfy any reference in this module). |
3584 | if (!TheFnVI) |
3585 | continue; |
3586 | |
3587 | auto *GVSummary = |
3588 | ImportSummary->findSummaryInModule(VI: TheFnVI, ModuleId: M.getModuleIdentifier()); |
3589 | if (!GVSummary) { |
3590 | // Must have been imported, use the summary which matches the definition。 |
3591 | // (might be multiple if this was a linkonce_odr). |
3592 | auto SrcModuleMD = F.getMetadata(Kind: "thinlto_src_module" ); |
3593 | assert(SrcModuleMD && |
3594 | "enable-import-metadata is needed to emit thinlto_src_module" ); |
3595 | StringRef SrcModule = |
3596 | dyn_cast<MDString>(Val: SrcModuleMD->getOperand(I: 0))->getString(); |
3597 | for (auto &GVS : TheFnVI.getSummaryList()) { |
3598 | if (GVS->modulePath() == SrcModule) { |
3599 | GVSummary = GVS.get(); |
3600 | break; |
3601 | } |
3602 | } |
3603 | assert(GVSummary && GVSummary->modulePath() == SrcModule); |
3604 | } |
3605 | |
3606 | // If this was an imported alias skip it as we won't have the function |
3607 | // summary, and it should be cloned in the original module. |
3608 | if (isa<AliasSummary>(Val: GVSummary)) |
3609 | continue; |
3610 | |
3611 | auto *FS = cast<FunctionSummary>(Val: GVSummary->getBaseObject()); |
3612 | |
3613 | if (FS->allocs().empty() && FS->callsites().empty()) |
3614 | continue; |
3615 | |
3616 | auto SI = FS->callsites().begin(); |
3617 | auto AI = FS->allocs().begin(); |
3618 | |
3619 | // To handle callsite infos synthesized for tail calls which have missing |
3620 | // frames in the profiled context, map callee VI to the synthesized callsite |
3621 | // info. |
3622 | DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite; |
3623 | // Iterate the callsites for this function in reverse, since we place all |
3624 | // those synthesized for tail calls at the end. |
3625 | for (auto CallsiteIt = FS->callsites().rbegin(); |
3626 | CallsiteIt != FS->callsites().rend(); CallsiteIt++) { |
3627 | auto &Callsite = *CallsiteIt; |
3628 | // Stop as soon as we see a non-synthesized callsite info (see comment |
3629 | // above loop). All the entries added for discovered tail calls have empty |
3630 | // stack ids. |
3631 | if (!Callsite.StackIdIndices.empty()) |
3632 | break; |
3633 | MapTailCallCalleeVIToCallsite.insert(KV: {Callsite.Callee, Callsite}); |
3634 | } |
3635 | |
3636 | // Assume for now that the instructions are in the exact same order |
3637 | // as when the summary was created, but confirm this is correct by |
3638 | // matching the stack ids. |
3639 | for (auto &BB : F) { |
3640 | for (auto &I : BB) { |
3641 | auto *CB = dyn_cast<CallBase>(Val: &I); |
3642 | // Same handling as when creating module summary. |
3643 | if (!mayHaveMemprofSummary(CB)) |
3644 | continue; |
3645 | |
3646 | auto *CalledValue = CB->getCalledOperand(); |
3647 | auto *CalledFunction = CB->getCalledFunction(); |
3648 | if (CalledValue && !CalledFunction) { |
3649 | CalledValue = CalledValue->stripPointerCasts(); |
3650 | // Stripping pointer casts can reveal a called function. |
3651 | CalledFunction = dyn_cast<Function>(Val: CalledValue); |
3652 | } |
3653 | // Check if this is an alias to a function. If so, get the |
3654 | // called aliasee for the checks below. |
3655 | if (auto *GA = dyn_cast<GlobalAlias>(Val: CalledValue)) { |
3656 | assert(!CalledFunction && |
3657 | "Expected null called function in callsite for alias" ); |
3658 | CalledFunction = dyn_cast<Function>(Val: GA->getAliaseeObject()); |
3659 | } |
3660 | |
3661 | CallStack<MDNode, MDNode::op_iterator> CallsiteContext( |
3662 | I.getMetadata(KindID: LLVMContext::MD_callsite)); |
3663 | auto *MemProfMD = I.getMetadata(KindID: LLVMContext::MD_memprof); |
3664 | |
3665 | // Include allocs that were already assigned a memprof function |
3666 | // attribute in the statistics. |
3667 | if (CB->getAttributes().hasFnAttr(Kind: "memprof" )) { |
3668 | assert(!MemProfMD); |
3669 | CB->getAttributes().getFnAttr(Kind: "memprof" ).getValueAsString() == "cold" |
3670 | ? AllocTypeColdThinBackend++ |
3671 | : AllocTypeNotColdThinBackend++; |
3672 | OrigAllocsThinBackend++; |
3673 | AllocVersionsThinBackend++; |
3674 | if (!MaxAllocVersionsThinBackend) |
3675 | MaxAllocVersionsThinBackend = 1; |
3676 | // Remove any remaining callsite metadata and we can skip the rest of |
3677 | // the handling for this instruction, since no cloning needed. |
3678 | I.setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr); |
3679 | continue; |
3680 | } |
3681 | |
3682 | if (MemProfMD) { |
3683 | // Consult the next alloc node. |
3684 | assert(AI != FS->allocs().end()); |
3685 | auto &AllocNode = *(AI++); |
3686 | |
3687 | // Sanity check that the MIB stack ids match between the summary and |
3688 | // instruction metadata. |
3689 | auto MIBIter = AllocNode.MIBs.begin(); |
3690 | for (auto &MDOp : MemProfMD->operands()) { |
3691 | assert(MIBIter != AllocNode.MIBs.end()); |
3692 | LLVM_ATTRIBUTE_UNUSED auto StackIdIndexIter = |
3693 | MIBIter->StackIdIndices.begin(); |
3694 | auto *MIBMD = cast<const MDNode>(Val: MDOp); |
3695 | MDNode *StackMDNode = getMIBStackNode(MIB: MIBMD); |
3696 | assert(StackMDNode); |
3697 | CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode); |
3698 | auto ContextIterBegin = |
3699 | StackContext.beginAfterSharedPrefix(Other&: CallsiteContext); |
3700 | // Skip the checking on the first iteration. |
3701 | uint64_t LastStackContextId = |
3702 | (ContextIterBegin != StackContext.end() && |
3703 | *ContextIterBegin == 0) |
3704 | ? 1 |
3705 | : 0; |
3706 | for (auto ContextIter = ContextIterBegin; |
3707 | ContextIter != StackContext.end(); ++ContextIter) { |
3708 | // If this is a direct recursion, simply skip the duplicate |
3709 | // entries, to be consistent with how the summary ids were |
3710 | // generated during ModuleSummaryAnalysis. |
3711 | if (LastStackContextId == *ContextIter) |
3712 | continue; |
3713 | LastStackContextId = *ContextIter; |
3714 | assert(StackIdIndexIter != MIBIter->StackIdIndices.end()); |
3715 | assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) == |
3716 | *ContextIter); |
3717 | StackIdIndexIter++; |
3718 | } |
3719 | MIBIter++; |
3720 | } |
3721 | |
3722 | // Perform cloning if not yet done. |
3723 | CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size()); |
3724 | |
3725 | OrigAllocsThinBackend++; |
3726 | AllocVersionsThinBackend += AllocNode.Versions.size(); |
3727 | if (MaxAllocVersionsThinBackend < AllocNode.Versions.size()) |
3728 | MaxAllocVersionsThinBackend = AllocNode.Versions.size(); |
3729 | |
3730 | // If there is only one version that means we didn't end up |
3731 | // considering this function for cloning, and in that case the alloc |
3732 | // will still be none type or should have gotten the default NotCold. |
3733 | // Skip that after calling clone helper since that does some sanity |
3734 | // checks that confirm we haven't decided yet that we need cloning. |
3735 | if (AllocNode.Versions.size() == 1) { |
3736 | assert((AllocationType)AllocNode.Versions[0] == |
3737 | AllocationType::NotCold || |
3738 | (AllocationType)AllocNode.Versions[0] == |
3739 | AllocationType::None); |
3740 | UnclonableAllocsThinBackend++; |
3741 | continue; |
3742 | } |
3743 | |
3744 | // All versions should have a singular allocation type. |
3745 | assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) { |
3746 | return Type == ((uint8_t)AllocationType::NotCold | |
3747 | (uint8_t)AllocationType::Cold); |
3748 | })); |
3749 | |
3750 | // Update the allocation types per the summary info. |
3751 | for (unsigned J = 0; J < AllocNode.Versions.size(); J++) { |
3752 | // Ignore any that didn't get an assigned allocation type. |
3753 | if (AllocNode.Versions[J] == (uint8_t)AllocationType::None) |
3754 | continue; |
3755 | AllocationType AllocTy = (AllocationType)AllocNode.Versions[J]; |
3756 | AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++ |
3757 | : AllocTypeNotColdThinBackend++; |
3758 | std::string AllocTypeString = getAllocTypeAttributeString(Type: AllocTy); |
3759 | auto A = llvm::Attribute::get(Context&: F.getContext(), Kind: "memprof" , |
3760 | Val: AllocTypeString); |
3761 | CallBase *CBClone; |
3762 | // Copy 0 is the original function. |
3763 | if (!J) |
3764 | CBClone = CB; |
3765 | else |
3766 | // Since VMaps are only created for new clones, we index with |
3767 | // clone J-1 (J==0 is the original clone and does not have a VMaps |
3768 | // entry). |
3769 | CBClone = cast<CallBase>(Val&: (*VMaps[J - 1])[CB]); |
3770 | CBClone->addFnAttr(Attr: A); |
3771 | ORE.emit(OptDiag&: OptimizationRemark(DEBUG_TYPE, "MemprofAttribute" , CBClone) |
3772 | << ore::NV("AllocationCall" , CBClone) << " in clone " |
3773 | << ore::NV("Caller" , CBClone->getFunction()) |
3774 | << " marked with memprof allocation attribute " |
3775 | << ore::NV("Attribute" , AllocTypeString)); |
3776 | } |
3777 | } else if (!CallsiteContext.empty()) { |
3778 | // Consult the next callsite node. |
3779 | assert(SI != FS->callsites().end()); |
3780 | auto &StackNode = *(SI++); |
3781 | |
3782 | #ifndef NDEBUG |
3783 | // Sanity check that the stack ids match between the summary and |
3784 | // instruction metadata. |
3785 | auto StackIdIndexIter = StackNode.StackIdIndices.begin(); |
3786 | for (auto StackId : CallsiteContext) { |
3787 | assert(StackIdIndexIter != StackNode.StackIdIndices.end()); |
3788 | assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) == |
3789 | StackId); |
3790 | StackIdIndexIter++; |
3791 | } |
3792 | #endif |
3793 | |
3794 | CloneCallsite(StackNode, CB, CalledFunction); |
3795 | } else if (CB->isTailCall()) { |
3796 | // Locate the synthesized callsite info for the callee VI, if any was |
3797 | // created, and use that for cloning. |
3798 | ValueInfo CalleeVI = |
3799 | findValueInfoForFunc(F: *CalledFunction, M, ImportSummary); |
3800 | if (CalleeVI && MapTailCallCalleeVIToCallsite.count(Val: CalleeVI)) { |
3801 | auto Callsite = MapTailCallCalleeVIToCallsite.find(Val: CalleeVI); |
3802 | assert(Callsite != MapTailCallCalleeVIToCallsite.end()); |
3803 | CloneCallsite(Callsite->second, CB, CalledFunction); |
3804 | } |
3805 | } |
3806 | // Memprof and callsite metadata on memory allocations no longer needed. |
3807 | I.setMetadata(KindID: LLVMContext::MD_memprof, Node: nullptr); |
3808 | I.setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr); |
3809 | } |
3810 | } |
3811 | } |
3812 | |
3813 | return Changed; |
3814 | } |
3815 | |
3816 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
3817 | bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() { |
3818 | if (DumpCCG) { |
3819 | dbgs() << "CCG before cloning:\n" ; |
3820 | dbgs() << *this; |
3821 | } |
3822 | if (ExportToDot) |
3823 | exportToDot(Label: "postbuild" ); |
3824 | |
3825 | if (VerifyCCG) { |
3826 | check(); |
3827 | } |
3828 | |
3829 | identifyClones(); |
3830 | |
3831 | if (VerifyCCG) { |
3832 | check(); |
3833 | } |
3834 | |
3835 | if (DumpCCG) { |
3836 | dbgs() << "CCG after cloning:\n" ; |
3837 | dbgs() << *this; |
3838 | } |
3839 | if (ExportToDot) |
3840 | exportToDot(Label: "cloned" ); |
3841 | |
3842 | bool Changed = assignFunctions(); |
3843 | |
3844 | if (DumpCCG) { |
3845 | dbgs() << "CCG after assigning function clones:\n" ; |
3846 | dbgs() << *this; |
3847 | } |
3848 | if (ExportToDot) |
3849 | exportToDot(Label: "clonefuncassign" ); |
3850 | |
3851 | if (MemProfReportHintedSizes) |
3852 | printTotalSizes(OS&: errs()); |
3853 | |
3854 | return Changed; |
3855 | } |
3856 | |
3857 | bool MemProfContextDisambiguation::( |
3858 | Module &M, |
3859 | llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) { |
3860 | |
3861 | // If we have an import summary, then the cloning decisions were made during |
3862 | // the thin link on the index. Apply them and return. |
3863 | if (ImportSummary) |
3864 | return applyImport(M); |
3865 | |
3866 | // TODO: If/when other types of memprof cloning are enabled beyond just for |
3867 | // hot and cold, we will need to change this to individually control the |
3868 | // AllocationType passed to addStackNodesForMIB during CCG construction. |
3869 | // Note that we specifically check this after applying imports above, so that |
3870 | // the option isn't needed to be passed to distributed ThinLTO backend |
3871 | // clang processes, which won't necessarily have visibility into the linker |
3872 | // dependences. Instead the information is communicated from the LTO link to |
3873 | // the backends via the combined summary index. |
3874 | if (!SupportsHotColdNew) |
3875 | return false; |
3876 | |
3877 | ModuleCallsiteContextGraph CCG(M, OREGetter); |
3878 | return CCG.process(); |
3879 | } |
3880 | |
3881 | MemProfContextDisambiguation::MemProfContextDisambiguation( |
3882 | const ModuleSummaryIndex *Summary) |
3883 | : ImportSummary(Summary) { |
3884 | if (ImportSummary) { |
3885 | // The MemProfImportSummary should only be used for testing ThinLTO |
3886 | // distributed backend handling via opt, in which case we don't have a |
3887 | // summary from the pass pipeline. |
3888 | assert(MemProfImportSummary.empty()); |
3889 | return; |
3890 | } |
3891 | if (MemProfImportSummary.empty()) |
3892 | return; |
3893 | |
3894 | auto ReadSummaryFile = |
3895 | errorOrToExpected(EO: MemoryBuffer::getFile(Filename: MemProfImportSummary)); |
3896 | if (!ReadSummaryFile) { |
3897 | logAllUnhandledErrors(E: ReadSummaryFile.takeError(), OS&: errs(), |
3898 | ErrorBanner: "Error loading file '" + MemProfImportSummary + |
3899 | "': " ); |
3900 | return; |
3901 | } |
3902 | auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(Buffer: **ReadSummaryFile); |
3903 | if (!ImportSummaryForTestingOrErr) { |
3904 | logAllUnhandledErrors(E: ImportSummaryForTestingOrErr.takeError(), OS&: errs(), |
3905 | ErrorBanner: "Error parsing file '" + MemProfImportSummary + |
3906 | "': " ); |
3907 | return; |
3908 | } |
3909 | ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr); |
3910 | ImportSummary = ImportSummaryForTesting.get(); |
3911 | } |
3912 | |
3913 | PreservedAnalyses MemProfContextDisambiguation::run(Module &M, |
3914 | ModuleAnalysisManager &AM) { |
3915 | auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(IR&: M).getManager(); |
3916 | auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & { |
3917 | return FAM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: *F); |
3918 | }; |
3919 | if (!processModule(M, OREGetter)) |
3920 | return PreservedAnalyses::all(); |
3921 | return PreservedAnalyses::none(); |
3922 | } |
3923 | |
3924 | void MemProfContextDisambiguation::run( |
3925 | ModuleSummaryIndex &Index, |
3926 | llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)> |
3927 | isPrevailing) { |
3928 | // TODO: If/when other types of memprof cloning are enabled beyond just for |
3929 | // hot and cold, we will need to change this to individually control the |
3930 | // AllocationType passed to addStackNodesForMIB during CCG construction. |
3931 | // The index was set from the option, so these should be in sync. |
3932 | assert(Index.withSupportsHotColdNew() == SupportsHotColdNew); |
3933 | if (!SupportsHotColdNew) |
3934 | return; |
3935 | |
3936 | IndexCallsiteContextGraph CCG(Index, isPrevailing); |
3937 | CCG.process(); |
3938 | } |
3939 | |