PartialInlining.cpp source code [llvm_projects/llvm/lib/Transforms/IPO/PartialInlining.cpp]

1	//===- PartialInlining.cpp - Inline parts of functions --------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This pass performs partial inlining, typically by inlining an if statement
10	// that surrounds the body of the function.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "llvm/Transforms/IPO/PartialInlining.h"
15	#include "llvm/ADT/DenseMap.h"
16	#include "llvm/ADT/DenseSet.h"
17	#include "llvm/ADT/DepthFirstIterator.h"
18	#include "llvm/ADT/STLExtras.h"
19	#include "llvm/ADT/SmallVector.h"
20	#include "llvm/ADT/Statistic.h"
21	#include "llvm/Analysis/BlockFrequencyInfo.h"
22	#include "llvm/Analysis/BranchProbabilityInfo.h"
23	#include "llvm/Analysis/InlineCost.h"
24	#include "llvm/Analysis/LoopInfo.h"
25	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
26	#include "llvm/Analysis/ProfileSummaryInfo.h"
27	#include "llvm/Analysis/TargetLibraryInfo.h"
28	#include "llvm/Analysis/TargetTransformInfo.h"
29	#include "llvm/IR/Attributes.h"
30	#include "llvm/IR/BasicBlock.h"
31	#include "llvm/IR/CFG.h"
32	#include "llvm/IR/DebugLoc.h"
33	#include "llvm/IR/DiagnosticInfo.h"
34	#include "llvm/IR/Dominators.h"
35	#include "llvm/IR/Function.h"
36	#include "llvm/IR/InstrTypes.h"
37	#include "llvm/IR/Instruction.h"
38	#include "llvm/IR/Instructions.h"
39	#include "llvm/IR/IntrinsicInst.h"
40	#include "llvm/IR/Intrinsics.h"
41	#include "llvm/IR/Module.h"
42	#include "llvm/IR/Operator.h"
43	#include "llvm/IR/ProfDataUtils.h"
44	#include "llvm/IR/User.h"
45	#include "llvm/Support/BlockFrequency.h"
46	#include "llvm/Support/BranchProbability.h"
47	#include "llvm/Support/Casting.h"
48	#include "llvm/Support/CommandLine.h"
49	#include "llvm/Support/ErrorHandling.h"
50	#include "llvm/Transforms/IPO.h"
51	#include "llvm/Transforms/Utils/Cloning.h"
52	#include "llvm/Transforms/Utils/CodeExtractor.h"
53	#include "llvm/Transforms/Utils/ValueMapper.h"
54	#include <algorithm>
55	#include <cassert>
56	#include <cstdint>
57	#include <memory>
58	#include <tuple>
59	#include <vector>
60
61	using namespace llvm;
62
63	#define DEBUG_TYPE "partial-inlining"
64
65	STATISTIC(NumPartialInlined,
66	"Number of callsites functions partially inlined into.");
67	STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with "
68	"cold outlined regions were partially "
69	"inlined into its caller(s).");
70	STATISTIC(NumColdRegionsFound,
71	"Number of cold single entry/exit regions found.");
72	STATISTIC(NumColdRegionsOutlined,
73	"Number of cold single entry/exit regions outlined.");
74
75	// Command line option to disable partial-inlining. The default is false:
76	static cl::opt<bool>
77	DisablePartialInlining("disable-partial-inlining", cl::init(Val: false),
78	cl::Hidden, cl::desc ("Disable partial inlining"));
79	// Command line option to disable multi-region partial-inlining. The default is
80	// false:
81	static cl::opt<bool> DisableMultiRegionPartialInline(
82	"disable-mr-partial-inlining", cl::init(Val: false), cl::Hidden,
83	cl::desc ("Disable multi-region partial inlining"));
84
85	// Command line option to force outlining in regions with live exit variables.
86	// The default is false:
87	static cl::opt<bool>
88	ForceLiveExit("pi-force-live-exit-outline", cl::init(Val: false), cl::Hidden,
89	cl::desc ("Force outline regions with live exits"));
90
91	// Command line option to enable marking outline functions with Cold Calling
92	// Convention. The default is false:
93	static cl::opt<bool>
94	MarkOutlinedColdCC("pi-mark-coldcc", cl::init(Val: false), cl::Hidden,
95	cl::desc ("Mark outline function calls with ColdCC"));
96
97	// This is an option used by testing:
98	static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
99
100	cl::ReallyHidden,
101	cl::desc ("Skip Cost Analysis"));
102	// Used to determine if a cold region is worth outlining based on
103	// its inlining cost compared to the original function. Default is set at 10%.
104	// ie. if the cold region reduces the inlining cost of the original function by
105	// at least 10%.
106	static cl::opt<float> MinRegionSizeRatio(
107	"min-region-size-ratio", cl::init(Val: `0.1`), cl::Hidden,
108	cl::desc ("Minimum ratio comparing relative sizes of each "
109	"outline candidate and original function"));
110	// Used to tune the minimum number of execution counts needed in the predecessor
111	// block to the cold edge. ie. confidence interval.
112	static cl::opt<unsigned>
113	MinBlockCounterExecution("min-block-execution", cl::init(Val: `100`), cl::Hidden,
114	cl::desc ("Minimum block executions to consider "
115	"its BranchProbabilityInfo valid"));
116	// Used to determine when an edge is considered cold. Default is set to 10%. ie.
117	// if the branch probability is 10% or less, then it is deemed as 'cold'.
118	static cl::opt<float> ColdBranchRatio(
119	"cold-branch-ratio", cl::init(Val: `0.1`), cl::Hidden,
120	cl::desc ("Minimum BranchProbability to consider a region cold."));
121
122	static cl::opt<unsigned> MaxNumInlineBlocks(
123	"max-num-inline-blocks", cl::init(Val: `5`), cl::Hidden,
124	cl::desc ("Max number of blocks to be partially inlined"));
125
126	// Command line option to set the maximum number of partial inlining allowed
127	// for the module. The default value of -1 means no limit.
128	static cl::opt<int> MaxNumPartialInlining(
129	"max-partial-inlining", cl::init(Val: -`1`), cl::Hidden,
130	cl::desc ("Max number of partial inlining. The default is unlimited"));
131
132	// Used only when PGO or user annotated branch data is absent. It is
133	// the least value that is used to weigh the outline region. If BFI
134	// produces larger value, the BFI value will be used.
135	static cl::opt<int>
136	OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(Val: `75`),
137	cl::Hidden,
138	cl::desc ("Relative frequency of outline region to "
139	"the entry block"));
140
141	static cl::opt<unsigned> ExtraOutliningPenalty(
142	"partial-inlining-extra-penalty", cl::init(Val: `0`), cl::Hidden,
143	cl::desc ("A debug option to add additional penalty to the computed one."));
144
145	namespace {
146
147	struct FunctionOutliningInfo {
148	FunctionOutliningInfo() = default;
149
150	// Returns the number of blocks to be inlined including all blocks
151	// in Entries and one return block.
152	unsigned getNumInlinedBlocks() const { return Entries.size() + `1`; }
153
154	// A set of blocks including the function entry that guard
155	// the region to be outlined.
156	SmallVector<BasicBlock *, `4`> Entries;
157
158	// The return block that is not included in the outlined region.
159	BasicBlock ReturnBlock = nullptr*;
160
161	// The dominating block of the region to be outlined.
162	BasicBlock NonReturnBlock = nullptr*;
163
164	// The set of blocks in Entries that are predecessors to ReturnBlock
165	SmallVector<BasicBlock *, `4`> ReturnBlockPreds;
166	};
167
168	struct FunctionOutliningMultiRegionInfo {
169	FunctionOutliningMultiRegionInfo() = default;
170
171	// Container for outline regions
172	struct OutlineRegionInfo {
173	OutlineRegionInfo(ArrayRef<BasicBlock > Region, BasicBlock EntryBlock,
174	BasicBlock ExitBlock, BasicBlock ReturnBlock)
175	: Region (Region), EntryBlock(EntryBlock), ExitBlock(ExitBlock),
176	ReturnBlock(ReturnBlock) {}
177	SmallVector<BasicBlock *, `8`> Region;
178	BasicBlock *EntryBlock;
179	BasicBlock *ExitBlock;
180	BasicBlock *ReturnBlock;
181	};
182
183	SmallVector<OutlineRegionInfo, `4`> ORI;
184	};
185
186	struct PartialInlinerImpl {
187
188	PartialInlinerImpl(
189	function_ref<AssumptionCache &(Function &)> GetAC,
190	function_ref<AssumptionCache *(Function &)> LookupAC,
191	function_ref<TargetTransformInfo &(Function &)> GTTI,
192	function_ref<const TargetLibraryInfo &(Function &)> GTLI,
193	ProfileSummaryInfo &ProfSI,
194	function_ref<BlockFrequencyInfo &(Function &)> GBFI = nullptr)
195	: GetAssumptionCache (GetAC), LookupAssumptionCache (LookupAC),
196	GetTTI (GTTI), GetBFI (GBFI), GetTLI (GTLI), PSI(ProfSI) {}
197
198	bool run(Module &M);
199	// Main part of the transformation that calls helper functions to find
200	// outlining candidates, clone & outline the function, and attempt to
201	// partially inline the resulting function. Returns true if
202	// inlining was successful, false otherwise. Also returns the outline
203	// function (only if we partially inlined early returns) as there is a
204	// possibility to further "peel" early return statements that were left in the
205	// outline function due to code size.
206	std::pair<bool, Function *> unswitchFunction(Function &F);
207
208	// This class speculatively clones the function to be partial inlined.
209	// At the end of partial inlining, the remaining callsites to the cloned
210	// function that are not partially inlined will be fixed up to reference
211	// the original function, and the cloned function will be erased.
212	struct FunctionCloner {
213	// Two constructors, one for single region outlining, the other for
214	// multi-region outlining.
215	FunctionCloner(Function F, FunctionOutliningInfo OI,
216	OptimizationRemarkEmitter &ORE,
217	function_ref<AssumptionCache *(Function &)> LookupAC,
218	function_ref<TargetTransformInfo &(Function &)> GetTTI);
219	FunctionCloner(Function F, FunctionOutliningMultiRegionInfo OMRI,
220	OptimizationRemarkEmitter &ORE,
221	function_ref<AssumptionCache *(Function &)> LookupAC,
222	function_ref<TargetTransformInfo &(Function &)> GetTTI);
223
224	~FunctionCloner();
225
226	// Prepare for function outlining: making sure there is only
227	// one incoming edge from the extracted/outlined region to
228	// the return block.
229	void normalizeReturnBlock() const;
230
231	// Do function outlining for cold regions.
232	bool doMultiRegionFunctionOutlining();
233	// Do function outlining for region after early return block(s).
234	// NOTE: For vararg functions that do the vararg handling in the outlined
235	// function, we temporarily generate IR that does not properly
236	// forward varargs to the outlined function. Calling InlineFunction
237	// will update calls to the outlined functions to properly forward
238	// the varargs.
239	Function *doSingleRegionFunctionOutlining();
240
241	Function OrigFunc = nullptr*;
242	Function ClonedFunc = nullptr*;
243
244	typedef std::pair<Function , BasicBlock > FuncBodyCallerPair;
245	// Keep track of Outlined Functions and the basic block they're called from.
246	SmallVector<FuncBodyCallerPair, `4`> OutlinedFunctions;
247
248	// ClonedFunc is inlined in one of its callers after function
249	// outlining.
250	bool IsFunctionInlined = false;
251	// The cost of the region to be outlined.
252	InstructionCost OutlinedRegionCost = `0`;
253	// ClonedOI is specific to outlining non-early return blocks.
254	std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr;
255	// ClonedOMRI is specific to outlining cold regions.
256	std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr;
257	std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr;
258	OptimizationRemarkEmitter &ORE;
259	function_ref<AssumptionCache *(Function &)> LookupAC;
260	function_ref<TargetTransformInfo &(Function &)> GetTTI;
261	};
262
263	private:
264	int NumPartialInlining = `0`;
265	function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
266	function_ref<AssumptionCache *(Function &)> LookupAssumptionCache;
267	function_ref<TargetTransformInfo &(Function &)> GetTTI;
268	function_ref<BlockFrequencyInfo &(Function &)> GetBFI;
269	function_ref<const TargetLibraryInfo &(Function &)> GetTLI;
270	ProfileSummaryInfo &PSI;
271
272	// Return the frequency of the OutlininingBB relative to F's entry point.
273	// The result is no larger than 1 and is represented using BP.
274	// (Note that the outlined region's 'head' block can only have incoming
275	// edges from the guarding entry blocks).
276	BranchProbability
277	getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) const;
278
279	// Return true if the callee of CB should be partially inlined with
280	// profit.
281	bool shouldPartialInline(CallBase &CB, FunctionCloner &Cloner,
282	BlockFrequency WeightedOutliningRcost,
283	OptimizationRemarkEmitter &ORE) const;
284
285	// Try to inline DuplicateFunction (cloned from F with call to
286	// the OutlinedFunction into its callers. Return true
287	// if there is any successful inlining.
288	bool tryPartialInline(FunctionCloner &Cloner);
289
290	// Compute the mapping from use site of DuplicationFunction to the enclosing
291	// BB's profile count.
292	void
293	computeCallsiteToProfCountMap(Function *DuplicateFunction,
294	DenseMap<User , uint64_t> &SiteCountMap) const*;
295
296	bool isLimitReached() const {
297	return (MaxNumPartialInlining != -`1` &&
298	NumPartialInlining >= MaxNumPartialInlining);
299	}
300
301	static CallBase getSupportedCallBase(User U) {
302	if (isa<CallInst>(Val: U) \|\| isa<InvokeInst>(Val: U))
303	return cast<CallBase>(Val: U);
304	llvm_unreachable("All uses must be calls");
305	return nullptr;
306	}
307
308	static CallBase *getOneCallSiteTo(Function &F) {
309	User User = F.user_begin();
310	return getSupportedCallBase(U: User);
311	}
312
313	std::tuple<DebugLoc, BasicBlock > getOneDebugLoc(Function &F) const* {
314	CallBase *CB = getOneCallSiteTo(F);
315	DebugLoc DLoc = CB->getDebugLoc();
316	BasicBlock *Block = CB->getParent();
317	return std::make_tuple(args&: DLoc, args&: Block);
318	}
319
320	// Returns the costs associated with function outlining:
321	// - The first value is the non-weighted runtime cost for making the call
322	// to the outlined function, including the addtional setup cost in the
323	// outlined function itself;
324	// - The second value is the estimated size of the new call sequence in
325	// basic block Cloner.OutliningCallBB;
326	std::tuple<InstructionCost, InstructionCost>
327	computeOutliningCosts(FunctionCloner &Cloner) const;
328
329	// Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
330	// approximate both the size and runtime cost (Note that in the current
331	// inline cost analysis, there is no clear distinction there either).
332	static InstructionCost computeBBInlineCost(BasicBlock *BB,
333	TargetTransformInfo *TTI);
334
335	std::unique_ptr<FunctionOutliningInfo>
336	computeOutliningInfo(Function &F) const;
337
338	std::unique_ptr<FunctionOutliningMultiRegionInfo>
339	computeOutliningColdRegionsInfo(Function &F,
340	OptimizationRemarkEmitter &ORE) const;
341	};
342
343	} // end anonymous namespace
344
345	std::unique_ptr<FunctionOutliningMultiRegionInfo>
346	PartialInlinerImpl::computeOutliningColdRegionsInfo(
347	Function &F, OptimizationRemarkEmitter &ORE) const {
348	BasicBlock *EntryBlock = &F.front();
349
350	DominatorTree DT(F);
351	LoopInfo LI(DT);
352	BranchProbabilityInfo BPI(F, LI);
353	std::unique_ptr<BlockFrequencyInfo> ScopedBFI;
354	BlockFrequencyInfo *BFI;
355	if (!GetBFI) {
356	ScopedBFI.reset(p: new BlockFrequencyInfo (F, BPI, LI));
357	BFI = ScopedBFI.get();
358	} else
359	BFI = &(GetBFI (F));
360
361	// Return if we don't have profiling information.
362	if (!PSI.hasInstrumentationProfile())
363	return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
364
365	std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo =
366	std::make_unique<FunctionOutliningMultiRegionInfo>();
367
368	auto IsSingleExit =
369	[&ORE](SmallVectorImpl<BasicBlock > &BlockList) -> BasicBlock {
370	BasicBlock ExitBlock = nullptr*;
371	for (auto *Block : BlockList) {
372	for (BasicBlock *Succ : successors(BB: Block)) {
373	if (!is_contained(Range&: BlockList, Element: Succ)) {
374	if (ExitBlock) {
375	ORE.emit(RemarkBuilder: [&]() {
376	return OptimizationRemarkMissed (DEBUG_TYPE, "MultiExitRegion",
377	&Succ->front())
378	<< "Region dominated by "
379	<< ore::NV ("Block", BlockList.front()->getName())
380	<< " has more than one region exit edge.";
381	});
382	return nullptr;
383	}
384
385	ExitBlock = Block;
386	}
387	}
388	}
389	return ExitBlock;
390	};
391
392	auto BBProfileCount = [BFI](BasicBlock *BB) {
393	return BFI->getBlockProfileCount(BB).value_or(u: `0`);
394	};
395
396	// Use the same computeBBInlineCost function to compute the cost savings of
397	// the outlining the candidate region.
398	TargetTransformInfo *FTTI = &GetTTI (F);
399	InstructionCost OverallFunctionCost = `0`;
400	for (auto &BB : F)
401	OverallFunctionCost += computeBBInlineCost(BB: &BB, TTI: FTTI);
402
403	LLVM_DEBUG(dbgs() << "OverallFunctionCost = " << OverallFunctionCost
404	<< "\n";);
405
406	InstructionCost MinOutlineRegionCost = OverallFunctionCost.map(
407	F: [&](auto Cost) { return Cost * MinRegionSizeRatio; });
408
409	BranchProbability MinBranchProbability(
410	static_cast<int>(ColdBranchRatio * MinBlockCounterExecution),
411	MinBlockCounterExecution);
412	bool ColdCandidateFound = false;
413	BasicBlock *CurrEntry = EntryBlock;
414	std::vector<BasicBlock *> DFS;
415	SmallPtrSet<BasicBlock *, `8`> VisitedSet;
416	DFS.push_back(x: CurrEntry);
417	VisitedSet.insert(Ptr: CurrEntry);
418
419	// Use Depth First Search on the basic blocks to find CFG edges that are
420	// considered cold.
421	// Cold regions considered must also have its inline cost compared to the
422	// overall inline cost of the original function. The region is outlined only
423	// if it reduced the inline cost of the function by 'MinOutlineRegionCost' or
424	// more.
425	while (!DFS.empty()) {
426	auto *ThisBB = DFS.back();
427	DFS.pop_back();
428	// Only consider regions with predecessor blocks that are considered
429	// not-cold (default: part of the top 99.99% of all block counters)
430	// AND greater than our minimum block execution count (default: 100).
431	if (PSI.isColdBlock(BB: ThisBB, BFI) \|\|
432	BBProfileCount (ThisBB) < MinBlockCounterExecution)
433	continue;
434	for (auto SI = succ_begin(BB: ThisBB); SI != succ_end(BB: ThisBB); ++SI) {
435	if (!VisitedSet.insert(Ptr: *SI).second)
436	continue;
437	DFS.push_back(x: *SI);
438	// If branch isn't cold, we skip to the next one.
439	BranchProbability SuccProb = BPI.getEdgeProbability(Src: ThisBB, Dst: *SI);
440	if (SuccProb > MinBranchProbability)
441	continue;
442
443	LLVM_DEBUG(dbgs() << "Found cold edge: " << ThisBB->getName() << "->"
444	<< SI->getName()
445	<< "\nBranch Probability = " << SuccProb << "\n";);
446
447	SmallVector<BasicBlock *, `8`> DominateVector;
448	DT.getDescendants(R: *SI, Result&: DominateVector);
449	assert(!DominateVector.empty() &&
450	"SI should be reachable and have at least itself as descendant");
451
452	// We can only outline single entry regions (for now).
453	if (!DominateVector.front()->hasNPredecessors(N: `1`)) {
454	LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName()
455	<< " doesn't have a single predecessor in the "
456	"dominator tree\n";);
457	continue;
458	}
459
460	BasicBlock ExitBlock = nullptr*;
461	// We can only outline single exit regions (for now).
462	if (!(ExitBlock = IsSingleExit (DominateVector))) {
463	LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName()
464	<< " doesn't have a unique successor\n";);
465	continue;
466	}
467
468	InstructionCost OutlineRegionCost = `0`;
469	for (auto *BB : DominateVector)
470	OutlineRegionCost += computeBBInlineCost(BB, TTI: &GetTTI (*BB->getParent()));
471
472	LLVM_DEBUG(dbgs() << "OutlineRegionCost = " << OutlineRegionCost
473	<< "\n";);
474
475	if (!SkipCostAnalysis && OutlineRegionCost < MinOutlineRegionCost) {
476	ORE.emit(RemarkBuilder: [&]() {
477	return OptimizationRemarkAnalysis (DEBUG_TYPE, "TooCostly",
478	&SI ->front())
479	<< ore::NV ("Callee", &F)
480	<< " inline cost-savings smaller than "
481	<< ore::NV ("Cost", MinOutlineRegionCost);
482	});
483
484	LLVM_DEBUG(dbgs() << "ABORT: Outline region cost is smaller than "
485	<< MinOutlineRegionCost << "\n";);
486	continue;
487	}
488
489	// For now, ignore blocks that belong to a SISE region that is a
490	// candidate for outlining. In the future, we may want to look
491	// at inner regions because the outer region may have live-exit
492	// variables.
493	VisitedSet.insert_range(R&: DominateVector);
494
495	// ReturnBlock here means the block after the outline call
496	BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor();
497	FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo(
498	DominateVector, DominateVector.front(), ExitBlock, ReturnBlock);
499	OutliningInfo ->ORI.push_back(Elt: RegInfo);
500	LLVM_DEBUG(dbgs() << "Found Cold Candidate starting at block: "
501	<< DominateVector.front()->getName() << "\n";);
502	ColdCandidateFound = true;
503	NumColdRegionsFound ++;
504	}
505	}
506
507	if (ColdCandidateFound)
508	return OutliningInfo;
509
510	return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
511	}
512
513	std::unique_ptr<FunctionOutliningInfo>
514	PartialInlinerImpl::computeOutliningInfo(Function &F) const {
515	BasicBlock *EntryBlock = &F.front();
516	BranchInst *BR = dyn_cast<BranchInst>(Val: EntryBlock->getTerminator());
517	if (!BR \|\| BR->isUnconditional())
518	return std::unique_ptr<FunctionOutliningInfo>();
519
520	// Returns true if Succ is BB's successor
521	auto IsSuccessor = [](BasicBlock Succ, BasicBlock BB) {
522	return is_contained(Range: successors(BB), Element: Succ);
523	};
524
525	auto IsReturnBlock = [](BasicBlock *BB) {
526	Instruction *TI = BB->getTerminator();
527	return isa<ReturnInst>(Val: TI);
528	};
529
530	auto GetReturnBlock = [&](BasicBlock Succ1, BasicBlock Succ2) {
531	if (IsReturnBlock (Succ1))
532	return std::make_tuple(args&: Succ1, args&: Succ2);
533	if (IsReturnBlock (Succ2))
534	return std::make_tuple(args&: Succ2, args&: Succ1);
535
536	return std::make_tuple<BasicBlock , BasicBlock >(args: nullptr, args: nullptr);
537	};
538
539	// Detect a triangular shape:
540	auto GetCommonSucc = [&](BasicBlock Succ1, BasicBlock Succ2) {
541	if (IsSuccessor (Succ1, Succ2))
542	return std::make_tuple(args&: Succ1, args&: Succ2);
543	if (IsSuccessor (Succ2, Succ1))
544	return std::make_tuple(args&: Succ2, args&: Succ1);
545
546	return std::make_tuple<BasicBlock , BasicBlock >(args: nullptr, args: nullptr);
547	};
548
549	std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
550	std::make_unique<FunctionOutliningInfo>();
551
552	BasicBlock *CurrEntry = EntryBlock;
553	bool CandidateFound = false;
554	do {
555	// The number of blocks to be inlined has already reached
556	// the limit. When MaxNumInlineBlocks is set to 0 or 1, this
557	// disables partial inlining for the function.
558	if (OutliningInfo ->getNumInlinedBlocks() >= MaxNumInlineBlocks)
559	break;
560
561	if (succ_size(BB: CurrEntry) != `2`)
562	break;
563
564	BasicBlock Succ1 = succ_begin(BB: CurrEntry);
565	BasicBlock Succ2 = (succ_begin(BB: CurrEntry) + `1`);
566
567	BasicBlock ReturnBlock, NonReturnBlock;
568	std::tie(args&: ReturnBlock, args&: NonReturnBlock) = GetReturnBlock (Succ1, Succ2);
569
570	if (ReturnBlock) {
571	OutliningInfo ->Entries.push_back(Elt: CurrEntry);
572	OutliningInfo ->ReturnBlock = ReturnBlock;
573	OutliningInfo ->NonReturnBlock = NonReturnBlock;
574	CandidateFound = true;
575	break;
576	}
577
578	BasicBlock CommSucc, OtherSucc;
579	std::tie(args&: CommSucc, args&: OtherSucc) = GetCommonSucc (Succ1, Succ2);
580
581	if (!CommSucc)
582	break;
583
584	OutliningInfo ->Entries.push_back(Elt: CurrEntry);
585	CurrEntry = OtherSucc;
586	} while (true);
587
588	if (!CandidateFound)
589	return std::unique_ptr<FunctionOutliningInfo>();
590
591	// There should not be any successors (not in the entry set) other than
592	// {ReturnBlock, NonReturnBlock}
593	assert(OutliningInfo->Entries[`0`] == &F.front() &&
594	"Function Entry must be the first in Entries vector");
595	DenseSet<BasicBlock *> Entries(llvm::from_range, OutliningInfo ->Entries);
596
597	// Returns true of BB has Predecessor which is not
598	// in Entries set.
599	auto HasNonEntryPred = [Entries](BasicBlock *BB) {
600	for (auto *Pred : predecessors(BB)) {
601	if (!Entries.count(V: Pred))
602	return true;
603	}
604	return false;
605	};
606	auto CheckAndNormalizeCandidate =
607	[Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) {
608	for (BasicBlock *E : OutliningInfo->Entries) {
609	for (auto *Succ : successors(BB: E)) {
610	if (Entries.count(V: Succ))
611	continue;
612	if (Succ == OutliningInfo->ReturnBlock)
613	OutliningInfo->ReturnBlockPreds.push_back(Elt: E);
614	else if (Succ != OutliningInfo->NonReturnBlock)
615	return false;
616	}
617	// There should not be any outside incoming edges either:
618	if (HasNonEntryPred (E))
619	return false;
620	}
621	return true;
622	};
623
624	if (!CheckAndNormalizeCandidate (OutliningInfo.get()))
625	return std::unique_ptr<FunctionOutliningInfo>();
626
627	// Now further growing the candidate's inlining region by
628	// peeling off dominating blocks from the outlining region:
629	while (OutliningInfo ->getNumInlinedBlocks() < MaxNumInlineBlocks) {
630	BasicBlock *Cand = OutliningInfo ->NonReturnBlock;
631	if (succ_size(BB: Cand) != `2`)
632	break;
633
634	if (HasNonEntryPred (Cand))
635	break;
636
637	BasicBlock Succ1 = succ_begin(BB: Cand);
638	BasicBlock Succ2 = (succ_begin(BB: Cand) + `1`);
639
640	BasicBlock ReturnBlock, NonReturnBlock;
641	std::tie(args&: ReturnBlock, args&: NonReturnBlock) = GetReturnBlock (Succ1, Succ2);
642	if (!ReturnBlock \|\| ReturnBlock != OutliningInfo ->ReturnBlock)
643	break;
644
645	if (NonReturnBlock->getSinglePredecessor() != Cand)
646	break;
647
648	// Now grow and update OutlininigInfo:
649	OutliningInfo ->Entries.push_back(Elt: Cand);
650	OutliningInfo ->NonReturnBlock = NonReturnBlock;
651	OutliningInfo ->ReturnBlockPreds.push_back(Elt: Cand);
652	Entries.insert(V: Cand);
653	}
654
655	return OutliningInfo;
656	}
657
658	// Check if there is PGO data or user annotated branch data:
659	static bool hasProfileData(const Function &F, const FunctionOutliningInfo &OI) {
660	if (F.hasProfileData())
661	return true;
662	// Now check if any of the entry block has MD_prof data:
663	for (auto *E : OI.Entries) {
664	BranchInst *BR = dyn_cast<BranchInst>(Val: E->getTerminator());
665	if (!BR \|\| BR->isUnconditional())
666	continue;
667	if (hasBranchWeightMD(I: *BR))
668	return true;
669	}
670	return false;
671	}
672
673	BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq(
674	FunctionCloner &Cloner) const {
675	BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second;
676	auto EntryFreq =
677	Cloner.ClonedFuncBFI ->getBlockFreq(BB: &Cloner.ClonedFunc->getEntryBlock());
678	auto OutliningCallFreq =
679	Cloner.ClonedFuncBFI ->getBlockFreq(BB: OutliningCallBB);
680	// FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE
681	// we outlined any regions, so we may encounter situations where the
682	// OutliningCallFreq is slightly* bigger than the EntryFreq.*
683	if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency())
684	OutliningCallFreq = EntryFreq;
685
686	auto OutlineRegionRelFreq = BranchProbability::getBranchProbability(
687	Numerator: OutliningCallFreq.getFrequency(), Denominator: EntryFreq.getFrequency());
688
689	if (hasProfileData(F: Cloner.OrigFunc, OI: Cloner.ClonedOI))
690	return OutlineRegionRelFreq;
691
692	// When profile data is not available, we need to be conservative in
693	// estimating the overall savings. Static branch prediction can usually
694	// guess the branch direction right (taken/non-taken), but the guessed
695	// branch probability is usually not biased enough. In case when the
696	// outlined region is predicted to be likely, its probability needs
697	// to be made higher (more biased) to not under-estimate the cost of
698	// function outlining. On the other hand, if the outlined region
699	// is predicted to be less likely, the predicted probablity is usually
700	// higher than the actual. For instance, the actual probability of the
701	// less likely target is only 5%, but the guessed probablity can be
702	// 40%. In the latter case, there is no need for further adjustment.
703	// FIXME: add an option for this.
704	if (OutlineRegionRelFreq < BranchProbability (`45`, `100`))
705	return OutlineRegionRelFreq;
706
707	OutlineRegionRelFreq = std::max(
708	a: OutlineRegionRelFreq, b: BranchProbability (OutlineRegionFreqPercent, `100`));
709
710	return OutlineRegionRelFreq;
711	}
712
713	bool PartialInlinerImpl::shouldPartialInline(
714	CallBase &CB, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost,
715	OptimizationRemarkEmitter &ORE) const {
716	using namespace ore;
717
718	Function *Callee = CB.getCalledFunction();
719	assert(Callee == Cloner.ClonedFunc);
720
721	if (SkipCostAnalysis)
722	return isInlineViable(Callee&: *Callee).isSuccess();
723
724	Function *Caller = CB.getCaller();
725	auto &CalleeTTI = GetTTI (*Callee);
726	bool RemarksEnabled =
727	Callee->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled(
728	DEBUG_TYPE);
729	InlineCost IC =
730	getInlineCost(Call&: CB, Params: getInlineParams(), CalleeTTI, GetAssumptionCache,
731	GetTLI, GetBFI, PSI: &PSI, ORE: RemarksEnabled ? &ORE : nullptr);
732
733	if (IC.isAlways()) {
734	ORE.emit(RemarkBuilder: [&]() {
735	return OptimizationRemarkAnalysis (DEBUG_TYPE, "AlwaysInline", &CB)
736	<< NV ("Callee", Cloner.OrigFunc)
737	<< " should always be fully inlined, not partially";
738	});
739	return false;
740	}
741
742	if (IC.isNever()) {
743	ORE.emit(RemarkBuilder: [&]() {
744	return OptimizationRemarkMissed (DEBUG_TYPE, "NeverInline", &CB)
745	<< NV ("Callee", Cloner.OrigFunc) << " not partially inlined into "
746	<< NV ("Caller", Caller)
747	<< " because it should never be inlined (cost=never)";
748	});
749	return false;
750	}
751
752	if (!IC) {
753	ORE.emit(RemarkBuilder: [&]() {
754	return OptimizationRemarkAnalysis (DEBUG_TYPE, "TooCostly", &CB)
755	<< NV ("Callee", Cloner.OrigFunc) << " not partially inlined into "
756	<< NV ("Caller", Caller) << " because too costly to inline (cost="
757	<< NV ("Cost", IC.getCost()) << ", threshold="
758	<< NV ("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
759	});
760	return false;
761	}
762	const DataLayout &DL = Caller->getDataLayout();
763
764	// The savings of eliminating the call:
765	int NonWeightedSavings = getCallsiteCost(TTI: CalleeTTI, Call: CB, DL);
766	BlockFrequency NormWeightedSavings(NonWeightedSavings);
767
768	// Weighted saving is smaller than weighted cost, return false
769	if (NormWeightedSavings < WeightedOutliningRcost) {
770	ORE.emit(RemarkBuilder: [&]() {
771	return OptimizationRemarkAnalysis (DEBUG_TYPE, "OutliningCallcostTooHigh",
772	&CB)
773	<< NV ("Callee", Cloner.OrigFunc) << " not partially inlined into "
774	<< NV ("Caller", Caller) << " runtime overhead (overhead="
775	<< NV ("Overhead", (unsigned)WeightedOutliningRcost.getFrequency())
776	<< ", savings="
777	<< NV ("Savings", (unsigned)NormWeightedSavings.getFrequency())
778	<< ")"
779	<< " of making the outlined call is too high";
780	});
781
782	return false;
783	}
784
785	ORE.emit(RemarkBuilder: [&]() {
786	return OptimizationRemarkAnalysis (DEBUG_TYPE, "CanBePartiallyInlined", &CB)
787	<< NV ("Callee", Cloner.OrigFunc) << " can be partially inlined into "
788	<< NV ("Caller", Caller) << " with cost=" << NV ("Cost", IC.getCost())
789	<< " (threshold="
790	<< NV ("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
791	});
792	return true;
793	}
794
795	// TODO: Ideally we should share Inliner's InlineCost Analysis code.
796	// For now use a simplified version. The returned 'InlineCost' will be used
797	// to esimate the size cost as well as runtime cost of the BB.
798	InstructionCost
799	PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB,
800	TargetTransformInfo *TTI) {
801	InstructionCost InlineCost = `0`;
802	const DataLayout &DL = BB->getDataLayout();
803	int InstrCost = InlineConstants::getInstrCost();
804	for (Instruction &I : BB->instructionsWithoutDebug()) {
805	// Skip free instructions.
806	switch (I.getOpcode()) {
807	case Instruction::BitCast:
808	case Instruction::PtrToInt:
809	case Instruction::IntToPtr:
810	case Instruction::Alloca:
811	case Instruction::PHI:
812	continue;
813	case Instruction::GetElementPtr:
814	if (cast<GetElementPtrInst>(Val: &I)->hasAllZeroIndices())
815	continue;
816	break;
817	default:
818	break;
819	}
820
821	if (I.isLifetimeStartOrEnd())
822	continue;
823
824	if (auto *II = dyn_cast<IntrinsicInst>(Val: &I)) {
825	Intrinsic::ID IID = II->getIntrinsicID();
826	SmallVector<Type *, `4`> Tys;
827	FastMathFlags FMF;
828	for (Value *Val : II->args())
829	Tys.push_back(Elt: Val->getType());
830
831	if (auto *FPMO = dyn_cast<FPMathOperator>(Val: II))
832	FMF = FPMO->getFastMathFlags();
833
834	IntrinsicCostAttributes ICA(IID, II->getType(), Tys, FMF);
835	InlineCost += TTI->getIntrinsicInstrCost(ICA, CostKind: TTI::TCK_SizeAndLatency);
836	continue;
837	}
838
839	if (CallInst *CI = dyn_cast<CallInst>(Val: &I)) {
840	InlineCost += getCallsiteCost(TTI: TTI, Call: CI, DL);
841	continue;
842	}
843
844	if (InvokeInst *II = dyn_cast<InvokeInst>(Val: &I)) {
845	InlineCost += getCallsiteCost(TTI: TTI, Call: II, DL);
846	continue;
847	}
848
849	if (SwitchInst *SI = dyn_cast<SwitchInst>(Val: &I)) {
850	InlineCost += (SI->getNumCases() + `1`) * InstrCost;
851	continue;
852	}
853	InlineCost += InstrCost;
854	}
855
856	return InlineCost;
857	}
858
859	std::tuple<InstructionCost, InstructionCost>
860	PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) const {
861	InstructionCost OutliningFuncCallCost = `0`, OutlinedFunctionCost = `0`;
862	for (auto FuncBBPair : Cloner.OutlinedFunctions) {
863	Function *OutlinedFunc = FuncBBPair.first;
864	BasicBlock* OutliningCallBB = FuncBBPair.second;
865	// Now compute the cost of the call sequence to the outlined function
866	// 'OutlinedFunction' in BB 'OutliningCallBB':
867	auto OutlinedFuncTTI = &GetTTI (OutlinedFunc);
868	OutliningFuncCallCost +=
869	computeBBInlineCost(BB: OutliningCallBB, TTI: OutlinedFuncTTI);
870
871	// Now compute the cost of the extracted/outlined function itself:
872	for (BasicBlock &BB : *OutlinedFunc)
873	OutlinedFunctionCost += computeBBInlineCost(BB: &BB, TTI: OutlinedFuncTTI);
874	}
875	assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
876	"Outlined function cost should be no less than the outlined region");
877
878	// The code extractor introduces a new root and exit stub blocks with
879	// additional unconditional branches. Those branches will be eliminated
880	// later with bb layout. The cost should be adjusted accordingly:
881	OutlinedFunctionCost -=
882	`2` * InlineConstants::getInstrCost() * Cloner.OutlinedFunctions.size();
883
884	InstructionCost OutliningRuntimeOverhead =
885	OutliningFuncCallCost +
886	(OutlinedFunctionCost - Cloner.OutlinedRegionCost) +
887	ExtraOutliningPenalty.getValue();
888
889	return std::make_tuple(args&: OutliningFuncCallCost, args&: OutliningRuntimeOverhead);
890	}
891
892	// Create the callsite to profile count map which is
893	// used to update the original function's entry count,
894	// after the function is partially inlined into the callsite.
895	void PartialInlinerImpl::computeCallsiteToProfCountMap(
896	Function *DuplicateFunction,
897	DenseMap<User , uint64_t> &CallSiteToProfCountMap) const* {
898	std::vector<User *> Users(DuplicateFunction->user_begin(),
899	DuplicateFunction->user_end());
900	Function CurrentCaller = nullptr*;
901	std::unique_ptr<BlockFrequencyInfo> TempBFI;
902	BlockFrequencyInfo CurrentCallerBFI = nullptr*;
903
904	auto ComputeCurrBFI = [&,this](Function *Caller) {
905	// For the old pass manager:
906	if (!GetBFI) {
907	DominatorTree DT(*Caller);
908	LoopInfo LI(DT);
909	BranchProbabilityInfo BPI(*Caller, LI);
910	TempBFI.reset(p: new BlockFrequencyInfo (*Caller, BPI, LI));
911	CurrentCallerBFI = TempBFI.get();
912	} else {
913	// New pass manager:
914	CurrentCallerBFI = &(GetBFI (*Caller));
915	}
916	};
917
918	for (User *User : Users) {
919	CallBase *CB = getSupportedCallBase(U: User);
920	Function *Caller = CB->getCaller();
921	if (CurrentCaller != Caller) {
922	CurrentCaller = Caller;
923	ComputeCurrBFI (Caller);
924	} else {
925	assert(CurrentCallerBFI && "CallerBFI is not set");
926	}
927	BasicBlock *CallBB = CB->getParent();
928	auto Count = CurrentCallerBFI->getBlockProfileCount(BB: CallBB);
929	if (Count)
930	CallSiteToProfCountMap [User] = *Count;
931	else
932	CallSiteToProfCountMap [User] = `0`;
933	}
934	}
935
936	PartialInlinerImpl::FunctionCloner::FunctionCloner(
937	Function F, FunctionOutliningInfo OI, OptimizationRemarkEmitter &ORE,
938	function_ref<AssumptionCache *(Function &)> LookupAC,
939	function_ref<TargetTransformInfo &(Function &)> GetTTI)
940	: OrigFunc(F), ORE(ORE), LookupAC (LookupAC), GetTTI (GetTTI) {
941	ClonedOI = std::make_unique<FunctionOutliningInfo>();
942
943	// Clone the function, so that we can hack away on it.
944	ValueToValueMapTy VMap;
945	ClonedFunc = CloneFunction(F, VMap);
946
947	ClonedOI ->ReturnBlock = cast<BasicBlock>(Val&: VMap [OI->ReturnBlock]);
948	ClonedOI ->NonReturnBlock = cast<BasicBlock>(Val&: VMap [OI->NonReturnBlock]);
949	for (BasicBlock *BB : OI->Entries)
950	ClonedOI ->Entries.push_back(Elt: cast<BasicBlock>(Val&: VMap [BB]));
951
952	for (BasicBlock *E : OI->ReturnBlockPreds) {
953	BasicBlock *NewE = cast<BasicBlock>(Val&: VMap [E]);
954	ClonedOI ->ReturnBlockPreds.push_back(Elt: NewE);
955	}
956	// Go ahead and update all uses to the duplicate, so that we can just
957	// use the inliner functionality when we're done hacking.
958	F->replaceAllUsesWith(V: ClonedFunc);
959	}
960
961	PartialInlinerImpl::FunctionCloner::FunctionCloner(
962	Function F, FunctionOutliningMultiRegionInfo OI,
963	OptimizationRemarkEmitter &ORE,
964	function_ref<AssumptionCache *(Function &)> LookupAC,
965	function_ref<TargetTransformInfo &(Function &)> GetTTI)
966	: OrigFunc(F), ORE(ORE), LookupAC (LookupAC), GetTTI (GetTTI) {
967	ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>();
968
969	// Clone the function, so that we can hack away on it.
970	ValueToValueMapTy VMap;
971	ClonedFunc = CloneFunction(F, VMap);
972
973	// Go through all Outline Candidate Regions and update all BasicBlock
974	// information.
975	for (const FunctionOutliningMultiRegionInfo::OutlineRegionInfo &RegionInfo :
976	OI->ORI) {
977	SmallVector<BasicBlock *, `8`> Region;
978	for (BasicBlock *BB : RegionInfo.Region)
979	Region.push_back(Elt: cast<BasicBlock>(Val&: VMap [BB]));
980
981	BasicBlock *NewEntryBlock = cast<BasicBlock>(Val&: VMap [RegionInfo.EntryBlock]);
982	BasicBlock *NewExitBlock = cast<BasicBlock>(Val&: VMap [RegionInfo.ExitBlock]);
983	BasicBlock NewReturnBlock = nullptr*;
984	if (RegionInfo.ReturnBlock)
985	NewReturnBlock = cast<BasicBlock>(Val&: VMap [RegionInfo.ReturnBlock]);
986	FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo(
987	Region, NewEntryBlock, NewExitBlock, NewReturnBlock);
988	ClonedOMRI ->ORI.push_back(Elt: MappedRegionInfo);
989	}
990	// Go ahead and update all uses to the duplicate, so that we can just
991	// use the inliner functionality when we're done hacking.
992	F->replaceAllUsesWith(V: ClonedFunc);
993	}
994
995	void PartialInlinerImpl::FunctionCloner::normalizeReturnBlock() const {
996	auto GetFirstPHI = [](BasicBlock *BB) {
997	BasicBlock::iterator I = BB->begin();
998	PHINode FirstPhi = nullptr*;
999	while (I != BB->end()) {
1000	PHINode *Phi = dyn_cast<PHINode>(Val&: I);
1001	if (!Phi)
1002	break;
1003	if (!FirstPhi) {
1004	FirstPhi = Phi;
1005	break;
1006	}
1007	}
1008	return FirstPhi;
1009	};
1010
1011	// Shouldn't need to normalize PHIs if we're not outlining non-early return
1012	// blocks.
1013	if (!ClonedOI)
1014	return;
1015
1016	// Special hackery is needed with PHI nodes that have inputs from more than
1017	// one extracted block. For simplicity, just split the PHIs into a two-level
1018	// sequence of PHIs, some of which will go in the extracted region, and some
1019	// of which will go outside.
1020	BasicBlock *PreReturn = ClonedOI ->ReturnBlock;
1021	// only split block when necessary:
1022	PHINode *FirstPhi = GetFirstPHI (PreReturn);
1023	unsigned NumPredsFromEntries = ClonedOI ->ReturnBlockPreds.size();
1024
1025	if (!FirstPhi \|\| FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + `1`)
1026	return;
1027
1028	auto IsTrivialPhi = [](PHINode PN) -> Value {
1029	if (llvm::all_equal(Range: PN->incoming_values()))
1030	return PN->getIncomingValue(i: `0`);
1031	return nullptr;
1032	};
1033
1034	ClonedOI ->ReturnBlock = ClonedOI ->ReturnBlock->splitBasicBlock(
1035	I: ClonedOI ->ReturnBlock->getFirstNonPHIIt());
1036	BasicBlock::iterator I = PreReturn->begin();
1037	BasicBlock::iterator Ins = ClonedOI ->ReturnBlock->begin();
1038	SmallVector<Instruction *, `4`> DeadPhis;
1039	while (I != PreReturn->end()) {
1040	PHINode *OldPhi = dyn_cast<PHINode>(Val&: I);
1041	if (!OldPhi)
1042	break;
1043
1044	PHINode *RetPhi =
1045	PHINode::Create(Ty: OldPhi->getType(), NumReservedValues: NumPredsFromEntries + `1`, NameStr: "");
1046	RetPhi->insertBefore(InsertPos: Ins);
1047	OldPhi->replaceAllUsesWith(V: RetPhi);
1048	Ins = ClonedOI ->ReturnBlock->getFirstNonPHIIt();
1049
1050	RetPhi->addIncoming(V: &*I, BB: PreReturn);
1051	for (BasicBlock *E : ClonedOI ->ReturnBlockPreds) {
1052	RetPhi->addIncoming(V: OldPhi->getIncomingValueForBlock(BB: E), BB: E);
1053	OldPhi->removeIncomingValue(BB: E);
1054	}
1055
1056	// After incoming values splitting, the old phi may become trivial.
1057	// Keeping the trivial phi can introduce definition inside the outline
1058	// region which is live-out, causing necessary overhead (load, store
1059	// arg passing etc).
1060	if (auto *OldPhiVal = IsTrivialPhi (OldPhi)) {
1061	OldPhi->replaceAllUsesWith(V: OldPhiVal);
1062	DeadPhis.push_back(Elt: OldPhi);
1063	}
1064	++I;
1065	}
1066	for (auto *DP : DeadPhis)
1067	DP->eraseFromParent();
1068
1069	for (auto *E : ClonedOI ->ReturnBlockPreds)
1070	E->getTerminator()->replaceUsesOfWith(From: PreReturn, To: ClonedOI ->ReturnBlock);
1071	}
1072
1073	bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
1074
1075	auto ComputeRegionCost =
1076	[&](SmallVectorImpl<BasicBlock *> &Region) -> InstructionCost {
1077	InstructionCost Cost = `0`;
1078	for (BasicBlock* BB : Region)
1079	Cost += computeBBInlineCost(BB, TTI: &GetTTI (*BB->getParent()));
1080	return Cost;
1081	};
1082
1083	assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline");
1084
1085	if (ClonedOMRI ->ORI.empty())
1086	return false;
1087
1088	// The CodeExtractor needs a dominator tree.
1089	DominatorTree DT;
1090	DT.recalculate(Func&: *ClonedFunc);
1091
1092	// Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1093	LoopInfo LI(DT);
1094	BranchProbabilityInfo BPI(*ClonedFunc, LI);
1095	ClonedFuncBFI.reset(p: new BlockFrequencyInfo (*ClonedFunc, BPI, LI));
1096
1097	// Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
1098	CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1099
1100	SetVector<Value *> Inputs, Outputs, Sinks;
1101	for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1102	ClonedOMRI ->ORI) {
1103	InstructionCost CurrentOutlinedRegionCost =
1104	ComputeRegionCost (RegionInfo.Region);
1105
1106	CodeExtractor CE(RegionInfo.Region, &DT, /AggregateArgs/ false,
1107	ClonedFuncBFI.get(), &BPI,
1108	LookupAC (*RegionInfo.EntryBlock->getParent()),
1109	/ AllowVarargs / false);
1110
1111	CE.findInputsOutputs(Inputs, Outputs, Allocas: Sinks);
1112
1113	LLVM_DEBUG({
1114	dbgs() << "inputs: " << Inputs.size() << "\n";
1115	dbgs() << "outputs: " << Outputs.size() << "\n";
1116	for (Value *value : Inputs)
1117	dbgs() << "value used in func: " << *value << "\n";
1118	for (Value *output : Outputs)
1119	dbgs() << "instr used in func: " << *output << "\n";
1120	});
1121
1122	// Do not extract regions that have live exit variables.
1123	if (Outputs.size() > `0` && !ForceLiveExit)
1124	continue;
1125
1126	if (Function *OutlinedFunc = CE.extractCodeRegion(CEAC)) {
1127	CallBase OCS = PartialInlinerImpl::getOneCallSiteTo(F&: OutlinedFunc);
1128	BasicBlock *OutliningCallBB = OCS->getParent();
1129	assert(OutliningCallBB->getParent() == ClonedFunc);
1130	OutlinedFunctions.push_back(Elt: std::make_pair(x&: OutlinedFunc,y&: OutliningCallBB));
1131	NumColdRegionsOutlined ++;
1132	OutlinedRegionCost += CurrentOutlinedRegionCost;
1133
1134	if (MarkOutlinedColdCC) {
1135	OutlinedFunc->setCallingConv(CallingConv::Cold);
1136	OCS->setCallingConv(CallingConv::Cold);
1137	}
1138	} else
1139	ORE.emit(RemarkBuilder: [&]() {
1140	return OptimizationRemarkMissed (DEBUG_TYPE, "ExtractFailed",
1141	&RegionInfo.Region.front()->front())
1142	<< "Failed to extract region at block "
1143	<< ore::NV ("Block", RegionInfo.Region.front());
1144	});
1145	}
1146
1147	return !OutlinedFunctions.empty();
1148	}
1149
1150	Function *
1151	PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
1152	// Returns true if the block is to be partial inlined into the caller
1153	// (i.e. not to be extracted to the out of line function)
1154	auto ToBeInlined = [&, this](BasicBlock *BB) {
1155	return BB == ClonedOI ->ReturnBlock \|\|
1156	llvm::is_contained(Range&: ClonedOI ->Entries, Element: BB);
1157	};
1158
1159	assert(ClonedOI && "Expecting OutlineInfo for single region outline");
1160	// The CodeExtractor needs a dominator tree.
1161	DominatorTree DT;
1162	DT.recalculate(Func&: *ClonedFunc);
1163
1164	// Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1165	LoopInfo LI(DT);
1166	BranchProbabilityInfo BPI(*ClonedFunc, LI);
1167	ClonedFuncBFI.reset(p: new BlockFrequencyInfo (*ClonedFunc, BPI, LI));
1168
1169	// Gather up the blocks that we're going to extract.
1170	std::vector<BasicBlock *> ToExtract;
1171	auto ClonedFuncTTI = &GetTTI (ClonedFunc);
1172	ToExtract.push_back(x: ClonedOI ->NonReturnBlock);
1173	OutlinedRegionCost += PartialInlinerImpl::computeBBInlineCost(
1174	BB: ClonedOI ->NonReturnBlock, TTI: ClonedFuncTTI);
1175	for (BasicBlock *BB : depth_first(G: &ClonedFunc->getEntryBlock()))
1176	if (!ToBeInlined (BB) && BB != ClonedOI ->NonReturnBlock) {
1177	ToExtract.push_back(x: BB);
1178	// FIXME: the code extractor may hoist/sink more code
1179	// into the outlined function which may make the outlining
1180	// overhead (the difference of the outlined function cost
1181	// and OutliningRegionCost) look larger.
1182	OutlinedRegionCost += computeBBInlineCost(BB, TTI: ClonedFuncTTI);
1183	}
1184
1185	// Extract the body of the if.
1186	CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1187	Function *OutlinedFunc =
1188	CodeExtractor (ToExtract, &DT, /AggregateArgs/ false,
1189	ClonedFuncBFI.get(), &BPI, LookupAC (*ClonedFunc),
1190	/ AllowVarargs / true)
1191	.extractCodeRegion(CEAC);
1192
1193	if (OutlinedFunc) {
1194	BasicBlock *OutliningCallBB =
1195	PartialInlinerImpl::getOneCallSiteTo(F&: *OutlinedFunc)->getParent();
1196	assert(OutliningCallBB->getParent() == ClonedFunc);
1197	OutlinedFunctions.push_back(Elt: std::make_pair(x&: OutlinedFunc, y&: OutliningCallBB));
1198	} else
1199	ORE.emit(RemarkBuilder: [&]() {
1200	return OptimizationRemarkMissed (DEBUG_TYPE, "ExtractFailed",
1201	&ToExtract.front()->front())
1202	<< "Failed to extract region at block "
1203	<< ore::NV ("Block", ToExtract.front());
1204	});
1205
1206	return OutlinedFunc;
1207	}
1208
1209	PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
1210	// Ditch the duplicate, since we're done with it, and rewrite all remaining
1211	// users (function pointers, etc.) back to the original function.
1212	ClonedFunc->replaceAllUsesWith(V: OrigFunc);
1213	ClonedFunc->eraseFromParent();
1214	if (!IsFunctionInlined) {
1215	// Remove each function that was speculatively created if there is no
1216	// reference.
1217	for (auto FuncBBPair : OutlinedFunctions) {
1218	Function *Func = FuncBBPair.first;
1219	Func->eraseFromParent();
1220	}
1221	}
1222	}
1223
1224	std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function &F) {
1225	if (F.hasAddressTaken())
1226	return {false, nullptr};
1227
1228	// Let inliner handle it
1229	if (F.hasFnAttribute(Kind: Attribute::AlwaysInline))
1230	return {false, nullptr};
1231
1232	if (F.hasFnAttribute(Kind: Attribute::NoInline))
1233	return {false, nullptr};
1234
1235	if (PSI.isFunctionEntryCold(F: &F))
1236	return {false, nullptr};
1237
1238	if (F.users().empty())
1239	return {false, nullptr};
1240
1241	OptimizationRemarkEmitter ORE(&F);
1242
1243	// Only try to outline cold regions if we have a profile summary, which
1244	// implies we have profiling information.
1245	if (PSI.hasProfileSummary() && F.hasProfileData() &&
1246	!DisableMultiRegionPartialInline) {
1247	std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI =
1248	computeOutliningColdRegionsInfo(F, ORE);
1249	if (OMRI) {
1250	FunctionCloner Cloner(&F, OMRI.get(), ORE, LookupAssumptionCache, GetTTI);
1251
1252	LLVM_DEBUG({
1253	dbgs() << "HotCountThreshold = " << PSI.getHotCountThreshold() << "\n";
1254	dbgs() << "ColdCountThreshold = " << PSI.getColdCountThreshold()
1255	<< "\n";
1256	});
1257
1258	bool DidOutline = Cloner.doMultiRegionFunctionOutlining();
1259
1260	if (DidOutline) {
1261	LLVM_DEBUG({
1262	dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n";
1263	Cloner.ClonedFunc->print(dbgs());
1264	dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n";
1265	});
1266
1267	if (tryPartialInline(Cloner))
1268	return {true, nullptr};
1269	}
1270	}
1271	}
1272
1273	// Fall-thru to regular partial inlining if we:
1274	// i) can't find any cold regions to outline, or
1275	// ii) can't inline the outlined function anywhere.
1276	std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
1277	if (!OI)
1278	return {false, nullptr};
1279
1280	FunctionCloner Cloner(&F, OI.get(), ORE, LookupAssumptionCache, GetTTI);
1281	Cloner.normalizeReturnBlock();
1282
1283	Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining();
1284
1285	if (!OutlinedFunction)
1286	return {false, nullptr};
1287
1288	if (tryPartialInline(Cloner))
1289	return {true, OutlinedFunction};
1290
1291	return {false, nullptr};
1292	}
1293
1294	bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
1295	if (Cloner.OutlinedFunctions.empty())
1296	return false;
1297
1298	auto OutliningCosts = computeOutliningCosts(Cloner);
1299
1300	InstructionCost SizeCost = std::get<`0`>(t&: OutliningCosts);
1301	InstructionCost NonWeightedRcost = std::get<`1`>(t&: OutliningCosts);
1302
1303	assert(SizeCost.isValid() && NonWeightedRcost.isValid() &&
1304	"Expected valid costs");
1305
1306	// Only calculate RelativeToEntryFreq when we are doing single region
1307	// outlining.
1308	BranchProbability RelativeToEntryFreq;
1309	if (Cloner.ClonedOI)
1310	RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);
1311	else
1312	// RelativeToEntryFreq doesn't make sense when we have more than one
1313	// outlined call because each call will have a different relative frequency
1314	// to the entry block. We can consider using the average, but the
1315	// usefulness of that information is questionable. For now, assume we never
1316	// execute the calls to outlined functions.
1317	RelativeToEntryFreq = BranchProbability (`0`, `1`);
1318
1319	BlockFrequency WeightedRcost =
1320	BlockFrequency (NonWeightedRcost.getValue()) * RelativeToEntryFreq;
1321
1322	// The call sequence(s) to the outlined function(s) are larger than the sum of
1323	// the original outlined region size(s), it does not increase the chances of
1324	// inlining the function with outlining (The inliner uses the size increase to
1325	// model the cost of inlining a callee).
1326	if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) {
1327	OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1328	DebugLoc DLoc;
1329	BasicBlock *Block;
1330	std::tie(args&: DLoc, args&: Block) = getOneDebugLoc(F&: *Cloner.ClonedFunc);
1331	OrigFuncORE.emit(RemarkBuilder: [&]() {
1332	return OptimizationRemarkAnalysis (DEBUG_TYPE, "OutlineRegionTooSmall",
1333	DLoc, Block)
1334	<< ore::NV ("Function", Cloner.OrigFunc)
1335	<< " not partially inlined into callers (Original Size = "
1336	<< ore::NV ("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost)
1337	<< ", Size of call sequence to outlined function = "
1338	<< ore::NV ("NewSize", SizeCost) << ")";
1339	});
1340	return false;
1341	}
1342
1343	assert(Cloner.OrigFunc->users().empty() &&
1344	"F's users should all be replaced!");
1345
1346	std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),
1347	Cloner.ClonedFunc->user_end());
1348
1349	DenseMap<User *, uint64_t> CallSiteToProfCountMap;
1350	auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
1351	if (CalleeEntryCount)
1352	computeCallsiteToProfCountMap(DuplicateFunction: Cloner.ClonedFunc, CallSiteToProfCountMap);
1353
1354	uint64_t CalleeEntryCountV =
1355	(CalleeEntryCount ? CalleeEntryCount ->getCount() : `0`);
1356
1357	bool AnyInline = false;
1358	for (User *User : Users) {
1359	CallBase *CB = getSupportedCallBase(U: User);
1360
1361	if (isLimitReached())
1362	continue;
1363
1364	OptimizationRemarkEmitter CallerORE(CB->getCaller());
1365	if (!shouldPartialInline(CB&: *CB, Cloner, WeightedOutliningRcost: WeightedRcost, ORE&: CallerORE))
1366	continue;
1367
1368	// Construct remark before doing the inlining, as after successful inlining
1369	// the callsite is removed.
1370	OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CB);
1371	OR << ore::NV ("Callee", Cloner.OrigFunc) << " partially inlined into "
1372	<< ore::NV ("Caller", CB->getCaller());
1373
1374	InlineFunctionInfo IFI(GetAssumptionCache, &PSI);
1375	// We can only forward varargs when we outlined a single region, else we
1376	// bail on vararg functions.
1377	if (!InlineFunction(CB&: CB, IFI, /MergeAttributes=/*false, CalleeAAR: nullptr, InsertLifetime: true,
1378	ForwardVarArgsTo: (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first
1379	: nullptr))
1380	.isSuccess())
1381	continue;
1382
1383	CallerORE.emit(OptDiag&: OR);
1384
1385	// Now update the entry count:
1386	if (CalleeEntryCountV) {
1387	if (auto It = CallSiteToProfCountMap.find(Val: User);
1388	It != CallSiteToProfCountMap.end()) {
1389	uint64_t CallSiteCount = It ->second;
1390	CalleeEntryCountV -= std::min(a: CalleeEntryCountV, b: CallSiteCount);
1391	}
1392	}
1393
1394	AnyInline = true;
1395	NumPartialInlining++;
1396	// Update the stats
1397	if (Cloner.ClonedOI)
1398	NumPartialInlined ++;
1399	else
1400	NumColdOutlinePartialInlined ++;
1401	}
1402
1403	if (AnyInline) {
1404	Cloner.IsFunctionInlined = true;
1405	if (CalleeEntryCount)
1406	Cloner.OrigFunc->setEntryCount(Count: Function::ProfileCount (
1407	CalleeEntryCountV, CalleeEntryCount ->getType()));
1408	OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1409	OrigFuncORE.emit(RemarkBuilder: [&]() {
1410	return OptimizationRemark (DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc)
1411	<< "Partially inlined into at least one caller";
1412	});
1413	}
1414
1415	return AnyInline;
1416	}
1417
1418	bool PartialInlinerImpl::run(Module &M) {
1419	if (DisablePartialInlining)
1420	return false;
1421
1422	std::vector<Function *> Worklist;
1423	Worklist.reserve(n: M.size());
1424	for (Function &F : M)
1425	if (!F.use_empty() && !F.isDeclaration())
1426	Worklist.push_back(x: &F);
1427
1428	bool Changed = false;
1429	while (!Worklist.empty()) {
1430	Function *CurrFunc = Worklist.back();
1431	Worklist.pop_back();
1432
1433	if (CurrFunc->use_empty())
1434	continue;
1435
1436	std::pair<bool, Function > Result = unswitchFunction(F&: CurrFunc);
1437	if (Result.second)
1438	Worklist.push_back(x: Result.second);
1439	Changed \|= Result.first;
1440	}
1441
1442	return Changed;
1443	}
1444
1445	PreservedAnalyses PartialInlinerPass::run(Module &M,
1446	ModuleAnalysisManager &AM) {
1447	auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(IR&: M).getManager();
1448
1449	auto GetAssumptionCache = [&FAM](Function &F) -> AssumptionCache & {
1450	return FAM.getResult<AssumptionAnalysis>(IR&: F);
1451	};
1452
1453	auto LookupAssumptionCache = [&FAM](Function &F) -> AssumptionCache * {
1454	return FAM.getCachedResult<AssumptionAnalysis>(IR&: F);
1455	};
1456
1457	auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & {
1458	return FAM.getResult<BlockFrequencyAnalysis>(IR&: F);
1459	};
1460
1461	auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & {
1462	return FAM.getResult<TargetIRAnalysis>(IR&: F);
1463	};
1464
1465	auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
1466	return FAM.getResult<TargetLibraryAnalysis>(IR&: F);
1467	};
1468
1469	ProfileSummaryInfo &PSI = AM.getResult<ProfileSummaryAnalysis>(IR&: M);
1470
1471	if (PartialInlinerImpl (GetAssumptionCache, LookupAssumptionCache, GetTTI,
1472	GetTLI, PSI, GetBFI)
1473	.run(M))
1474	return PreservedAnalyses::none();
1475	return PreservedAnalyses::all();
1476	}
1477

Browse the source code of llvm_projects/llvm/lib/Transforms/IPO/PartialInlining.cpp