1//===- PartialInlining.cpp - Inline parts of functions --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass performs partial inlining, typically by inlining an if statement
10// that surrounds the body of the function.
11//
12//===----------------------------------------------------------------------===//
13
14#include "llvm/Transforms/IPO/PartialInlining.h"
15#include "llvm/ADT/DenseMap.h"
16#include "llvm/ADT/DenseSet.h"
17#include "llvm/ADT/DepthFirstIterator.h"
18#include "llvm/ADT/STLExtras.h"
19#include "llvm/ADT/SmallVector.h"
20#include "llvm/ADT/Statistic.h"
21#include "llvm/Analysis/BlockFrequencyInfo.h"
22#include "llvm/Analysis/BranchProbabilityInfo.h"
23#include "llvm/Analysis/InlineCost.h"
24#include "llvm/Analysis/LoopInfo.h"
25#include "llvm/Analysis/OptimizationRemarkEmitter.h"
26#include "llvm/Analysis/ProfileSummaryInfo.h"
27#include "llvm/Analysis/TargetLibraryInfo.h"
28#include "llvm/Analysis/TargetTransformInfo.h"
29#include "llvm/IR/Attributes.h"
30#include "llvm/IR/BasicBlock.h"
31#include "llvm/IR/CFG.h"
32#include "llvm/IR/DebugLoc.h"
33#include "llvm/IR/DiagnosticInfo.h"
34#include "llvm/IR/Dominators.h"
35#include "llvm/IR/Function.h"
36#include "llvm/IR/InstrTypes.h"
37#include "llvm/IR/Instruction.h"
38#include "llvm/IR/Instructions.h"
39#include "llvm/IR/IntrinsicInst.h"
40#include "llvm/IR/Intrinsics.h"
41#include "llvm/IR/Module.h"
42#include "llvm/IR/Operator.h"
43#include "llvm/IR/ProfDataUtils.h"
44#include "llvm/IR/User.h"
45#include "llvm/Support/BlockFrequency.h"
46#include "llvm/Support/BranchProbability.h"
47#include "llvm/Support/Casting.h"
48#include "llvm/Support/CommandLine.h"
49#include "llvm/Support/ErrorHandling.h"
50#include "llvm/Transforms/IPO.h"
51#include "llvm/Transforms/Utils/Cloning.h"
52#include "llvm/Transforms/Utils/CodeExtractor.h"
53#include "llvm/Transforms/Utils/ValueMapper.h"
54#include <algorithm>
55#include <cassert>
56#include <cstdint>
57#include <memory>
58#include <tuple>
59#include <vector>
60
61using namespace llvm;
62
63#define DEBUG_TYPE "partial-inlining"
64
65STATISTIC(NumPartialInlined,
66 "Number of callsites functions partially inlined into.");
67STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with "
68 "cold outlined regions were partially "
69 "inlined into its caller(s).");
70STATISTIC(NumColdRegionsFound,
71 "Number of cold single entry/exit regions found.");
72STATISTIC(NumColdRegionsOutlined,
73 "Number of cold single entry/exit regions outlined.");
74
75// Command line option to disable partial-inlining. The default is false:
76static cl::opt<bool>
77 DisablePartialInlining("disable-partial-inlining", cl::init(Val: false),
78 cl::Hidden, cl::desc("Disable partial inlining"));
79// Command line option to disable multi-region partial-inlining. The default is
80// false:
81static cl::opt<bool> DisableMultiRegionPartialInline(
82 "disable-mr-partial-inlining", cl::init(Val: false), cl::Hidden,
83 cl::desc("Disable multi-region partial inlining"));
84
85// Command line option to force outlining in regions with live exit variables.
86// The default is false:
87static cl::opt<bool>
88 ForceLiveExit("pi-force-live-exit-outline", cl::init(Val: false), cl::Hidden,
89 cl::desc("Force outline regions with live exits"));
90
91// Command line option to enable marking outline functions with Cold Calling
92// Convention. The default is false:
93static cl::opt<bool>
94 MarkOutlinedColdCC("pi-mark-coldcc", cl::init(Val: false), cl::Hidden,
95 cl::desc("Mark outline function calls with ColdCC"));
96
97// This is an option used by testing:
98static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
99
100 cl::ReallyHidden,
101 cl::desc("Skip Cost Analysis"));
102// Used to determine if a cold region is worth outlining based on
103// its inlining cost compared to the original function. Default is set at 10%.
104// ie. if the cold region reduces the inlining cost of the original function by
105// at least 10%.
106static cl::opt<float> MinRegionSizeRatio(
107 "min-region-size-ratio", cl::init(Val: 0.1), cl::Hidden,
108 cl::desc("Minimum ratio comparing relative sizes of each "
109 "outline candidate and original function"));
110// Used to tune the minimum number of execution counts needed in the predecessor
111// block to the cold edge. ie. confidence interval.
112static cl::opt<unsigned>
113 MinBlockCounterExecution("min-block-execution", cl::init(Val: 100), cl::Hidden,
114 cl::desc("Minimum block executions to consider "
115 "its BranchProbabilityInfo valid"));
116// Used to determine when an edge is considered cold. Default is set to 10%. ie.
117// if the branch probability is 10% or less, then it is deemed as 'cold'.
118static cl::opt<float> ColdBranchRatio(
119 "cold-branch-ratio", cl::init(Val: 0.1), cl::Hidden,
120 cl::desc("Minimum BranchProbability to consider a region cold."));
121
122static cl::opt<unsigned> MaxNumInlineBlocks(
123 "max-num-inline-blocks", cl::init(Val: 5), cl::Hidden,
124 cl::desc("Max number of blocks to be partially inlined"));
125
126// Command line option to set the maximum number of partial inlining allowed
127// for the module. The default value of -1 means no limit.
128static cl::opt<int> MaxNumPartialInlining(
129 "max-partial-inlining", cl::init(Val: -1), cl::Hidden,
130 cl::desc("Max number of partial inlining. The default is unlimited"));
131
132// Used only when PGO or user annotated branch data is absent. It is
133// the least value that is used to weigh the outline region. If BFI
134// produces larger value, the BFI value will be used.
135static cl::opt<int>
136 OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(Val: 75),
137 cl::Hidden,
138 cl::desc("Relative frequency of outline region to "
139 "the entry block"));
140
141static cl::opt<unsigned> ExtraOutliningPenalty(
142 "partial-inlining-extra-penalty", cl::init(Val: 0), cl::Hidden,
143 cl::desc("A debug option to add additional penalty to the computed one."));
144
145namespace {
146
147struct FunctionOutliningInfo {
148 FunctionOutliningInfo() = default;
149
150 // Returns the number of blocks to be inlined including all blocks
151 // in Entries and one return block.
152 unsigned getNumInlinedBlocks() const { return Entries.size() + 1; }
153
154 // A set of blocks including the function entry that guard
155 // the region to be outlined.
156 SmallVector<BasicBlock *, 4> Entries;
157
158 // The return block that is not included in the outlined region.
159 BasicBlock *ReturnBlock = nullptr;
160
161 // The dominating block of the region to be outlined.
162 BasicBlock *NonReturnBlock = nullptr;
163
164 // The set of blocks in Entries that are predecessors to ReturnBlock
165 SmallVector<BasicBlock *, 4> ReturnBlockPreds;
166};
167
168struct FunctionOutliningMultiRegionInfo {
169 FunctionOutliningMultiRegionInfo() = default;
170
171 // Container for outline regions
172 struct OutlineRegionInfo {
173 OutlineRegionInfo(ArrayRef<BasicBlock *> Region, BasicBlock *EntryBlock,
174 BasicBlock *ExitBlock, BasicBlock *ReturnBlock)
175 : Region(Region), EntryBlock(EntryBlock), ExitBlock(ExitBlock),
176 ReturnBlock(ReturnBlock) {}
177 SmallVector<BasicBlock *, 8> Region;
178 BasicBlock *EntryBlock;
179 BasicBlock *ExitBlock;
180 BasicBlock *ReturnBlock;
181 };
182
183 SmallVector<OutlineRegionInfo, 4> ORI;
184};
185
186struct PartialInlinerImpl {
187
188 PartialInlinerImpl(
189 function_ref<AssumptionCache &(Function &)> GetAC,
190 function_ref<AssumptionCache *(Function &)> LookupAC,
191 function_ref<TargetTransformInfo &(Function &)> GTTI,
192 function_ref<const TargetLibraryInfo &(Function &)> GTLI,
193 ProfileSummaryInfo &ProfSI,
194 function_ref<BlockFrequencyInfo &(Function &)> GBFI = nullptr)
195 : GetAssumptionCache(GetAC), LookupAssumptionCache(LookupAC),
196 GetTTI(GTTI), GetBFI(GBFI), GetTLI(GTLI), PSI(ProfSI) {}
197
198 bool run(Module &M);
199 // Main part of the transformation that calls helper functions to find
200 // outlining candidates, clone & outline the function, and attempt to
201 // partially inline the resulting function. Returns true if
202 // inlining was successful, false otherwise. Also returns the outline
203 // function (only if we partially inlined early returns) as there is a
204 // possibility to further "peel" early return statements that were left in the
205 // outline function due to code size.
206 std::pair<bool, Function *> unswitchFunction(Function &F);
207
208 // This class speculatively clones the function to be partial inlined.
209 // At the end of partial inlining, the remaining callsites to the cloned
210 // function that are not partially inlined will be fixed up to reference
211 // the original function, and the cloned function will be erased.
212 struct FunctionCloner {
213 // Two constructors, one for single region outlining, the other for
214 // multi-region outlining.
215 FunctionCloner(Function *F, FunctionOutliningInfo *OI,
216 OptimizationRemarkEmitter &ORE,
217 function_ref<AssumptionCache *(Function &)> LookupAC,
218 function_ref<TargetTransformInfo &(Function &)> GetTTI);
219 FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI,
220 OptimizationRemarkEmitter &ORE,
221 function_ref<AssumptionCache *(Function &)> LookupAC,
222 function_ref<TargetTransformInfo &(Function &)> GetTTI);
223
224 ~FunctionCloner();
225
226 // Prepare for function outlining: making sure there is only
227 // one incoming edge from the extracted/outlined region to
228 // the return block.
229 void normalizeReturnBlock() const;
230
231 // Do function outlining for cold regions.
232 bool doMultiRegionFunctionOutlining();
233 // Do function outlining for region after early return block(s).
234 // NOTE: For vararg functions that do the vararg handling in the outlined
235 // function, we temporarily generate IR that does not properly
236 // forward varargs to the outlined function. Calling InlineFunction
237 // will update calls to the outlined functions to properly forward
238 // the varargs.
239 Function *doSingleRegionFunctionOutlining();
240
241 Function *OrigFunc = nullptr;
242 Function *ClonedFunc = nullptr;
243
244 typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair;
245 // Keep track of Outlined Functions and the basic block they're called from.
246 SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions;
247
248 // ClonedFunc is inlined in one of its callers after function
249 // outlining.
250 bool IsFunctionInlined = false;
251 // The cost of the region to be outlined.
252 InstructionCost OutlinedRegionCost = 0;
253 // ClonedOI is specific to outlining non-early return blocks.
254 std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr;
255 // ClonedOMRI is specific to outlining cold regions.
256 std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr;
257 std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr;
258 OptimizationRemarkEmitter &ORE;
259 function_ref<AssumptionCache *(Function &)> LookupAC;
260 function_ref<TargetTransformInfo &(Function &)> GetTTI;
261 };
262
263private:
264 int NumPartialInlining = 0;
265 function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
266 function_ref<AssumptionCache *(Function &)> LookupAssumptionCache;
267 function_ref<TargetTransformInfo &(Function &)> GetTTI;
268 function_ref<BlockFrequencyInfo &(Function &)> GetBFI;
269 function_ref<const TargetLibraryInfo &(Function &)> GetTLI;
270 ProfileSummaryInfo &PSI;
271
272 // Return the frequency of the OutlininingBB relative to F's entry point.
273 // The result is no larger than 1 and is represented using BP.
274 // (Note that the outlined region's 'head' block can only have incoming
275 // edges from the guarding entry blocks).
276 BranchProbability
277 getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) const;
278
279 // Return true if the callee of CB should be partially inlined with
280 // profit.
281 bool shouldPartialInline(CallBase &CB, FunctionCloner &Cloner,
282 BlockFrequency WeightedOutliningRcost,
283 OptimizationRemarkEmitter &ORE) const;
284
285 // Try to inline DuplicateFunction (cloned from F with call to
286 // the OutlinedFunction into its callers. Return true
287 // if there is any successful inlining.
288 bool tryPartialInline(FunctionCloner &Cloner);
289
290 // Compute the mapping from use site of DuplicationFunction to the enclosing
291 // BB's profile count.
292 void
293 computeCallsiteToProfCountMap(Function *DuplicateFunction,
294 DenseMap<User *, uint64_t> &SiteCountMap) const;
295
296 bool isLimitReached() const {
297 return (MaxNumPartialInlining != -1 &&
298 NumPartialInlining >= MaxNumPartialInlining);
299 }
300
301 static CallBase *getSupportedCallBase(User *U) {
302 if (isa<CallInst>(Val: U) || isa<InvokeInst>(Val: U))
303 return cast<CallBase>(Val: U);
304 llvm_unreachable("All uses must be calls");
305 return nullptr;
306 }
307
308 static CallBase *getOneCallSiteTo(Function &F) {
309 User *User = *F.user_begin();
310 return getSupportedCallBase(U: User);
311 }
312
313 std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function &F) const {
314 CallBase *CB = getOneCallSiteTo(F);
315 DebugLoc DLoc = CB->getDebugLoc();
316 BasicBlock *Block = CB->getParent();
317 return std::make_tuple(args&: DLoc, args&: Block);
318 }
319
320 // Returns the costs associated with function outlining:
321 // - The first value is the non-weighted runtime cost for making the call
322 // to the outlined function, including the addtional setup cost in the
323 // outlined function itself;
324 // - The second value is the estimated size of the new call sequence in
325 // basic block Cloner.OutliningCallBB;
326 std::tuple<InstructionCost, InstructionCost>
327 computeOutliningCosts(FunctionCloner &Cloner) const;
328
329 // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
330 // approximate both the size and runtime cost (Note that in the current
331 // inline cost analysis, there is no clear distinction there either).
332 static InstructionCost computeBBInlineCost(BasicBlock *BB,
333 TargetTransformInfo *TTI);
334
335 std::unique_ptr<FunctionOutliningInfo>
336 computeOutliningInfo(Function &F) const;
337
338 std::unique_ptr<FunctionOutliningMultiRegionInfo>
339 computeOutliningColdRegionsInfo(Function &F,
340 OptimizationRemarkEmitter &ORE) const;
341};
342
343} // end anonymous namespace
344
345std::unique_ptr<FunctionOutliningMultiRegionInfo>
346PartialInlinerImpl::computeOutliningColdRegionsInfo(
347 Function &F, OptimizationRemarkEmitter &ORE) const {
348 BasicBlock *EntryBlock = &F.front();
349
350 DominatorTree DT(F);
351 LoopInfo LI(DT);
352 BranchProbabilityInfo BPI(F, LI);
353 std::unique_ptr<BlockFrequencyInfo> ScopedBFI;
354 BlockFrequencyInfo *BFI;
355 if (!GetBFI) {
356 ScopedBFI.reset(p: new BlockFrequencyInfo(F, BPI, LI));
357 BFI = ScopedBFI.get();
358 } else
359 BFI = &(GetBFI(F));
360
361 // Return if we don't have profiling information.
362 if (!PSI.hasInstrumentationProfile())
363 return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
364
365 std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo =
366 std::make_unique<FunctionOutliningMultiRegionInfo>();
367
368 auto IsSingleExit =
369 [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * {
370 BasicBlock *ExitBlock = nullptr;
371 for (auto *Block : BlockList) {
372 for (BasicBlock *Succ : successors(BB: Block)) {
373 if (!is_contained(Range&: BlockList, Element: Succ)) {
374 if (ExitBlock) {
375 ORE.emit(RemarkBuilder: [&]() {
376 return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion",
377 &Succ->front())
378 << "Region dominated by "
379 << ore::NV("Block", BlockList.front()->getName())
380 << " has more than one region exit edge.";
381 });
382 return nullptr;
383 }
384
385 ExitBlock = Block;
386 }
387 }
388 }
389 return ExitBlock;
390 };
391
392 auto BBProfileCount = [BFI](BasicBlock *BB) {
393 return BFI->getBlockProfileCount(BB).value_or(u: 0);
394 };
395
396 // Use the same computeBBInlineCost function to compute the cost savings of
397 // the outlining the candidate region.
398 TargetTransformInfo *FTTI = &GetTTI(F);
399 InstructionCost OverallFunctionCost = 0;
400 for (auto &BB : F)
401 OverallFunctionCost += computeBBInlineCost(BB: &BB, TTI: FTTI);
402
403 LLVM_DEBUG(dbgs() << "OverallFunctionCost = " << OverallFunctionCost
404 << "\n";);
405
406 InstructionCost MinOutlineRegionCost = OverallFunctionCost.map(
407 F: [&](auto Cost) { return Cost * MinRegionSizeRatio; });
408
409 BranchProbability MinBranchProbability(
410 static_cast<int>(ColdBranchRatio * MinBlockCounterExecution),
411 MinBlockCounterExecution);
412 bool ColdCandidateFound = false;
413 BasicBlock *CurrEntry = EntryBlock;
414 std::vector<BasicBlock *> DFS;
415 SmallPtrSet<BasicBlock *, 8> VisitedSet;
416 DFS.push_back(x: CurrEntry);
417 VisitedSet.insert(Ptr: CurrEntry);
418
419 // Use Depth First Search on the basic blocks to find CFG edges that are
420 // considered cold.
421 // Cold regions considered must also have its inline cost compared to the
422 // overall inline cost of the original function. The region is outlined only
423 // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or
424 // more.
425 while (!DFS.empty()) {
426 auto *ThisBB = DFS.back();
427 DFS.pop_back();
428 // Only consider regions with predecessor blocks that are considered
429 // not-cold (default: part of the top 99.99% of all block counters)
430 // AND greater than our minimum block execution count (default: 100).
431 if (PSI.isColdBlock(BB: ThisBB, BFI) ||
432 BBProfileCount(ThisBB) < MinBlockCounterExecution)
433 continue;
434 for (auto SI = succ_begin(BB: ThisBB); SI != succ_end(BB: ThisBB); ++SI) {
435 if (!VisitedSet.insert(Ptr: *SI).second)
436 continue;
437 DFS.push_back(x: *SI);
438 // If branch isn't cold, we skip to the next one.
439 BranchProbability SuccProb = BPI.getEdgeProbability(Src: ThisBB, Dst: *SI);
440 if (SuccProb > MinBranchProbability)
441 continue;
442
443 LLVM_DEBUG(dbgs() << "Found cold edge: " << ThisBB->getName() << "->"
444 << SI->getName()
445 << "\nBranch Probability = " << SuccProb << "\n";);
446
447 SmallVector<BasicBlock *, 8> DominateVector;
448 DT.getDescendants(R: *SI, Result&: DominateVector);
449 assert(!DominateVector.empty() &&
450 "SI should be reachable and have at least itself as descendant");
451
452 // We can only outline single entry regions (for now).
453 if (!DominateVector.front()->hasNPredecessors(N: 1)) {
454 LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName()
455 << " doesn't have a single predecessor in the "
456 "dominator tree\n";);
457 continue;
458 }
459
460 BasicBlock *ExitBlock = nullptr;
461 // We can only outline single exit regions (for now).
462 if (!(ExitBlock = IsSingleExit(DominateVector))) {
463 LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName()
464 << " doesn't have a unique successor\n";);
465 continue;
466 }
467
468 InstructionCost OutlineRegionCost = 0;
469 for (auto *BB : DominateVector)
470 OutlineRegionCost += computeBBInlineCost(BB, TTI: &GetTTI(*BB->getParent()));
471
472 LLVM_DEBUG(dbgs() << "OutlineRegionCost = " << OutlineRegionCost
473 << "\n";);
474
475 if (!SkipCostAnalysis && OutlineRegionCost < MinOutlineRegionCost) {
476 ORE.emit(RemarkBuilder: [&]() {
477 return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly",
478 &SI->front())
479 << ore::NV("Callee", &F)
480 << " inline cost-savings smaller than "
481 << ore::NV("Cost", MinOutlineRegionCost);
482 });
483
484 LLVM_DEBUG(dbgs() << "ABORT: Outline region cost is smaller than "
485 << MinOutlineRegionCost << "\n";);
486 continue;
487 }
488
489 // For now, ignore blocks that belong to a SISE region that is a
490 // candidate for outlining. In the future, we may want to look
491 // at inner regions because the outer region may have live-exit
492 // variables.
493 VisitedSet.insert_range(R&: DominateVector);
494
495 // ReturnBlock here means the block after the outline call
496 BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor();
497 FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo(
498 DominateVector, DominateVector.front(), ExitBlock, ReturnBlock);
499 OutliningInfo->ORI.push_back(Elt: RegInfo);
500 LLVM_DEBUG(dbgs() << "Found Cold Candidate starting at block: "
501 << DominateVector.front()->getName() << "\n";);
502 ColdCandidateFound = true;
503 NumColdRegionsFound++;
504 }
505 }
506
507 if (ColdCandidateFound)
508 return OutliningInfo;
509
510 return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
511}
512
513std::unique_ptr<FunctionOutliningInfo>
514PartialInlinerImpl::computeOutliningInfo(Function &F) const {
515 BasicBlock *EntryBlock = &F.front();
516 BranchInst *BR = dyn_cast<BranchInst>(Val: EntryBlock->getTerminator());
517 if (!BR || BR->isUnconditional())
518 return std::unique_ptr<FunctionOutliningInfo>();
519
520 // Returns true if Succ is BB's successor
521 auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) {
522 return is_contained(Range: successors(BB), Element: Succ);
523 };
524
525 auto IsReturnBlock = [](BasicBlock *BB) {
526 Instruction *TI = BB->getTerminator();
527 return isa<ReturnInst>(Val: TI);
528 };
529
530 auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
531 if (IsReturnBlock(Succ1))
532 return std::make_tuple(args&: Succ1, args&: Succ2);
533 if (IsReturnBlock(Succ2))
534 return std::make_tuple(args&: Succ2, args&: Succ1);
535
536 return std::make_tuple<BasicBlock *, BasicBlock *>(args: nullptr, args: nullptr);
537 };
538
539 // Detect a triangular shape:
540 auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
541 if (IsSuccessor(Succ1, Succ2))
542 return std::make_tuple(args&: Succ1, args&: Succ2);
543 if (IsSuccessor(Succ2, Succ1))
544 return std::make_tuple(args&: Succ2, args&: Succ1);
545
546 return std::make_tuple<BasicBlock *, BasicBlock *>(args: nullptr, args: nullptr);
547 };
548
549 std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
550 std::make_unique<FunctionOutliningInfo>();
551
552 BasicBlock *CurrEntry = EntryBlock;
553 bool CandidateFound = false;
554 do {
555 // The number of blocks to be inlined has already reached
556 // the limit. When MaxNumInlineBlocks is set to 0 or 1, this
557 // disables partial inlining for the function.
558 if (OutliningInfo->getNumInlinedBlocks() >= MaxNumInlineBlocks)
559 break;
560
561 if (succ_size(BB: CurrEntry) != 2)
562 break;
563
564 BasicBlock *Succ1 = *succ_begin(BB: CurrEntry);
565 BasicBlock *Succ2 = *(succ_begin(BB: CurrEntry) + 1);
566
567 BasicBlock *ReturnBlock, *NonReturnBlock;
568 std::tie(args&: ReturnBlock, args&: NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
569
570 if (ReturnBlock) {
571 OutliningInfo->Entries.push_back(Elt: CurrEntry);
572 OutliningInfo->ReturnBlock = ReturnBlock;
573 OutliningInfo->NonReturnBlock = NonReturnBlock;
574 CandidateFound = true;
575 break;
576 }
577
578 BasicBlock *CommSucc, *OtherSucc;
579 std::tie(args&: CommSucc, args&: OtherSucc) = GetCommonSucc(Succ1, Succ2);
580
581 if (!CommSucc)
582 break;
583
584 OutliningInfo->Entries.push_back(Elt: CurrEntry);
585 CurrEntry = OtherSucc;
586 } while (true);
587
588 if (!CandidateFound)
589 return std::unique_ptr<FunctionOutliningInfo>();
590
591 // There should not be any successors (not in the entry set) other than
592 // {ReturnBlock, NonReturnBlock}
593 assert(OutliningInfo->Entries[0] == &F.front() &&
594 "Function Entry must be the first in Entries vector");
595 DenseSet<BasicBlock *> Entries(llvm::from_range, OutliningInfo->Entries);
596
597 // Returns true of BB has Predecessor which is not
598 // in Entries set.
599 auto HasNonEntryPred = [Entries](BasicBlock *BB) {
600 for (auto *Pred : predecessors(BB)) {
601 if (!Entries.count(V: Pred))
602 return true;
603 }
604 return false;
605 };
606 auto CheckAndNormalizeCandidate =
607 [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) {
608 for (BasicBlock *E : OutliningInfo->Entries) {
609 for (auto *Succ : successors(BB: E)) {
610 if (Entries.count(V: Succ))
611 continue;
612 if (Succ == OutliningInfo->ReturnBlock)
613 OutliningInfo->ReturnBlockPreds.push_back(Elt: E);
614 else if (Succ != OutliningInfo->NonReturnBlock)
615 return false;
616 }
617 // There should not be any outside incoming edges either:
618 if (HasNonEntryPred(E))
619 return false;
620 }
621 return true;
622 };
623
624 if (!CheckAndNormalizeCandidate(OutliningInfo.get()))
625 return std::unique_ptr<FunctionOutliningInfo>();
626
627 // Now further growing the candidate's inlining region by
628 // peeling off dominating blocks from the outlining region:
629 while (OutliningInfo->getNumInlinedBlocks() < MaxNumInlineBlocks) {
630 BasicBlock *Cand = OutliningInfo->NonReturnBlock;
631 if (succ_size(BB: Cand) != 2)
632 break;
633
634 if (HasNonEntryPred(Cand))
635 break;
636
637 BasicBlock *Succ1 = *succ_begin(BB: Cand);
638 BasicBlock *Succ2 = *(succ_begin(BB: Cand) + 1);
639
640 BasicBlock *ReturnBlock, *NonReturnBlock;
641 std::tie(args&: ReturnBlock, args&: NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
642 if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock)
643 break;
644
645 if (NonReturnBlock->getSinglePredecessor() != Cand)
646 break;
647
648 // Now grow and update OutlininigInfo:
649 OutliningInfo->Entries.push_back(Elt: Cand);
650 OutliningInfo->NonReturnBlock = NonReturnBlock;
651 OutliningInfo->ReturnBlockPreds.push_back(Elt: Cand);
652 Entries.insert(V: Cand);
653 }
654
655 return OutliningInfo;
656}
657
658// Check if there is PGO data or user annotated branch data:
659static bool hasProfileData(const Function &F, const FunctionOutliningInfo &OI) {
660 if (F.hasProfileData())
661 return true;
662 // Now check if any of the entry block has MD_prof data:
663 for (auto *E : OI.Entries) {
664 BranchInst *BR = dyn_cast<BranchInst>(Val: E->getTerminator());
665 if (!BR || BR->isUnconditional())
666 continue;
667 if (hasBranchWeightMD(I: *BR))
668 return true;
669 }
670 return false;
671}
672
673BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq(
674 FunctionCloner &Cloner) const {
675 BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second;
676 auto EntryFreq =
677 Cloner.ClonedFuncBFI->getBlockFreq(BB: &Cloner.ClonedFunc->getEntryBlock());
678 auto OutliningCallFreq =
679 Cloner.ClonedFuncBFI->getBlockFreq(BB: OutliningCallBB);
680 // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE
681 // we outlined any regions, so we may encounter situations where the
682 // OutliningCallFreq is *slightly* bigger than the EntryFreq.
683 if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency())
684 OutliningCallFreq = EntryFreq;
685
686 auto OutlineRegionRelFreq = BranchProbability::getBranchProbability(
687 Numerator: OutliningCallFreq.getFrequency(), Denominator: EntryFreq.getFrequency());
688
689 if (hasProfileData(F: *Cloner.OrigFunc, OI: *Cloner.ClonedOI))
690 return OutlineRegionRelFreq;
691
692 // When profile data is not available, we need to be conservative in
693 // estimating the overall savings. Static branch prediction can usually
694 // guess the branch direction right (taken/non-taken), but the guessed
695 // branch probability is usually not biased enough. In case when the
696 // outlined region is predicted to be likely, its probability needs
697 // to be made higher (more biased) to not under-estimate the cost of
698 // function outlining. On the other hand, if the outlined region
699 // is predicted to be less likely, the predicted probablity is usually
700 // higher than the actual. For instance, the actual probability of the
701 // less likely target is only 5%, but the guessed probablity can be
702 // 40%. In the latter case, there is no need for further adjustment.
703 // FIXME: add an option for this.
704 if (OutlineRegionRelFreq < BranchProbability(45, 100))
705 return OutlineRegionRelFreq;
706
707 OutlineRegionRelFreq = std::max(
708 a: OutlineRegionRelFreq, b: BranchProbability(OutlineRegionFreqPercent, 100));
709
710 return OutlineRegionRelFreq;
711}
712
713bool PartialInlinerImpl::shouldPartialInline(
714 CallBase &CB, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost,
715 OptimizationRemarkEmitter &ORE) const {
716 using namespace ore;
717
718 Function *Callee = CB.getCalledFunction();
719 assert(Callee == Cloner.ClonedFunc);
720
721 if (SkipCostAnalysis)
722 return isInlineViable(Callee&: *Callee).isSuccess();
723
724 Function *Caller = CB.getCaller();
725 auto &CalleeTTI = GetTTI(*Callee);
726 bool RemarksEnabled =
727 Callee->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled(
728 DEBUG_TYPE);
729 InlineCost IC =
730 getInlineCost(Call&: CB, Params: getInlineParams(), CalleeTTI, GetAssumptionCache,
731 GetTLI, GetBFI, PSI: &PSI, ORE: RemarksEnabled ? &ORE : nullptr);
732
733 if (IC.isAlways()) {
734 ORE.emit(RemarkBuilder: [&]() {
735 return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", &CB)
736 << NV("Callee", Cloner.OrigFunc)
737 << " should always be fully inlined, not partially";
738 });
739 return false;
740 }
741
742 if (IC.isNever()) {
743 ORE.emit(RemarkBuilder: [&]() {
744 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", &CB)
745 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
746 << NV("Caller", Caller)
747 << " because it should never be inlined (cost=never)";
748 });
749 return false;
750 }
751
752 if (!IC) {
753 ORE.emit(RemarkBuilder: [&]() {
754 return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", &CB)
755 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
756 << NV("Caller", Caller) << " because too costly to inline (cost="
757 << NV("Cost", IC.getCost()) << ", threshold="
758 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
759 });
760 return false;
761 }
762 const DataLayout &DL = Caller->getDataLayout();
763
764 // The savings of eliminating the call:
765 int NonWeightedSavings = getCallsiteCost(TTI: CalleeTTI, Call: CB, DL);
766 BlockFrequency NormWeightedSavings(NonWeightedSavings);
767
768 // Weighted saving is smaller than weighted cost, return false
769 if (NormWeightedSavings < WeightedOutliningRcost) {
770 ORE.emit(RemarkBuilder: [&]() {
771 return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh",
772 &CB)
773 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
774 << NV("Caller", Caller) << " runtime overhead (overhead="
775 << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency())
776 << ", savings="
777 << NV("Savings", (unsigned)NormWeightedSavings.getFrequency())
778 << ")"
779 << " of making the outlined call is too high";
780 });
781
782 return false;
783 }
784
785 ORE.emit(RemarkBuilder: [&]() {
786 return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", &CB)
787 << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into "
788 << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
789 << " (threshold="
790 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
791 });
792 return true;
793}
794
795// TODO: Ideally we should share Inliner's InlineCost Analysis code.
796// For now use a simplified version. The returned 'InlineCost' will be used
797// to esimate the size cost as well as runtime cost of the BB.
798InstructionCost
799PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB,
800 TargetTransformInfo *TTI) {
801 InstructionCost InlineCost = 0;
802 const DataLayout &DL = BB->getDataLayout();
803 int InstrCost = InlineConstants::getInstrCost();
804 for (Instruction &I : BB->instructionsWithoutDebug()) {
805 // Skip free instructions.
806 switch (I.getOpcode()) {
807 case Instruction::BitCast:
808 case Instruction::PtrToInt:
809 case Instruction::IntToPtr:
810 case Instruction::Alloca:
811 case Instruction::PHI:
812 continue;
813 case Instruction::GetElementPtr:
814 if (cast<GetElementPtrInst>(Val: &I)->hasAllZeroIndices())
815 continue;
816 break;
817 default:
818 break;
819 }
820
821 if (I.isLifetimeStartOrEnd())
822 continue;
823
824 if (auto *II = dyn_cast<IntrinsicInst>(Val: &I)) {
825 Intrinsic::ID IID = II->getIntrinsicID();
826 SmallVector<Type *, 4> Tys;
827 FastMathFlags FMF;
828 for (Value *Val : II->args())
829 Tys.push_back(Elt: Val->getType());
830
831 if (auto *FPMO = dyn_cast<FPMathOperator>(Val: II))
832 FMF = FPMO->getFastMathFlags();
833
834 IntrinsicCostAttributes ICA(IID, II->getType(), Tys, FMF);
835 InlineCost += TTI->getIntrinsicInstrCost(ICA, CostKind: TTI::TCK_SizeAndLatency);
836 continue;
837 }
838
839 if (CallInst *CI = dyn_cast<CallInst>(Val: &I)) {
840 InlineCost += getCallsiteCost(TTI: *TTI, Call: *CI, DL);
841 continue;
842 }
843
844 if (InvokeInst *II = dyn_cast<InvokeInst>(Val: &I)) {
845 InlineCost += getCallsiteCost(TTI: *TTI, Call: *II, DL);
846 continue;
847 }
848
849 if (SwitchInst *SI = dyn_cast<SwitchInst>(Val: &I)) {
850 InlineCost += (SI->getNumCases() + 1) * InstrCost;
851 continue;
852 }
853 InlineCost += InstrCost;
854 }
855
856 return InlineCost;
857}
858
859std::tuple<InstructionCost, InstructionCost>
860PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) const {
861 InstructionCost OutliningFuncCallCost = 0, OutlinedFunctionCost = 0;
862 for (auto FuncBBPair : Cloner.OutlinedFunctions) {
863 Function *OutlinedFunc = FuncBBPair.first;
864 BasicBlock* OutliningCallBB = FuncBBPair.second;
865 // Now compute the cost of the call sequence to the outlined function
866 // 'OutlinedFunction' in BB 'OutliningCallBB':
867 auto *OutlinedFuncTTI = &GetTTI(*OutlinedFunc);
868 OutliningFuncCallCost +=
869 computeBBInlineCost(BB: OutliningCallBB, TTI: OutlinedFuncTTI);
870
871 // Now compute the cost of the extracted/outlined function itself:
872 for (BasicBlock &BB : *OutlinedFunc)
873 OutlinedFunctionCost += computeBBInlineCost(BB: &BB, TTI: OutlinedFuncTTI);
874 }
875 assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
876 "Outlined function cost should be no less than the outlined region");
877
878 // The code extractor introduces a new root and exit stub blocks with
879 // additional unconditional branches. Those branches will be eliminated
880 // later with bb layout. The cost should be adjusted accordingly:
881 OutlinedFunctionCost -=
882 2 * InlineConstants::getInstrCost() * Cloner.OutlinedFunctions.size();
883
884 InstructionCost OutliningRuntimeOverhead =
885 OutliningFuncCallCost +
886 (OutlinedFunctionCost - Cloner.OutlinedRegionCost) +
887 ExtraOutliningPenalty.getValue();
888
889 return std::make_tuple(args&: OutliningFuncCallCost, args&: OutliningRuntimeOverhead);
890}
891
892// Create the callsite to profile count map which is
893// used to update the original function's entry count,
894// after the function is partially inlined into the callsite.
895void PartialInlinerImpl::computeCallsiteToProfCountMap(
896 Function *DuplicateFunction,
897 DenseMap<User *, uint64_t> &CallSiteToProfCountMap) const {
898 std::vector<User *> Users(DuplicateFunction->user_begin(),
899 DuplicateFunction->user_end());
900 Function *CurrentCaller = nullptr;
901 std::unique_ptr<BlockFrequencyInfo> TempBFI;
902 BlockFrequencyInfo *CurrentCallerBFI = nullptr;
903
904 auto ComputeCurrBFI = [&,this](Function *Caller) {
905 // For the old pass manager:
906 if (!GetBFI) {
907 DominatorTree DT(*Caller);
908 LoopInfo LI(DT);
909 BranchProbabilityInfo BPI(*Caller, LI);
910 TempBFI.reset(p: new BlockFrequencyInfo(*Caller, BPI, LI));
911 CurrentCallerBFI = TempBFI.get();
912 } else {
913 // New pass manager:
914 CurrentCallerBFI = &(GetBFI(*Caller));
915 }
916 };
917
918 for (User *User : Users) {
919 CallBase *CB = getSupportedCallBase(U: User);
920 Function *Caller = CB->getCaller();
921 if (CurrentCaller != Caller) {
922 CurrentCaller = Caller;
923 ComputeCurrBFI(Caller);
924 } else {
925 assert(CurrentCallerBFI && "CallerBFI is not set");
926 }
927 BasicBlock *CallBB = CB->getParent();
928 auto Count = CurrentCallerBFI->getBlockProfileCount(BB: CallBB);
929 if (Count)
930 CallSiteToProfCountMap[User] = *Count;
931 else
932 CallSiteToProfCountMap[User] = 0;
933 }
934}
935
936PartialInlinerImpl::FunctionCloner::FunctionCloner(
937 Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE,
938 function_ref<AssumptionCache *(Function &)> LookupAC,
939 function_ref<TargetTransformInfo &(Function &)> GetTTI)
940 : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
941 ClonedOI = std::make_unique<FunctionOutliningInfo>();
942
943 // Clone the function, so that we can hack away on it.
944 ValueToValueMapTy VMap;
945 ClonedFunc = CloneFunction(F, VMap);
946
947 ClonedOI->ReturnBlock = cast<BasicBlock>(Val&: VMap[OI->ReturnBlock]);
948 ClonedOI->NonReturnBlock = cast<BasicBlock>(Val&: VMap[OI->NonReturnBlock]);
949 for (BasicBlock *BB : OI->Entries)
950 ClonedOI->Entries.push_back(Elt: cast<BasicBlock>(Val&: VMap[BB]));
951
952 for (BasicBlock *E : OI->ReturnBlockPreds) {
953 BasicBlock *NewE = cast<BasicBlock>(Val&: VMap[E]);
954 ClonedOI->ReturnBlockPreds.push_back(Elt: NewE);
955 }
956 // Go ahead and update all uses to the duplicate, so that we can just
957 // use the inliner functionality when we're done hacking.
958 F->replaceAllUsesWith(V: ClonedFunc);
959}
960
961PartialInlinerImpl::FunctionCloner::FunctionCloner(
962 Function *F, FunctionOutliningMultiRegionInfo *OI,
963 OptimizationRemarkEmitter &ORE,
964 function_ref<AssumptionCache *(Function &)> LookupAC,
965 function_ref<TargetTransformInfo &(Function &)> GetTTI)
966 : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
967 ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>();
968
969 // Clone the function, so that we can hack away on it.
970 ValueToValueMapTy VMap;
971 ClonedFunc = CloneFunction(F, VMap);
972
973 // Go through all Outline Candidate Regions and update all BasicBlock
974 // information.
975 for (const FunctionOutliningMultiRegionInfo::OutlineRegionInfo &RegionInfo :
976 OI->ORI) {
977 SmallVector<BasicBlock *, 8> Region;
978 for (BasicBlock *BB : RegionInfo.Region)
979 Region.push_back(Elt: cast<BasicBlock>(Val&: VMap[BB]));
980
981 BasicBlock *NewEntryBlock = cast<BasicBlock>(Val&: VMap[RegionInfo.EntryBlock]);
982 BasicBlock *NewExitBlock = cast<BasicBlock>(Val&: VMap[RegionInfo.ExitBlock]);
983 BasicBlock *NewReturnBlock = nullptr;
984 if (RegionInfo.ReturnBlock)
985 NewReturnBlock = cast<BasicBlock>(Val&: VMap[RegionInfo.ReturnBlock]);
986 FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo(
987 Region, NewEntryBlock, NewExitBlock, NewReturnBlock);
988 ClonedOMRI->ORI.push_back(Elt: MappedRegionInfo);
989 }
990 // Go ahead and update all uses to the duplicate, so that we can just
991 // use the inliner functionality when we're done hacking.
992 F->replaceAllUsesWith(V: ClonedFunc);
993}
994
995void PartialInlinerImpl::FunctionCloner::normalizeReturnBlock() const {
996 auto GetFirstPHI = [](BasicBlock *BB) {
997 BasicBlock::iterator I = BB->begin();
998 PHINode *FirstPhi = nullptr;
999 while (I != BB->end()) {
1000 PHINode *Phi = dyn_cast<PHINode>(Val&: I);
1001 if (!Phi)
1002 break;
1003 if (!FirstPhi) {
1004 FirstPhi = Phi;
1005 break;
1006 }
1007 }
1008 return FirstPhi;
1009 };
1010
1011 // Shouldn't need to normalize PHIs if we're not outlining non-early return
1012 // blocks.
1013 if (!ClonedOI)
1014 return;
1015
1016 // Special hackery is needed with PHI nodes that have inputs from more than
1017 // one extracted block. For simplicity, just split the PHIs into a two-level
1018 // sequence of PHIs, some of which will go in the extracted region, and some
1019 // of which will go outside.
1020 BasicBlock *PreReturn = ClonedOI->ReturnBlock;
1021 // only split block when necessary:
1022 PHINode *FirstPhi = GetFirstPHI(PreReturn);
1023 unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size();
1024
1025 if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1)
1026 return;
1027
1028 auto IsTrivialPhi = [](PHINode *PN) -> Value * {
1029 if (llvm::all_equal(Range: PN->incoming_values()))
1030 return PN->getIncomingValue(i: 0);
1031 return nullptr;
1032 };
1033
1034 ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock(
1035 I: ClonedOI->ReturnBlock->getFirstNonPHIIt());
1036 BasicBlock::iterator I = PreReturn->begin();
1037 BasicBlock::iterator Ins = ClonedOI->ReturnBlock->begin();
1038 SmallVector<Instruction *, 4> DeadPhis;
1039 while (I != PreReturn->end()) {
1040 PHINode *OldPhi = dyn_cast<PHINode>(Val&: I);
1041 if (!OldPhi)
1042 break;
1043
1044 PHINode *RetPhi =
1045 PHINode::Create(Ty: OldPhi->getType(), NumReservedValues: NumPredsFromEntries + 1, NameStr: "");
1046 RetPhi->insertBefore(InsertPos: Ins);
1047 OldPhi->replaceAllUsesWith(V: RetPhi);
1048 Ins = ClonedOI->ReturnBlock->getFirstNonPHIIt();
1049
1050 RetPhi->addIncoming(V: &*I, BB: PreReturn);
1051 for (BasicBlock *E : ClonedOI->ReturnBlockPreds) {
1052 RetPhi->addIncoming(V: OldPhi->getIncomingValueForBlock(BB: E), BB: E);
1053 OldPhi->removeIncomingValue(BB: E);
1054 }
1055
1056 // After incoming values splitting, the old phi may become trivial.
1057 // Keeping the trivial phi can introduce definition inside the outline
1058 // region which is live-out, causing necessary overhead (load, store
1059 // arg passing etc).
1060 if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) {
1061 OldPhi->replaceAllUsesWith(V: OldPhiVal);
1062 DeadPhis.push_back(Elt: OldPhi);
1063 }
1064 ++I;
1065 }
1066 for (auto *DP : DeadPhis)
1067 DP->eraseFromParent();
1068
1069 for (auto *E : ClonedOI->ReturnBlockPreds)
1070 E->getTerminator()->replaceUsesOfWith(From: PreReturn, To: ClonedOI->ReturnBlock);
1071}
1072
1073bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
1074
1075 auto ComputeRegionCost =
1076 [&](SmallVectorImpl<BasicBlock *> &Region) -> InstructionCost {
1077 InstructionCost Cost = 0;
1078 for (BasicBlock* BB : Region)
1079 Cost += computeBBInlineCost(BB, TTI: &GetTTI(*BB->getParent()));
1080 return Cost;
1081 };
1082
1083 assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline");
1084
1085 if (ClonedOMRI->ORI.empty())
1086 return false;
1087
1088 // The CodeExtractor needs a dominator tree.
1089 DominatorTree DT;
1090 DT.recalculate(Func&: *ClonedFunc);
1091
1092 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1093 LoopInfo LI(DT);
1094 BranchProbabilityInfo BPI(*ClonedFunc, LI);
1095 ClonedFuncBFI.reset(p: new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1096
1097 // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
1098 CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1099
1100 SetVector<Value *> Inputs, Outputs, Sinks;
1101 for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1102 ClonedOMRI->ORI) {
1103 InstructionCost CurrentOutlinedRegionCost =
1104 ComputeRegionCost(RegionInfo.Region);
1105
1106 CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false,
1107 ClonedFuncBFI.get(), &BPI,
1108 LookupAC(*RegionInfo.EntryBlock->getParent()),
1109 /* AllowVarargs */ false);
1110
1111 CE.findInputsOutputs(Inputs, Outputs, Allocas: Sinks);
1112
1113 LLVM_DEBUG({
1114 dbgs() << "inputs: " << Inputs.size() << "\n";
1115 dbgs() << "outputs: " << Outputs.size() << "\n";
1116 for (Value *value : Inputs)
1117 dbgs() << "value used in func: " << *value << "\n";
1118 for (Value *output : Outputs)
1119 dbgs() << "instr used in func: " << *output << "\n";
1120 });
1121
1122 // Do not extract regions that have live exit variables.
1123 if (Outputs.size() > 0 && !ForceLiveExit)
1124 continue;
1125
1126 if (Function *OutlinedFunc = CE.extractCodeRegion(CEAC)) {
1127 CallBase *OCS = PartialInlinerImpl::getOneCallSiteTo(F&: *OutlinedFunc);
1128 BasicBlock *OutliningCallBB = OCS->getParent();
1129 assert(OutliningCallBB->getParent() == ClonedFunc);
1130 OutlinedFunctions.push_back(Elt: std::make_pair(x&: OutlinedFunc,y&: OutliningCallBB));
1131 NumColdRegionsOutlined++;
1132 OutlinedRegionCost += CurrentOutlinedRegionCost;
1133
1134 if (MarkOutlinedColdCC) {
1135 OutlinedFunc->setCallingConv(CallingConv::Cold);
1136 OCS->setCallingConv(CallingConv::Cold);
1137 }
1138 } else
1139 ORE.emit(RemarkBuilder: [&]() {
1140 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1141 &RegionInfo.Region.front()->front())
1142 << "Failed to extract region at block "
1143 << ore::NV("Block", RegionInfo.Region.front());
1144 });
1145 }
1146
1147 return !OutlinedFunctions.empty();
1148}
1149
1150Function *
1151PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
1152 // Returns true if the block is to be partial inlined into the caller
1153 // (i.e. not to be extracted to the out of line function)
1154 auto ToBeInlined = [&, this](BasicBlock *BB) {
1155 return BB == ClonedOI->ReturnBlock ||
1156 llvm::is_contained(Range&: ClonedOI->Entries, Element: BB);
1157 };
1158
1159 assert(ClonedOI && "Expecting OutlineInfo for single region outline");
1160 // The CodeExtractor needs a dominator tree.
1161 DominatorTree DT;
1162 DT.recalculate(Func&: *ClonedFunc);
1163
1164 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1165 LoopInfo LI(DT);
1166 BranchProbabilityInfo BPI(*ClonedFunc, LI);
1167 ClonedFuncBFI.reset(p: new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1168
1169 // Gather up the blocks that we're going to extract.
1170 std::vector<BasicBlock *> ToExtract;
1171 auto *ClonedFuncTTI = &GetTTI(*ClonedFunc);
1172 ToExtract.push_back(x: ClonedOI->NonReturnBlock);
1173 OutlinedRegionCost += PartialInlinerImpl::computeBBInlineCost(
1174 BB: ClonedOI->NonReturnBlock, TTI: ClonedFuncTTI);
1175 for (BasicBlock *BB : depth_first(G: &ClonedFunc->getEntryBlock()))
1176 if (!ToBeInlined(BB) && BB != ClonedOI->NonReturnBlock) {
1177 ToExtract.push_back(x: BB);
1178 // FIXME: the code extractor may hoist/sink more code
1179 // into the outlined function which may make the outlining
1180 // overhead (the difference of the outlined function cost
1181 // and OutliningRegionCost) look larger.
1182 OutlinedRegionCost += computeBBInlineCost(BB, TTI: ClonedFuncTTI);
1183 }
1184
1185 // Extract the body of the if.
1186 CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1187 Function *OutlinedFunc =
1188 CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false,
1189 ClonedFuncBFI.get(), &BPI, LookupAC(*ClonedFunc),
1190 /* AllowVarargs */ true)
1191 .extractCodeRegion(CEAC);
1192
1193 if (OutlinedFunc) {
1194 BasicBlock *OutliningCallBB =
1195 PartialInlinerImpl::getOneCallSiteTo(F&: *OutlinedFunc)->getParent();
1196 assert(OutliningCallBB->getParent() == ClonedFunc);
1197 OutlinedFunctions.push_back(Elt: std::make_pair(x&: OutlinedFunc, y&: OutliningCallBB));
1198 } else
1199 ORE.emit(RemarkBuilder: [&]() {
1200 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1201 &ToExtract.front()->front())
1202 << "Failed to extract region at block "
1203 << ore::NV("Block", ToExtract.front());
1204 });
1205
1206 return OutlinedFunc;
1207}
1208
1209PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
1210 // Ditch the duplicate, since we're done with it, and rewrite all remaining
1211 // users (function pointers, etc.) back to the original function.
1212 ClonedFunc->replaceAllUsesWith(V: OrigFunc);
1213 ClonedFunc->eraseFromParent();
1214 if (!IsFunctionInlined) {
1215 // Remove each function that was speculatively created if there is no
1216 // reference.
1217 for (auto FuncBBPair : OutlinedFunctions) {
1218 Function *Func = FuncBBPair.first;
1219 Func->eraseFromParent();
1220 }
1221 }
1222}
1223
1224std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function &F) {
1225 if (F.hasAddressTaken())
1226 return {false, nullptr};
1227
1228 // Let inliner handle it
1229 if (F.hasFnAttribute(Kind: Attribute::AlwaysInline))
1230 return {false, nullptr};
1231
1232 if (F.hasFnAttribute(Kind: Attribute::NoInline))
1233 return {false, nullptr};
1234
1235 if (PSI.isFunctionEntryCold(F: &F))
1236 return {false, nullptr};
1237
1238 if (F.users().empty())
1239 return {false, nullptr};
1240
1241 OptimizationRemarkEmitter ORE(&F);
1242
1243 // Only try to outline cold regions if we have a profile summary, which
1244 // implies we have profiling information.
1245 if (PSI.hasProfileSummary() && F.hasProfileData() &&
1246 !DisableMultiRegionPartialInline) {
1247 std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI =
1248 computeOutliningColdRegionsInfo(F, ORE);
1249 if (OMRI) {
1250 FunctionCloner Cloner(&F, OMRI.get(), ORE, LookupAssumptionCache, GetTTI);
1251
1252 LLVM_DEBUG({
1253 dbgs() << "HotCountThreshold = " << PSI.getHotCountThreshold() << "\n";
1254 dbgs() << "ColdCountThreshold = " << PSI.getColdCountThreshold()
1255 << "\n";
1256 });
1257
1258 bool DidOutline = Cloner.doMultiRegionFunctionOutlining();
1259
1260 if (DidOutline) {
1261 LLVM_DEBUG({
1262 dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n";
1263 Cloner.ClonedFunc->print(dbgs());
1264 dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n";
1265 });
1266
1267 if (tryPartialInline(Cloner))
1268 return {true, nullptr};
1269 }
1270 }
1271 }
1272
1273 // Fall-thru to regular partial inlining if we:
1274 // i) can't find any cold regions to outline, or
1275 // ii) can't inline the outlined function anywhere.
1276 std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
1277 if (!OI)
1278 return {false, nullptr};
1279
1280 FunctionCloner Cloner(&F, OI.get(), ORE, LookupAssumptionCache, GetTTI);
1281 Cloner.normalizeReturnBlock();
1282
1283 Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining();
1284
1285 if (!OutlinedFunction)
1286 return {false, nullptr};
1287
1288 if (tryPartialInline(Cloner))
1289 return {true, OutlinedFunction};
1290
1291 return {false, nullptr};
1292}
1293
1294bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
1295 if (Cloner.OutlinedFunctions.empty())
1296 return false;
1297
1298 auto OutliningCosts = computeOutliningCosts(Cloner);
1299
1300 InstructionCost SizeCost = std::get<0>(t&: OutliningCosts);
1301 InstructionCost NonWeightedRcost = std::get<1>(t&: OutliningCosts);
1302
1303 assert(SizeCost.isValid() && NonWeightedRcost.isValid() &&
1304 "Expected valid costs");
1305
1306 // Only calculate RelativeToEntryFreq when we are doing single region
1307 // outlining.
1308 BranchProbability RelativeToEntryFreq;
1309 if (Cloner.ClonedOI)
1310 RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);
1311 else
1312 // RelativeToEntryFreq doesn't make sense when we have more than one
1313 // outlined call because each call will have a different relative frequency
1314 // to the entry block. We can consider using the average, but the
1315 // usefulness of that information is questionable. For now, assume we never
1316 // execute the calls to outlined functions.
1317 RelativeToEntryFreq = BranchProbability(0, 1);
1318
1319 BlockFrequency WeightedRcost =
1320 BlockFrequency(NonWeightedRcost.getValue()) * RelativeToEntryFreq;
1321
1322 // The call sequence(s) to the outlined function(s) are larger than the sum of
1323 // the original outlined region size(s), it does not increase the chances of
1324 // inlining the function with outlining (The inliner uses the size increase to
1325 // model the cost of inlining a callee).
1326 if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) {
1327 OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1328 DebugLoc DLoc;
1329 BasicBlock *Block;
1330 std::tie(args&: DLoc, args&: Block) = getOneDebugLoc(F&: *Cloner.ClonedFunc);
1331 OrigFuncORE.emit(RemarkBuilder: [&]() {
1332 return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
1333 DLoc, Block)
1334 << ore::NV("Function", Cloner.OrigFunc)
1335 << " not partially inlined into callers (Original Size = "
1336 << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost)
1337 << ", Size of call sequence to outlined function = "
1338 << ore::NV("NewSize", SizeCost) << ")";
1339 });
1340 return false;
1341 }
1342
1343 assert(Cloner.OrigFunc->users().empty() &&
1344 "F's users should all be replaced!");
1345
1346 std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),
1347 Cloner.ClonedFunc->user_end());
1348
1349 DenseMap<User *, uint64_t> CallSiteToProfCountMap;
1350 auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
1351 if (CalleeEntryCount)
1352 computeCallsiteToProfCountMap(DuplicateFunction: Cloner.ClonedFunc, CallSiteToProfCountMap);
1353
1354 uint64_t CalleeEntryCountV =
1355 (CalleeEntryCount ? CalleeEntryCount->getCount() : 0);
1356
1357 bool AnyInline = false;
1358 for (User *User : Users) {
1359 CallBase *CB = getSupportedCallBase(U: User);
1360
1361 if (isLimitReached())
1362 continue;
1363
1364 OptimizationRemarkEmitter CallerORE(CB->getCaller());
1365 if (!shouldPartialInline(CB&: *CB, Cloner, WeightedOutliningRcost: WeightedRcost, ORE&: CallerORE))
1366 continue;
1367
1368 // Construct remark before doing the inlining, as after successful inlining
1369 // the callsite is removed.
1370 OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CB);
1371 OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
1372 << ore::NV("Caller", CB->getCaller());
1373
1374 InlineFunctionInfo IFI(GetAssumptionCache, &PSI);
1375 // We can only forward varargs when we outlined a single region, else we
1376 // bail on vararg functions.
1377 if (!InlineFunction(CB&: *CB, IFI, /*MergeAttributes=*/false, CalleeAAR: nullptr, InsertLifetime: true,
1378 ForwardVarArgsTo: (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first
1379 : nullptr))
1380 .isSuccess())
1381 continue;
1382
1383 CallerORE.emit(OptDiag&: OR);
1384
1385 // Now update the entry count:
1386 if (CalleeEntryCountV) {
1387 if (auto It = CallSiteToProfCountMap.find(Val: User);
1388 It != CallSiteToProfCountMap.end()) {
1389 uint64_t CallSiteCount = It->second;
1390 CalleeEntryCountV -= std::min(a: CalleeEntryCountV, b: CallSiteCount);
1391 }
1392 }
1393
1394 AnyInline = true;
1395 NumPartialInlining++;
1396 // Update the stats
1397 if (Cloner.ClonedOI)
1398 NumPartialInlined++;
1399 else
1400 NumColdOutlinePartialInlined++;
1401 }
1402
1403 if (AnyInline) {
1404 Cloner.IsFunctionInlined = true;
1405 if (CalleeEntryCount)
1406 Cloner.OrigFunc->setEntryCount(Count: Function::ProfileCount(
1407 CalleeEntryCountV, CalleeEntryCount->getType()));
1408 OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1409 OrigFuncORE.emit(RemarkBuilder: [&]() {
1410 return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc)
1411 << "Partially inlined into at least one caller";
1412 });
1413 }
1414
1415 return AnyInline;
1416}
1417
1418bool PartialInlinerImpl::run(Module &M) {
1419 if (DisablePartialInlining)
1420 return false;
1421
1422 std::vector<Function *> Worklist;
1423 Worklist.reserve(n: M.size());
1424 for (Function &F : M)
1425 if (!F.use_empty() && !F.isDeclaration())
1426 Worklist.push_back(x: &F);
1427
1428 bool Changed = false;
1429 while (!Worklist.empty()) {
1430 Function *CurrFunc = Worklist.back();
1431 Worklist.pop_back();
1432
1433 if (CurrFunc->use_empty())
1434 continue;
1435
1436 std::pair<bool, Function *> Result = unswitchFunction(F&: *CurrFunc);
1437 if (Result.second)
1438 Worklist.push_back(x: Result.second);
1439 Changed |= Result.first;
1440 }
1441
1442 return Changed;
1443}
1444
1445PreservedAnalyses PartialInlinerPass::run(Module &M,
1446 ModuleAnalysisManager &AM) {
1447 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(IR&: M).getManager();
1448
1449 auto GetAssumptionCache = [&FAM](Function &F) -> AssumptionCache & {
1450 return FAM.getResult<AssumptionAnalysis>(IR&: F);
1451 };
1452
1453 auto LookupAssumptionCache = [&FAM](Function &F) -> AssumptionCache * {
1454 return FAM.getCachedResult<AssumptionAnalysis>(IR&: F);
1455 };
1456
1457 auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & {
1458 return FAM.getResult<BlockFrequencyAnalysis>(IR&: F);
1459 };
1460
1461 auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & {
1462 return FAM.getResult<TargetIRAnalysis>(IR&: F);
1463 };
1464
1465 auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
1466 return FAM.getResult<TargetLibraryAnalysis>(IR&: F);
1467 };
1468
1469 ProfileSummaryInfo &PSI = AM.getResult<ProfileSummaryAnalysis>(IR&: M);
1470
1471 if (PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI,
1472 GetTLI, PSI, GetBFI)
1473 .run(M))
1474 return PreservedAnalyses::none();
1475 return PreservedAnalyses::all();
1476}
1477